提交 abc885cf authored 作者: Frederic's avatar Frederic

Same speed up for SoftmaxWithBias

上级 9d73a2d5
...@@ -95,7 +95,7 @@ class SoftmaxWithBias(gof.Op): ...@@ -95,7 +95,7 @@ class SoftmaxWithBias(gof.Op):
return ['<iostream>', '<cmath>'] return ['<iostream>', '<cmath>']
@staticmethod @staticmethod
def c_code_template(): def c_code_template(dtype):
# this implementation was lifted from # this implementation was lifted from
# /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx # /u/bergstrj/cvs/bergstrj/src/feb07/nn.cxx
...@@ -107,6 +107,10 @@ class SoftmaxWithBias(gof.Op): ...@@ -107,6 +107,10 @@ class SoftmaxWithBias(gof.Op):
#TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1] #TODO: use this to accept float32 and int32: node.inputs[0].type.dtype_specs()[1]
init_decl = """ init_decl = """
npy_intp* Nx = PyArray_DIMS(%(x)s); npy_intp* Nx = PyArray_DIMS(%(x)s);
npy_intp Sx = 0;
npy_intp Sb = 0;
npy_intp Ssm = 0;
if (PyArray_NDIM(%(x)s) != 2) if (PyArray_NDIM(%(x)s) != 2)
{ {
...@@ -151,6 +155,10 @@ class SoftmaxWithBias(gof.Op): ...@@ -151,6 +155,10 @@ class SoftmaxWithBias(gof.Op):
%(fail)s %(fail)s
} }
} }
Sx = PyArray_STRIDES(%(x)s)[1]/sizeof(dtype_%(x)s);
Sb = PyArray_STRIDES(%(b)s)[0]/sizeof(dtype_%(b)s);
Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
""" """
begin_row_loop = """ begin_row_loop = """
...@@ -201,6 +209,54 @@ class SoftmaxWithBias(gof.Op): ...@@ -201,6 +209,54 @@ class SoftmaxWithBias(gof.Op):
""" """
# Get the vectorized version of exp if it exist
try:
vec_exp = theano.scalar.exp.c_code_contiguous_raw(dtype,
"Nx[1]", "sm_i", "sm_i")
inside_row_loop_contig = """
size_t row_max_j=0;
dtype_%%(sm)s row_max = x_i[0] + b_i[0];
//std::cout << "0 " << row_max << "\\n";
// Get the maximum value of the row
for (j = 1; j < Nx[1]; ++j)
{
dtype_%%(sm)s row_ij = x_i[j * Sx] + b_i[j * Sb];
//std::cout << "1 " << row_ij << "\\n";
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
}
for (j = 0; j < Nx[1]; ++j)
{
dtype_%%(sm)s row_ij = x_i[j * Sx] + b_i[j * Sb];
//std::cout << "2 " << j << " " << row_ij << " " << row_max << "\\n";
dtype_%%(sm)s sm_ij = row_ij - row_max;
//std::cout << "3 " << j << " " << sm_ij << "\\n";
sm_i[j * Ssm] = sm_ij;
}
%(vec_exp)s;
for (j = 0; j < Nx[1]; ++j)
{
sum += sm_i[j * Ssm];
}
//cblas_dscal(x.N, 1.0 / sum, &mat_at(s,i,0), s.n);
double sum_inv = 1.0 / sum;
for (j = 0; j < Nx[1]; ++j)
{
sm_i[j * Ssm] *= sum_inv;
}
""" % locals()
inside_row_loop = """
if(Ssm == 1){
%(inside_row_loop_contig)s
}else{
%(inside_row_loop)s
}
""" % locals()
except theano.gof.utils.MethodNotDefined:
pass
end_row_loop = """ end_row_loop = """
} }
""" """
...@@ -210,12 +266,13 @@ class SoftmaxWithBias(gof.Op): ...@@ -210,12 +266,13 @@ class SoftmaxWithBias(gof.Op):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, b = inp x, b = inp
sm, = out sm, = out
code_template = ''.join(self.c_code_template()) code_template = ''.join(self.c_code_template(
node.inputs[0].type.dtype_specs()[1]))
return code_template % dict(locals(), **sub) return code_template % dict(locals(), **sub)
@staticmethod @staticmethod
def c_code_cache_version(): def c_code_cache_version():
return (6,) return (7,)
softmax_with_bias = SoftmaxWithBias() softmax_with_bias = SoftmaxWithBias()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论