提交 6465523c authored 作者: nouiz's avatar nouiz

Merge pull request #583 from lamblin/preallocated_out_ops

Fix ops to accept any kind of preallocated output memory
......@@ -30,7 +30,7 @@ class GpuDot22(GpuOp):
return Apply(self, [x, y], [otype()])
def c_code_cache_version(self):
return (1, 1)
return (1, 2)
def c_code(self, node, nodename, inputs, outputs, sub):
x, y = inputs
......@@ -51,9 +51,14 @@ class GpuDot22(GpuOp):
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1]))
CudaNdarray_HOST_DIMS(%(y)s)[1])
|| (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
|| ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
&& (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
{
//if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
......@@ -108,7 +113,7 @@ class GpuDot22Scalar(GpuOp):
return Apply(self, [x, y, a], [otype()])
def c_code_cache_version(self):
return (1, 1)
return (1, 2)
def c_code(self, node, name, inputs, outputs, sub):
x, y, a = inputs
......@@ -135,7 +140,13 @@ class GpuDot22Scalar(GpuOp):
(CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
(CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1]))
CudaNdarray_HOST_DIMS(%(y)s)[1])
|| (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
|| ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
&& (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
{
//if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s);
......@@ -790,7 +801,7 @@ class GpuDownsampleFactorMax(GpuOp):
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return (3)
return (4)
def c_code(self, node, nodename, inp, out, sub):
x, = inp
......@@ -856,7 +867,11 @@ class GpuDownsampleFactorMax(GpuOp):
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s));
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_HOST_STRIDES(%(z)s)[2],
CudaNdarray_HOST_STRIDES(%(z)s)[3]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -883,7 +898,7 @@ class GpuDownsampleFactorMax(GpuOp):
__global__ void kMaxPool_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3,
float *z)
float *z, int zS0, int zS1, int zS2, int zS3)
{
float cur_max, cur_x;
int i0 = blockIdx.x %% D0;
......@@ -932,7 +947,7 @@ class GpuDownsampleFactorMax(GpuOp):
}
//store the result to global memory
z[i0 * D1*D2*D3 + i1*D2*D3 + i2*D3 + threadIdx.x] = cur_max;
z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
}
""" % locals()
......
......@@ -36,8 +36,8 @@ class SupportCodeError(Exception):
class NaiveAlgo(object):
verbose = 0 # 1, 2 or 3 for more verbose output.
cache_version = ()
cache_version = (14, verbose)
#cache_version = ()
cache_version = (15, verbose)
def __init__(self, scalar_op, sync=True, inplace_pattern={}):
"""
......@@ -541,7 +541,7 @@ class NaiveAlgo(object):
if nb_inputs > 0 and nd > 0:
print >> sio, """
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_outputs)s][%(nd)s];
""" % locals()
else:
print >> sio, """
......@@ -928,6 +928,11 @@ nd_collapse_[i]=0;
%(oname)s = NULL;
}
}
if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
{
Py_XDECREF(%(oname)s);
%(oname)s = NULL;
}
if (NULL == %(oname)s)
{
%(oname)s = (CudaNdarray*)CudaNdarray_New();
......
......@@ -199,8 +199,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
def make_node(self, dy, sm, y_idx):
return Apply(self, [dy, sm, y_idx],[sm.type()])
def c_code_cache_version(self):
return (4,)
#return ()
return (5,)
def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp
dx, = out
......@@ -257,7 +257,9 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(dx)s) //guaranteed c-contiguous
CudaNdarray_DEV_DATA(%(dx)s),
CudaNdarray_HOST_STRIDES(%(dx)s)[0],
CudaNdarray_HOST_STRIDES(%(dx)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
......@@ -277,7 +279,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
const float * dnll, const int dnll_s0,
const float * sm, const int sm_s0, const int sm_s1,
const float * y_idx, const int y_idx_s0,
float * dx)
float * dx, const int dx_s0, const int dx_s1)
{
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
......@@ -288,14 +290,14 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
{
if (y_i == j)
{
dx[i * K + j] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
dx[i * dx_s0 + j * dx_s1] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
}
else
{
dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1];
dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
}
//dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*K+j] = 0;
//dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*dx_s0+j*dx_s1] = 0;
}
}
}
......@@ -319,7 +321,7 @@ class GpuSoftmax (GpuOp):
return shape
def c_code_cache_version(self):
#return ()
return (4,) + inline_softmax.code_version
return (5,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, = inp
z, = out
......@@ -364,7 +366,9 @@ class GpuSoftmax (GpuOp):
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
......@@ -381,7 +385,7 @@ class GpuSoftmax (GpuOp):
return nvcc_kernel("kSoftmax_%s"%nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm'],
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
......@@ -393,7 +397,7 @@ class GpuSoftmax (GpuOp):
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * N + tx] = buf[tx]",# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",# This set all value correctly
"}",
"__syncthreads()",
"}",
......@@ -419,7 +423,7 @@ class GpuSoftmaxWithBias (GpuOp):
return [shape[0]]
def c_code_cache_version(self):
#return ()
return (5,) + inline_softmax.code_version
return (6,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, b = inp
......@@ -481,7 +485,9 @@ class GpuSoftmaxWithBias (GpuOp):
CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
......@@ -503,7 +509,7 @@ class GpuSoftmaxWithBias (GpuOp):
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm'],
'float * sm', 'const int ssm0', 'const int ssm1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
......@@ -516,7 +522,7 @@ class GpuSoftmaxWithBias (GpuOp):
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * N + tx] = buf[tx]",
"sm[blockIDX * ssm0 + tx * ssm1] = buf[tx]",
"}",
"__syncthreads()",
"}",
......
......@@ -33,7 +33,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
batch_size = 4097
n_out = 1250
if theano.config.mode != "DEBUG_MODE":
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
......
......@@ -147,19 +147,21 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self):
return (7,)
return (8,)
def c_support_code_apply(self, node, nodename):
return """
static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi,
const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
const int unis_stride,
float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis,
float * global_outs
const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
)
{
// each thread takes care of one multinomial draw
......@@ -174,7 +176,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
float current_out = 0.;
if (!done)
{
cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul)
{
current_out = 1.;
......@@ -182,7 +184,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
}
}
//write out transposed for speed.
global_outs[n + m * nb_multi] = current_out;
global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
}
}
}
......@@ -262,12 +264,14 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s)
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
......
......@@ -314,7 +314,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return (7,)
return (8,)
def c_support_code_apply(self, node, nodename):
mode = self.mode
......@@ -333,6 +333,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
const int grid_d,
const int stride0, const int stride1, const int stride2, const int stride3,
float * global_ten4,
const int out_s0, const int out_s1,
float * global_out
)
{
......@@ -375,7 +376,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col + c*d*z_row;
int z_idx = z_col * out_s1 + z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
......@@ -395,6 +396,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
const int grid_d,
const int stride0, const int stride1, const int stride2, const int stride3,
float * global_ten4,
const int out_s0, const int out_s1,
float * global_out
)
{
......@@ -437,7 +439,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col + c*d*z_row;
int z_idx = z_col * out_s1 + z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
......@@ -573,7 +575,9 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int, int, int ,int,
int, int,
int, int, int, int,
float*, float*);
float*,
int, int,
float*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
......@@ -591,6 +595,8 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
CudaNdarray_DEV_DATA(%(ten4)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
......
......@@ -239,7 +239,9 @@ class mrg_uniform(mrg_uniform_base):
npy_intp odims[%(ndim)s];
int n_elements = 1;
int n_streams = 0;
int must_alloc_sample = ((NULL == %(o_sample)s) || (%(o_sample)s->nd != %(ndim)s));
int must_alloc_sample = ((NULL == %(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s)
|| !(PyArray_ISCONTIGUOUS(%(o_sample)s)));
%(otype)s * sample_data;
npy_int32 * state_data;
......@@ -280,7 +282,6 @@ class mrg_uniform(mrg_uniform_base):
n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (%(o_sample)s->dimensions[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
// TODO CHECK STRIDES OF o_sample?
}
if (must_alloc_sample)
{
......@@ -370,9 +371,11 @@ class mrg_uniform(mrg_uniform_base):
state_data_i[5]= x23;
}
//////// </ code generated by mrg_uniform>
""" %locals()
""" % locals()
def c_code_cache_version(self):
return (1,)
return (2,)
class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
#GPU VERSION
......@@ -496,6 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check(py_%(o_sample)s)
|| !CudaNdarray_is_c_contiguous(%(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s));
if (%(size)s->nd != 1)
......@@ -590,7 +594,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
//////// </ code generated by mrg_uniform>
""" %locals()
def c_code_cache_version(self):
return (5,)
return (6,)
def guess_n_streams(size, warn=True):
......
......@@ -1847,6 +1847,9 @@ class StructuredDotGradCSC(gof.Op):
g_a_data[i_idx] = dot_val
out[0] = g_a_data
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
if node.inputs[2].type.dtype in ('complex64', 'complex128'):
......@@ -1870,17 +1873,13 @@ class StructuredDotGradCSC(gof.Op):
if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}
if (!%(_zout)s)
if (!%(_zout)s
|| (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
{
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
npy_intp nnz = %(_indices)s->dimensions[0];
npy_intp N = %(_indptr)s->dimensions[0]-1; //TODO: error checking with this
......@@ -1971,6 +1970,9 @@ class StructuredDotGradCSR(gof.Op):
g_a_data[j_idx] = dot_val
out[0] = g_a_data
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
if node.inputs[2].type.dtype in ('complex64', 'complex128'):
......@@ -1994,17 +1996,13 @@ class StructuredDotGradCSR(gof.Op):
if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}
if (!%(_zout)s)
if (!%(_zout)s
|| (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
{
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
}
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions.
npy_intp nnz = %(_indices)s->dimensions[0];
// extract number of rows
......@@ -2411,8 +2409,6 @@ class UsmmCscDense(gof.Op):
npy_intp K = %(y)s->dimensions[0];
// pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
dtype_%(zn)s* __restrict__ Dzn = (dtype_%(zn)s*)%(zn)s->data;
const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
......@@ -2428,7 +2424,11 @@ class UsmmCscDense(gof.Op):
if (!(%(inplace)s))
{
memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s));
if (PyArray_CopyInto(%(zn)s, %(z)s))
{
Py_XDECREF(%(zn)s);
%(fail)s;
}
}
for (npy_int32 k = 0; k < K; ++k)
......@@ -2439,9 +2439,16 @@ class UsmmCscDense(gof.Op):
const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
// axpy expects pointer to the beginning of memory arrays,
// so when the stride is negative, we need to get the
// last element
if (Sy < 0)
y_row += (K - 1) * Sy;
const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
if (Szn < 0)
z_row += (N - 1) * Szn;
%(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
}
......@@ -2451,6 +2458,9 @@ class UsmmCscDense(gof.Op):
return rval
def c_code_cache_version(self):
return (1,)
usmm_csc_dense = UsmmCscDense(inplace=False)
usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
......
......@@ -2849,40 +2849,47 @@ class Alloc(gof.Op):
out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inp, out, sub):
# TODO: use the elemwise code generator here
if python_all(node.inputs[0].broadcastable):
# filling with a scalar is a common use of alloc
# that we can implement relatively easily
vv = inp[0]
zz, = out
fail = sub['fail']
if node.outputs[0].ndim == 1:
N0 = inp[1]
return """
npy_intp N0 = ((dtype_%(N0)s*)%(N0)s->data)[0];
dtype_%(vv)s vv;
dtype_%(zz)s* zz;
if ((NULL == %(zz)s) || (%(zz)s->dimensions[0] != N0))
{
if (%(zz)s) Py_XDECREF(%(zz)s);
%(zz)s = (PyArrayObject*)PyArray_SimpleNew(1,
&N0, type_num_%(vv)s);
if(!%(zz)s) {
PyErr_SetString(PyExc_MemoryError, "alloc failed");
%(fail)s
}
}
vv = ((dtype_%(vv)s*)%(vv)s->data)[0];
zz = ((dtype_%(zz)s*)%(zz)s->data);
assert (%(zz)s->strides[0] == sizeof(dtype_%(zz)s));
for (int i = 0; i < N0; ++i)
vv = inp[0]
ndim = len(inp[1:])
zz, = out
fail = sub['fail']
code = """
npy_intp shape[%(ndim)s];
""" % dict(ndim=ndim)
# Initialize shape
for i, shp_i in enumerate(inp[1:]):
code += """
shape[%(i)s] = ((dtype_%(shp_i)s*) %(shp_i)s->data)[0];
""" % dict(i=i, shp_i=shp_i)
code += """
int need_new_out = (NULL == %(zz)s);
for (int i = 0; i < %(ndim)s; i++)
need_new_out = (need_new_out
|| (%(zz)s->dimensions[i] != shape[i]));
if (need_new_out)
{
Py_XDECREF(%(zz)s);
%(zz)s = (PyArrayObject*) PyArray_SimpleNew(%(ndim)s,
shape, type_num_%(vv)s);
if (!%(zz)s)
{
zz[i] = vv;
PyErr_SetString(PyExc_MemoryError, "alloc failed");
%(fail)s
}
""" % locals()
}
// This function takes care of broadcasting
PyArray_CopyInto(%(zz)s, %(vv)s);
""" % dict(vv=vv, ndim=ndim, zz=zz, fail=fail)
# else pretend this never happened
return super(Alloc, self).c_code(node, name, inp, out, sub)
return code
def c_code_cache_version(self):
return (1,)
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
......
......@@ -845,9 +845,15 @@ class Gemm(GemmRelated):
setup_z_Nz_Sz_outplace = """
if ((NULL == %(_zout)s)
|| (%(_zout)s->dimensions[0] != %(_z)s->dimensions[0])
|| (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1]))
|| (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1])
|| (%(_zout)s->strides[0] <= 0)
|| (%(_zout)s->strides[1] <= 0)
|| (%(_zout)s->strides[0] MOD type_size)
|| (%(_zout)s->strides[1] MOD type_size)
|| ((%(_zout)s->strides[0] != type_size)
&& (%(_zout)s->strides[1] != type_size)))
{
if (%(_zout)s) Py_XDECREF(%(_zout)s);
Py_XDECREF(%(_zout)s);
npy_intp dims[2];
dims[0] = %(_z)s->dimensions[0];
dims[1] = %(_z)s->dimensions[1];
......@@ -862,42 +868,44 @@ class Gemm(GemmRelated):
}
Nz = %(_zout)s->dimensions;
Sz = %(_zout)s->strides;
if (1) // COPY z -> zout
if (%(_zout)s->descr->type_num == PyArray_FLOAT)
{
if (%(_zout)s->descr->type_num == PyArray_FLOAT)
float * zoutdata = (float*)%(_zout)s->data;
int zoi = Sz[0] / sizeof(float);
int zoj = Sz[1] / sizeof(float);
const float * zdata = (float*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(float);
int zj = %(_z)s->strides[1]/sizeof(float);
for (int i = 0; i < Nz[0]; ++i)
{
float * zoutdata = (float*)%(_zout)s->data;
const float * zdata = (float*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(float);
int zj = %(_z)s->strides[1]/sizeof(float);
for (int i = 0; i < Nz[0]; ++i)
for (int j = 0; j < Nz[1]; ++j)
{
for (int j = 0; j < Nz[1]; ++j)
{
zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
}
zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
}
}
else if (%(_zout)s->descr->type_num == PyArray_DOUBLE)
}
else if (%(_zout)s->descr->type_num == PyArray_DOUBLE)
{
double * zoutdata = (double*) %(_zout)s->data;
int zoi = Sz[0] / sizeof(double);
int zoj = Sz[1] / sizeof(double);
const double * zdata = (double*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(double);
int zj = %(_z)s->strides[1]/sizeof(double);
for (int i = 0; i < Nz[0]; ++i)
{
double * zoutdata = (double*) %(_zout)s->data;
const double * zdata = (double*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(double);
int zj = %(_z)s->strides[1]/sizeof(double);
for (int i = 0; i < Nz[0]; ++i)
for (int j = 0; j < Nz[1]; ++j)
{
for (int j = 0; j < Nz[1]; ++j)
{
zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
}
zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
}
}
else
{
PyErr_SetString(PyExc_AssertionError,
"neither float nor double dtype");
%(fail)s
}
}
else
{
PyErr_SetString(PyExc_AssertionError,
"neither float nor double dtype");
%(fail)s
}
"""
......@@ -938,7 +946,7 @@ class Gemm(GemmRelated):
def c_code_cache_version(self):
gv = self.build_gemm_version()
if gv:
return (3,) + gv
return (4,) + gv
else:
return gv
......
......@@ -12,7 +12,7 @@ class ConvTransp3D(theano.Op):
return hash(type(self))
def c_code_cache_version(self):
return (2,)
return (3,)
def make_node(self, W, b, d, H, RShape = None):
"""
......@@ -232,23 +232,8 @@ class ConvTransp3D(theano.Op):
}
}
for (int i = 0; i < 3; i++)
if (%(R)s->strides[i] < %(R)s->strides[4])
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
%(fail)s
}
{ // for fail 6
memset(%(R)s->data, 0, (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
(videoHeight-1) * %(R)s->strides[1] +
(videoWidth-1) * %(R)s->strides[2] +
(videoDur-1) * %(R)s->strides[3]);
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论