提交 6465523c authored 作者: nouiz's avatar nouiz

Merge pull request #583 from lamblin/preallocated_out_ops

Fix ops to accept any kind of preallocated output memory
...@@ -30,7 +30,7 @@ class GpuDot22(GpuOp): ...@@ -30,7 +30,7 @@ class GpuDot22(GpuOp):
return Apply(self, [x, y], [otype()]) return Apply(self, [x, y], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, 1) return (1, 2)
def c_code(self, node, nodename, inputs, outputs, sub): def c_code(self, node, nodename, inputs, outputs, sub):
x, y = inputs x, y = inputs
...@@ -51,9 +51,14 @@ class GpuDot22(GpuOp): ...@@ -51,9 +51,14 @@ class GpuDot22(GpuOp):
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != || (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1])) CudaNdarray_HOST_DIMS(%(y)s)[1])
|| (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
|| ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
&& (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
{ {
//if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
...@@ -108,7 +113,7 @@ class GpuDot22Scalar(GpuOp): ...@@ -108,7 +113,7 @@ class GpuDot22Scalar(GpuOp):
return Apply(self, [x, y, a], [otype()]) return Apply(self, [x, y, a], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, 1) return (1, 2)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
x, y, a = inputs x, y, a = inputs
...@@ -135,7 +140,13 @@ class GpuDot22Scalar(GpuOp): ...@@ -135,7 +140,13 @@ class GpuDot22Scalar(GpuOp):
(CudaNdarray_HOST_DIMS(%(z)s)[0] != (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) || CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
(CudaNdarray_HOST_DIMS(%(z)s)[1] != (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1])) CudaNdarray_HOST_DIMS(%(y)s)[1])
|| (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
|| ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
&& (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
&& (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
{ {
//if (%(z)s) Py_DECREF(%(z)s); //if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
...@@ -790,7 +801,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -790,7 +801,7 @@ class GpuDownsampleFactorMax(GpuOp):
#def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return (3) return (4)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
...@@ -856,7 +867,11 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -856,7 +867,11 @@ class GpuDownsampleFactorMax(GpuOp):
CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2], CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_HOST_STRIDES(%(x)s)[3], CudaNdarray_HOST_STRIDES(%(x)s)[3],
CudaNdarray_DEV_DATA(%(z)s)); CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_HOST_STRIDES(%(z)s)[2],
CudaNdarray_HOST_STRIDES(%(z)s)[3]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -883,7 +898,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -883,7 +898,7 @@ class GpuDownsampleFactorMax(GpuOp):
__global__ void kMaxPool_%(nodename)s( __global__ void kMaxPool_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3, int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3, const float * x, int xS0, int xS1, int xS2, int xS3,
float *z) float *z, int zS0, int zS1, int zS2, int zS3)
{ {
float cur_max, cur_x; float cur_max, cur_x;
int i0 = blockIdx.x %% D0; int i0 = blockIdx.x %% D0;
...@@ -932,7 +947,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -932,7 +947,7 @@ class GpuDownsampleFactorMax(GpuOp):
} }
//store the result to global memory //store the result to global memory
z[i0 * D1*D2*D3 + i1*D2*D3 + i2*D3 + threadIdx.x] = cur_max; z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
} }
""" % locals() """ % locals()
......
...@@ -36,8 +36,8 @@ class SupportCodeError(Exception): ...@@ -36,8 +36,8 @@ class SupportCodeError(Exception):
class NaiveAlgo(object): class NaiveAlgo(object):
verbose = 0 # 1, 2 or 3 for more verbose output. verbose = 0 # 1, 2 or 3 for more verbose output.
cache_version = () #cache_version = ()
cache_version = (14, verbose) cache_version = (15, verbose)
def __init__(self, scalar_op, sync=True, inplace_pattern={}): def __init__(self, scalar_op, sync=True, inplace_pattern={}):
""" """
...@@ -541,7 +541,7 @@ class NaiveAlgo(object): ...@@ -541,7 +541,7 @@ class NaiveAlgo(object):
if nb_inputs > 0 and nd > 0: if nb_inputs > 0 and nd > 0:
print >> sio, """ print >> sio, """
int local_str[%(nb_inputs)s][%(nd)s]; int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_inputs)s][%(nd)s]; int local_ostr[%(nb_outputs)s][%(nd)s];
""" % locals() """ % locals()
else: else:
print >> sio, """ print >> sio, """
...@@ -928,6 +928,11 @@ nd_collapse_[i]=0; ...@@ -928,6 +928,11 @@ nd_collapse_[i]=0;
%(oname)s = NULL; %(oname)s = NULL;
} }
} }
if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
{
Py_XDECREF(%(oname)s);
%(oname)s = NULL;
}
if (NULL == %(oname)s) if (NULL == %(oname)s)
{ {
%(oname)s = (CudaNdarray*)CudaNdarray_New(); %(oname)s = (CudaNdarray*)CudaNdarray_New();
......
...@@ -199,8 +199,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -199,8 +199,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
def make_node(self, dy, sm, y_idx): def make_node(self, dy, sm, y_idx):
return Apply(self, [dy, sm, y_idx],[sm.type()]) return Apply(self, [dy, sm, y_idx],[sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,)
#return () #return ()
return (5,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
...@@ -257,7 +257,9 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -257,7 +257,9 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(%(y_idx)s)[0], CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(dx)s) //guaranteed c-contiguous CudaNdarray_DEV_DATA(%(dx)s),
CudaNdarray_HOST_STRIDES(%(dx)s)[0],
CudaNdarray_HOST_STRIDES(%(dx)s)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
...@@ -277,7 +279,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -277,7 +279,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
const float * dnll, const int dnll_s0, const float * dnll, const int dnll_s0,
const float * sm, const int sm_s0, const int sm_s1, const float * sm, const int sm_s0, const int sm_s1,
const float * y_idx, const int y_idx_s0, const float * y_idx, const int y_idx_s0,
float * dx) float * dx, const int dx_s0, const int dx_s1)
{ {
for (int i = blockIdx.x; i < N; i += gridDim.x) for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
...@@ -288,14 +290,14 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -288,14 +290,14 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
{ {
if (y_i == j) if (y_i == j)
{ {
dx[i * K + j] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0); dx[i * dx_s0 + j * dx_s1] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
} }
else else
{ {
dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1]; dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
} }
//dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1]; //dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*K+j] = 0; //dx[i*dx_s0+j*dx_s1] = 0;
} }
} }
} }
...@@ -319,7 +321,7 @@ class GpuSoftmax (GpuOp): ...@@ -319,7 +321,7 @@ class GpuSoftmax (GpuOp):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (4,) + inline_softmax.code_version return (5,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
...@@ -364,7 +366,9 @@ class GpuSoftmax (GpuOp): ...@@ -364,7 +366,9 @@ class GpuSoftmax (GpuOp):
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
...@@ -381,7 +385,7 @@ class GpuSoftmax (GpuOp): ...@@ -381,7 +385,7 @@ class GpuSoftmax (GpuOp):
return nvcc_kernel("kSoftmax_%s"%nodename, return nvcc_kernel("kSoftmax_%s"%nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
...@@ -393,7 +397,7 @@ class GpuSoftmax (GpuOp): ...@@ -393,7 +397,7 @@ class GpuSoftmax (GpuOp):
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * N + tx] = buf[tx]",# This set all value correctly "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",# This set all value correctly
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
...@@ -419,7 +423,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -419,7 +423,7 @@ class GpuSoftmaxWithBias (GpuOp):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) + inline_softmax.code_version return (6,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, b = inp x, b = inp
...@@ -481,7 +485,9 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -481,7 +485,9 @@ class GpuSoftmaxWithBias (GpuOp):
CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0], CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s) //guarantee c contig CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
...@@ -503,7 +509,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -503,7 +509,7 @@ class GpuSoftmaxWithBias (GpuOp):
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0', 'const float * b', 'const int sb0',
'float * sm'], 'float * sm', 'const int ssm0', 'const int ssm1'],
body=[ body=[
"extern __shared__ float buf[]", "extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
...@@ -516,7 +522,7 @@ class GpuSoftmaxWithBias (GpuOp): ...@@ -516,7 +522,7 @@ class GpuSoftmaxWithBias (GpuOp):
"__syncthreads()", "__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'), inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * N + tx] = buf[tx]", "sm[blockIDX * ssm0 + tx * ssm1] = buf[tx]",
"}", "}",
"__syncthreads()", "__syncthreads()",
"}", "}",
......
...@@ -33,7 +33,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -33,7 +33,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if theano.config.mode != "DEBUG_MODE": if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098 n_in = 4098
n_out = 4099 n_out = 4099
......
...@@ -147,19 +147,21 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp): ...@@ -147,19 +147,21 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
return Apply(self, [pvals, unis], [pvals.type()]) return Apply(self, [pvals, unis], [pvals.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return """ return """
static __global__ void k_multi_warp_%(nodename)s( static __global__ void k_multi_warp_%(nodename)s(
const int nb_multi, const int nb_multi,
const int nb_outcomes, const int nb_outcomes,
const int pvals_row_strides,
const int pvals_col_strides,
const int unis_stride,
float * global_pvals, float * global_pvals,
const int pvals_row_stride,
const int pvals_col_stride,
float * global_unis, float * global_unis,
float * global_outs const int unis_stride,
float * global_outs,
const int outs_row_stride,
const int outs_col_stride
) )
{ {
// each thread takes care of one multinomial draw // each thread takes care of one multinomial draw
...@@ -174,7 +176,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp): ...@@ -174,7 +176,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
float current_out = 0.; float current_out = 0.;
if (!done) if (!done)
{ {
cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides]; cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
if (unis_n < cummul) if (unis_n < cummul)
{ {
current_out = 1.; current_out = 1.;
...@@ -182,7 +184,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp): ...@@ -182,7 +184,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
} }
} }
//write out transposed for speed. //write out transposed for speed.
global_outs[n + m * nb_multi] = current_out; global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
} }
} }
} }
...@@ -262,12 +264,14 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp): ...@@ -262,12 +264,14 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(%(z)s)[1], CudaNdarray_HOST_DIMS(%(z)s)[1],
CudaNdarray_HOST_DIMS(%(z)s)[0], CudaNdarray_HOST_DIMS(%(z)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_HOST_STRIDES(%(pvals)s)[0], CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
CudaNdarray_HOST_STRIDES(%(pvals)s)[1], CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(pvals)s),
CudaNdarray_DEV_DATA(%(unis)s), CudaNdarray_DEV_DATA(%(unis)s),
CudaNdarray_DEV_DATA(%(z)s) CudaNdarray_HOST_STRIDES(%(unis)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
......
...@@ -314,7 +314,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -314,7 +314,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
mode = self.mode mode = self.mode
...@@ -333,6 +333,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -333,6 +333,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
const int grid_d, const int grid_d,
const int stride0, const int stride1, const int stride2, const int stride3, const int stride0, const int stride1, const int stride2, const int stride3,
float * global_ten4, float * global_ten4,
const int out_s0, const int out_s1,
float * global_out float * global_out
) )
{ {
...@@ -375,7 +376,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -375,7 +376,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i; int z_col = j + d * i;
int z_idx = z_col + c*d*z_row; int z_idx = z_col * out_s1 + z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx]; global_out[z_idx] = global_ten4[ten4_idx];
} }
} }
...@@ -395,6 +396,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -395,6 +396,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
const int grid_d, const int grid_d,
const int stride0, const int stride1, const int stride2, const int stride3, const int stride0, const int stride1, const int stride2, const int stride3,
float * global_ten4, float * global_ten4,
const int out_s0, const int out_s1,
float * global_out float * global_out
) )
{ {
...@@ -437,7 +439,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -437,7 +439,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
int z_col = j + d * i; int z_col = j + d * i;
int z_idx = z_col + c*d*z_row; int z_idx = z_col * out_s1 + z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx]; global_out[z_idx] = global_ten4[ten4_idx];
} }
} }
...@@ -573,7 +575,9 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -573,7 +575,9 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
int, int, int ,int, int, int, int ,int,
int, int, int, int,
int, int, int, int, int, int, int, int,
float*, float*); float*,
int, int,
float*);
if(n_threads.x==d && n_threads.y==c){ if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s; f = k_multi_warp_less_%(name)s;
}else{ }else{
...@@ -591,6 +595,8 @@ class GpuImages2Neibs(Images2Neibs, GpuOp): ...@@ -591,6 +595,8 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
CudaNdarray_HOST_STRIDES(%(ten4)s)[2], CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
CudaNdarray_HOST_STRIDES(%(ten4)s)[3], CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
CudaNdarray_DEV_DATA(%(ten4)s), CudaNdarray_DEV_DATA(%(ten4)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_DEV_DATA(%(z)s) CudaNdarray_DEV_DATA(%(z)s)
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
......
...@@ -239,7 +239,9 @@ class mrg_uniform(mrg_uniform_base): ...@@ -239,7 +239,9 @@ class mrg_uniform(mrg_uniform_base):
npy_intp odims[%(ndim)s]; npy_intp odims[%(ndim)s];
int n_elements = 1; int n_elements = 1;
int n_streams = 0; int n_streams = 0;
int must_alloc_sample = ((NULL == %(o_sample)s) || (%(o_sample)s->nd != %(ndim)s)); int must_alloc_sample = ((NULL == %(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s)
|| !(PyArray_ISCONTIGUOUS(%(o_sample)s)));
%(otype)s * sample_data; %(otype)s * sample_data;
npy_int32 * state_data; npy_int32 * state_data;
...@@ -280,7 +282,6 @@ class mrg_uniform(mrg_uniform_base): ...@@ -280,7 +282,6 @@ class mrg_uniform(mrg_uniform_base):
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (%(o_sample)s->dimensions[i] != odims[i]); must_alloc_sample = must_alloc_sample || (%(o_sample)s->dimensions[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]); //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
// TODO CHECK STRIDES OF o_sample?
} }
if (must_alloc_sample) if (must_alloc_sample)
{ {
...@@ -370,9 +371,11 @@ class mrg_uniform(mrg_uniform_base): ...@@ -370,9 +371,11 @@ class mrg_uniform(mrg_uniform_base):
state_data_i[5]= x23; state_data_i[5]= x23;
} }
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" %locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
class GPU_mrg_uniform(mrg_uniform_base, GpuOp): class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
#GPU VERSION #GPU VERSION
...@@ -496,6 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -496,6 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
int n_streams, n_streams_used_in_this_call; int n_streams, n_streams_used_in_this_call;
int must_alloc_sample = ((NULL == %(o_sample)s) int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check(py_%(o_sample)s) || !CudaNdarray_Check(py_%(o_sample)s)
|| !CudaNdarray_is_c_contiguous(%(o_sample)s)
|| (%(o_sample)s->nd != %(ndim)s)); || (%(o_sample)s->nd != %(ndim)s));
if (%(size)s->nd != 1) if (%(size)s->nd != 1)
...@@ -590,7 +594,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -590,7 +594,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
def guess_n_streams(size, warn=True): def guess_n_streams(size, warn=True):
......
...@@ -1847,6 +1847,9 @@ class StructuredDotGradCSC(gof.Op): ...@@ -1847,6 +1847,9 @@ class StructuredDotGradCSC(gof.Op):
g_a_data[i_idx] = dot_val g_a_data[i_idx] = dot_val
out[0] = g_a_data out[0] = g_a_data
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub): def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
if node.inputs[2].type.dtype in ('complex64', 'complex128'): if node.inputs[2].type.dtype in ('complex64', 'complex128'):
...@@ -1870,17 +1873,13 @@ class StructuredDotGradCSC(gof.Op): ...@@ -1870,17 +1873,13 @@ class StructuredDotGradCSC(gof.Op):
if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1]) if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;} {PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}
if (!%(_zout)s) if (!%(_zout)s
|| (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
{ {
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num); %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
} }
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions. { //makes it compile even though labels jump over variable definitions.
npy_intp nnz = %(_indices)s->dimensions[0]; npy_intp nnz = %(_indices)s->dimensions[0];
npy_intp N = %(_indptr)s->dimensions[0]-1; //TODO: error checking with this npy_intp N = %(_indptr)s->dimensions[0]-1; //TODO: error checking with this
...@@ -1971,6 +1970,9 @@ class StructuredDotGradCSR(gof.Op): ...@@ -1971,6 +1970,9 @@ class StructuredDotGradCSR(gof.Op):
g_a_data[j_idx] = dot_val g_a_data[j_idx] = dot_val
out[0] = g_a_data out[0] = g_a_data
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub): def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):
if node.inputs[2].type.dtype in ('complex64', 'complex128'): if node.inputs[2].type.dtype in ('complex64', 'complex128'):
...@@ -1994,17 +1996,13 @@ class StructuredDotGradCSR(gof.Op): ...@@ -1994,17 +1996,13 @@ class StructuredDotGradCSR(gof.Op):
if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1]) if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
{PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;} {PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}
if (!%(_zout)s) if (!%(_zout)s
|| (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
{ {
Py_XDECREF(%(_zout)s);
%(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num); %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
} }
if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
{
PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
%(fail)s;
}
{ //makes it compile even though labels jump over variable definitions. { //makes it compile even though labels jump over variable definitions.
npy_intp nnz = %(_indices)s->dimensions[0]; npy_intp nnz = %(_indices)s->dimensions[0];
// extract number of rows // extract number of rows
...@@ -2411,8 +2409,6 @@ class UsmmCscDense(gof.Op): ...@@ -2411,8 +2409,6 @@ class UsmmCscDense(gof.Op):
npy_intp K = %(y)s->dimensions[0]; npy_intp K = %(y)s->dimensions[0];
// pointers to access actual data in the arrays passed as params. // pointers to access actual data in the arrays passed as params.
dtype_%(z)s* __restrict__ Dz = (dtype_%(z)s*)%(z)s->data;
dtype_%(zn)s* __restrict__ Dzn = (dtype_%(zn)s*)%(zn)s->data;
const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data; const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data; const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data; const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
...@@ -2428,7 +2424,11 @@ class UsmmCscDense(gof.Op): ...@@ -2428,7 +2424,11 @@ class UsmmCscDense(gof.Op):
if (!(%(inplace)s)) if (!(%(inplace)s))
{ {
memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s)); if (PyArray_CopyInto(%(zn)s, %(z)s))
{
Py_XDECREF(%(zn)s);
%(fail)s;
}
} }
for (npy_int32 k = 0; k < K; ++k) for (npy_int32 k = 0; k < K; ++k)
...@@ -2439,9 +2439,16 @@ class UsmmCscDense(gof.Op): ...@@ -2439,9 +2439,16 @@ class UsmmCscDense(gof.Op):
const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location
const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k); dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
// axpy expects pointer to the beginning of memory arrays,
// so when the stride is negative, we need to get the
// last element
if (Sy < 0)
y_row += (K - 1) * Sy;
const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m); dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
if (Szn < 0)
z_row += (N - 1) * Szn;
%(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn); %(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
} }
...@@ -2451,6 +2458,9 @@ class UsmmCscDense(gof.Op): ...@@ -2451,6 +2458,9 @@ class UsmmCscDense(gof.Op):
return rval return rval
def c_code_cache_version(self):
return (1,)
usmm_csc_dense = UsmmCscDense(inplace=False) usmm_csc_dense = UsmmCscDense(inplace=False)
usmm_csc_dense_inplace = UsmmCscDense(inplace=True) usmm_csc_dense_inplace = UsmmCscDense(inplace=True)
......
...@@ -2849,40 +2849,47 @@ class Alloc(gof.Op): ...@@ -2849,40 +2849,47 @@ class Alloc(gof.Op):
out[0][...] = v # broadcast v to fill us up out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
# TODO: use the elemwise code generator here vv = inp[0]
if python_all(node.inputs[0].broadcastable): ndim = len(inp[1:])
# filling with a scalar is a common use of alloc zz, = out
# that we can implement relatively easily fail = sub['fail']
vv = inp[0]
zz, = out code = """
fail = sub['fail'] npy_intp shape[%(ndim)s];
if node.outputs[0].ndim == 1: """ % dict(ndim=ndim)
N0 = inp[1]
return """ # Initialize shape
npy_intp N0 = ((dtype_%(N0)s*)%(N0)s->data)[0]; for i, shp_i in enumerate(inp[1:]):
dtype_%(vv)s vv; code += """
dtype_%(zz)s* zz; shape[%(i)s] = ((dtype_%(shp_i)s*) %(shp_i)s->data)[0];
if ((NULL == %(zz)s) || (%(zz)s->dimensions[0] != N0)) """ % dict(i=i, shp_i=shp_i)
{
if (%(zz)s) Py_XDECREF(%(zz)s); code += """
%(zz)s = (PyArrayObject*)PyArray_SimpleNew(1, int need_new_out = (NULL == %(zz)s);
&N0, type_num_%(vv)s); for (int i = 0; i < %(ndim)s; i++)
if(!%(zz)s) { need_new_out = (need_new_out
PyErr_SetString(PyExc_MemoryError, "alloc failed"); || (%(zz)s->dimensions[i] != shape[i]));
%(fail)s
} if (need_new_out)
} {
vv = ((dtype_%(vv)s*)%(vv)s->data)[0]; Py_XDECREF(%(zz)s);
zz = ((dtype_%(zz)s*)%(zz)s->data); %(zz)s = (PyArrayObject*) PyArray_SimpleNew(%(ndim)s,
assert (%(zz)s->strides[0] == sizeof(dtype_%(zz)s)); shape, type_num_%(vv)s);
for (int i = 0; i < N0; ++i) if (!%(zz)s)
{ {
zz[i] = vv; PyErr_SetString(PyExc_MemoryError, "alloc failed");
%(fail)s
} }
""" % locals() }
// This function takes care of broadcasting
PyArray_CopyInto(%(zz)s, %(vv)s);
""" % dict(vv=vv, ndim=ndim, zz=zz, fail=fail)
# else pretend this never happened return code
return super(Alloc, self).c_code(node, name, inp, out, sub)
def c_code_cache_version(self):
return (1,)
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
return [node.inputs[1:]] return [node.inputs[1:]]
......
...@@ -845,9 +845,15 @@ class Gemm(GemmRelated): ...@@ -845,9 +845,15 @@ class Gemm(GemmRelated):
setup_z_Nz_Sz_outplace = """ setup_z_Nz_Sz_outplace = """
if ((NULL == %(_zout)s) if ((NULL == %(_zout)s)
|| (%(_zout)s->dimensions[0] != %(_z)s->dimensions[0]) || (%(_zout)s->dimensions[0] != %(_z)s->dimensions[0])
|| (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1])) || (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1])
|| (%(_zout)s->strides[0] <= 0)
|| (%(_zout)s->strides[1] <= 0)
|| (%(_zout)s->strides[0] MOD type_size)
|| (%(_zout)s->strides[1] MOD type_size)
|| ((%(_zout)s->strides[0] != type_size)
&& (%(_zout)s->strides[1] != type_size)))
{ {
if (%(_zout)s) Py_XDECREF(%(_zout)s); Py_XDECREF(%(_zout)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = %(_z)s->dimensions[0]; dims[0] = %(_z)s->dimensions[0];
dims[1] = %(_z)s->dimensions[1]; dims[1] = %(_z)s->dimensions[1];
...@@ -862,42 +868,44 @@ class Gemm(GemmRelated): ...@@ -862,42 +868,44 @@ class Gemm(GemmRelated):
} }
Nz = %(_zout)s->dimensions; Nz = %(_zout)s->dimensions;
Sz = %(_zout)s->strides; Sz = %(_zout)s->strides;
if (1) // COPY z -> zout
if (%(_zout)s->descr->type_num == PyArray_FLOAT)
{ {
if (%(_zout)s->descr->type_num == PyArray_FLOAT) float * zoutdata = (float*)%(_zout)s->data;
int zoi = Sz[0] / sizeof(float);
int zoj = Sz[1] / sizeof(float);
const float * zdata = (float*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(float);
int zj = %(_z)s->strides[1]/sizeof(float);
for (int i = 0; i < Nz[0]; ++i)
{ {
float * zoutdata = (float*)%(_zout)s->data; for (int j = 0; j < Nz[1]; ++j)
const float * zdata = (float*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(float);
int zj = %(_z)s->strides[1]/sizeof(float);
for (int i = 0; i < Nz[0]; ++i)
{ {
for (int j = 0; j < Nz[1]; ++j) zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
{
zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
}
} }
} }
else if (%(_zout)s->descr->type_num == PyArray_DOUBLE) }
else if (%(_zout)s->descr->type_num == PyArray_DOUBLE)
{
double * zoutdata = (double*) %(_zout)s->data;
int zoi = Sz[0] / sizeof(double);
int zoj = Sz[1] / sizeof(double);
const double * zdata = (double*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(double);
int zj = %(_z)s->strides[1]/sizeof(double);
for (int i = 0; i < Nz[0]; ++i)
{ {
double * zoutdata = (double*) %(_zout)s->data; for (int j = 0; j < Nz[1]; ++j)
const double * zdata = (double*)%(_z)s->data;
int zi = %(_z)s->strides[0]/sizeof(double);
int zj = %(_z)s->strides[1]/sizeof(double);
for (int i = 0; i < Nz[0]; ++i)
{ {
for (int j = 0; j < Nz[1]; ++j) zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
{
zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
}
} }
} }
else }
{ else
PyErr_SetString(PyExc_AssertionError, {
"neither float nor double dtype"); PyErr_SetString(PyExc_AssertionError,
%(fail)s "neither float nor double dtype");
} %(fail)s
} }
""" """
...@@ -938,7 +946,7 @@ class Gemm(GemmRelated): ...@@ -938,7 +946,7 @@ class Gemm(GemmRelated):
def c_code_cache_version(self): def c_code_cache_version(self):
gv = self.build_gemm_version() gv = self.build_gemm_version()
if gv: if gv:
return (3,) + gv return (4,) + gv
else: else:
return gv return gv
......
...@@ -12,7 +12,7 @@ class ConvTransp3D(theano.Op): ...@@ -12,7 +12,7 @@ class ConvTransp3D(theano.Op):
return hash(type(self)) return hash(type(self))
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def make_node(self, W, b, d, H, RShape = None): def make_node(self, W, b, d, H, RShape = None):
""" """
...@@ -232,23 +232,8 @@ class ConvTransp3D(theano.Op): ...@@ -232,23 +232,8 @@ class ConvTransp3D(theano.Op):
} }
} }
for (int i = 0; i < 3; i++)
if (%(R)s->strides[i] < %(R)s->strides[4])
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
%(fail)s
}
{ // for fail 6 { // for fail 6
memset(%(R)s->data, 0, (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
(videoHeight-1) * %(R)s->strides[1] +
(videoWidth-1) * %(R)s->strides[2] +
(videoDur-1) * %(R)s->strides[3]);
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] ) #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) ) #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论