提交 2681accc authored 作者: David Warde-Farley's avatar David Warde-Farley

Big whitespace cleanup.

上级 b7a6e812
...@@ -11,13 +11,13 @@ if cuda_available: ...@@ -11,13 +11,13 @@ if cuda_available:
class GpuConv3D(theano.Op): class GpuConv3D(theano.Op):
""" GPU implementation of Conv3D """ """ GPU implementation of Conv3D """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return '%s' % (self.__class__.__name__) return '%s' % (self.__class__.__name__)
...@@ -33,16 +33,16 @@ class GpuConv3D(theano.Op): ...@@ -33,16 +33,16 @@ class GpuConv3D(theano.Op):
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
return theano.Apply(self, inputs=[V_, W_, b_, d_], return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] ) outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
def c_code(self, node, nodename, (V,W,b,d), outputs, sub): def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
fail = sub['fail'] fail = sub['fail']
H = outputs[0] H = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConv3D > ///////////// < code generated by GpuConv3D >
...@@ -51,31 +51,31 @@ class GpuConv3D(theano.Op): ...@@ -51,31 +51,31 @@ class GpuConv3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->dimensions[0] != 3) if (%(d)s->dimensions[0] != 3)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s %(fail)s
} }
...@@ -87,7 +87,7 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -87,7 +87,7 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels) if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so error handler jumps don't cause errors { //extra scope so error handler jumps don't cause errors
...@@ -115,14 +115,14 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -115,14 +115,14 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0) if (dr <= 0 || dc <= 0 || dt <= 0)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
...@@ -139,16 +139,16 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -139,16 +139,16 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
dims[2] = outputWidth; dims[2] = outputWidth;
dims[3] = outputDur; dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){ CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s); Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) { if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s %(fail)s
} }
} }
...@@ -180,7 +180,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -180,7 +180,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
//conv_rows_stack //conv_rows_stack
dim3 grid(outputHeight*outputWidth,batchSize*outputChannels); dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
dim3 threads(outputDur); dim3 threads(outputDur);
int shared_size=0; int shared_size=0;
conv_rows_stack<<<grid, threads, shared_size>>>( conv_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
...@@ -193,7 +193,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -193,7 +193,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -206,7 +206,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -206,7 +206,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
...@@ -216,10 +216,10 @@ if(!work_complete){ ...@@ -216,10 +216,10 @@ if(!work_complete){
%(fail)s %(fail)s
} }
}}}}}}} //extra scope so error handler jumps don't cross declarations }}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D > ///////////// < /code generated by GpuConv3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -260,7 +260,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -260,7 +260,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
for (int k =0; k < kern_height; k++) { for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) { for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) { for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] * sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m]; kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
} }
} }
...@@ -278,7 +278,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -278,7 +278,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
""" """
return codeSource#renderString(codeSource,locals()) return codeSource#renderString(codeSource,locals())
gpu_convd = GpuConv3D() gpu_convd = GpuConv3D()
......
...@@ -22,14 +22,14 @@ class GpuConvGrad3D(theano.Op): ...@@ -22,14 +22,14 @@ class GpuConvGrad3D(theano.Op):
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()]) outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
def perform_(self, node, inputs, output_storage): def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs V, d, WShape, dCdH = inputs
print "GpuConvGrad3D python code (warning not updated to new format)" print "GpuConvGrad3D python code (warning not updated to new format)"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1] outputFilters = dCdH.shape[1]
...@@ -66,7 +66,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -66,7 +66,7 @@ class GpuConvGrad3D(theano.Op):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvGrad3D > ///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n"); //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
...@@ -123,9 +123,9 @@ class GpuConvGrad3D(theano.Op): ...@@ -123,9 +123,9 @@ class GpuConvGrad3D(theano.Op):
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (WShape[4] != inputChannels) if (WShape[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels); PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so fail works { //extra scope so fail works
const int filterHeight = WShape[1]; const int filterHeight = WShape[1];
...@@ -149,7 +149,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -149,7 +149,7 @@ class GpuConvGrad3D(theano.Op):
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth); PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
...@@ -167,7 +167,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -167,7 +167,7 @@ class GpuConvGrad3D(theano.Op):
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1; const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize || if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels || CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight || CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth || CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
...@@ -185,10 +185,10 @@ class GpuConvGrad3D(theano.Op): ...@@ -185,10 +185,10 @@ class GpuConvGrad3D(theano.Op):
dims[2] = filterWidth; dims[2] = filterWidth;
dims[3] = filterDur; dims[3] = filterDur;
if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] || if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){ CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s); Py_XDECREF(%(dCdW)s);
%(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
...@@ -219,7 +219,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -219,7 +219,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
//conv_rows_stack //conv_rows_stack
dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth); dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
dim3 threads(WShape[3]); dim3 threads(WShape[3]);
int shared_size=0; int shared_size=0;
convgrad_rows_stack<<<grid, threads, shared_size>>>( convgrad_rows_stack<<<grid, threads, shared_size>>>(
...@@ -235,7 +235,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -235,7 +235,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -248,18 +248,18 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -248,18 +248,18 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
if(!work_complete){ if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!"); PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s %(fail)s
} }
}}}}} // extra scope for fail }}}}} // extra scope for fail
///////////// < /code generated by GpuConvGrad3D > ///////////// < /code generated by GpuConvGrad3D >
""" """
return strutls.render_string(codeSource,locals()) return strutls.render_string(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -329,7 +329,7 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW, ...@@ -329,7 +329,7 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m] dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
*/ */
""" """
return codeSource#renderString(codeSource,locals()) return codeSource#renderString(codeSource,locals())
gpu_conv_grad3d = GpuConvGrad3D() gpu_conv_grad3d = GpuConvGrad3D()
......
...@@ -14,10 +14,10 @@ class GpuConvTransp3D(theano.Op): ...@@ -14,10 +14,10 @@ class GpuConvTransp3D(theano.Op):
""" The gpu version of ConvTransp3D """ """ The gpu version of ConvTransp3D """
def __eq__(self,other): def __eq__(self,other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, W, b, d, H, RShape = None): def make_node(self, W, b, d, H, RShape = None):
W_ = as_cuda_ndarray_variable(W) W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
...@@ -27,9 +27,9 @@ class GpuConvTransp3D(theano.Op): ...@@ -27,9 +27,9 @@ class GpuConvTransp3D(theano.Op):
RShape_ = T.as_tensor_variable(RShape) RShape_ = T.as_tensor_variable(RShape)
else: else:
RShape_ = T.as_tensor_variable([-1,-1,-1]) RShape_ = T.as_tensor_variable([-1,-1,-1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
outputs = [CudaNdarrayType(dtype=H_.dtype, outputs = [CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()]) broadcastable=(False,)*5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
...@@ -51,7 +51,7 @@ class GpuConvTransp3D(theano.Op): ...@@ -51,7 +51,7 @@ class GpuConvTransp3D(theano.Op):
R = outputs[0] R = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvTransp3D > ///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n"); //printf("\t\t\t\tGpuConvTransp c code\\n");
...@@ -59,25 +59,25 @@ class GpuConvTransp3D(theano.Op): ...@@ -59,25 +59,25 @@ class GpuConvTransp3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(H)s->nd != 5) if (%(H)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s %(fail)s
} }
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
%(fail)s %(fail)s
} }
...@@ -106,7 +106,7 @@ class GpuConvTransp3D(theano.Op): ...@@ -106,7 +106,7 @@ class GpuConvTransp3D(theano.Op):
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels) if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{ {
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
%(fail)s %(fail)s
} }
{ // for fail { // for fail
...@@ -173,10 +173,10 @@ class GpuConvTransp3D(theano.Op): ...@@ -173,10 +173,10 @@ class GpuConvTransp3D(theano.Op):
dims[2] = videoWidth; dims[2] = videoWidth;
dims[3] = videoDur; dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){ CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s); Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
...@@ -213,7 +213,7 @@ if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_comple ...@@ -213,7 +213,7 @@ if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_comple
//conv_transp_rows_stack //conv_transp_rows_stack
dim3 grid(batchSize * inputChannels, videoHeight * videoWidth); dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
dim3 threads(videoDur); dim3 threads(videoDur);
HERE HERE
int shared_size=0; int shared_size=0;
...@@ -230,7 +230,7 @@ HERE ...@@ -230,7 +230,7 @@ HERE
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -243,7 +243,7 @@ HERE ...@@ -243,7 +243,7 @@ HERE
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
...@@ -258,8 +258,8 @@ if(!work_complete){ ...@@ -258,8 +258,8 @@ if(!work_complete){
}}}}}} // for fail }}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D > ///////////// < /code generated by GpuConvTransp3D >
""" """
return renderString(codeSource,locals()) return renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -358,9 +358,9 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -358,9 +358,9 @@ def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5 assert len(W.shape) == 5
assert len(H.shape) == 5 assert len(H.shape) == 5
assert len(b.shape) == 1 assert len(b.shape) == 1
assert len(d) == 3 assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels assert outputChannelsAgain == outputChannels
...@@ -382,18 +382,18 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -382,18 +382,18 @@ def computeR(W,b,d,H,Rshape = None):
assert Rshape[1] >= videoWidth assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape) #print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape videoHeight, videoWidth, videoDur = Rshape
#else: #else:
# print "No Rshape passed in" # print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur)) #print "video size: "+str((videoHeight, videoWidth, videoDur))
R = N.zeros( (batchSize, inputChannels, videoHeight, R = N.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize): for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize) #print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels): for j in xrange(0,inputChannels):
...@@ -403,7 +403,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -403,7 +403,7 @@ def computeR(W,b,d,H,Rshape = None):
for c in xrange(0,videoWidth): for c in xrange(0,videoWidth):
for t in xrange(0,videoDur): for t in xrange(0,videoDur):
R[i,j,r,c,t] = b[j] R[i,j,r,c,t] = b[j]
ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ]) ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ]) fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
...@@ -424,7 +424,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -424,7 +424,7 @@ def computeR(W,b,d,H,Rshape = None):
tk = t - tc * dt tk = t - tc * dt
if tk < 0: if tk < 0:
break break
R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] ) R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1 tc += 1
...@@ -439,7 +439,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -439,7 +439,7 @@ def computeR(W,b,d,H,Rshape = None):
"" #close loop over r "" #close loop over r
"" #close loop over j "" #close loop over j
"" #close loop over i "" #close loop over i
return R return R
......
...@@ -33,14 +33,14 @@ class ConvGrad3D(theano.Op): ...@@ -33,14 +33,14 @@ class ConvGrad3D(theano.Op):
dLdd = None #not differentiable, since d is not continuous dLdd = None #not differentiable, since d is not continuous
dLdWShape = None #not differentiable, since d is not continuous dLdWShape = None #not differentiable, since d is not continuous
dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d) dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d)
return [ dLdC, dLdd, dLdWShape, dLdB ] return [ dLdC, dLdd, dLdWShape, dLdB ]
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs V, d, WShape, dCdH = inputs
print "ConvGradW3D python code" print "ConvGradW3D python code"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[4] outputFilters = dCdH.shape[4]
...@@ -82,7 +82,7 @@ class ConvGrad3D(theano.Op): ...@@ -82,7 +82,7 @@ class ConvGrad3D(theano.Op):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by ConvGradW3D > ///////////// < code generated by ConvGradW3D >
//printf("\t\t\t\tConvGradW3D c code\\n"); //printf("\t\t\t\tConvGradW3D c code\\n");
...@@ -90,13 +90,13 @@ class ConvGrad3D(theano.Op): ...@@ -90,13 +90,13 @@ class ConvGrad3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(dCdH)s->nd != 5) if (%(dCdH)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
...@@ -131,16 +131,16 @@ class ConvGrad3D(theano.Op): ...@@ -131,16 +131,16 @@ class ConvGrad3D(theano.Op):
PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous"); PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
%(fail)s %(fail)s
} }
{ //extra scope so that fail will not jump over declarations { //extra scope so that fail will not jump over declarations
dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data; dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
const int outputChannels = WShape[0]; const int outputChannels = WShape[0];
const int inputChannels = %(V)s->dimensions[4]; const int inputChannels = %(V)s->dimensions[4];
if (WShape[4] != inputChannels) if (WShape[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels); PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so fail works { //extra scope so fail works
const int filterHeight = WShape[1]; const int filterHeight = WShape[1];
...@@ -184,7 +184,7 @@ class ConvGrad3D(theano.Op): ...@@ -184,7 +184,7 @@ class ConvGrad3D(theano.Op):
if (%(dCdH)s->dimensions[0] != batchSize || if (%(dCdH)s->dimensions[0] != batchSize ||
%(dCdH)s->dimensions[4] != outputChannels || %(dCdH)s->dimensions[4] != outputChannels ||
%(dCdH)s->dimensions[1] != outputHeight || %(dCdH)s->dimensions[1] != outputHeight ||
%(dCdH)s->dimensions[2] != outputWidth || %(dCdH)s->dimensions[2] != outputWidth ||
...@@ -202,10 +202,10 @@ class ConvGrad3D(theano.Op): ...@@ -202,10 +202,10 @@ class ConvGrad3D(theano.Op):
dims[2] = filterWidth; dims[2] = filterWidth;
dims[3] = filterDur; dims[3] = filterDur;
if(!(%(dCdW)s) || %(dCdW)s->dimensions[0]!=dims[0] || if(!(%(dCdW)s) || %(dCdW)s->dimensions[0]!=dims[0] ||
%(dCdW)s->dimensions[1]!=dims[1] || %(dCdW)s->dimensions[1]!=dims[1] ||
%(dCdW)s->dimensions[2]!=dims[2] || %(dCdW)s->dimensions[2]!=dims[2] ||
%(dCdW)s->dimensions[3]!=dims[3] || %(dCdW)s->dimensions[3]!=dims[3] ||
%(dCdW)s->dimensions[4]!=dims[4] ){ %(dCdW)s->dimensions[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s); Py_XDECREF(%(dCdW)s);
%(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num); %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
...@@ -241,9 +241,9 @@ class ConvGrad3D(theano.Op): ...@@ -241,9 +241,9 @@ class ConvGrad3D(theano.Op):
for (int p = 0; p < outputHeight; p++) { for (int p = 0; p < outputHeight; p++) {
for (int q = 0; q < outputWidth; q++) { for (int q = 0; q < outputWidth; q++) {
int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ; int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ;
int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] + (dr * p+k) * %(V)s->strides[1] + (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3]; int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] + (dr * p+k) * %(V)s->strides[1] + (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3];
for (int r = 0; r < outputDur; r++) { for (int r = 0; r < outputDur; r++) {
writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z); writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
//writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos); //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
Hpos += dhs3; Hpos += dhs3;
...@@ -258,11 +258,11 @@ class ConvGrad3D(theano.Op): ...@@ -258,11 +258,11 @@ class ConvGrad3D(theano.Op):
} }
} }
}}}}}}} // extra scope for fail }}}}}}} // extra scope for fail
///////////// < /code generated by ConvGradW3D > ///////////// < /code generated by ConvGradW3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
convGrad3D = ConvGrad3D() convGrad3D = ConvGrad3D()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论