提交 2681accc authored 作者: David Warde-Farley's avatar David Warde-Farley

Big whitespace cleanup.

上级 b7a6e812
...@@ -11,13 +11,13 @@ if cuda_available: ...@@ -11,13 +11,13 @@ if cuda_available:
class GpuConv3D(theano.Op): class GpuConv3D(theano.Op):
""" GPU implementation of Conv3D """ """ GPU implementation of Conv3D """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return '%s' % (self.__class__.__name__) return '%s' % (self.__class__.__name__)
...@@ -33,16 +33,16 @@ class GpuConv3D(theano.Op): ...@@ -33,16 +33,16 @@ class GpuConv3D(theano.Op):
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
return theano.Apply(self, inputs=[V_, W_, b_, d_], return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] ) outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
def c_code(self, node, nodename, (V,W,b,d), outputs, sub): def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
fail = sub['fail'] fail = sub['fail']
H = outputs[0] H = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConv3D > ///////////// < code generated by GpuConv3D >
...@@ -51,31 +51,31 @@ class GpuConv3D(theano.Op): ...@@ -51,31 +51,31 @@ class GpuConv3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
if (%(d)s->dimensions[0] != 3) if (%(d)s->dimensions[0] != 3)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s %(fail)s
} }
...@@ -87,7 +87,7 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -87,7 +87,7 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels) if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so error handler jumps don't cause errors { //extra scope so error handler jumps don't cause errors
...@@ -115,14 +115,14 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -115,14 +115,14 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0) if (dr <= 0 || dc <= 0 || dt <= 0)
{ {
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
...@@ -139,16 +139,16 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); ...@@ -139,16 +139,16 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
dims[2] = outputWidth; dims[2] = outputWidth;
dims[3] = outputDur; dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){ CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s); Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) { if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s %(fail)s
} }
} }
...@@ -180,7 +180,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -180,7 +180,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
//conv_rows_stack //conv_rows_stack
dim3 grid(outputHeight*outputWidth,batchSize*outputChannels); dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
dim3 threads(outputDur); dim3 threads(outputDur);
int shared_size=0; int shared_size=0;
conv_rows_stack<<<grid, threads, shared_size>>>( conv_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
...@@ -193,7 +193,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -193,7 +193,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -206,7 +206,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 & ...@@ -206,7 +206,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
...@@ -216,10 +216,10 @@ if(!work_complete){ ...@@ -216,10 +216,10 @@ if(!work_complete){
%(fail)s %(fail)s
} }
}}}}}}} //extra scope so error handler jumps don't cross declarations }}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D > ///////////// < /code generated by GpuConv3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -260,7 +260,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -260,7 +260,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
for (int k =0; k < kern_height; k++) { for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) { for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) { for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] * sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m]; kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
} }
} }
...@@ -278,7 +278,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -278,7 +278,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
""" """
return codeSource#renderString(codeSource,locals()) return codeSource#renderString(codeSource,locals())
gpu_convd = GpuConv3D() gpu_convd = GpuConv3D()
......
...@@ -22,14 +22,14 @@ class GpuConvGrad3D(theano.Op): ...@@ -22,14 +22,14 @@ class GpuConvGrad3D(theano.Op):
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()]) outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
def perform_(self, node, inputs, output_storage): def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs V, d, WShape, dCdH = inputs
print "GpuConvGrad3D python code (warning not updated to new format)" print "GpuConvGrad3D python code (warning not updated to new format)"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1] outputFilters = dCdH.shape[1]
...@@ -66,7 +66,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -66,7 +66,7 @@ class GpuConvGrad3D(theano.Op):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvGrad3D > ///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n"); //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
...@@ -123,9 +123,9 @@ class GpuConvGrad3D(theano.Op): ...@@ -123,9 +123,9 @@ class GpuConvGrad3D(theano.Op):
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4]; const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (WShape[4] != inputChannels) if (WShape[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels); PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so fail works { //extra scope so fail works
const int filterHeight = WShape[1]; const int filterHeight = WShape[1];
...@@ -149,7 +149,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -149,7 +149,7 @@ class GpuConvGrad3D(theano.Op):
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth); PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
%(fail)s %(fail)s
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
...@@ -167,7 +167,7 @@ class GpuConvGrad3D(theano.Op): ...@@ -167,7 +167,7 @@ class GpuConvGrad3D(theano.Op):
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1; const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1; const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize || if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels || CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight || CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth || CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
...@@ -185,10 +185,10 @@ class GpuConvGrad3D(theano.Op): ...@@ -185,10 +185,10 @@ class GpuConvGrad3D(theano.Op):
dims[2] = filterWidth; dims[2] = filterWidth;
dims[3] = filterDur; dims[3] = filterDur;
if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] || if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){ CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s); Py_XDECREF(%(dCdW)s);
%(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
...@@ -219,7 +219,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -219,7 +219,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
//conv_rows_stack //conv_rows_stack
dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth); dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
dim3 threads(WShape[3]); dim3 threads(WShape[3]);
int shared_size=0; int shared_size=0;
convgrad_rows_stack<<<grid, threads, shared_size>>>( convgrad_rows_stack<<<grid, threads, shared_size>>>(
...@@ -235,7 +235,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -235,7 +235,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -248,18 +248,18 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple ...@@ -248,18 +248,18 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
if(!work_complete){ if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!"); PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s %(fail)s
} }
}}}}} // extra scope for fail }}}}} // extra scope for fail
///////////// < /code generated by GpuConvGrad3D > ///////////// < /code generated by GpuConvGrad3D >
""" """
return strutls.render_string(codeSource,locals()) return strutls.render_string(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -329,7 +329,7 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW, ...@@ -329,7 +329,7 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m] dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
*/ */
""" """
return codeSource#renderString(codeSource,locals()) return codeSource#renderString(codeSource,locals())
gpu_conv_grad3d = GpuConvGrad3D() gpu_conv_grad3d = GpuConvGrad3D()
......
...@@ -14,10 +14,10 @@ class GpuConvTransp3D(theano.Op): ...@@ -14,10 +14,10 @@ class GpuConvTransp3D(theano.Op):
""" The gpu version of ConvTransp3D """ """ The gpu version of ConvTransp3D """
def __eq__(self,other): def __eq__(self,other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, W, b, d, H, RShape = None): def make_node(self, W, b, d, H, RShape = None):
W_ = as_cuda_ndarray_variable(W) W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
...@@ -27,9 +27,9 @@ class GpuConvTransp3D(theano.Op): ...@@ -27,9 +27,9 @@ class GpuConvTransp3D(theano.Op):
RShape_ = T.as_tensor_variable(RShape) RShape_ = T.as_tensor_variable(RShape)
else: else:
RShape_ = T.as_tensor_variable([-1,-1,-1]) RShape_ = T.as_tensor_variable([-1,-1,-1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
outputs = [CudaNdarrayType(dtype=H_.dtype, outputs = [CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()]) broadcastable=(False,)*5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
...@@ -51,7 +51,7 @@ class GpuConvTransp3D(theano.Op): ...@@ -51,7 +51,7 @@ class GpuConvTransp3D(theano.Op):
R = outputs[0] R = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvTransp3D > ///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n"); //printf("\t\t\t\tGpuConvTransp c code\\n");
...@@ -59,25 +59,25 @@ class GpuConvTransp3D(theano.Op): ...@@ -59,25 +59,25 @@ class GpuConvTransp3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(H)s->nd != 5) if (%(H)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s %(fail)s
} }
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
%(fail)s %(fail)s
} }
...@@ -106,7 +106,7 @@ class GpuConvTransp3D(theano.Op): ...@@ -106,7 +106,7 @@ class GpuConvTransp3D(theano.Op):
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels) if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{ {
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
%(fail)s %(fail)s
} }
{ // for fail { // for fail
...@@ -173,10 +173,10 @@ class GpuConvTransp3D(theano.Op): ...@@ -173,10 +173,10 @@ class GpuConvTransp3D(theano.Op):
dims[2] = videoWidth; dims[2] = videoWidth;
dims[3] = videoDur; dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){ CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s); Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims); %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
...@@ -213,7 +213,7 @@ if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_comple ...@@ -213,7 +213,7 @@ if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_comple
//conv_transp_rows_stack //conv_transp_rows_stack
dim3 grid(batchSize * inputChannels, videoHeight * videoWidth); dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
dim3 threads(videoDur); dim3 threads(videoDur);
HERE HERE
int shared_size=0; int shared_size=0;
...@@ -230,7 +230,7 @@ HERE ...@@ -230,7 +230,7 @@ HERE
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
...@@ -243,7 +243,7 @@ HERE ...@@ -243,7 +243,7 @@ HERE
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)", PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
%(fail)s %(fail)s
} }
} }
...@@ -258,8 +258,8 @@ if(!work_complete){ ...@@ -258,8 +258,8 @@ if(!work_complete){
}}}}}} // for fail }}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D > ///////////// < /code generated by GpuConvTransp3D >
""" """
return renderString(codeSource,locals()) return renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
...@@ -358,9 +358,9 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -358,9 +358,9 @@ def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5 assert len(W.shape) == 5
assert len(H.shape) == 5 assert len(H.shape) == 5
assert len(b.shape) == 1 assert len(b.shape) == 1
assert len(d) == 3 assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels assert outputChannelsAgain == outputChannels
...@@ -382,18 +382,18 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -382,18 +382,18 @@ def computeR(W,b,d,H,Rshape = None):
assert Rshape[1] >= videoWidth assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape) #print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape videoHeight, videoWidth, videoDur = Rshape
#else: #else:
# print "No Rshape passed in" # print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur)) #print "video size: "+str((videoHeight, videoWidth, videoDur))
R = N.zeros( (batchSize, inputChannels, videoHeight, R = N.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize): for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize) #print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels): for j in xrange(0,inputChannels):
...@@ -403,7 +403,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -403,7 +403,7 @@ def computeR(W,b,d,H,Rshape = None):
for c in xrange(0,videoWidth): for c in xrange(0,videoWidth):
for t in xrange(0,videoDur): for t in xrange(0,videoDur):
R[i,j,r,c,t] = b[j] R[i,j,r,c,t] = b[j]
ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ]) ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ]) fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
...@@ -424,7 +424,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -424,7 +424,7 @@ def computeR(W,b,d,H,Rshape = None):
tk = t - tc * dt tk = t - tc * dt
if tk < 0: if tk < 0:
break break
R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] ) R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1 tc += 1
...@@ -439,7 +439,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -439,7 +439,7 @@ def computeR(W,b,d,H,Rshape = None):
"" #close loop over r "" #close loop over r
"" #close loop over j "" #close loop over j
"" #close loop over i "" #close loop over i
return R return R
......
...@@ -27,15 +27,15 @@ from theano.tensor.blas import ldflags ...@@ -27,15 +27,15 @@ from theano.tensor.blas import ldflags
#partial C / partial b_j = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j ) #partial C / partial b_j = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j )
# = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * delta(k = j) # = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * delta(k = j)
# = sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] ) # = sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] )
#partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z]) #partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * # = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) *
# (partial sum_s sum_u sum_v sum_a W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z]) # (partial sum_s sum_u sum_v sum_a W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * # = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *
# (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z]) # (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * V[i,dr*p+k,dc*q+l,dt*r+m,z] # = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * V[i,dr*p+k,dc*q+l,dt*r+m,z]
#derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since dr, dc, dt are natural numbers. #derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since dr, dc, dt are natural numbers.
...@@ -76,7 +76,7 @@ class Conv3D(theano.Op): ...@@ -76,7 +76,7 @@ class Conv3D(theano.Op):
#print "dCdH.broadcastable" #print "dCdH.broadcastable"
#quit(-1) #quit(-1)
#dCdH = printing.Print("dCdH = ",["shape"]) #dCdH = printing.Print("dCdH = ",["shape"])
dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4] ) dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4] )
WShape = W.shape WShape = W.shape
dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH) dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH)
...@@ -92,7 +92,7 @@ class Conv3D(theano.Op): ...@@ -92,7 +92,7 @@ class Conv3D(theano.Op):
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
V,W,b,d = node.inputs V,W,b,d = node.inputs
V_shape, W_shape, b_shape, d_shape = input_shapes V_shape, W_shape, b_shape, d_shape = input_shapes
dr = d[0] dr = d[0]
dc = d[1] dc = d[1]
dt = d[2] dt = d[2]
...@@ -103,12 +103,12 @@ class Conv3D(theano.Op): ...@@ -103,12 +103,12 @@ class Conv3D(theano.Op):
vidWidth = V_shape[2] vidWidth = V_shape[2]
filterWidth = W_shape[2] filterWidth = W_shape[2]
vidDur = V_shape[3] vidDur = V_shape[3]
filterDur = W_shape[3] filterDur = W_shape[3]
output_height = T.floor( (vidHeight - filterHeight) / dr )+1 output_height = T.floor( (vidHeight - filterHeight) / dr )+1
output_width = T.floor( (vidWidth - filterWidth) / dc )+1 output_width = T.floor( (vidWidth - filterWidth) / dc )+1
output_dur = T.floor( (vidDur - filterDur) / dt ) +1 output_dur = T.floor( (vidDur - filterDur) / dt ) +1
return [(batch_size, output_height, output_width, output_dur, output_channels )] return [(batch_size, output_height, output_width, output_dur, output_channels )]
...@@ -133,9 +133,9 @@ class Conv3D(theano.Op): ...@@ -133,9 +133,9 @@ class Conv3D(theano.Op):
fail = sub['fail'] fail = sub['fail']
H = outputs[0] H = outputs[0]
codeSource = """
codeSource = """
///////////// < code generated by Conv3D > ///////////// < code generated by Conv3D >
//printf("\t\t\t\tConv3D c code\\n"); //printf("\t\t\t\tConv3D c code\\n");
...@@ -143,14 +143,14 @@ class Conv3D(theano.Op): ...@@ -143,14 +143,14 @@ class Conv3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
...@@ -180,13 +180,13 @@ class Conv3D(theano.Op): ...@@ -180,13 +180,13 @@ class Conv3D(theano.Op):
if (%(W)s->dimensions[4] != inputChannels) if (%(W)s->dimensions[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels); PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels);
%(fail)s %(fail)s
} }
if (%(b)s->dimensions[0] != outputChannels) if (%(b)s->dimensions[0] != outputChannels)
{ {
PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels); PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels);
%(fail)s %(fail)s
} }
...@@ -221,7 +221,7 @@ class Conv3D(theano.Op): ...@@ -221,7 +221,7 @@ class Conv3D(theano.Op):
} }
{ // extra scope so fail works { // extra scope so fail works
//Read and check stride arguments //Read and check stride arguments
const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0); const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1); const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
...@@ -249,10 +249,10 @@ class Conv3D(theano.Op): ...@@ -249,10 +249,10 @@ class Conv3D(theano.Op):
if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] || if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] ||
%(H)s->dimensions[1]!=dims[1] || %(H)s->dimensions[1]!=dims[1] ||
%(H)s->dimensions[2]!=dims[2] || %(H)s->dimensions[2]!=dims[2] ||
%(H)s->dimensions[3]!=dims[3] || %(H)s->dimensions[3]!=dims[3] ||
%(H)s->dimensions[4]!=dims[4]){ %(H)s->dimensions[4]!=dims[4]){
Py_XDECREF(%(H)s); Py_XDECREF(%(H)s);
%(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num); %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
...@@ -287,11 +287,11 @@ class Conv3D(theano.Op): ...@@ -287,11 +287,11 @@ class Conv3D(theano.Op):
// ex: filterDur == 1 && batchSize == 1 && dt = 1 (for SFA) // ex: filterDur == 1 && batchSize == 1 && dt = 1 (for SFA)
// ex: inputChannels == 1 """ // ex: inputChannels == 1 """
#if the data types are not mixed, we can insert special case optimizations based on BLAS #if the data types are not mixed, we can insert special case optimizations based on BLAS
...@@ -326,7 +326,7 @@ class Conv3D(theano.Op): ...@@ -326,7 +326,7 @@ class Conv3D(theano.Op):
long long Hposi = Hpos; long long Hposi = Hpos;
long long Vposi = Vpos; long long Vposi = Vpos;
for (int r = 0; r < outputHeight; r++) { for (int r = 0; r < outputHeight; r++) {
long long Hposr = Hpos; long long Hposr = Hpos;
long long Vposr = Vpos; long long Vposr = Vpos;
...@@ -357,7 +357,7 @@ class Conv3D(theano.Op): ...@@ -357,7 +357,7 @@ class Conv3D(theano.Op):
dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos); dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);
for (int k =0; k < filterHeight; k++) { for (int k =0; k < filterHeight; k++) {
int Wposk = Wpos; int Wposk = Wpos;
long long Vposk = Vpos; long long Vposk = Vpos;
...@@ -368,7 +368,7 @@ class Conv3D(theano.Op): ...@@ -368,7 +368,7 @@ class Conv3D(theano.Op):
//H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:]) //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])
//note: changing the weights so that outputChannels and inputChannels were the last two rather than //note: changing the weights so that outputChannels and inputChannels were the last two rather than
//the first and last elements did not speed this up, even for extremely large input sizes //the first and last elements did not speed this up, even for extremely large input sizes
...@@ -395,11 +395,11 @@ class Conv3D(theano.Op): ...@@ -395,11 +395,11 @@ class Conv3D(theano.Op):
Hpos = Hposr + %(H)s->strides[1]; Hpos = Hposr + %(H)s->strides[1];
Vpos = Vposr + %(V)s->strides[1] * dr; Vpos = Vposr + %(V)s->strides[1] * dr;
} //closes r } //closes r
Hpos = Hposi + %(H)s->strides[0]; Hpos = Hposi + %(H)s->strides[0];
Vpos = Vposi + %(V)s->strides[0]; Vpos = Vposi + %(V)s->strides[0];
} //closes i } //closes i
} //closes "lots of channels" special case code } //closes "lots of channels" special case code
else else
""" """
...@@ -414,7 +414,7 @@ class Conv3D(theano.Op): ...@@ -414,7 +414,7 @@ class Conv3D(theano.Op):
long long Hposi = Hpos; long long Hposi = Hpos;
long long Vposi = Vpos; long long Vposi = Vpos;
for (int r = 0; r < outputHeight; r++) { for (int r = 0; r < outputHeight; r++) {
long long Hposr = Hpos; long long Hposr = Hpos;
long long Vposr = Vpos; long long Vposr = Vpos;
...@@ -441,14 +441,14 @@ class Conv3D(theano.Op): ...@@ -441,14 +441,14 @@ class Conv3D(theano.Op):
long long Hposj = Hpos; long long Hposj = Hpos;
long long Vposj = Vpos; long long Vposj = Vpos;
int Wposj = Wpos; int Wposj = Wpos;
// H[i,r,c,t,j] = b[j] // H[i,r,c,t,j] = b[j]
dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos); dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);
writePos = ELEM_AT(%(b)s,bPos); writePos = ELEM_AT(%(b)s,bPos);
for (int k =0; k < filterHeight; k++) { for (int k =0; k < filterHeight; k++) {
int Wposk = Wpos; int Wposk = Wpos;
...@@ -462,9 +462,9 @@ class Conv3D(theano.Op): ...@@ -462,9 +462,9 @@ class Conv3D(theano.Op):
for (int z = 0; z < inputChannels; z++) { for (int z = 0; z < inputChannels; z++) {
//H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z] //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos); writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
Wpos += ws4; Wpos += ws4;
Vpos += vs4; Vpos += vs4;
} // close z } // close z
...@@ -478,7 +478,7 @@ class Conv3D(theano.Op): ...@@ -478,7 +478,7 @@ class Conv3D(theano.Op):
Vpos = Vposk + %(V)s->strides[1]; Vpos = Vposk + %(V)s->strides[1];
} //close k } //close k
bPos += bs; bPos += bs;
Wpos = Wposj + ws0; Wpos = Wposj + ws0;
Hpos = Hposj + hs4; Hpos = Hposj + hs4;
...@@ -495,15 +495,15 @@ class Conv3D(theano.Op): ...@@ -495,15 +495,15 @@ class Conv3D(theano.Op):
Hpos = Hposr + %(H)s->strides[1]; Hpos = Hposr + %(H)s->strides[1];
Vpos = Vposr + %(V)s->strides[1] * dr; Vpos = Vposr + %(V)s->strides[1] * dr;
} //closes r } //closes r
Hpos = Hposi + %(H)s->strides[0]; Hpos = Hposi + %(H)s->strides[0];
Vpos = Vposi + %(V)s->strides[0]; Vpos = Vposi + %(V)s->strides[0];
} //closes i } //closes i
} //closes general case code } //closes general case code
}}}}}}} //extra scope so error handler jumps don't cross declarations }}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by Conv3D > ///////////// < /code generated by Conv3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
global conv3D global conv3D
conv3D = Conv3D() conv3D = Conv3D()
...@@ -514,7 +514,7 @@ def computeH(V,W,b,d): ...@@ -514,7 +514,7 @@ def computeH(V,W,b,d):
if len(b.shape) != 1: if len(b.shape) != 1:
print b.shape print b.shape
assert False assert False
assert len(d) == 3 assert len(d) == 3
batchSize = V.shape[0] batchSize = V.shape[0]
outputChannels = W.shape[0] outputChannels = W.shape[0]
...@@ -539,7 +539,7 @@ def computeH(V,W,b,d): ...@@ -539,7 +539,7 @@ def computeH(V,W,b,d):
outputWidth = int( (vidWidth - filterWidth) / dy )+1 outputWidth = int( (vidWidth - filterWidth) / dy )+1
outputDur = int( (vidDur - filterDur) / dt ) +1 outputDur = int( (vidDur - filterDur) / dt ) +1
H = N.zeros( (batchSize, outputHeight, H = N.zeros( (batchSize, outputHeight,
outputWidth, outputDur, outputChannels ), dtype=V.dtype ) outputWidth, outputDur, outputChannels ), dtype=V.dtype )
...@@ -563,7 +563,7 @@ def computeH(V,W,b,d): ...@@ -563,7 +563,7 @@ def computeH(V,W,b,d):
v = V[i,d[0]*x+k, d[1]*y+l, d[2]*t+m,z] v = V[i,d[0]*x+k, d[1]*y+l, d[2]*t+m,z]
#if i == 0 and x == 0 and y == 0 and t == 0 and j == 0: #if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
# print 'setting H[0] += '+str(w*v)+' W['+str((j,z,k,l,m))+']='+str(w)+' V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v) # print 'setting H[0] += '+str(w*v)+' W['+str((j,z,k,l,m))+']='+str(w)+' V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
H[i,x,y,t,j] += w * v H[i,x,y,t,j] += w * v
return H return H
......
...@@ -33,14 +33,14 @@ class ConvGrad3D(theano.Op): ...@@ -33,14 +33,14 @@ class ConvGrad3D(theano.Op):
dLdd = None #not differentiable, since d is not continuous dLdd = None #not differentiable, since d is not continuous
dLdWShape = None #not differentiable, since d is not continuous dLdWShape = None #not differentiable, since d is not continuous
dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d) dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d)
return [ dLdC, dLdd, dLdWShape, dLdB ] return [ dLdC, dLdd, dLdWShape, dLdB ]
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs V, d, WShape, dCdH = inputs
print "ConvGradW3D python code" print "ConvGradW3D python code"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[4] outputFilters = dCdH.shape[4]
...@@ -82,7 +82,7 @@ class ConvGrad3D(theano.Op): ...@@ -82,7 +82,7 @@ class ConvGrad3D(theano.Op):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by ConvGradW3D > ///////////// < code generated by ConvGradW3D >
//printf("\t\t\t\tConvGradW3D c code\\n"); //printf("\t\t\t\tConvGradW3D c code\\n");
...@@ -90,13 +90,13 @@ class ConvGrad3D(theano.Op): ...@@ -90,13 +90,13 @@ class ConvGrad3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(dCdH)s->nd != 5) if (%(dCdH)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
if (%(V)s->nd != 5) if (%(V)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor"); PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
%(fail)s %(fail)s
} }
...@@ -131,16 +131,16 @@ class ConvGrad3D(theano.Op): ...@@ -131,16 +131,16 @@ class ConvGrad3D(theano.Op):
PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous"); PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
%(fail)s %(fail)s
} }
{ //extra scope so that fail will not jump over declarations { //extra scope so that fail will not jump over declarations
dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data; dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
const int outputChannels = WShape[0]; const int outputChannels = WShape[0];
const int inputChannels = %(V)s->dimensions[4]; const int inputChannels = %(V)s->dimensions[4];
if (WShape[4] != inputChannels) if (WShape[4] != inputChannels)
{ {
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels); PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
%(fail)s %(fail)s
} }
{ //extra scope so fail works { //extra scope so fail works
const int filterHeight = WShape[1]; const int filterHeight = WShape[1];
...@@ -184,7 +184,7 @@ class ConvGrad3D(theano.Op): ...@@ -184,7 +184,7 @@ class ConvGrad3D(theano.Op):
if (%(dCdH)s->dimensions[0] != batchSize || if (%(dCdH)s->dimensions[0] != batchSize ||
%(dCdH)s->dimensions[4] != outputChannels || %(dCdH)s->dimensions[4] != outputChannels ||
%(dCdH)s->dimensions[1] != outputHeight || %(dCdH)s->dimensions[1] != outputHeight ||
%(dCdH)s->dimensions[2] != outputWidth || %(dCdH)s->dimensions[2] != outputWidth ||
...@@ -202,10 +202,10 @@ class ConvGrad3D(theano.Op): ...@@ -202,10 +202,10 @@ class ConvGrad3D(theano.Op):
dims[2] = filterWidth; dims[2] = filterWidth;
dims[3] = filterDur; dims[3] = filterDur;
if(!(%(dCdW)s) || %(dCdW)s->dimensions[0]!=dims[0] || if(!(%(dCdW)s) || %(dCdW)s->dimensions[0]!=dims[0] ||
%(dCdW)s->dimensions[1]!=dims[1] || %(dCdW)s->dimensions[1]!=dims[1] ||
%(dCdW)s->dimensions[2]!=dims[2] || %(dCdW)s->dimensions[2]!=dims[2] ||
%(dCdW)s->dimensions[3]!=dims[3] || %(dCdW)s->dimensions[3]!=dims[3] ||
%(dCdW)s->dimensions[4]!=dims[4] ){ %(dCdW)s->dimensions[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s); Py_XDECREF(%(dCdW)s);
%(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num); %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
...@@ -241,9 +241,9 @@ class ConvGrad3D(theano.Op): ...@@ -241,9 +241,9 @@ class ConvGrad3D(theano.Op):
for (int p = 0; p < outputHeight; p++) { for (int p = 0; p < outputHeight; p++) {
for (int q = 0; q < outputWidth; q++) { for (int q = 0; q < outputWidth; q++) {
int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ; int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ;
int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] + (dr * p+k) * %(V)s->strides[1] + (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3]; int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] + (dr * p+k) * %(V)s->strides[1] + (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3];
for (int r = 0; r < outputDur; r++) { for (int r = 0; r < outputDur; r++) {
writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z); writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
//writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos); //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
Hpos += dhs3; Hpos += dhs3;
...@@ -258,11 +258,11 @@ class ConvGrad3D(theano.Op): ...@@ -258,11 +258,11 @@ class ConvGrad3D(theano.Op):
} }
} }
}}}}}}} // extra scope for fail }}}}}}} // extra scope for fail
///////////// < /code generated by ConvGradW3D > ///////////// < /code generated by ConvGradW3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
convGrad3D = ConvGrad3D() convGrad3D = ConvGrad3D()
......
...@@ -8,10 +8,10 @@ class ConvTransp3D(theano.Op): ...@@ -8,10 +8,10 @@ class ConvTransp3D(theano.Op):
""" "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """ """ "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """
def __eq__(self,other): def __eq__(self,other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, W, b, d, H, RShape = None): def make_node(self, W, b, d, H, RShape = None):
""" """
:param W: Weights, filter :param W: Weights, filter
...@@ -27,7 +27,7 @@ class ConvTransp3D(theano.Op): ...@@ -27,7 +27,7 @@ class ConvTransp3D(theano.Op):
RShape_ = T.as_tensor_variable(RShape) RShape_ = T.as_tensor_variable(RShape)
else: else:
RShape_ = T.as_tensor_variable([-1,-1,-1]) RShape_ = T.as_tensor_variable([-1,-1,-1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], outputs = [ T.TensorType(H_.dtype, (False,False,False,False,False))() ] ) return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], outputs = [ T.TensorType(H_.dtype, (False,False,False,False,False))() ] )
def c_compile_args(self): def c_compile_args(self):
...@@ -62,7 +62,7 @@ class ConvTransp3D(theano.Op): ...@@ -62,7 +62,7 @@ class ConvTransp3D(theano.Op):
R = outputs[0] R = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by ConvTransp3D > ///////////// < code generated by ConvTransp3D >
//printf("\t\t\t\tConvTransp3D c code\\n"); //printf("\t\t\t\tConvTransp3D c code\\n");
...@@ -70,25 +70,25 @@ class ConvTransp3D(theano.Op): ...@@ -70,25 +70,25 @@ class ConvTransp3D(theano.Op):
//Check dimensionality of inputs //Check dimensionality of inputs
if (%(H)s->nd != 5) if (%(H)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd); PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s %(fail)s
} }
if (%(W)s->nd != 5) if (%(W)s->nd != 5)
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor"); PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
%(fail)s %(fail)s
} }
if (%(b)s->nd != 1) if (%(b)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector"); PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
%(fail)s %(fail)s
} }
if (%(d)s->nd != 1) if (%(d)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector"); PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
%(fail)s %(fail)s
} }
...@@ -103,14 +103,14 @@ class ConvTransp3D(theano.Op): ...@@ -103,14 +103,14 @@ class ConvTransp3D(theano.Op):
int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0); int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1); int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2); int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0) if (dr <= 0 || dc <= 0 || dt <= 0)
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s %(fail)s
} }
//Read and check sizes of inputs //Read and check sizes of inputs
{ // for fail 2 { // for fail 2
...@@ -119,7 +119,7 @@ class ConvTransp3D(theano.Op): ...@@ -119,7 +119,7 @@ class ConvTransp3D(theano.Op):
if (%(H)s->dimensions[4] != outputChannels) if (%(H)s->dimensions[4] != outputChannels)
{ {
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]); PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]);
%(fail)s %(fail)s
} }
...@@ -150,16 +150,16 @@ class ConvTransp3D(theano.Op): ...@@ -150,16 +150,16 @@ class ConvTransp3D(theano.Op):
{ {
if (%(RShape)s->nd != 1) if (%(RShape)s->nd != 1)
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector"); PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
%(fail)s %(fail)s
} }
if (%(RShape)s->dimensions[0] != 3) if (%(RShape)s->dimensions[0] != 3)
{ {
PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )"); PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
%(fail)s %(fail)s
} }
dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0); dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1); dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2); dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
...@@ -188,16 +188,16 @@ class ConvTransp3D(theano.Op): ...@@ -188,16 +188,16 @@ class ConvTransp3D(theano.Op):
dims[2] = videoWidth; dims[2] = videoWidth;
dims[3] = videoDur; dims[3] = videoDur;
if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] || if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] ||
%(R)s->dimensions[1]!=dims[1] || %(R)s->dimensions[1]!=dims[1] ||
%(R)s->dimensions[2]!=dims[2] || %(R)s->dimensions[2]!=dims[2] ||
%(R)s->dimensions[3]!=dims[3] || %(R)s->dimensions[3]!=dims[3] ||
%(R)s->dimensions[4]!=dims[4]) %(R)s->dimensions[4]!=dims[4])
{ {
Py_XDECREF(%(R)s); Py_XDECREF(%(R)s);
%(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num); %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num);
if (!(%(R)s)) { if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R"); PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
%(fail)s %(fail)s
} }
} }
...@@ -205,25 +205,25 @@ class ConvTransp3D(theano.Op): ...@@ -205,25 +205,25 @@ class ConvTransp3D(theano.Op):
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
if (%(R)s->strides[i] < %(R)s->strides[4]) if (%(R)s->strides[i] < %(R)s->strides[4])
{ {
PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)"); PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
%(fail)s %(fail)s
} }
{ // for fail 6 { // for fail 6
memset(%(R)s->data, 0, (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] + memset(%(R)s->data, 0, (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
(videoHeight-1) * %(R)s->strides[1] + (videoHeight-1) * %(R)s->strides[1] +
(videoWidth-1) * %(R)s->strides[2] + (videoWidth-1) * %(R)s->strides[2] +
(videoDur-1) * %(R)s->strides[3]); (videoDur-1) * %(R)s->strides[3]);
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] ) #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) ) #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data; dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data;
int rs4 = %(R)s->strides[4]; int rs4 = %(R)s->strides[4];
...@@ -232,60 +232,60 @@ class ConvTransp3D(theano.Op): ...@@ -232,60 +232,60 @@ class ConvTransp3D(theano.Op):
int hs4 = %(H)s->strides[4]; int hs4 = %(H)s->strides[4];
// Compute R // Compute R
// R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k] // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]
for (int i = 0; i < batchSize; i++) { for (int i = 0; i < batchSize; i++) {
for (int r = 0; r < videoHeight; r++) { for (int r = 0; r < videoHeight; r++) {
const int frc = std::max(0.0, ceil(float(r-filterHeight+1)/float(dr))); const int frc = std::max(0.0, ceil(float(r-filterHeight+1)/float(dr)));
for (int c = 0; c < videoWidth; c++) { for (int c = 0; c < videoWidth; c++) {
const int fcc = std::max(0.0, ceil(float(c-filterWidth +1)/float(dc))); const int fcc = std::max(0.0, ceil(float(c-filterWidth +1)/float(dc)));
for (int t = 0; t < videoDur; t++) { for (int t = 0; t < videoDur; t++) {
const int ftc = std::max(0.0, ceil(float(t-filterDur +1) /float(dt))); const int ftc = std::max(0.0, ceil(float(t-filterDur +1) /float(dt)));
long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3]; long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3];
long long Rpos = Rpost; long long Rpos = Rpost;
for (int j = 0; j < inputChannels; j++) for (int j = 0; j < inputChannels; j++)
{ {
//ELEM5(%(R)s, i,r,c,t,j) = b[j]; //ELEM5(%(R)s, i,r,c,t,j) = b[j];
ELEM_AT(%(R)s,Rpos) = b[j]; ELEM_AT(%(R)s,Rpos) = b[j];
Rpos += rs4; Rpos += rs4;
} }
for (int rc = frc; rc < outputHeight; rc++) { for (int rc = frc; rc < outputHeight; rc++) {
const int rk = r - rc * dr; const int rk = r - rc * dr;
if (rk < 0) break; if (rk < 0) break;
for (int cc = fcc; cc < outputWidth; cc++) { for (int cc = fcc; cc < outputWidth; cc++) {
const int ck = c - cc * dc; const int ck = c - cc * dc;
if (ck < 0) break; if (ck < 0) break;
for (int tc = ftc; tc < outputDur; tc++) for (int tc = ftc; tc < outputDur; tc++)
{ {
const int tk = t - tc * dt; const int tk = t - tc * dt;
if (tk < 0) break; if (tk < 0) break;
int Wpos = rk * %(W)s->strides[1] + ck * %(W)s->strides[2] + tk * %(W)s->strides[3]; int Wpos = rk * %(W)s->strides[1] + ck * %(W)s->strides[2] + tk * %(W)s->strides[3];
int Hpostc = i * %(H)s->strides[0] + rc * %(H)s->strides[1] + cc * %(H)s->strides[2] + tc * %(H)s->strides[3]; int Hpostc = i * %(H)s->strides[0] + rc * %(H)s->strides[1] + cc * %(H)s->strides[2] + tc * %(H)s->strides[3];
Rpos = Rpost; Rpos = Rpost;
for (int j = 0; j < inputChannels; j++) for (int j = 0; j < inputChannels; j++)
{ {
int Wposj = Wpos; int Wposj = Wpos;
dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos); dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
int Hpos = Hpostc; int Hpos = Hpostc;
for (int k = 0; k < outputChannels; k++) { for (int k = 0; k < outputChannels; k++) {
//TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
//writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k); //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
//writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos); writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
Wpos += ws0; Wpos += ws0;
Hpos += hs4; Hpos += hs4;
} //close the k loop } //close the k loop
Rpos += rs4; Rpos += rs4;
Wpos = Wposj + ws4; Wpos = Wposj + ws4;
...@@ -304,9 +304,9 @@ class ConvTransp3D(theano.Op): ...@@ -304,9 +304,9 @@ class ConvTransp3D(theano.Op):
} //for fail 2 } //for fail 2
} // for fail 1 } // for fail 1
///////////// < /code generated by ConvTransp3D > ///////////// < /code generated by ConvTransp3D >
""" """
return strutil.renderString(codeSource,locals()) return strutil.renderString(codeSource,locals())
convTransp3D = ConvTransp3D() convTransp3D = ConvTransp3D()
...@@ -316,14 +316,14 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -316,14 +316,14 @@ def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5 assert len(W.shape) == 5
assert len(H.shape) == 5 assert len(H.shape) == 5
assert len(b.shape) == 1 assert len(b.shape) == 1
assert len(d) == 3 assert len(d) == 3
outputChannels, filterHeight, filterWidth, filterDur, inputChannels = W.shape outputChannels, filterHeight, filterWidth, filterDur, inputChannels = W.shape
batchSize, outputHeight, outputWidth, outputDur, outputChannelsAgain = H.shape batchSize, outputHeight, outputWidth, outputDur, outputChannelsAgain = H.shape
assert outputChannelsAgain == outputChannels assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels assert b.shape[0] == inputChannels
dr,dc,dt = d dr,dc,dt = d
assert dr > 0 assert dr > 0
...@@ -341,18 +341,18 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -341,18 +341,18 @@ def computeR(W,b,d,H,Rshape = None):
assert Rshape[1] >= videoWidth assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape) #print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape videoHeight, videoWidth, videoDur = Rshape
#else: #else:
# print "No Rshape passed in" # print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur)) #print "video size: "+str((videoHeight, videoWidth, videoDur))
R = N.zeros( (batchSize, videoHeight, R = N.zeros( (batchSize, videoHeight,
videoWidth, videoDur, inputChannels ) , dtype=H.dtype) videoWidth, videoDur, inputChannels ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize): for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize) #print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels): for j in xrange(0,inputChannels):
...@@ -362,7 +362,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -362,7 +362,7 @@ def computeR(W,b,d,H,Rshape = None):
for c in xrange(0,videoWidth): for c in xrange(0,videoWidth):
for t in xrange(0,videoDur): for t in xrange(0,videoDur):
R[i,r,c,t,j] = b[j] R[i,r,c,t,j] = b[j]
ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ]) ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ]) fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
...@@ -383,7 +383,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -383,7 +383,7 @@ def computeR(W,b,d,H,Rshape = None):
tk = t - tc * dt tk = t - tc * dt
if tk < 0: if tk < 0:
break break
R[i,r,c,t,j] += N.dot(W[:,rk,ck,tk,j], H[i,rc,cc,tc,:] ) R[i,r,c,t,j] += N.dot(W[:,rk,ck,tk,j], H[i,rc,cc,tc,:] )
tc += 1 tc += 1
...@@ -398,7 +398,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -398,7 +398,7 @@ def computeR(W,b,d,H,Rshape = None):
"" #close loop over r "" #close loop over r
"" #close loop over j "" #close loop over j
"" #close loop over i "" #close loop over i
return R return R
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论