提交 fd8541cb authored 作者: David Warde-Farley's avatar David Warde-Farley

Replace tabs with 4 spaces in GpuConvTransp3D.py.

上级 703fb32e
......@@ -15,10 +15,10 @@ from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu
class GpuConvTransp3D(theano.Op):
""" The gpu version of ConvTransp3D """
def __eq__(self,other):
return type(self) == type(other)
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
return hash(type(self))
def make_node(self, W, b, d, H, RShape = None):
W_ = as_cuda_ndarray_variable(W)
......@@ -41,153 +41,153 @@ class GpuConvTransp3D(theano.Op):
def perform_(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs
print "\t\t\t\tGpuConvTransp3D python code still uses old format"
output_storage[0][0] = computeR(W,b,d,H,RShape)
W, b, d, H, RShape = inputs
print "\t\t\t\tGpuConvTransp3D python code still uses old format"
output_storage[0][0] = computeR(W,b,d,H,RShape)
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (W, b, d, H, RShape), outputs, sub):
fail = sub['fail']
fail = sub['fail']
R = outputs[0]
R = outputs[0]
codeSource = """
///////////// < code generated by GpuConvTransp3D >
codeSource = """
///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n");
//printf("\t\t\t\tGpuConvTransp c code\\n");
//Check dimensionality of inputs
if (%(H)s->nd != 5)
{
//Check dimensionality of inputs
if (%(H)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s
}
}
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
%(fail)s
}
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
%(fail)s
}
}
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
%(fail)s
}
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
//Read and check stride arguments
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
//Read and check stride arguments
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
}
{ // for fail
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Read and check sizes of inputs
//Read and check sizes of inputs
{ // for fail
const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
%(fail)s
}
}
{ // for fail
const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
{
if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
%(fail)s
}
%(fail)s
}
{ // for fail
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
int videoHeight = (outputHeight-1) * dr + filterHeight;
int videoWidth = (outputWidth-1) * dc + filterWidth;
int videoDur = (outputDur-1) * dt + filterDur;
int videoHeight = (outputHeight-1) * dr + filterHeight;
int videoWidth = (outputWidth-1) * dc + filterWidth;
int videoDur = (outputDur-1) * dt + filterDur;
if (%(RShape)s)
{
if (%(RShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "RShape must be a vector");
%(fail)s
}
if (%(RShape)s)
{
if (%(RShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "RShape must be a vector");
%(fail)s
}
if (%(RShape)s->dimensions[0] != 3)
{
if (%(RShape)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
%(fail)s
}
}
{ // for fail
dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
if (RShape0 != -1)
{
if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
{
PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
%(fail)s
}
videoHeight = RShape0;
videoWidth = RShape1;
videoDur = RShape2;
}
}
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if (RShape0 != -1)
{
if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
{
PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
%(fail)s
}
videoHeight = RShape0;
videoWidth = RShape1;
videoDur = RShape2;
}
}
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError,"Could not allocate R");
%(fail)s;
}
Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError,"Could not allocate R");
%(fail)s;
}
}
cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
{ // for fail
......@@ -235,7 +235,7 @@ HERE
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
}
else
......@@ -259,9 +259,9 @@ if(!work_complete){
}}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D >
"""
return strutil.renderString(codeSource,locals())
///////////// < /code generated by GpuConvTransp3D >
"""
return strutil.renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
......@@ -275,15 +275,15 @@ __global__ void
conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
int img_len, int img_wid, int img_dur,
int img_len, int img_wid, int img_dur,
int kern_len, int kern_wid, int kern_dur,
int H_len, int H_wid, int H_dur,
int nkern, int nstack,
int nkern, int nstack,
int dr, int dc, int dt,
int H_stride_frame, int H_stride_col, int H_stride_row,
int H_stride_stack, int H_stride_batch,
int H_stride_stack, int H_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern,
int kern_stride_stack, int kern_stride_nkern,
int bias_stride)
{
int __shared__ batch_id, stack_id;
......@@ -306,30 +306,30 @@ conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
float sum = 0;
while(rc < H_len){
int rk = r - rc * dr;
if(rk < 0)
break;
int cc = fcc;
while( cc < H_wid){
int ck = c - cc * dc;
if(ck < 0)
break;
int tc = ftc;
while(tc < H_dur){
int tk = t - tc * dt;
if(tk < 0)
break;
//R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
int rk = r - rc * dr;
if(rk < 0)
break;
int cc = fcc;
while( cc < H_wid){
int ck = c - cc * dc;
if(ck < 0)
break;
int tc = ftc;
while(tc < H_dur){
int tk = t - tc * dt;
if(tk < 0)
break;
//R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
for(int q=0;q<nkern;q++){
sum += kern[q*kern_stride_nkern+stack_id*kern_stride_stack+rk*kern_stride_row+ck*kern_stride_col+tk*kern_stride_frame]*
H[batch_id*H_stride_batch+q*H_stride_stack+rc*H_stride_row+cc*H_stride_col+tc*H_stride_frame];
}
tc += 1;
tc += 1;
}
cc += 1;
cc += 1;
}
rc += 1;
rc += 1;
}
R[batch_id*nstack*img_len*img_wid*img_dur+//the good batch
stack_id+//the output image
......@@ -357,89 +357,89 @@ def local_gpu_conv_transpd(node):
#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr,dc,dt = d
assert dr > 0
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
if Rshape != None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
print (Rshape[0], videoHeight)
assert False
assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape
#else:
# print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth):
for t in xrange(0,videoDur):
R[i,j,r,c,t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(numpy.ceil(float(c-filterWidth +1)/float(dc))) ])
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
break
cc = fcc
while cc < outputWidth:
ck = c - cc * dc
if ck < 0:
break
tc = ftc
while tc < outputDur:
tk = t - tc * dt
if tk < 0:
break
R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1
"" #close loop over tc
cc += 1
"" #close loop over cc
rc += 1
"" #close loop over rc
"" #close loop over t
"" #close loop over c
"" #close loop over r
"" #close loop over j
"" #close loop over i
return R
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr,dc,dt = d
assert dr > 0
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
if Rshape != None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
print (Rshape[0], videoHeight)
assert False
assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape
#else:
# print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth):
for t in xrange(0,videoDur):
R[i,j,r,c,t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(numpy.ceil(float(c-filterWidth +1)/float(dc))) ])
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
break
cc = fcc
while cc < outputWidth:
ck = c - cc * dc
if ck < 0:
break
tc = ftc
while tc < outputDur:
tk = t - tc * dt
if tk < 0:
break
R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1
"" #close loop over tc
cc += 1
"" #close loop over cc
rc += 1
"" #close loop over rc
"" #close loop over t
"" #close loop over c
"" #close loop over r
"" #close loop over j
"" #close loop over i
return R
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论