提交 015b42a4 authored 作者: Frederic's avatar Frederic

pep8

上级 c3a461f7
...@@ -3,12 +3,14 @@ import numpy ...@@ -3,12 +3,14 @@ import numpy
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gpu, HostFromGpu from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, HostFromGpu)
from theano.misc import strutil from theano.misc import strutil
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.sandbox.cuda.opt import register_opt from theano.sandbox.cuda.opt import register_opt
from theano.sandbox.cuda import CudaNdarrayType, GpuOp from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(GpuOp): class GpuConv3D(GpuOp):
""" GPU implementation of Conv3D """ """ GPU implementation of Conv3D """
...@@ -32,19 +34,21 @@ class GpuConv3D(GpuOp): ...@@ -32,19 +34,21 @@ class GpuConv3D(GpuOp):
W_ = as_cuda_ndarray_variable(W) W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
return theano.Apply(self, inputs=[V_, W_, b_, d_], return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] ) outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
def c_code(self, node, nodename, inputs, outputs, sub): def c_code(self, node, nodename, inputs, outputs, sub):
V, W, b, d = inputs V, W, b, d = inputs
fail = sub['fail'] fail = sub['fail']
H = outputs[0] H = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConv3D > ///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n"); //printf("\t\t\t\tConv3DGPU c code\\n");
...@@ -220,13 +224,13 @@ if(!work_complete){ ...@@ -220,13 +224,13 @@ if(!work_complete){
}}}}}}} //extra scope so error handler jumps don't cross declarations }}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D > ///////////// < /code generated by GpuConv3D >
""" """
return strutil.render_string(codeSource,locals()) return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the # It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position. # input pixels that were downsampled to that z-position.
codeSource = """ codeSource = """
__global__ void __global__ void
//thread block size = out_dur //thread block size = out_dur
//grid block size =(out_len*out_wid, nb kern *nb batch) //grid block size =(out_len*out_wid, nb kern *nb batch)
...@@ -283,11 +287,16 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out, ...@@ -283,11 +287,16 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
gpu_convd = GpuConv3D() gpu_convd = GpuConv3D()
@register_opt() @register_opt()
@local_optimizer([Conv3D]) @local_optimizer([Conv3D])
def local_gpu_conv3d(node): def local_gpu_conv3d(node):
if isinstance(node.op, Conv3D): if isinstance(node.op, Conv3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, W, b, d = node.inputs V, W, b, d = node.inputs
return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))] return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
as_cuda_ndarray_variable(W),
as_cuda_ndarray_variable(b),
d))]
...@@ -12,7 +12,6 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu, ...@@ -12,7 +12,6 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp) host_from_gpu, GpuOp)
class GpuConvGrad3D(GpuOp): class GpuConvGrad3D(GpuOp):
""" GPU version of gradient of ConvGrad3D with respect to W """ """ GPU version of gradient of ConvGrad3D with respect to W """
...@@ -27,9 +26,10 @@ class GpuConvGrad3D(GpuOp): ...@@ -27,9 +26,10 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()]) outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def perform_(self, node, inputs, output_storage): def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs V, d, WShape, dCdH = inputs
...@@ -51,18 +51,18 @@ class GpuConvGrad3D(GpuOp): ...@@ -51,18 +51,18 @@ class GpuConvGrad3D(GpuOp):
dCdW = numpy.zeros(WShape, dtype=V.dtype) dCdW = numpy.zeros(WShape, dtype=V.dtype)
#block # block
for j in xrange(0,WShape[0]): for j in xrange(0, WShape[0]):
for z in xrange(0,WShape[1]): for z in xrange(0, WShape[1]):
for k in xrange(0,WShape[2]): for k in xrange(0, WShape[2]):
for l in xrange(0,WShape[3]): for l in xrange(0, WShape[3]):
#threads # threads
for m in xrange(0,WShape[4]): for m in xrange(0, WShape[4]):
#thread # thread
for i in xrange(0,batchSize): for i in xrange(0, batchSize):
for p in xrange(0,outputHeight): for p in xrange(0, outputHeight):
for q in xrange(0,outputWidth): for q in xrange(0, outputWidth):
for r in xrange(0,outputDur): for r in xrange(0, outputDur):
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m] dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
output_storage[0][0] = dCdW output_storage[0][0] = dCdW
...@@ -340,11 +340,17 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW, ...@@ -340,11 +340,17 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
gpu_conv_grad3d = GpuConvGrad3D() gpu_conv_grad3d = GpuConvGrad3D()
@register_opt() @register_opt()
@local_optimizer([ConvGrad3D]) @local_optimizer([ConvGrad3D])
def local_gpu_conv_gradd(node): def local_gpu_conv_gradd(node):
if isinstance(node.op, ConvGrad3D): if isinstance(node.op, ConvGrad3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, d, WShape, dCdH = node.inputs V, d, WShape, dCdH = node.inputs
return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))] return [host_from_gpu(gpu_conv_grad3d(
as_cuda_ndarray_variable(V),
d,
WShape,
as_cuda_ndarray_variable(dCdH)))]
...@@ -15,13 +15,13 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu, ...@@ -15,13 +15,13 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvTransp3D(GpuOp): class GpuConvTransp3D(GpuOp):
""" The gpu version of ConvTransp3D """ """ The gpu version of ConvTransp3D """
def __eq__(self,other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def make_node(self, W, b, d, H, RShape = None): def make_node(self, W, b, d, H, RShape=None):
W_ = as_cuda_ndarray_variable(W) W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b) b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
...@@ -29,22 +29,21 @@ class GpuConvTransp3D(GpuOp): ...@@ -29,22 +29,21 @@ class GpuConvTransp3D(GpuOp):
if RShape: if RShape:
RShape_ = T.as_tensor_variable(RShape) RShape_ = T.as_tensor_variable(RShape)
else: else:
RShape_ = T.as_tensor_variable([-1,-1,-1]) RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
outputs = [CudaNdarrayType(dtype=H_.dtype, outputs=[CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()]) broadcastable=(False,)*5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
W,b,d,H,RShape = node.inputs W, b, d, H, RShape = node.inputs
W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])] return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
def perform_(self, node, inputs, output_storage): def perform_(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs W, b, d, H, RShape = inputs
print "\t\t\t\tGpuConvTransp3D python code still uses old format" print "\t\t\t\tGpuConvTransp3D python code still uses old format"
output_storage[0][0] = computeR(W,b,d,H,RShape) output_storage[0][0] = computeR(W, b, d, H, RShape)
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
...@@ -55,7 +54,7 @@ class GpuConvTransp3D(GpuOp): ...@@ -55,7 +54,7 @@ class GpuConvTransp3D(GpuOp):
R = outputs[0] R = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvTransp3D > ///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n"); //printf("\t\t\t\tGpuConvTransp c code\\n");
...@@ -263,13 +262,13 @@ if(!work_complete){ ...@@ -263,13 +262,13 @@ if(!work_complete){
}}}}}} // for fail }}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D > ///////////// < /code generated by GpuConvTransp3D >
""" """
return strutil.render_string(codeSource,locals()) return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the # It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position. # input pixels that were downsampled to that z-position.
codeSource = """ codeSource = """
__global__ void __global__ void
//thread block size = videoDur //thread block size = videoDur
//grid block size =(batchSize * inputChannels, videoHeight * videoWidth) //grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
...@@ -347,18 +346,20 @@ conv_transp_rows_stack( float* H, float* kern, float* bias, float* R, ...@@ -347,18 +346,20 @@ conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
gpu_conv_transpd = GpuConvTransp3D() gpu_conv_transpd = GpuConvTransp3D()
@register_opt() @register_opt()
@local_optimizer([ConvTransp3D]) @local_optimizer([ConvTransp3D])
def local_gpu_conv_transpd(node): def local_gpu_conv_transpd(node):
if isinstance(node.op, ConvTransp3D): if isinstance(node.op, ConvTransp3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]): if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]): if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
W, b, d, H, RShape = node.inputs W, b, d, H, RShape = node.inputs
return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))] return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction #If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W,b,d,H,Rshape = None): def computeR(W, b, d, H, Rshape=None):
assert len(W.shape) == 5 assert len(W.shape) == 5
assert len(H.shape) == 5 assert len(H.shape) == 5
assert len(b.shape) == 1 assert len(b.shape) == 1
...@@ -370,7 +371,7 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -370,7 +371,7 @@ def computeR(W,b,d,H,Rshape = None):
assert outputChannelsAgain == outputChannels assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels assert b.shape[0] == inputChannels
dr,dc,dt = d dr, dc, dt = d
assert dr > 0 assert dr > 0
assert dc > 0 assert dc > 0
assert dt > 0 assert dt > 0
...@@ -398,14 +399,14 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -398,14 +399,14 @@ def computeR(W,b,d,H,Rshape = None):
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize): for i in xrange(0, batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize) #print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels): for j in xrange(0, inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels) #print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight): for r in xrange(0, videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight) #print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth): for c in xrange(0, videoWidth):
for t in xrange(0,videoDur): for t in xrange(0, videoDur):
R[i,j,r,c,t] = b[j] R[i,j,r,c,t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur +1 )/float(dt))) ]) ftc = max([0, int(numpy.ceil(float(t-filterDur +1 )/float(dt))) ])
...@@ -432,16 +433,16 @@ def computeR(W,b,d,H,Rshape = None): ...@@ -432,16 +433,16 @@ def computeR(W,b,d,H,Rshape = None):
R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] ) R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1 tc += 1
"" #close loop over tc "" # close loop over tc
cc += 1 cc += 1
"" #close loop over cc "" # close loop over cc
rc += 1 rc += 1
"" #close loop over rc "" # close loop over rc
"" #close loop over t "" # close loop over t
"" #close loop over c "" # close loop over c
"" #close loop over r "" # close loop over r
"" #close loop over j "" # close loop over j
"" #close loop over i "" # close loop over i
return R return R
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论