提交 015b42a4 authored 作者: Frederic's avatar Frederic

pep8

上级 c3a461f7
......@@ -3,12 +3,14 @@ import numpy
import theano
import theano.tensor as T
from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gpu, HostFromGpu
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, HostFromGpu)
from theano.misc import strutil
from theano.tensor.nnet.Conv3D import Conv3D
from theano.sandbox.cuda.opt import register_opt
from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(GpuOp):
""" GPU implementation of Conv3D """
......@@ -32,19 +34,21 @@ class GpuConv3D(GpuOp):
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, inputs, outputs, sub):
V, W, b, d = inputs
fail = sub['fail']
H = outputs[0]
codeSource = """
codeSource = """
///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n");
......@@ -220,13 +224,13 @@ if(!work_complete){
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D >
"""
return strutil.render_string(codeSource,locals())
return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
codeSource = """
__global__ void
//thread block size = out_dur
//grid block size =(out_len*out_wid, nb kern *nb batch)
......@@ -283,11 +287,16 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
gpu_convd = GpuConv3D()
@register_opt()
@local_optimizer([Conv3D])
def local_gpu_conv3d(node):
if isinstance(node.op, Conv3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, W, b, d = node.inputs
return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
as_cuda_ndarray_variable(W),
as_cuda_ndarray_variable(b),
d))]
......@@ -12,7 +12,6 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp)
class GpuConvGrad3D(GpuOp):
""" GPU version of gradient of ConvGrad3D with respect to W """
......@@ -27,9 +26,10 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs
......@@ -51,18 +51,18 @@ class GpuConvGrad3D(GpuOp):
dCdW = numpy.zeros(WShape, dtype=V.dtype)
#block
for j in xrange(0,WShape[0]):
for z in xrange(0,WShape[1]):
for k in xrange(0,WShape[2]):
for l in xrange(0,WShape[3]):
#threads
for m in xrange(0,WShape[4]):
#thread
for i in xrange(0,batchSize):
for p in xrange(0,outputHeight):
for q in xrange(0,outputWidth):
for r in xrange(0,outputDur):
# block
for j in xrange(0, WShape[0]):
for z in xrange(0, WShape[1]):
for k in xrange(0, WShape[2]):
for l in xrange(0, WShape[3]):
# threads
for m in xrange(0, WShape[4]):
# thread
for i in xrange(0, batchSize):
for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth):
for r in xrange(0, outputDur):
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
output_storage[0][0] = dCdW
......@@ -340,11 +340,17 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
gpu_conv_grad3d = GpuConvGrad3D()
@register_opt()
@local_optimizer([ConvGrad3D])
def local_gpu_conv_gradd(node):
if isinstance(node.op, ConvGrad3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, d, WShape, dCdH = node.inputs
return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))]
return [host_from_gpu(gpu_conv_grad3d(
as_cuda_ndarray_variable(V),
d,
WShape,
as_cuda_ndarray_variable(dCdH)))]
......@@ -15,13 +15,13 @@ from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
class GpuConvTransp3D(GpuOp):
""" The gpu version of ConvTransp3D """
def __eq__(self,other):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, W, b, d, H, RShape = None):
def make_node(self, W, b, d, H, RShape=None):
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
......@@ -29,22 +29,21 @@ class GpuConvTransp3D(GpuOp):
if RShape:
RShape_ = T.as_tensor_variable(RShape)
else:
RShape_ = T.as_tensor_variable([-1,-1,-1])
RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
outputs = [CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()])
def infer_shape(self, node, input_shapes):
W,b,d,H,RShape = node.inputs
W, b, d, H, RShape = node.inputs
W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
def perform_(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs
print "\t\t\t\tGpuConvTransp3D python code still uses old format"
output_storage[0][0] = computeR(W,b,d,H,RShape)
output_storage[0][0] = computeR(W, b, d, H, RShape)
def c_code_cache_version(self):
return ()
......@@ -55,7 +54,7 @@ class GpuConvTransp3D(GpuOp):
R = outputs[0]
codeSource = """
codeSource = """
///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n");
......@@ -263,13 +262,13 @@ if(!work_complete){
}}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D >
"""
return strutil.render_string(codeSource,locals())
return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
codeSource = """
__global__ void
//thread block size = videoDur
//grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
......@@ -347,18 +346,20 @@ conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
gpu_conv_transpd = GpuConvTransp3D()
@register_opt()
@local_optimizer([ConvTransp3D])
def local_gpu_conv_transpd(node):
if isinstance(node.op, ConvTransp3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
W, b, d, H, RShape = node.inputs
return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W,b,d,H,Rshape = None):
def computeR(W, b, d, H, Rshape=None):
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
......@@ -370,7 +371,7 @@ def computeR(W,b,d,H,Rshape = None):
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr,dc,dt = d
dr, dc, dt = d
assert dr > 0
assert dc > 0
assert dt > 0
......@@ -398,14 +399,14 @@ def computeR(W,b,d,H,Rshape = None):
videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize):
for i in xrange(0, batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels):
for j in xrange(0, inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight):
for r in xrange(0, videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth):
for t in xrange(0,videoDur):
for c in xrange(0, videoWidth):
for t in xrange(0, videoDur):
R[i,j,r,c,t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur +1 )/float(dt))) ])
......@@ -432,16 +433,16 @@ def computeR(W,b,d,H,Rshape = None):
R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1
"" #close loop over tc
"" # close loop over tc
cc += 1
"" #close loop over cc
"" # close loop over cc
rc += 1
"" #close loop over rc
"" #close loop over t
"" #close loop over c
"" #close loop over r
"" #close loop over j
"" #close loop over i
"" # close loop over rc
"" # close loop over t
"" # close loop over c
"" # close loop over r
"" # close loop over j
"" # close loop over i
return R
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论