提交 12cc6f02 authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: Pascal Lamblin

update cpu gradweight

上级 3937acc7
...@@ -32,6 +32,9 @@ from theano.sandbox.cuda.opt import values_eq_approx_high_tol ...@@ -32,6 +32,9 @@ from theano.sandbox.cuda.opt import values_eq_approx_high_tol
## Cpu implementation ## Cpu implementation
from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
from theano.tensor.nnet.ConvGrad3D import convGrad3D
from theano.tensor.nnet.ConvTransp3D import convTransp3D
_logger = logging.getLogger("theano.tensor.nnet.conv2d") _logger = logging.getLogger("theano.tensor.nnet.conv2d")
...@@ -434,7 +437,7 @@ def local_conv2d_cudnn(node): ...@@ -434,7 +437,7 @@ def local_conv2d_cudnn(node):
direction_hint='bprop inputs', direction_hint='bprop inputs',
conv_mode = conv_mode) conv_mode = conv_mode)
return [rval] return [rval]
register_specialize_device(local_conv2d_cudnn) #register_specialize_device(local_conv2d_cudnn)
@local_optimizer([AbstractConv2d]) @local_optimizer([AbstractConv2d])
...@@ -445,7 +448,6 @@ def local_conv2d_corrmm(node): ...@@ -445,7 +448,6 @@ def local_conv2d_corrmm(node):
not isinstance(kern.type, CudaNdarrayType)): not isinstance(kern.type, CudaNdarrayType)):
return None return None
print "here"
if node.op.border_mode in ['full', 'valid']: if node.op.border_mode in ['full', 'valid']:
border_mode = node.op.border_mode border_mode = node.op.border_mode
...@@ -495,7 +497,7 @@ def local_conv2d_corrmm(node): ...@@ -495,7 +497,7 @@ def local_conv2d_corrmm(node):
rval = GpuCorrMM_gradInputs('valid', subsample)( rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img)) gpu_contiguous(kern), gpu_contiguous(img))
return [rval] return [rval]
register_specialize_device(local_conv2d_corrmm) #register_specialize_device(local_conv2d_corrmm)
@local_optimizer([AbstractConv2d_gradWeights]) @local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_corrmm(node): def local_conv2d_gradweight_corrmm(node):
...@@ -511,7 +513,7 @@ def local_conv2d_gradweight_corrmm(node): ...@@ -511,7 +513,7 @@ def local_conv2d_gradweight_corrmm(node):
subsample=node.op.subsample)( subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape) gpu_contiguous(img), gpu_contiguous(topgrad), shape)
return [rval] return [rval]
register_specialize_device(local_conv2d_gradweight_corrmm) #register_specialize_device(local_conv2d_gradweight_corrmm)
@local_optimizer([AbstractConv2d_gradInputs]) @local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_corrmm(node): def local_conv2d_gradinputs_corrmm(node):
...@@ -528,7 +530,7 @@ def local_conv2d_gradinputs_corrmm(node): ...@@ -528,7 +530,7 @@ def local_conv2d_gradinputs_corrmm(node):
subsample=node.op.subsample)( subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape) gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval] return [rval]
register_specialize_device(local_conv2d_gradinputs_corrmm) #register_specialize_device(local_conv2d_gradinputs_corrmm)
...@@ -537,6 +539,7 @@ register_specialize_device(local_conv2d_gradinputs_corrmm) ...@@ -537,6 +539,7 @@ register_specialize_device(local_conv2d_gradinputs_corrmm)
@local_optimizer([AbstractConv2d]) @local_optimizer([AbstractConv2d])
def local_conv2d_cpu(node): def local_conv2d_cpu(node):
import pdb; pdb.set_trace()
if not isinstance(node.op, AbstractConv2d): if not isinstance(node.op, AbstractConv2d):
return None return None
...@@ -556,33 +559,36 @@ register_specialize_device(local_conv2d_cpu) ...@@ -556,33 +559,36 @@ register_specialize_device(local_conv2d_cpu)
@local_optimizer([AbstractConv2d_gradWeights]) @local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_cpu(node): def local_conv2d_gradweight_cpu(node):
import pdb; pdb.set_trace()
## len is 4 all the time ## len is 4 all the time
img, topgrad, shape = node.inputs img, topgrad, shape = node.inputs
if isinstance(img.type, CudaNdarrayType) or \ if isinstance(img.type, CudaNdarrayType) or \
isinstance(topgrad.type, CudaNdarrayType): isinstance(topgrad.type, CudaNdarrayType):
return None return None
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1): if (node.op.border_mode == 'valid' and node.op.subsample != (1, 1)) or \
node.op.imshp is None or node.op.kshp is None:
# Use the gradient as defined in conv3D, because the implementation # Use the gradient as defined in conv3D, because the implementation
# by Conv is slow (about 3x slower than conv3D, and probably 10x # by Conv is slow (about 3x slower than conv3D, and probably 10x
# slower than it could be), nad incorrect when dx or dy > 2. # slower than it could be), nad incorrect when subsample > 2.
# build a "node", that should be equivalent to the one given by # build a "node", that should be equivalent to the one given by
# self.make_node, but using convGrad3D instead. # self.make_node, but using convGrad3D instead.
shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1) shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1) shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
rval = ConvGrad3D(V=shuffled_img, print shape
d=(op.subsample[0], op.subsample[1], 1), rval = convGrad3D(V=shuffled_img,
WShape=(self.kshp[0], self.kshp[1], 1), d=(node.op.subsample[0], node.op.subsample[1], 1),
dCdH_=shuffled_topgrad) WShape=(shape[0], shape[2], shape[3], 1, shape[1]),
dCdH=shuffled_topgrad)
rval = theano.tensor.addbroadcast(rval, 3)
return [rval.dimshuffle(0, 4, 1, 2)] return [rval.dimshuffle(0, 4, 1, 2)]
if node.op.imshp is None or node.op.kshp is None:
return None
####### Determine gradient on kernels ######## ####### Determine gradient on kernels ########
assert len(node.op.imshp) == 4 and len(node.op.kshp) == 4 assert len(node.op.imshp) == 4 and len(node.op.kshp) == 4
print "here0", node.op.imshp[2:], node.op.kshp[2:] print "here0", node.op.imshp[2:], node.op.kshp[2:]
import pdb; pdb.set_trace()
outshp = ConvOp.getOutputShape(node.op.imshp[2:], outshp = ConvOp.getOutputShape(node.op.imshp[2:],
node.op.kshp[2:], node.op.subsample, node.op.kshp[2:], node.op.subsample,
...@@ -592,9 +598,15 @@ def local_conv2d_gradweight_cpu(node): ...@@ -592,9 +598,15 @@ def local_conv2d_gradweight_cpu(node):
node.op.border_mode) node.op.border_mode)
print outshp, fulloutshp print outshp, fulloutshp
#newimg = img.dimshuffle((1, 0, 2, 3))
#newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
newimg = img
newtopgrad = topgrad
if node.op.border_mode == 'valid': if node.op.border_mode == 'valid':
print "here1", node.op.imshp, node.op.kshp, fulloutshp print "here1", node.op.imshp, node.op.kshp, fulloutshp
(img, filters) = (img, topgrad) (img, filters) = (newimg, newtopgrad)
kshp_logical = fulloutshp kshp_logical = fulloutshp
kshp_logical_top_aligned = False kshp_logical_top_aligned = False
imshp_logical = None imshp_logical = None
...@@ -602,15 +614,15 @@ def local_conv2d_gradweight_cpu(node): ...@@ -602,15 +614,15 @@ def local_conv2d_gradweight_cpu(node):
imshp = (bsize, node.op.imshp[1], node.op.imshp[2]) imshp = (bsize, node.op.imshp[1], node.op.imshp[2])
kshp = node.op.kshp[2:] kshp = node.op.kshp[2:]
elif node.op.border_mode == 'full': elif node.op.border_mode == 'full':
(img, filters) = (topgrad, img) (img, filters) = (newtopgrad, newimg)
kshp_logical = None kshp_logical = None
kshp_logical_top_aligned = True kshp_logical_top_aligned = True
imshp_logical = (node.op.imshp[0], imshp_logical = (node.op.imshp[0],
fulloutshp[0], fulloutshp[0],
fulloutshp[1]) ## FIXME fulloutshp[1])
(bsize, nkern) = (node.op.kshp[0], node.op.imshp[1]) (bsize, nkern) = (node.op.kshp[0], node.op.imshp[1])
imshp = (node.op.imshp[0], outshp[0], outshp[1]) ## FIXME imshp = (node.op.imshp[0], outshp[0], outshp[1])
kshp = node.op.imshp[1:] ## FIXME kshp = node.op.imshp[1:]
else: else:
raise NotImplementedError( raise NotImplementedError(
'Only [full,valid] modes are currently supported.') 'Only [full,valid] modes are currently supported.')
...@@ -629,26 +641,46 @@ def local_conv2d_gradweight_cpu(node): ...@@ -629,26 +641,46 @@ def local_conv2d_gradweight_cpu(node):
#dw = ConvOp(output_mode='valid') #dw = ConvOp(output_mode='valid')
res = dw(img, filters) res = dw(img, filters)
print "here3", node.op.imshp, node.op.kshp, fulloutshp print "here3", node.op.imshp, node.op.kshp, fulloutshp
res = res.dimshuffle((1, 0, 2, 3))
return [res] return [res]
register_specialize_device(local_conv2d_gradweight_cpu) register_specialize_device(local_conv2d_gradweight_cpu)
@local_optimizer([AbstractConv2d_gradInputs]) @local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_cpu(node): def local_conv2d_gradinputs_cpu(node):
import pdb; pdb.set_trace()
kern, topgrad, shape = node.inputs kern, topgrad, shape = node.inputs
if isinstance(kern.type, CudaNdarrayType) or \ if isinstance(kern.type, CudaNdarrayType) or \
isinstance(topgrad.type, CudaNdarrayType): isinstance(topgrad.type, CudaNdarrayType):
return None return None
print "here4a", node.op.imshp, node.op.kshp
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
# Use the gradient as defined in conv3D, because the implementation
# by Conv is slow (about 3x slower than conv3D, and probably 10x
# slower than it could be), nad incorrect when subsample > 2.
# build a "node", that should be equivalent to the one given by
# self.make_node, but using convGrad3D instead.
shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
b = T.zeros((kern.shape[1]))
rval = ConvTransp3D(W=shuffled_kern, b=b,
d=(op.subsample[0], op.subsample[1], 1),
H=shuffled_topgrad,
RShape=(shape[0], shape[1], 1))
return [rval.dimshuffle(0, 4, 1, 2)]
####### Determine gradient on inputs ######## ####### Determine gradient on inputs ########
mode = 'valid' mode = 'valid'
if not node.op.border_mode == 'full': if not node.op.border_mode == 'full':
mode = 'full' mode = 'full'
filters = kern.dimshuffle((1, 0, 2, 3))
if node.op.filter_flip:
filters = filters[:, :, ::-1, ::-1]
filters = kern.dimshuffle((1, 0, 2, 3))
outshp = ConvOp.getOutputShape(node.op.imshp[2:], outshp = ConvOp.getOutputShape(node.op.imshp[2:],
node.op.kshp[2:], node.op.subsample, node.op.kshp[2:], node.op.subsample,
node.op.border_mode) node.op.border_mode)
...@@ -659,6 +691,10 @@ def local_conv2d_gradinputs_cpu(node): ...@@ -659,6 +691,10 @@ def local_conv2d_gradinputs_cpu(node):
imshp = (nkern, outshp[0], outshp[1]) imshp = (nkern, outshp[0], outshp[1])
imshp_logical = (nkern, fulloutshp[0], fulloutshp[1]) imshp_logical = (nkern, fulloutshp[0], fulloutshp[1])
if node.op.filter_flip:
filters = filters[:, :, ::-1, ::-1]
print "here4", imshp, node.op.kshp, nkern print "here4", imshp, node.op.kshp, nkern
din = ConvOp(imshp, din = ConvOp(imshp,
node.op.kshp[2:], node.op.kshp[2:],
...@@ -671,16 +707,6 @@ def local_conv2d_gradinputs_cpu(node): ...@@ -671,16 +707,6 @@ def local_conv2d_gradinputs_cpu(node):
kshp_logical=None, kshp_logical=None,
version=-1, version=-1,
direction_hint='bprop inputs') direction_hint='bprop inputs')
#din = ConvOp()
print "here5"
din = din(topgrad, filters) din = din(topgrad, filters)
print "here6"
#assert all(o is None or o == i
# for o, i in zip(din.owner.op.outshp, node.op.imshp[1:]))
# din and dw should have the same broadcasting pattern as the
# parameters they are the gradient of (resp. inputs and kerns).
din = din
return [din] return [din]
register_specialize_device(local_conv2d_gradinputs_cpu) register_specialize_device(local_conv2d_gradinputs_cpu)
...@@ -34,8 +34,11 @@ class TestConv2d(unittest.TestCase): ...@@ -34,8 +34,11 @@ class TestConv2d(unittest.TestCase):
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val) ### FIXME (CPU vs GPU)
filters = shared(filters_val) inputs = theano.tensor.shared(inputs_val)
filters = theano.tensor.shared(filters_val)
c_ref = conv_ref.conv2d(inputs, filters, c_ref = conv_ref.conv2d(inputs, filters,
border_mode="valid", border_mode="valid",
subsample=subsample) subsample=subsample)
...@@ -63,23 +66,40 @@ class TestConv2d(unittest.TestCase): ...@@ -63,23 +66,40 @@ class TestConv2d(unittest.TestCase):
def run_gradweight(self, def run_gradweight(self,
inputs_shape, inputs_shape,
filters_shape, filters_shape,
output_shape,
subsample=(1, 1), subsample=(1, 1),
verify_grad=True, verify_grad=True,
mode=mode_without_gpu): mode=mode_without_gpu,
device='gpu',
provide_shape = False):
inputs_val = numpy.random.random(inputs_shape).astype('float32') inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32') output_val = numpy.random.random(output_shape).astype('float32')
if device == 'gpu':
inputs = shared(inputs_val)
filters = shared(filters_val)
else:
inputs = theano.tensor.shared(inputs_val)
output = theano.tensor.shared(output_val)
inputs = shared(inputs_val.transpose((1, 0, 2, 3))) if provide_shape:
filters = shared(filters_val.transpose((1, 0, 2, 3))[:,:,:,:]) imshp = inputs_shape
kshp = filters_shape
else:
imshp = None,
kshp = None
c = conv.AbstractConv2d_gradWeights(border_mode="valid", c = conv.AbstractConv2d_gradWeights(border_mode="valid",
subsample=subsample) subsample=subsample,
c = c(inputs, filters, inputs_shape) imshp = imshp, kshp = kshp)
c = c(inputs, output, filters_shape)
f = theano.function([], c, mode) f = theano.function([], c, mode)
res_ref = py_conv(inputs_val, filters_val, 'valid', subsample) res_ref = py_conv(inputs_val.transpose((1, 0, 2, 3)),
output_val.transpose((1, 0, 2, 3)),
'valid', subsample).transpose((1, 0, 2, 3))
print res_ref.shape, numpy.array(f()).shape print res_ref.shape, numpy.array(f()).shape
res = numpy.array(f()).transpose((1, 0, 2, 3)) res = numpy.array(f())
utt.assert_allclose(res_ref, res) utt.assert_allclose(res_ref, res)
if verify_grad: if verify_grad:
utt.verify_grad(conv.AbstractConv2d(border_mode="valid", utt.verify_grad(conv.AbstractConv2d(border_mode="valid",
...@@ -129,15 +149,12 @@ class TestConv2d(unittest.TestCase): ...@@ -129,15 +149,12 @@ class TestConv2d(unittest.TestCase):
# verify_grad=False, mode=mode) # verify_grad=False, mode=mode)
def test_cpu(self): #def test_cpu(self):
self.run_conv(inputs_shape=(16, 1, 2, 2), #self.run_conv(inputs_shape=(16, 1, 2, 2),
filters_shape=(10, 1, 2, 2),
verify_grad=True,
mode=mode_without_gpu)
# self.run_gradweight(inputs_shape=(16, 1, 2, 2),
# filters_shape=(10, 1, 2, 2), # filters_shape=(10, 1, 2, 2),
# verify_grad=False, mode=mode_without_gpu) # verify_grad=False,
#self.run_gradinput(inputs_shape=(1, 1, 2, 2), # mode=mode_without_gpu)
# self.run_gradinput(inputs_shape=(1, 1, 2, 2),
# filters_shape=(10, 1, 2, 2), # filters_shape=(10, 1, 2, 2),
# verify_grad=False, mode=mode_without_gpu) # verify_grad=False, mode=mode_without_gpu)
...@@ -166,4 +183,16 @@ class TestConv2d(unittest.TestCase): ...@@ -166,4 +183,16 @@ class TestConv2d(unittest.TestCase):
# # subsample=(2, 2), # # subsample=(2, 2),
# # verify_grad=True,mode=mode) # # verify_grad=True,mode=mode)
def test_cpu_grad_weight(self):
self.run_gradweight(inputs_shape=(16, 1, 2, 2),
filters_shape=(10, 1, 2, 2),
output_shape=(16, 10, 1, 1),
verify_grad=False, mode=mode_without_gpu, device='cpu')
self.run_gradweight(inputs_shape=(16, 1, 2, 2),
filters_shape=(10, 1, 2, 2),
output_shape=(16, 10, 1, 1),
verify_grad=False,
mode=mode_without_gpu, device='cpu',
provide_shape=True)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论