提交 e4c6880e authored 作者: Razvan Pascanu's avatar Razvan Pascanu

merge; no conflicts

...@@ -50,7 +50,20 @@ class HostFromGpu(Op): ...@@ -50,7 +50,20 @@ class HostFromGpu(Op):
z[0] = numpy.asarray(x) z[0] = numpy.asarray(x)
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
if isinstance(gz, tensor.TensorType):
# This would only happen if you call Lop, and provide a tensor
# that is not cuda
# This might require another look to be sure
return [gpu_from_host(gz)] return [gpu_from_host(gz)]
else:
return [gz]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, tensor.TensorType):
return [gpu_from_host(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
host_from_gpu = HostFromGpu() host_from_gpu = HostFromGpu()
...@@ -72,7 +85,21 @@ class GpuFromHost(Op): ...@@ -72,7 +85,21 @@ class GpuFromHost(Op):
z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0]) z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
if isinstance(gz,CudaNdarrayType):
# This would only happen if you call Lop, and provide a tensor
# that is not cuda
# This might require another look to be sure
return [host_from_gpu(gz)] return [host_from_gpu(gz)]
else:
return [gz]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, CudaNdarrayType):
return [host_from_gpu(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
gpu_from_host = GpuFromHost() gpu_from_host = GpuFromHost()
......
...@@ -24,5 +24,6 @@ from sharedvar import tensor_constructor as shared ...@@ -24,5 +24,6 @@ from sharedvar import tensor_constructor as shared
import nnet # used for softmax, sigmoid, etc. import nnet # used for softmax, sigmoid, etc.
from tensor_grad import Rop, Lop, grad, numeric_grad, verify_grad
差异被折叠。
...@@ -218,6 +218,13 @@ class DimShuffle(Op): ...@@ -218,6 +218,13 @@ class DimShuffle(Op):
rval.insert(augm, 1) rval.insert(augm, 1)
return [rval] return [rval]
def R_op(self, inputs, eval_points):
if None in eval_points:
return [None]
return self.make_node(*eval_points).outputs
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
input, = inp input, = inp
res, = out res, = out
...@@ -534,7 +541,78 @@ class Elemwise(Op): ...@@ -534,7 +541,78 @@ class Elemwise(Op):
else: else:
return self.name return self.name
def R_op(self, inputs, eval_points):
outs = self.make_node(*inputs).outputs
rval = [None for x in outs]
# For each output
for idx, out in enumerate(outs):
# make such that _bgrads computes only the gradients of the
# current output on the inputs ( and not all outputs)
ograds = [ theano.tensor.zeros_like(x) for x in outs]
ograds[idx] = theano.tensor.ones_like(out)
bgrads = self._bgrad(inputs, ograds)
rop_out = None
for jdx, (inp, eval_point) in enumerate(zip(inputs,
eval_points)):
# if None, then we can just ignore this branch ..
# what we do is to assume that for any non-differentiable
# branch, the gradient is actually 0, which I think is not
# the right thing to do .. have to talk to Ian and James
# about it
if bgrads[jdx] is None:
pass
elif eval_point is not None:
if rop_out is None:
rop_out = bgrads[jdx]*eval_point
else:
rop_out = rop_out + bgrads[jdx]*eval_point
rval[idx] = rop_out
return rval
def grad(self, inputs, ograds): def grad(self, inputs, ograds):
#compute grad with respect to broadcasted input
rval = self._bgrad(inputs,ograds)
#sum out the broadcasted dimensions
for i, ipt in enumerate(inputs):
if rval[i] is None:
continue
# list of all the dimensions that are broadcastable for input[i] so we
# can sum over them
# todo: only count dimensions that were effectively broadcasted
to_sum = [j for j, bcast in enumerate(ipt.type.broadcastable) if bcast]
if to_sum:
shuffle = []
j = 0
for bcast in ipt.type.broadcastable:
if bcast == 1:
shuffle.append('x')
else:
shuffle.append(j)
j += 1
#close if
#close for
sr = Sum(axis = to_sum)(rval[i])
sr = sr.dimshuffle(shuffle)
#sr = DimShuffle(sr.type.broadcastable, shuffle)(sr)
rval[i] = sr
#close if
#close for
return rval
def _bgrad(self, inputs, ograds):
# returns grad, with respect to broadcasted versions of inputs
# Gradients (especially on the final costs) don't have to be symbolic # Gradients (especially on the final costs) don't have to be symbolic
# e.g., ograds will be [ 1. ] if your objective is c and the output # e.g., ograds will be [ 1. ] if your objective is c and the output
# of the current apply node is c # of the current apply node is c
...@@ -558,35 +636,17 @@ class Elemwise(Op): ...@@ -558,35 +636,17 @@ class Elemwise(Op):
broadcastable = ()), broadcastable = ()),
numpy.asarray(r.data)) # .reshape(b) numpy.asarray(r.data)) # .reshape(b)
return DimShuffle((), ['x']*nd, inplace = True)(res) return DimShuffle((), ['x']*nd, inplace = True)(res)
new_r = Elemwise(node.op, {})(*[transform(input) for input in node.inputs]) new_r = Elemwise(node.op, {})(*[transform(ipt) for ipt in node.inputs])
return new_r return new_r
ret = [] ret = []
for scalar_igrad, input in zip(scalar_igrads, inputs): for scalar_igrad, ipt in zip(scalar_igrads, inputs):
if scalar_igrad is None: if scalar_igrad is None:
# undefined gradient # undefined gradient
ret.append(None) ret.append(None)
continue continue
r = transform(scalar_igrad) ret.append( transform(scalar_igrad))
# list of all the dimensions that are broadcastable for that input so we
# can sum over them
# todo: only count dimensions that were effectively broadcasted
to_sum = [i for i, bcast in enumerate(input.type.broadcastable) if bcast]
if to_sum:
shuffle = []
j = 0
for bcast in input.type.broadcastable:
if bcast == 1:
shuffle.append('x')
else:
shuffle.append(j)
j += 1
sr = Sum(axis = to_sum)(r)
sr = DimShuffle(sr.type.broadcastable, shuffle)(sr)
ret.append(sr)
else:
ret.append(r)
return ret return ret
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage):
...@@ -1180,6 +1240,11 @@ class Sum(CAReduce): ...@@ -1180,6 +1240,11 @@ class Sum(CAReduce):
i += 1 i += 1
return Elemwise(scalar.second)(x, DimShuffle(gz.type.broadcastable, new_dims)(gz)), return Elemwise(scalar.second)(x, DimShuffle(gz.type.broadcastable, new_dims)(gz)),
def R_op(self, inputs, eval_points):
if None in eval_points:
return [None]
return self.make_node(*eval_points).outputs
def __str__(self): def __str__(self):
if self.axis is None: if self.axis is None:
return "Sum" return "Sum"
......
...@@ -347,6 +347,13 @@ class Softmax(gof.Op): ...@@ -347,6 +347,13 @@ class Softmax(gof.Op):
sm = softmax(x) sm = softmax(x)
return [softmax_grad(g_sm, sm)] return [softmax_grad(g_sm, sm)]
def R_op(self, inputs, eval_points):
# I think the Jacobian is symmetric so the R_op
# is the same as the grad
if None in eval_points:
return [None]
return self.grad(inputs, eval_points)
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return shape return shape
......
...@@ -469,6 +469,11 @@ class MakeVector(T.Op): ...@@ -469,6 +469,11 @@ class MakeVector(T.Op):
grads.append(output_gradients[0][i]) grads.append(output_gradients[0][i])
return grads return grads
def R_op(self, inputs, eval_points):
if None in eval_points:
return [None]
return self.make_node(*eval_points).outputs
make_vector = MakeVector() make_vector = MakeVector()
class MakeVectorPrinter: class MakeVectorPrinter:
......
差异被折叠。
...@@ -2,6 +2,48 @@ ...@@ -2,6 +2,48 @@
WRITE ME WRITE ME
Tests for the R operator / L operator Tests for the R operator / L operator
ops without:
PermuteRowElements
Tile
AdvancedSubtensor
TensorDot
Outer
Prod
MulwithoutZeros
ProdWithoutZeros
list of ops that support R-op:
* Alloc
* Split
* ARange
* ScalarFromTensor
* Shape
* SpecifyShape
* MaxAndArgmax
* Subtensor
* IncSubtensor
* Rebroadcast
* Join
* Reshape
* Flatten
* AdvancedSubtensor1
* AdvancedIncSubtensor1
* AdvancedIncSubtensor
* Dot
* DimShuffle
* Elemwise
* Sum
* Softmax
* Scan
""" """
import unittest import unittest
...@@ -10,52 +52,208 @@ from theano import function ...@@ -10,52 +52,208 @@ from theano import function
import theano import theano
import theano.tensor as TT import theano.tensor as TT
import numpy import numpy
from theano.gof import Op, Apply
class test_rop(unittest.TestCase): '''
Special Op created to test what happens when you have one op that is not
differentiable in the computational graph
'''
class BreakRop(Op):
"""
@note: Non-differentiable.
"""
def __hash__(self):
return hash(type(self))
def __eq__(self, other):
return type(self) == type(other)
def make_node(self, x):
return Apply(self, [x], [x.type()])
def perform(self, node, inp, out_):
x, = inp
out, = out_
out[0] = x
def grad(self, inp, grads):
return [None]
def R_op(self, inputs, eval_points):
return [None]
def test_specifyshape(self): break_op = BreakRop()
rng = numpy.random.RandomState(utt.fetch_seed())
vx = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
vv = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX) class test_RopLop(unittest.TestCase):
x = TT.vector('x') def setUp(self):
v = TT.vector('v') # Using vectors make things a lot simpler for generating the same
y = TT.specify_shape(x, (5,)) # computations using scan
yv = TT.Rop(y,x,v) self.x = TT.vector('x')
rop_f = function([x,v], yv) self.v = TT.vector('v')
J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x), self.rng = numpy.random.RandomState(utt.fetch_seed())
sequences = TT.arange(x.shape[0]), self.in_shape = ( 5+self.rng.randint(30),)
non_sequences = [y,x]) self.mx = TT.matrix('mx')
sy = TT.dot(J, v) self.mv = TT.matrix('mv')
self.mat_in_shape = ( 5 + self.rng.randint(30),
5+self.rng.randint(30))
def check_nondiff_rop(self, y):
raised = False
try:
tmp = TT.Rop(y, self.x, self.v)
except ValueError:
raised = True
if not raised:
self.fail((
'Op did not raised an error even though the function'
' is not differentiable'))
def check_mat_rop_lop(self, y, out_shape):
vx = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
vv = numpy.asarray(self.rng.uniform(size=self.mat_in_shape), theano.config.floatX)
yv = TT.Rop(y, self.mx, self.mv)
rop_f = function([self.mx, self.mv], yv)
sy, _ = theano.scan( lambda i,y,x,v: (TT.grad(y[i],x)*v).sum(),
sequences = TT.arange(y.shape[0]),
non_sequences = [y,self.mx,self.mv])
scan_f = function([self.mx,self.mv], sy)
scan_f = function([x,v], sy)
v1 = rop_f(vx,vv) v1 = rop_f(vx,vv)
v2 = scan_f(vx,vv) v2 = scan_f(vx,vv)
assert numpy.allclose(v1,v2) assert numpy.allclose(v1,v2)
self.check_nondiff_rop( theano.clone(y,
replace={self.mx:break_op(self.mx)}))
vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
yv = TT.Lop(y, self.mx, self.v)
lop_f = function([self.mx, self.v], yv)
sy, _ = theano.scan( lambda i,y,x,v: (TT.grad(y[i]*v[i],x))[i],
sequences = TT.arange(y.shape[0]),
non_sequences = [y,self.mx,self.v])
scan_f = function([self.mx, self.v], sy)
class test_lop(unittest.TestCase):
def test_specifyshape(self): v1 = lop_f(vx,vv)
rng = numpy.random.RandomState(utt.fetch_seed()) v2 = scan_f(vx,vv)
vx = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX) assert numpy.allclose(v1,v2)
vv = numpy.asarray(rng.uniform(size=(5,)), theano.config.floatX)
x = TT.vector('x') def check_rop_lop(self, y, out_shape):
v = TT.vector('v') # TEST ROP
y = TT.specify_shape(x, (5,)) vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
yv = TT.Lop(y,x,v) vv = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
rop_f = function([x,v], yv)
yv = TT.Rop(y,self.x,self.v)
rop_f = function([self.x,self.v], yv)
J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x), J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x),
sequences = TT.arange(x.shape[0]), sequences = TT.arange(y.shape[0]),
non_sequences = [y,x]) non_sequences = [y,self.x])
sy = TT.dot(v, J) sy = TT.dot(J, self.v)
scan_f = function([x,v], sy) scan_f = function([self.x,self.v], sy)
v1 = rop_f(vx,vv) v1 = rop_f(vx,vv)
v2 = scan_f(vx,vv) v2 = scan_f(vx,vv)
assert numpy.allclose(v1,v2) assert numpy.allclose(v1,v2)
self.check_nondiff_rop( theano.clone(y,
replace={self.x:break_op(self.x)}))
# TEST LOP
vx = numpy.asarray(self.rng.uniform(size=self.in_shape), theano.config.floatX)
vv = numpy.asarray(self.rng.uniform(size=out_shape), theano.config.floatX)
yv = TT.Lop(y,self.x,self.v)
lop_f = function([self.x,self.v], yv)
J, _ = theano.scan( lambda i,y,x: TT.grad(y[i],x),
sequences = TT.arange(y.shape[0]),
non_sequences = [y,self.x])
sy = TT.dot(self.v, J)
scan_f = function([self.x,self.v], sy)
v1 = lop_f(vx,vv)
v2 = scan_f(vx,vv)
assert numpy.allclose(v1,v2)
def test_shape(self):
self.check_nondiff_rop( self.x.shape[0])
def test_specifyshape(self):
self.check_rop_lop(TT.specify_shape(self.x, self.in_shape),
self.in_shape)
def test_max_argmax(self):
self.check_map_rop_lop(TT.max(self.mx, axis=1),
(self.mat_in_shape[0],))
def test_max_argmax(self):
self.check_nondiff_rop(TT.argmax(self.mx,axis=1))
def test_subtensor(self):
self.check_rop_lop(self.x[:4], (4,))
def test_incsubtensor1(self):
tv = numpy.asarray( self.rng.uniform(size=(3,)),
theano.config.floatX)
t = theano.shared(tv)
out = TT.inc_subtensor(self.x[:3], t)
self.check_rop_lop(out, self.in_shape)
def test_incsubtensor1(self):
tv = numpy.asarray( self.rng.uniform(size=(10,)),
theano.config.floatX)
t = theano.shared(tv)
out = TT.inc_subtensor(t[:4], self.x[:4])
self.check_rop_lop(out, (10,))
def test_setsubtensor1(self):
tv = numpy.asarray( self.rng.uniform(size=(3,)),
theano.config.floatX)
t = theano.shared(tv)
out = TT.set_subtensor(self.x[:3], t)
self.check_rop_lop(out, self.in_shape)
def test_setsubtensor1(self):
tv = numpy.asarray( self.rng.uniform(size=(10,)),
theano.config.floatX)
t = theano.shared(tv)
out = TT.set_subtensor(t[:4], self.x[:4])
self.check_rop_lop(out, (10,))
def test_join(self):
tv = numpy.asarray( self.rng.uniform(size=(10,)),
theano.config.floatX)
t = theano.shared(tv)
out = TT.join(0, self.x, t)
self.check_rop_lop(out, (self.in_shape[0]+10,))
def test_dot(self):
insh = self.in_shape[0]
vW = numpy.asarray(self.rng.uniform(size=(insh,insh)),
theano.config.floatX)
W = theano.shared(vW)
self.check_rop_lop( TT.dot(self.x, W), self.in_shape)
def test_elemwise0(self):
self.check_rop_lop( (self.x+1)**2, self.in_shape)
def test_elemwise1(self):
self.check_rop_lop( self.x+TT.cast(self.x, 'int32'),
self.in_shape)
def test_sum(self):
self.check_mat_rop_lop(self.mx.sum(axis=1), (self.mat_in_shape[0],))
def test_softmax(self):
# Softmax adds an extra dimnesion !
self.check_rop_lop( TT.nnet.softmax(self.x)[0], self.in_shape)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论