nnet_conv1 passes with all computations on GPU

71338c0b · James Bergstra · 9c146bd8 · 71338c0b · 71338c0b · 71338c0b
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -609,3 +609,124 @@ class GpuSum(Op):
    def perform(self, node, (x,), (z,)):
        z[0] = x.reduce_sum(self.reduce_mask)

+class GpuReshape(tensor.Reshape):
+    def make_node(self, x, shp):
+        return Apply(self, [x, shp], [CudaNdarrayType([False]*self.ndim)()])
+    def perform(self, node, (x, shp), (out,)):
+        if (len(shp) != self.ndim):
+            raise ValueError('shape argument to Reshape.perform has incorrect length %i'
+                    ', should be %i' % (len(shp), self.ndim), shp)
+        out[0] = x.reshape(tuple(shp))
+
+class GpuDimFlip(Op):
+    """This Op implements a very special case of Subtensor, in which some (or all) of the
+    strides are negated.
+
+    This Op should be erased when a proper GpuSubtensor is implemented.
+    """
+
+    def __init__(self, mask):
+        Op.__init__(self)
+        self.mask = mask
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.mask == other.mask
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.mask)
+
+    def __str__(self):
+        return '%s{%s}' %(self.__class__.__name__, str(self.mask))
+
+    def perform(self, node, (x,), (out,)):
+        z = x.view()
+        total_dev_data_offset = 0
+        for i, f in enumerate(self.mask):
+            if f and z.shape[i] > 1:
+                dev_data_offset += (z.dim[i] - 1) * z.str[i]
+                z.str[i] *= -1
+        z.dev_data += total_dev_data_offset
+        out[0] = z
+
+
+class GpuSubtensor(tensor.Subtensor):
+    def make_node(self, x, *inputs):
+        rval = tensor.Subtensor.make_node(self, x, *inputs)
+        rval.inputs[0] = x # clobber the 'astensor'
+        rval.outputs[0].type = CudaNdarrayType(rval.outputs[0].type.broadcastable)
+        return rval
+
+    def perform(self, node, inputs, (out, )):
+        indices = list(reversed(inputs[1:]))
+
+        def convert(entry):
+            if isinstance(entry, Type):
+                return indices.pop()
+            elif isinstance(entry, slice):
+                return slice(convert(entry.start),
+                             convert(entry.stop),
+                             convert(entry.step))
+            else:
+                return entry
+
+        x = inputs[0].view()
+        out[0] = x
+        #todo; when this works, put it into CudaNdarray.__getitem__
+        #      (sequence protocol)
+        x_shape = x.shape
+        x_strides = x._strides
+        offset = 0
+        for i, thing in enumerate(map(convert, self.idx_list)):
+            if isinstance(thing, int):
+                #this requires reducing the rank of the 
+                # view....
+                raise NotImplementedError()
+
+            if isinstance(thing, slice):
+                #stride
+                if thing.step is None:
+                    stride = 1
+                else:
+                    stride = thing.step
+
+                #start
+                if thing.start is None:
+                    if stride > 0:
+                        start = 0
+                    else:
+                        start = x_shape[i]-1
+                else:
+                    if thing.start < 0:
+                        start = x_shape[i] - thing.start
+                    else:
+                        start = thing.start
+
+                #stop
+                if thing.stop is None:
+                    if stride > 0:
+                        stop = x_shape[i]
+                    else:
+                        stop = -1
+                else:
+                    if thing.stop < 0:
+                        stop = x_shape[i] - thing.stop
+                    else:
+                        stop = thing.stop
+
+                newlen = (stop - start) // stride
+                offset += x_strides[i] * start
+                x._set_shape_i(i, newlen)
+                x._set_stride(i, x_strides[i] * stride)
+
+            #print 'perform', id(x), x.shape, i, thing
+        sizeof_float = 4
+        x._dev_data += offset * sizeof_float
+        #sys.stdout.flush()
+        #sys.exit()
+
+
+class GpuShape(tensor.Shape):
+    def make_node(self, x):
+        return Apply(self, [x], [tensor.lvector()])
+gpu_shape = GpuShape()
+
--- a/blas.py
+++ b/blas.py
@@ -2,6 +2,8 @@ from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar
 import StringIO

+import cuda_ndarray
+
 class GpuDot22(Op):
    def __str__(self):
        return 'GpuDot22'
@@ -109,3 +111,76 @@ class GpuGemm(Op):
        Py_INCREF(cnda_%(z_out)s);
        """ % locals()
 gpu_gemm = GpuGemm()
+
+##
+# Not really a BLAS operation, but whatever.
+#
+class GpuConv(Op):
+    @staticmethod
+    def logical_output_shape_2d(imshp, kshp, mode):
+        if mode == 'valid':
+            return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
+        if mode == 'full':
+            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
+        raise ValueError(mode)
+
+    def __init__(self, border_mode, 
+            subsample=(1,1), 
+            logical_img_hw=None, 
+            logical_kern_hw=None,
+            logical_kern_align_top=True):
+        self.border_mode = border_mode
+        self.subsample = subsample
+        if logical_img_hw is not None:
+            h,w = logical_img_hw
+            #TODO: reconsider this... since shapes are not given in constructor,
+            # maybe a multiplier + offset is a more appropriate way of passing this logical
+            # grid
+        self.logical_img_hw = tuple(logical_img_hw)
+        if logical_kern_hw is not None:
+            h,w = logical_kern_hw
+            #TODO: reconsider this... since shapes are not given in constructor,
+            # maybe a multiplier + offset is a more appropriate way of passing this logical
+            # grid
+        self.logical_kern_hw = tuple(logical_kern_hw)
+        self.logical_kern_align_top = logical_kern_align_top
+
+    def __eq__(self, other):
+        return type(self) == type(other) \
+            and self.border_mode == other.border_mode \
+            and self.subsample == other.subsample \
+            and self.logical_img_hw == other.logical_img_hw \
+            and self.logical_kern_hw == other.logical_kern_hw \
+            and self.logical_kern_align_top == other.logical_kern_align_top
+
+    def __hash__(self):
+        return hash(type(self)) \
+            ^ hash(self.border_mode) \
+            ^ hash(self.subsample) \
+            ^ hash(self.logical_img_hw) \
+            ^ hash(self.logical_kern_hw) \
+            ^ hash(self.logical_kern_align_top)
+
+    def __str__(self):
+        return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
+                self.border_mode,
+                str(self.subsample),
+                str(self.logical_img_hw),
+                str(self.logical_kern_hw),
+                str(self.logical_kern_align_top))
+
+    def make_node(self, img, kern):
+        if img.type.ndim != 4:
+            raise TypeError('img must be 4D tensor')
+        if img.type != kern.type:
+            raise TypeError('img and kern must have same type')
+        return Apply(self, [img, kern], [img.type()])
+
+    def perform(self, node, (img, kern), (out,)):
+        out[0] = cuda_ndarray.conv(img, kern, 
+                mode=self.border_mode, 
+                subsample=self.subsample,
+                logical_img_shape=self.logical_img_hw,
+                logical_kern_shape=self.logical_kern_hw,
+                kern_align=self.logical_kern_align_top)
+
--- a/opt.py
+++ b/opt.py
@@ -3,7 +3,7 @@ from theano import tensor, scalar, compile
 from theano.gof import local_optimizer, EquilibriumDB, SequenceDB

 from .basic_ops import *
-from .blas import gpu_dot22, gpu_gemm
+from .blas import gpu_dot22, gpu_gemm, GpuConv

 from theano.compile import optdb
 #optdb.print_summary()  # this shows what is currently registered (in a so-far crude way...)
@@ -153,3 +153,79 @@ def local_gpu_sum(node):
                return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
    return False

+import theano.sandbox.conv
+@register_opt()
+@local_optimizer([])
+def local_gpu_conv(node):
+    """
+    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
+
+    conv(host_from_gpu) -> host_from_gpu(conv)
+    """
+    def GpuConvOp_from_ConvOp(op):
+        return GpuConv(border_mode=op.out_mode,
+                    subsample=(op.dx, op.dy),
+                    logical_img_hw=op.imshp_logical[1:3],
+                    logical_kern_hw=op.kshp_logical,
+                    logical_kern_align_top=op.kshp_logical_top_aligned
+                    )
+
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, theano.sandbox.conv.ConvOp):
+            gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
+            img, kern = host_input.owner.inputs
+            return [gpu_conv(gpu_from_host(img), gpu_from_host(kern))]
+
+    if isinstance(node.op, theano.sandbox.conv.ConvOp):
+        img, kern = node.inputs
+        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
+        kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
+        if img_on_gpu or kern_on_gpu:
+            gpu_conv = GpuConvOp_from_ConvOp(node.op)
+            return [host_from_gpu(gpu_conv(gpu_from_host(img), gpu_from_host(kern)))]
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_reshape(node):
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape):
+            rshp = host_input.owner.op
+            x, shp = host_input.owner.inputs
+            return [GpuReshape(rshp.ndim)(gpu_from_host(x), shp)]
+    if isinstance(node.op, tensor.Reshape):
+        x, shp = node.inputs
+        if x.owner and x.owner.op == host_from_gpu:
+            gpu_x, = x.owner.inputs
+            return [host_from_gpu(GpuReshape(node.op.ndim)(gpu_x, shp))]
+    return False
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_subtensor(node):
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, tensor.Subtensor):
+            subt = host_input.owner.op
+            x = host_input.owner.inputs[0]
+            coords = host_input.owner.inputs[1:]
+            return [GpuSubtensor(subt.idx_list)(gpu_from_host(x), *coords)]
+    if isinstance(node.op, tensor.Subtensor):
+        x  = node.inputs[0]
+        coords = node.inputs[1:]
+        if x.owner and x.owner.op == host_from_gpu:
+            gpu_x, = x.owner.inputs
+            return [host_from_gpu(GpuSubtensor(node.op.idx_list)(gpu_x, *coords))]
+    return False
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_shape(node):
+    if isinstance(node.op, tensor.Shape):
+        x, = node.inputs
+        if x.owner and x.owner.op == host_from_gpu:
+            gpu_x, = x.owner.inputs
+            return [gpu_shape(gpu_x)]
+    return False
+
--- a/tests/test_nnet.py
+++ b/tests/test_nnet.py
 import sys, time
-import theano
+import theano, theano.sandbox.conv
 from theano.compile.sandbox.sharedvalue import shared
 from theano.compile.sandbox.pfunc import pfunc
 from theano import tensor
@@ -58,10 +58,65 @@ def run_nnet(use_gpu):
    mode.print_summary()
    return rval
    
-def test_nnet_cpu_gpu():
+def test_run_nnet():
    numpy.random.seed(23456)
    rval_cpu = run_nnet(False)
    numpy.random.seed(23456)
    rval_gpu = run_nnet(True)
    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)

+
+def run_conv_nnet1(shared_fn):
+    n_batch = 16
+    n_kern = 20
+    shape_img = (n_batch, 1, 32, 32)
+    shape_kern = (n_kern, 1, 5, 5)
+
+    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32,32),(5,5), 'valid')
+    n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
+    n_out = 10
+
+    w = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w')
+    b = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b')
+    v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
+    c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+
+    x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+
+    conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
+
+    hid = tensor.tanh(conv_op(x, w)+b)
+    hid_flat = hid.reshape((n_batch, n_hid))
+    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    print 'loss type', loss.type
+
+    params = [w, b, v, c]
+    gparams = tensor.grad(loss, params)
+
+    mode = theano.compile.ProfileMode()
+
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+
+    for i, n in enumerate(train.maker.env.toposort()):
+        print i, n
+
+    xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
+    yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
+    lr = numpy.asarray(0.01, dtype='float32')
+
+    for i in xrange(10):
+        rval = train(xval, yval, lr)
+    mode.print_summary()
+    return rval
+
+def test_conv_nnet1():
+    numpy.random.seed(23456)
+    rval_cpu = run_conv_nnet1(shared)
+    numpy.random.seed(23456)
+    rval_gpu = run_conv_nnet1(tcn.shared_constructor)
+    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
+