profiling a basic nnet

b3528941 · James Bergstra · 0f60bf1a · b3528941 · b3528941 · b3528941
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -131,7 +131,7 @@ class GpuElemwise(Op):
    def c_src_kernel(self, node, nodename):
        nd = node.outputs[0].type.ndim
        sio = StringIO.StringIO()
-        print 'C_SRC_KERNEL', sio.getvalue()
+        #print 'C_SRC_KERNEL', sio.getvalue()
        def _logical_scalar(x):
            return all(x.type.broadcastable)
@@ -202,7 +202,7 @@ class GpuElemwise(Op):
            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
        print >> sio, "}"
-        print sio.getvalue()
+        #print sio.getvalue()
        return sio.getvalue()
    def c_support_code_apply(self, node, nodename):
@@ -582,3 +582,25 @@ class GpuDimShuffle(Op):
        return sio.getvalue()
+class GpuSum(Op):
+    def __init__(self, reduce_mask):
+        self.reduce_mask = tuple(reduce_mask)
+    def __eq__(self, other):
+        return type(self) == type(other) and self.reduce_mask == other.reduce_mask
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.reduce_mask)
+    def __str__(self):
+        return "GpuSum{%s}" % str(self.reduce_mask)
+    def make_node(self, x):
+        if (x.type.ndim != len(self.reduce_mask)):
+            raise TypeError("x must have rank %i"%len(self.reduce_mask))
+        o_broadcast = [x.type.broadcastable[i] for i in xrange(x.type.ndim) if not self.reduce_mask[i]]
+        return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
+    def perform(self, node, (x,), (z,)):
+        z[0] = x.reduce_sum(self.reduce_mask)
--- a/blas.py
+++ b/blas.py
@@ -3,6 +3,8 @@ from theano import tensor, scalar
 import StringIO
 class GpuDot22(Op):
+    def __str__(self):
+        return 'GpuDot22'
    def __eq__(self, other):
        return type(self) == type(other)
@@ -67,6 +69,8 @@ gpu_dot22 = GpuDot22()
 class GpuGemm(Op):
    destroy_map = {0:[0]}
+    def __str__(self):
+        return 'GpuGemm'
    def __eq__(self, other):
        return type(self) == type(other)

--- a/opt.py
+++ b/opt.py
+import sys
 from theano import tensor, scalar, compile
-from theano.gof import local_optimizer, EquilibriumDB
+from theano.gof import local_optimizer, EquilibriumDB, SequenceDB
 from .basic_ops import *
 from .blas import gpu_dot22, gpu_gemm
@@ -8,8 +9,12 @@ from theano.compile import optdb
 #optdb.print_summary()  # this shows what is currently registered (in a so-far crude way...)
 gpu_optimizer = EquilibriumDB()
+gpu_cut_copies = EquilibriumDB()
+gpu_seqopt = SequenceDB()
+gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, 'fast_run', 'inplace')
+gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2, 'fast_run', 'inplace')
 optdb.register('gpu', 
-        gpu_optimizer, 
+        gpu_seqopt, 
        optdb.__priority__.get('inplace_opt', 75) + 5, 
        'fast_run',
        'inplace')
@@ -21,25 +26,23 @@ def register_opt(*tags, **kwargs):
        return local_opt
    return f
-@register_opt()
+@local_optimizer([])
-@local_optimizer([GpuFromHost(), None])
+def local_cut_gpu_host_gpu(node):
-def local_gpu_host_gpu(node):
+    if tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
-    if not tensor.opt.opt.check_chain(node, GpuFromHost(), HostFromGpu()):
+        return [node.inputs[0].owner.inputs[0]]
-        return False
+    if tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
-    return [node.inputs[0].owner.inputs[0]]
+        return [node.inputs[0].owner.inputs[0]]
+    return False
-@register_opt()
+gpu_cut_copies.register('cut_gpu_host_transfers', local_cut_gpu_host_gpu, 'fast_run', 'inplace', 'gpu')
-@local_optimizer([HostFromGpu(), None])
-def local_host_gpu_host(node):
-    if not tensor.opt.opt.check_chain(node, HostFromGpu(), GpuFromHost()):
-        return False
-    return [node.inputs[0].owner.inputs[0]]
 @register_opt()
 @local_optimizer([])
 def local_gpu_elemwise_0(node):
    if isinstance(node.op, tensor.Elemwise):
        if any(hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs):
+            if any(o.type.dtype == 'float64' for o in node.outputs):
+                print 'EXITING FROM local_gpu_elemwise_0', node
+                sys.exit()
            # move the add to a GpuAdd
            new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)
            return [host_from_gpu(new_op(*(gpu_from_host(i) for i in node.inputs)))]
@@ -132,3 +135,21 @@ def local_gpu_gemm(node):
        if x_on_gpu or y_on_gpu or z_on_gpu:
            return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
    return False
+@register_opt()
+@local_optimizer([])
+def local_gpu_sum(node):
+    if isinstance(node.op, tensor.elemwise.CAReduce):
+        if node.op.scalar_op == scalar.add:
+            x, = node.inputs
+            if x.owner and x.owner.op == host_from_gpu:
+                if node.op.axis is None:
+                    reduce_mask = [1] * x.type.ndim
+                else:
+                    reduce_mask = [0] * x.type.ndim
+                    for a in node.op.axis:
+                        assert reduce_mask[a] == 0
+                        reduce_mask[a] = 1
+                return [host_from_gpu(GpuSum(reduce_mask)(gpu_from_host(x)))]
+    return False
--- a/tests/test_nnet.py
+++ b/tests/test_nnet.py
+import sys, time
+import theano
+from theano.compile.sandbox.sharedvalue import shared
+from theano.compile.sandbox.pfunc import pfunc
+from theano import tensor
+import numpy
+import theano_cuda_ndarray as tcn
+import logging
+logging.getLogger('theano.gradient').setLevel(logging.INFO)
+def run_nnet(use_gpu):
+    n_batch = 16
+    n_in = 1024
+    n_hid = 2048
+    n_out = 10
+    if use_gpu:
+        w = tcn.shared_constructor(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
+        b = tcn.shared_constructor(numpy.zeros(n_hid), 'b')
+        v = tcn.shared_constructor(numpy.zeros((n_hid, n_out)), 'c')
+        c = tcn.shared_constructor(numpy.zeros(n_out), 'c')
+    else:
+        w = shared(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
+        b = shared(numpy.zeros(n_hid), 'b')
+        v = shared(numpy.zeros((n_hid, n_out)), 'c')
+        c = shared(numpy.zeros(n_out), 'c')
+    x = tensor.fmatrix('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+    hid = tensor.tanh(tensor.dot(x, w)+b)
+    out = tensor.tanh(tensor.dot(hid, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    print 'loss type', loss.type
+    params = [w, b, v, c]
+    gparams = tensor.grad(loss, params)
+    mode = theano.compile.ProfileMode()
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+    for i, n in enumerate(train.maker.env.toposort()):
+        print i, n
+    xval = numpy.asarray(numpy.random.rand(n_batch, n_in), dtype='float32')
+    yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
+    lr = numpy.asarray(0.01, dtype='float32')
+    for i in xrange(100):
+        train(xval, yval, lr)
+    mode.print_summary()
+def test_nnet_cpu():
+    run_nnet(False)
+def test_nnet_gpu():
+    run_nnet(True)