Make the example with pycuda work for bigger size of matrix.

57f60f1f · Frederic Bastien · 171e716c · 57f60f1f · 57f60f1f
--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
@@ -106,21 +106,23 @@ class PycudaElemwiseSourceModuleOp(Op):
        otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
        assert self.nout == 1
-        #TODO change the scalar op with the good c_code!
        fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i"+str(id) for id in range(len(inputs))]
        out_name = ["o"+str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
-        c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)])
+        c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
        mod = SourceModule("""
 #include<Python.h>
 #include <numpy/arrayobject.h>
  __global__ void %s(%s)
  {
-    int i = threadIdx.x + threadIdx.y*blockDim.x;
+    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
+    i += threadIdx.x + threadIdx.y*blockDim.x;
+    if(i<size){
        %s
    }
+  }
  """%(fct_name,c_code_param,c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node
@@ -131,7 +133,16 @@ class PycudaElemwiseSourceModuleOp(Op):
        z, = out
        if z[0] is None or z[0].shape!=inputs[0].shape:
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
-        self.pycuda_fct(inputs[0],inputs[1],z[0], block=(inputs[0].shape[0],inputs[0].shape[1],1))
+        if inputs[0].shape != inputs[1].shape:
+            raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!")
+        if inputs[0].size > 512:
+            grid = (int(numpy.ceil(inputs[0].size / 512.)),1)
+            block = (512,1,1)
+        else:
+            grid = (1,1)
+            block = (inputs[0].shape[0],inputs[0].shape[1],1)
+        self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
 class PycudaElemwiseKernelOp(Op):

--- a/theano/misc/tests/test_pycuda.py
+++ b/theano/misc/tests/test_pycuda.py
@@ -24,23 +24,27 @@ else:
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
 def test_pycuda_elemwise_source_module():
+    for shape in [(5,5), (10,49), (50,49),(500,501),(5000,5001)]:
+        for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
            x=T.fmatrix('x')
            y=T.fmatrix('y')
-    f=theano.function([x,y],x*y, mode=mode_with_gpu)
+            pycuda_op = PycudaElemwiseSourceModuleOp(op)
-    print f.maker.env.toposort()
+            elemwise_op = theano.tensor.Elemwise(op)
-    f2 = theano.function([x,y],x*y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
+            f=theano.function([x,y], elemwise_op(x,y), mode=mode_with_gpu)
-    print f2.maker.env.toposort()
+            f2 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op(x,y)))
+            f3 = theano.function([x,y], elemwise_op(x,y),
+                                 mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
            assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
            assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
+            assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f3.maker.env.toposort()])
-    val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
+            val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
-    val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
+            val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
-    #val1 = numpy.ones((5,5))
-    #val2 = numpy.arange(25).reshape(5,5)
            assert (f(val1,val2) == f2(val1,val2)).all()
-    print f(val1,val2)
+            assert (f(val1,val2) == f3(val1,val2)).all()
-    print f2(val1,val2)
+            #print f(val1,val2)
+            #print f2(val1,val2)
 def test_pycuda_elemwise_kernel():
    x=T.fmatrix('x')