merged in changes

3f5e0726 · Ian Goodfellow · 9a928682 · 8d74bd08 · 3f5e0726 · 3f5e0726
--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
 """
-This file show how we can use Pycuda compiled fct in a Theano Op. Do no use them in production code. See the TODO.
+This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO.

 You can use them as a guide to use your pycuda code into a Theano op.

-The PycudaElemwiseSourceModule op use pycuda code generated with pycuda.compiler.SourceModule
+The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule

-The PycudaElemwiseKernel op use pycuda code generated with pycuda.elementwise.ElementwiseKernel
+The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.

 Their is a test in test_pycuda.py. 

-This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
-
+This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem. 
+  If the memory is non-contiguous, we create a new copy that is contiguous.
+  If their is broadcasted dimensions, we raise an error.
 """

 import numpy
@@ -25,10 +26,51 @@ from theano.sandbox.cuda.opt import gpu_seqopt
 from pycuda.elementwise import ElementwiseKernel
 from pycuda.compiler import SourceModule
 from pycuda.gpuarray import splay
+from pycuda.tools import VectorArg

 import pycuda.autoinit

-class PycudaElemwiseSourceModule(Op):
+def theano_parse_c_arg(c_arg):
+    c_arg = c_arg.replace('npy_float32','float')
+    c_arg = c_arg.replace('npy_float64','double')
+    c_arg = c_arg.replace('npy_int32','int')
+    c_arg = c_arg.replace('npy_int8','char')
+    c_arg = c_arg.replace('npy_ucs4','unsigned int')
+    c_arg = c_arg.replace('npy_uint32','unsigned int')
+    c_arg = c_arg.replace('npy_uint16','unsigned short')
+    c_arg = c_arg.replace('npy_uint8','unsigned char')
+    return pycuda.tools.parse_c_arg(c_arg)
+
+class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
+    def __init__(self, arguments, operation,
+                 name="kernel", keep=False, options=[], **kwargs):
+        if isinstance(arguments, str):
+            arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")]
+            pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs)
+            
+    def __call__(self, *args):
+        vectors = []
+
+        invocation_args = []
+        for arg, arg_descr in zip(args, self.arguments):
+            if isinstance(arg_descr, VectorArg):
+                vectors.append(arg)
+                invocation_args.append(arg.gpudata)
+            else:
+                invocation_args.append(arg)
+
+        repr_vec = vectors[0]
+        invocation_args.append(repr_vec.mem_size)
+        if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"):
+            self.func.set_block_shape(*repr_vec._block)
+            self.func.prepared_call(repr_vec._grid, *invocation_args)
+        else:
+            _grid, _block = pycuda.gpuarray.splay(repr_vec.mem_size)
+            self.func.set_block_shape(*_block)
+            self.func.prepared_call(_grid, *invocation_args)
+
+
+class PycudaElemwiseSourceModuleOp(Op):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

@@ -42,9 +84,9 @@ class PycudaElemwiseSourceModule(Op):
            if self.inplace_pattern:
                items = self.inplace_pattern.items()
                items.sort()
-                return "PycudaElemwiseSourceModule{%s}%s" % (self.scalar_op, str(items))
+                return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
            else:
-                return "PycudaElemwiseSourceModule{%s}" % (self.scalar_op)
+                return self.__class__.__name__+"{%s}" % (self.scalar_op)
        else:
            return self.name

@@ -56,7 +98,8 @@ class PycudaElemwiseSourceModule(Op):
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

-        assert not any([any(i.type.broadcastable) for i in inputs])
+        if any([any(i.type.broadcastable) for i in inputs]):
+            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs)==2#TODO remove

        otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
@@ -89,7 +132,7 @@ class PycudaElemwiseSourceModule(Op):
        self.pycuda_fct(inputs[0],inputs[1],z[0], block=(inputs[0].shape[0],inputs[0].shape[1],1))


-class PycudaElemwiseKernel(Op):
+class PycudaElemwiseKernelOp(Op):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

@@ -103,9 +146,9 @@ class PycudaElemwiseKernel(Op):
            if self.inplace_pattern:
                items = self.inplace_pattern.items()
                items.sort()
-                return "PycudaElemwiseKernel{%s}%s" % (self.scalar_op, str(items))
+                return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
            else:
-                return "PycudaElemwiseKernel{%s}" % (self.scalar_op)
+                return self.__class__.__name__+"{%s}" % (self.scalar_op)
        else:
            return self.name

@@ -117,7 +160,8 @@ class PycudaElemwiseKernel(Op):
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

-        assert not any([any(i.type.broadcastable) for i in inputs])
+        if any([any(i.type.broadcastable) for i in inputs]):
+            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs)==2#TODO remove

 # output is broadcastable only along dimensions where all inputs are broadcastable
@@ -139,7 +183,7 @@ class PycudaElemwiseKernel(Op):
        out_name = ["o"+str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
        
-        self.pycuda_fct = ElementwiseKernel(
+        self.pycuda_fct = TheanoElementwiseKernel(
            ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
            c_code,
            "pycuda_elemwise_kernel_%s"%str(self.scalar_op),
@@ -152,8 +196,7 @@ class PycudaElemwiseKernel(Op):
        if z[0] is None or z[0].shape!=inputs[0].shape:
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
        i = inputs + z
-        sp = splay(i[0].mem_size)
-        self.pycuda_fct(*i)#, grid=sp[0], block=sp[1])
+        self.pycuda_fct(*i)

 pycuda_optimizer = EquilibriumDB()
 gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@@ -161,11 +204,11 @@ gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
 @local_optimizer([])
 def local_pycuda_gpu_elemwise(node):
    """
-       GpuElemwise -> PycudaElemwiseSourceModule
+       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
-            new_op = PycudaElemwiseSourceModule(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
+            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]

 pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)
@@ -173,11 +216,11 @@ pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise
 @local_optimizer([])
 def local_pycuda_gpu_elemwise_kernel(node):
    """
-       GpuElemwise -> PycudaElemwiseKernel
+       GpuElemwise -> PycudaElemwiseKernelOp
    """
    if isinstance(node.op, GpuElemwise):
        if not any([ any(i.type.broadcastable) for i in node.inputs]):
-            new_op = PycudaElemwiseKernel(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
+            new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]

 pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
--- a/theano/misc/test_pycuda.py
+++ b/theano/misc/test_pycuda.py
@@ -6,22 +6,36 @@ except ImportError:
    from nose.plugins.skip import SkipTest
    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")

+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    from nose.plugins.skip import SkipTest
+    raise SkipTest('Optional package cuda disabled')
+
 import theano
 import theano.tensor as T
-from theano.misc.pycuda_example import PycudaElemwiseSourceModule, PycudaElemwiseKernel
+from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp
 from theano.sandbox.cuda import GpuContiguous
 import theano.misc.pycuda_example

+import theano.sandbox.cuda as cuda_ndarray
+
+if theano.config.mode=='FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+
 def test_pycuda_elemwise_source_module():
    x=T.fmatrix('x')
    y=T.fmatrix('y')
-    f=theano.function([x,y],x*y)
+    f=theano.function([x,y],x*y, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    f2 = theano.function([x,y],x*y, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise"))
+    f2 = theano.function([x,y],x*y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
    print f2.maker.env.toposort()

    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
-    assert any([ isinstance(node.op, PycudaElemwiseSourceModule) for node in f2.maker.env.toposort()])
+    assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
    
    val1 = numpy.random.rand(5,5)
    val2 = numpy.random.rand(5,5)
@@ -34,13 +48,13 @@ def test_pycuda_elemwise_source_module():
 def test_pycuda_elemwise_kernel():
    x=T.fmatrix('x')
    y=T.fmatrix('y')
-    f=theano.function([x,y],x+y)
+    f=theano.function([x,y],x+y, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    f2 = theano.function([x,y],x+y, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise_kernel"))
+    f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f2.maker.env.toposort()

    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
-    assert any([ isinstance(node.op, PycudaElemwiseKernel) for node in f2.maker.env.toposort()])
+    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])
    
    val1 = numpy.random.rand(5,5)
    val2 = numpy.random.rand(5,5)
@@ -55,9 +69,9 @@ def test_pycuda_elemwise_kernel():
    y3=T.ftensor3('y')
    z3=T.ftensor3('y')

-    f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=theano.compile.mode.get_default_mode().including("local_pycuda_gpu_elemwise_kernel"))
+    f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
    print f4.maker.env.toposort()
-    assert any([ isinstance(node.op, PycudaElemwiseKernel) for node in f4.maker.env.toposort()])
+    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])

    val1 = numpy.random.rand(2,2,2)
    print val1

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -333,14 +333,14 @@ class PPrinter:
 use_ascii = True

 if use_ascii:
-    special = dict(middle_dot = "\dot",
-                   big_sigma = "\Sigma")
-
-    greek = dict(alpha    = "\alpha",
-                 beta     = "\beta",
-                 gamma    = "\gamma",
-                 delta    = "\delta",
-                 epsilon  = "\epsilon")
+    special = dict(middle_dot = "\\dot",
+                   big_sigma = "\\Sigma")
+
+    greek = dict(alpha    = "\\alpha",
+                 beta     = "\\beta",
+                 gamma    = "\\gamma",
+                 delta    = "\\delta",
+                 epsilon  = "\\epsilon")
 else:

    special = dict(middle_dot = u"\u00B7",

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -296,7 +296,8 @@ class GpuConv(Op):
            and self.logical_kern_hw == other.logical_kern_hw \
            and self.logical_kern_align_top == other.logical_kern_align_top \
            and self.version == other.version \
-            and self.verbose == other.verbose
+            and self.verbose == other.verbose \
+            and self.kshp == other.kshp

    def __hash__(self):
        # don't use hash(self.version) as hash(-1)==-2 and hash(-2)==-2 in python! 
@@ -307,7 +308,8 @@ class GpuConv(Op):
            ^ hash(self.logical_kern_hw) \
            ^ hash(self.logical_kern_align_top) \
            ^ self.version \
-            ^ self.verbose
+            ^ hash(self.verbose) \
+            ^ hash(self.kshp)
    
    def __str__(self):
        return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
@@ -336,7 +338,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']

    def c_code_cache_version(self):
-        return (0,6)
+        return (0,7)

    def c_support_code_apply(self, node, nodename):
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1052,8 +1052,13 @@ def local_argmax_pushdown(node):
            (softmax, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
        if theano.config.warn.argmax_pushdown_bug:
-            logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: their was a bug in Theano fixed the 27 may 2010 in this case. I.E. when we take the max of a softplus, softmax, exp, log, tanh, sigmoid, softmax_with_bias op, we where doing the max of the parent of the input. To remove this warning set the Theano flags 'warn.argmax_pushdown_bug' to False")
-
+            logging.getLogger('theano.tensor.nnet.nnet').warn("WARNING: there "
+                    "was a bug in Theano fixed on May 27th, 2010 in this case."
+                    " I.E. when we take the max of a softplus, softmax, exp, "
+                    "log, tanh, sigmoid, softmax_with_bias op, we were doing "
+                    "the max of the parent of the input. To remove this "
+                    "warning set the Theano flags 'warn.argmax_pushdown_bug' "
+                    "to False")

    if node.op == tensor._max_and_argmax and node.inputs[0].owner and len(node.outputs[0].clients)==0:
        x_max, x_argmax = node.outputs

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1569,6 +1569,7 @@ def test_constant_get_stabilized():
    f2 = theano.function([x2],y2)
    assert len(f2.maker.env.toposort())==1
    assert f2.maker.env.toposort()[0].op==theano.tensor.nnet.sigm.softplus
+    raise KnownFailureTest("Theano optimize constant before stabilization! This break stabilization optimization is some case!")
    assert f2(800)==800

    x = T.as_tensor_variable(800)
@@ -1576,7 +1577,6 @@ def test_constant_get_stabilized():
    f = theano.function([],y)
    assert len(f.maker.env.toposort())==0
    assert numpy.isinf(f())
-    raise KnownFailureTest("Theano optimize constant before stabilization! This break stabilization optimization is some case!")

    #When this error is fixed, the following line should be ok.
    assert f()==800,f()