Merge pull request #590 from nouiz/test_fix

Test fix

Merge pull request #590 from nouiz/test_fix
c1366d70 · lamblin · a2027539 · 72986083 · c1366d70 · c1366d70
--- a/theano/misc/do_nightly_build
+++ b/theano/misc/do_nightly_build
@@ -59,7 +59,7 @@ echo "Number of elements in the compiledir:"
 ls ${COMPILEDIR}|wc -l

 echo "Executing nosetests with mode=FAST_RUN"
-THEANO_FLAGS=${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
+THEANO_FLAGS=cmodule.warn_no_version=True,${FLAGS},mode=FAST_RUN ${NOSETESTS} ${PROFILING} ${ARGS}
 echo "Number of elements in the compiledir:"
 ls ${COMPILEDIR}|wc -l


--- a/theano/misc/pycuda_example.py
+++ b/theano/misc/pycuda_example.py
-"""
-This file show how we can use Pycuda compiled fct in a Theano Op. Do no use those op in production code. See the TODO.
+"""This file show how we can use Pycuda compiled fct in a Theano
+Op. Do no use those op in production code. See the TODO.

 You can use them as a guide to use your pycuda code into a Theano op.

-The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code generated with pycuda.compiler.SourceModule
-
-The PycudaElemwiseKernelOp op use pycuda code generated with pycuda.elementwise.ElementwiseKernel. It must be wrapper by TheanoElementwiseKernel.
+The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
+generated with pycuda.compiler.SourceModule

 Their is a test in test_pycuda.py.

-This don't work with broadcast and non-contiguous memory as pycuda don't support that, but we make sure we don't introduce problem.
+This don't work with broadcast and non-contiguous memory as pycuda
+don't support that, but we make sure we don't introduce problem.
  If the memory is non-contiguous, we create a new copy that is contiguous.
  If their is broadcasted dimensions, we raise an error.
+
+#The following is commented as it work only with old pycuda version
+The PycudaElemwiseKernelOp op use pycuda code generated with
+pycuda.elementwise.ElementwiseKernel. It must be wrapper by
+TheanoElementwiseKernel.
+
 """

 import numpy
@@ -19,7 +25,8 @@ import numpy
 import theano
 from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
 from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, gpu_contiguous
+from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
+                                           gpu_contiguous)
 from theano.sandbox.cuda.opt import gpu_seqopt

 import pycuda_init
@@ -30,30 +37,36 @@ import pycuda
 from pycuda.elementwise import ElementwiseKernel
 from pycuda.compiler import SourceModule
 from pycuda.tools import VectorArg
+import pycuda.gpuarray
+

 def theano_parse_c_arg(c_arg):
-    c_arg = c_arg.replace('npy_float32','float')
-    c_arg = c_arg.replace('npy_float64','double')
-    c_arg = c_arg.replace('npy_int32','int')
-    c_arg = c_arg.replace('npy_int8','char')
-    c_arg = c_arg.replace('npy_ucs4','unsigned int')
-    c_arg = c_arg.replace('npy_uint32','unsigned int')
-    c_arg = c_arg.replace('npy_uint16','unsigned short')
-    c_arg = c_arg.replace('npy_uint8','unsigned char')
+    c_arg = c_arg.replace('npy_float32', 'float')
+    c_arg = c_arg.replace('npy_float64', 'double')
+    c_arg = c_arg.replace('npy_int32', 'int')
+    c_arg = c_arg.replace('npy_int8', 'char')
+    c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
+    c_arg = c_arg.replace('npy_uint32', 'unsigned int')
+    c_arg = c_arg.replace('npy_uint16', 'unsigned short')
+    c_arg = c_arg.replace('npy_uint8', 'unsigned char')
    return pycuda.tools.parse_c_arg(c_arg)

+"""
 class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
    def __init__(self, arguments, operation,
                 name="kernel", keep=False, options=[], **kwargs):
        if isinstance(arguments, basestring):
-            arguments = [theano_parse_c_arg(arg) for arg in arguments.split(",")]
-            pycuda.elementwise.ElementwiseKernel.__init__(self, arguments, operation, name, keep, options, **kwargs)
+            arguments = [theano_parse_c_arg(arg)
+                         for arg in arguments.split(",")]
+        pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
+                                                      operation, name, keep,
+                                                      options, **kwargs)

    def __call__(self, *args):
        vectors = []

        invocation_args = []
-        for arg, arg_descr in zip(args, self.arguments):
+        for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
            if isinstance(arg_descr, VectorArg):
                vectors.append(arg)
                invocation_args.append(arg.gpudata)
@@ -62,7 +75,7 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):

        repr_vec = vectors[0]
        invocation_args.append(repr_vec.mem_size)
-        if hasattr(repr_vec,"_block") and hasattr(repr_vec,"_grid"):
+        if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
            self.func.set_block_shape(*repr_vec._block)
            self.func.prepared_call(repr_vec._grid, *invocation_args)
        else:
@@ -71,26 +84,120 @@ class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
            self.func.prepared_call(_grid, *invocation_args)


+class PycudaElemwiseKernelOp(GpuOp):
+    nin = property(lambda self: self.scalar_op.nin)
+    nout = property(lambda self: self.scalar_op.nout)
+
+    def __init__(self, scalar_op, inplace_pattern={}, name=None):
+        self.name = name
+        self.scalar_op = scalar_op
+        self.inplace_pattern = None
+
+    def __str__(self):
+        if self.name is None:
+            if self.inplace_pattern:
+                items = self.inplace_pattern.items()
+                items.sort()
+                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
+                                                             str(items))
+            else:
+                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
+        else:
+            return self.name
+
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.scalar_op == other.scalar_op and
+                self.inplace_pattern == other.inplace_pattern)
+
+    def __hash__(self):
+        return (hash(type(self)) ^ hash(self.scalar_op) ^
+                hash(self.inplace_pattern))
+
+    def make_node(self, *inputs):
+        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
+        if self.nin > 0 and len(_inputs) != self.nin:
+            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
+        for i in _inputs[1:]:
+            if i.type.ndim != inputs[0].type.ndim:
+                raise TypeError('different ranks among inputs')
+
+        if any([any(i.type.broadcastable) for i in inputs]):
+            raise Exception("pycuda don't support broadcasted dimensions")
+        assert len(inputs) == 2  # TODO remove
+
+        # output is broadcastable only along dimensions where all inputs are
+        # broadcastable
+        broadcastable = []
+        for d in xrange(_inputs[0].type.ndim):
+            bcast_d = True
+            for i in _inputs:
+                if not i.type.broadcastable[d]:
+                    bcast_d = False
+                    break
+            broadcastable.append(bcast_d)
+        assert len(broadcastable) == _inputs[0].type.ndim
+
+        otype = CudaNdarrayType(broadcastable=broadcastable)
+        assert self.nout == 1
+
+        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
+        in_name = ["i" + str(id) for id in range(len(inputs))]
+        out_name = ["o" + str(id) for id in range(self.nout)]
+        c_code = self.scalar_op.c_code(out_node, "some_name",
+                                       tuple([n + "[i]"for n in in_name]),
+                                       tuple(n + "[i]"for n in out_name), {})
+
+        self.pycuda_fct = TheanoElementwiseKernel(
+            ", ".join([var.type.dtype_specs()[1] + " *" + name
+                       for var, name in (zip(inputs, in_name) +
+                                         zip(out_node.outputs, out_name))]),
+            c_code,
+            "pycuda_elemwise_kernel_%s" % str(self.scalar_op),
+            preamble=("#include<Python.h>\n"
+"#include <numpy/arrayobject.h>"))
+        return out_node
+
+    def perform(self, node, inputs, out):
+        #TODO assert all input have the same shape
+        z, = out
+        if z[0] is None or z[0].shape != inputs[0].shape:
+            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
+        i = inputs + z
+        self.pycuda_fct(*i)
+"""
+
+
 class PycudaElemwiseSourceModuleOp(GpuOp):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op, inplace_pattern = {}, name = None):
+    def __init__(self, scalar_op, inplace_pattern={}, name=None):
        self.name = name
        self.scalar_op = scalar_op
-        self.inplace_pattern=None
+        self.inplace_pattern = None

    def __str__(self):
        if self.name is None:
            if self.inplace_pattern:
                items = self.inplace_pattern.items()
                items.sort()
-                return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
+                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
+                                                             str(items))
            else:
-                return self.__class__.__name__+"{%s}" % (self.scalar_op)
+                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
        else:
            return self.name

+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.scalar_op == other.scalar_op and
+                self.inplace_pattern == other.inplace_pattern)
+
+    def __hash__(self):
+        return (hash(type(self)) ^ hash(self.scalar_op) ^
+                hash(self.inplace_pattern))
+
    def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
@@ -101,17 +208,23 @@ class PycudaElemwiseSourceModuleOp(GpuOp):

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
-        assert len(inputs)==2#TODO remove
+        assert len(inputs) == 2  # TODO remove

-        otype = CudaNdarrayType(broadcastable=[False]*_inputs[0].type.ndim)
+        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        assert self.nout == 1

-        fct_name = "pycuda_elemwise_%s"%str(self.scalar_op)
+        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
-        in_name = ["i"+str(id) for id in range(len(inputs))]
-        out_name = ["o"+str(id) for id in range(self.nout)]
-        c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
-        c_code_param = ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]+["int size"])
+        in_name = ["i" + str(id) for id in range(len(inputs))]
+        out_name = ["o" + str(id) for id in range(self.nout)]
+        c_code = self.scalar_op.c_code(out_node, "some_name",
+                                       tuple([n + "[i]" for n in in_name]),
+                                       tuple(n + "[i]" for n in out_name), {})
+        c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
+                                  for var, name in (zip(inputs, in_name) +
+                                                    zip(out_node.outputs,
+                                                        out_name))] +
+                                 ["int size"])
        mod = SourceModule("""
 #include<Python.h>
 #include <numpy/arrayobject.h>
@@ -123,7 +236,7 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
        %s
    }
  }
-  """%(fct_name,c_code_param,c_code))
+  """ % (fct_name, c_code_param, c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node

@@ -131,41 +244,46 @@ class PycudaElemwiseSourceModuleOp(GpuOp):
        #TODO support broadcast!
        #TODO assert all input have the same shape
        z, = out
-        if z[0] is None or z[0].shape!=inputs[0].shape:
+        if z[0] is None or z[0].shape != inputs[0].shape:
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
        if inputs[0].shape != inputs[1].shape:
-            raise TypeError("PycudaElemwiseSourceModuleOp: inputs don't have the same shape!")
+            raise TypeError("PycudaElemwiseSourceModuleOp:"
+                            " inputs don't have the same shape!")

        if inputs[0].size > 512:
-            grid = (int(numpy.ceil(inputs[0].size / 512.)),1)
-            block = (512,1,1)
+            grid = (int(numpy.ceil(inputs[0].size / 512.)), 1)
+            block = (512, 1, 1)
        else:
-            grid = (1,1)
-            block = (inputs[0].shape[0],inputs[0].shape[1],1)
-        self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
+            grid = (1, 1)
+            block = (inputs[0].shape[0], inputs[0].shape[1], 1)
+        self.pycuda_fct(inputs[0], inputs[1], z[0],
+                        numpy.intc(inputs[1].size), block=block, grid=grid)


-class PycudaElemwiseKernelOp(GpuOp):
+class PycudaElemwiseSourceModuleMakeThunkOp(Op):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op, inplace_pattern = {}, name = None):
+    def __init__(self, scalar_op, inplace_pattern={}, name=None):
        self.name = name
        self.scalar_op = scalar_op
-        self.inplace_pattern=None
+        self.inplace_pattern = None

    def __str__(self):
        if self.name is None:
            if self.inplace_pattern:
                items = self.inplace_pattern.items()
                items.sort()
-                return self.__class__.__name__+"{%s}%s" % (self.scalar_op, str(items))
+                return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
+                                                             str(items))
            else:
-                return self.__class__.__name__+"{%s}" % (self.scalar_op)
+                return self.__class__.__name__ + "{%s}" % (self.scalar_op)
        else:
            return self.name

    def make_node(self, *inputs):
+        assert self.nout == 1
+        assert len(inputs) == 2  # TODO remove
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
@@ -175,57 +293,86 @@ class PycudaElemwiseKernelOp(GpuOp):

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
-        assert len(inputs)==2#TODO remove
-
-# output is broadcastable only along dimensions where all inputs are broadcastable
-        broadcastable = []
-        for d in xrange(_inputs[0].type.ndim):
-            bcast_d = True
-            for i in _inputs:
-                if not i.type.broadcastable[d]:
-                    bcast_d = False
-                    break
-            broadcastable.append(bcast_d)
-        assert len(broadcastable) == _inputs[0].type.ndim
-
-        otype = CudaNdarrayType(broadcastable=broadcastable)
-        assert self.nout == 1

+        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
-        in_name = ["i"+str(id) for id in range(len(inputs))]
-        out_name = ["o"+str(id) for id in range(self.nout)]
-        c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n+"[i]"for n in in_name]), tuple(n+"[i]"for n in out_name), {})
-
-        self.pycuda_fct = TheanoElementwiseKernel(
-            ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
-            c_code,
-            "pycuda_elemwise_kernel_%s"%str(self.scalar_op),
-            preamble="""#include<Python.h>
-#include <numpy/arrayobject.h>""")
        return out_node

-    def perform(self, node, inputs, out):
+    def make_thunk(self, node, storage_map, _, _2):
+        #TODO support broadcast!
        #TODO assert all input have the same shape
-        z, = out
-        if z[0] is None or z[0].shape!=inputs[0].shape:
-            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
-        i = inputs + z
-        self.pycuda_fct(*i)
+        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
+        in_name = ["i" + str(id) for id in range(len(node.inputs))]
+        out_name = ["o" + str(id) for id in range(self.nout)]
+
+        c_code = self.scalar_op.c_code(node, "some_name",
+                                       tuple([n + "[i]" for n in in_name]),
+                                       tuple(n + "[i]" for n in out_name), {})
+        c_code_param = ", ".join([var.type.dtype_specs()[1] + " *" + name
+                                  for var, name in
+                                  zip(node.inputs, in_name) +
+                                  zip(node.outputs, out_name)] + ["int size"])
+        mod = SourceModule("""
+#include<Python.h>
+#include <numpy/arrayobject.h>
+  __global__ void %s(%s)
+  {
+    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
+    i += threadIdx.x + threadIdx.y*blockDim.x;
+    if(i<size){
+        %s
+    }
+  }
+  """ % (fct_name, c_code_param, c_code))
+        pycuda_fct = mod.get_function(fct_name)
+        inputs = [storage_map[v] for v in node.inputs]
+        outputs = [storage_map[v] for v in node.outputs]
+
+        def thunk():
+            z = outputs[0]
+            if z[0] is None or z[0].shape != inputs[0][0].shape:
+                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
+                    inputs[0][0].shape)
+            if inputs[0][0].shape != inputs[1][0].shape:
+                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
+                                " inputs don't have the same shape!")
+
+            if inputs[0][0].size > 512:
+                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
+                block = (512, 1, 1)
+            else:
+                grid = (1, 1)
+                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
+            out = pycuda_fct(inputs[0][0], inputs[1][0], z[0],
+                             numpy.intc(inputs[1][0].size), block=block,
+                             grid=grid)
+        thunk.inputs = inputs
+        thunk.outputs = outputs
+        thunk.lazy = False
+
+        return thunk
+

 pycuda_optimizer = EquilibriumDB()
 gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")

+
 @local_optimizer([])
 def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
-        if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
-            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
+        if (not any([any(i.type.broadcastable) for i in node.inputs]) and
+            all([i.ndim <= 2 for i in node.inputs])):
+            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
+                                                  node.op.inplace_pattern)(
+                                                      *node.inputs)
            return [new_op]

-pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)
+pycuda_optimizer.register("local_pycuda_gpu_elemwise",
+                          local_pycuda_gpu_elemwise)
+

 @local_optimizer([])
 def local_pycuda_gpu_elemwise_kernel(node):
@@ -233,8 +380,11 @@ def local_pycuda_gpu_elemwise_kernel(node):
       GpuElemwise -> PycudaElemwiseKernelOp
    """
    if isinstance(node.op, GpuElemwise):
-        if not any([ any(i.type.broadcastable) for i in node.inputs]):
-            new_op = PycudaElemwiseKernelOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
+        if not any([any(i.type.broadcastable) for i in node.inputs]):
+            new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
+                                            node.op.inplace_pattern)(
+                                                *node.inputs)
            return [new_op]

-pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel", local_pycuda_gpu_elemwise_kernel, 1.5)
+pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
+                          local_pycuda_gpu_elemwise_kernel, 1.5)
--- a/theano/misc/tests/test_pycuda_example.py
+++ b/theano/misc/tests/test_pycuda_example.py
@@ -5,7 +5,8 @@ import theano.misc.pycuda_init

 if not theano.misc.pycuda_init.pycuda_available:
    from nose.plugins.skip import SkipTest
-    raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda code.")
+    raise SkipTest("Pycuda not installed. Skip test of theano op"
+                   " with pycuda code.")

 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available == False:
@@ -14,71 +15,93 @@ if cuda_ndarray.cuda_available == False:

 import theano
 import theano.tensor as T
-from theano.misc.pycuda_example import PycudaElemwiseSourceModuleOp, PycudaElemwiseKernelOp, PycudaElemwiseSourceModuleMakeThunkOp
+from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
+#                                        PycudaElemwiseKernelOp,
+                                        PycudaElemwiseSourceModuleMakeThunkOp)

-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode(
+        'FAST_RUN').excluding('gpu')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')

+
 def test_pycuda_elemwise_source_module():
-    for shape in [(5,5), (10,49), (50,49),(500,501),(5000,5001)]:
+    for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
        for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
-            x=T.fmatrix('x')
-            y=T.fmatrix('y')
+            x = T.fmatrix('x')
+            y = T.fmatrix('y')
            elemwise_op = theano.tensor.Elemwise(op)
            pycuda_op = PycudaElemwiseSourceModuleOp(op)
            pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
-            f=theano.function([x,y], elemwise_op(x,y), mode=mode_with_gpu)
-            f2 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op(x,y)))
-            f3 = theano.function([x,y], elemwise_op(x,y),
-                                 mode=mode_with_gpu.including("local_pycuda_gpu_elemwise"))
-            f4 = theano.function([x,y], theano.sandbox.cuda.host_from_gpu(pycuda_op_thunk(x,y)))
+            f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
+            f2 = theano.function([x, y],
+                                 theano.sandbox.cuda.host_from_gpu(
+                                     pycuda_op(x, y)),
+                                 mode=mode_with_gpu)
+            mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
+            f3 = theano.function([x, y], elemwise_op(x, y),
+                                 mode=mode_pycuda)
+            f4 = theano.function([x, y],
+                                 theano.sandbox.cuda.host_from_gpu(
+                                     pycuda_op_thunk(x, y)),
+                                 mode=mode_with_gpu)

-            assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
-            assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f2.maker.env.toposort()])
-            assert any([ isinstance(node.op, PycudaElemwiseSourceModuleOp) for node in f3.maker.env.toposort()])
-            assert any([ isinstance(node.op, PycudaElemwiseSourceModuleMakeThunkOp) for node in f4.maker.env.toposort()])
+            assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
+                        for node in f.maker.env.toposort()])
+            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
+                        for node in f2.maker.env.toposort()])
+            assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
+                        for node in f3.maker.env.toposort()])
+            assert any([isinstance(node.op,
+                                   PycudaElemwiseSourceModuleMakeThunkOp)
+                        for node in f4.maker.env.toposort()])

            val1 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
            val2 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
-            assert (f(val1,val2) == f2(val1,val2)).all()
-            assert (f(val1,val2) == f3(val1,val2)).all()
-            assert (f(val1,val2) == f4(val1,val2)).all()
+            assert (f(val1, val2) == f2(val1, val2)).all()
+            assert (f(val1, val2) == f3(val1, val2)).all()
+            assert (f(val1, val2) == f4(val1, val2)).all()
            #print f(val1,val2)
            #print f2(val1,val2)

+"""
+#commented as it work only with old pycuda version.
 def test_pycuda_elemwise_kernel():
-    x=T.fmatrix('x')
-    y=T.fmatrix('y')
-    f=theano.function([x,y],x+y, mode=mode_with_gpu)
+    x = T.fmatrix('x')
+    y = T.fmatrix('y')
+    f = theano.function([x, y], x + y, mode=mode_with_gpu)
    print f.maker.env.toposort()
-    f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
+    mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
+    f2 = theano.function([x, y], x + y, mode=mode_pycuda)
    print f2.maker.env.toposort()

-    assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()])
-    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()])
+    assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
+                for node in f.maker.env.toposort()])
+    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
+                for node in f2.maker.env.toposort()])

-    val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
-    val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32')
+    val1 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
+    val2 = numpy.asarray(numpy.random.rand(5, 5), dtype='float32')
    #val1 = numpy.ones((5,5))
    #val2 = numpy.arange(25).reshape(5,5)
-    assert (f(val1,val2) == f2(val1,val2)).all()
-    print f(val1,val2)
-    print f2(val1,val2)
-
+    assert (f(val1, val2) == f2(val1, val2)).all()
+    print f(val1, val2)
+    print f2(val1, val2)

-    x3=T.ftensor3('x')
-    y3=T.ftensor3('y')
-    z3=T.ftensor3('y')
+    x3 = T.ftensor3('x')
+    y3 = T.ftensor3('y')
+    z3 = T.ftensor3('y')

-    f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel"))
+    f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
    print f4.maker.env.toposort()
-    assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()])
+    assert any([isinstance(node.op, PycudaElemwiseKernelOp)
+                for node in f4.maker.env.toposort()])

-    val1 = numpy.random.rand(2,2,2)
+    val1 = numpy.random.rand(2, 2, 2)
    print val1
-    print f4(val1,val1,val1)
-    assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
+    print f4(val1, val1, val1)
+    assert numpy.allclose(f4(val1, val1, val1), val1 * val1 + val1)
+"""
--- a/theano/misc/tests/test_pycuda_theano_simple.py
+++ b/theano/misc/tests/test_pycuda_theano_simple.py
@@ -78,7 +78,10 @@ __global__ void multiply_them(float *dest, float *a, float *b)
 def test_pycuda_memory_to_theano():
    #Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
    y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
-    print numpy.asarray(y)
+    print sys.getrefcount(y)
+    # This increase the ref count with never pycuda. Do pycuda also
+    # cache ndarray?
+    # print y.get()
    print "gpuarray ref count before creating a CudaNdarray",
    print sys.getrefcount(y)
    assert sys.getrefcount(y) == 2