Merge pull request #2043 from nouiz/mixed

Mixed stuff: pep8, comments, doc, don't enable GpuConv*3D op by default at that module import.

Merge pull request #2043 from nouiz/mixed
3f1364db · abergeron · 7d20b5c3 · 12f76081 · 3f1364db · 3f1364db
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1695,6 +1695,7 @@ Gradient / Differentiation
    :return: Returns lists of gradients with respect to `wrt` and `end`,
            respectively.
+    .. versionadded:: 0.6.1
 .. _R_op_list:

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -38,7 +38,15 @@ TODO: Give examples for how to use these things! They are pretty complicated.
      that it requires CUDA >= 5.0, scikits.cuda >= 0.5.0 and PyCUDA to run.
      To desactivate the fft optimization on a specific nnet.conv2d
      while the optimization flags are active, you can set its parameters
-      version to 'no_fft'
+      version to 'no_fft'. To enable for just one Theano function:
+      .. code-block:: python
+          mode = theano.compile.get_default_mode()
+          mode = mode.including('conv_fft_valid', 'conv_fft_full')
+          f = theano.function(..., mode=mode)
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
      3D Convolution. Doesn't work on the GPU.
    - :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`
@@ -49,6 +57,15 @@ TODO: Give examples for how to use these things! They are pretty complicated.
      This is not enabled by default because it has some restrictions on
      input and uses more memory. Also note that it requires CUDA >= 5.0,
      scikits.cuda >= 0.5.0 and PyCUDA to run.
+      To enable for just one Theano function:
+      .. code-block:: python
+          mode = theano.compile.get_default_mode()
+          mode = mode.including('conv3d_fft', 'convgrad3d_fft', 'convtransp3d_fft')
+          f = theano.function(..., mode=mode)
    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
      Another conv3d implementation that uses the conv2d with data reshaping.
      It is faster in some cases than conv3d, specifically on the GPU.

--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
@@ -418,66 +418,56 @@ have to be jointly optimized explicitly in the code.)
 as_op
 =====
- Decorator that converts a function into a basic Theano op 
+as_op is a python decorator that converts a python function into a
-    that will call the supplied function as its implementation.
+basic Theano op that will call the supplied function during execution.
- Takes an optional infer_shape parameter that should be a
-    callable with this signature:
+This isn't the recommended way to build an op, but allows for a quick
+implementation.
+It takes an optional :func:`infer_shape` parameter that must have this
+signature:
+  .. code-block:: python
           def infer_shape(node, input_shapes):
-            ...
+                # ...
-            return output_shapes
+                return output_shapes
- `input_shapes` and `output_shapes` are lists of tuples that
+  - `input_shapes` and `output_shapes` are lists of tuples that
    represent the shape of the corresponding inputs/outputs.
 .. note::
-    This should not be used when performance is a concern since 
+    Not providing the `infer_shape` method cause shapes-related
-    the very basic nature of the resulting Op may interfere with
+    optimization to not work with that op. For example
-    certain graph optimizations.
+    `your_op(inputs, ...).shape` will need the op to be executed just
+    to get the shape.
 .. note::
-    Returns FromFunctionOp(fn, itypes, otypes, infer_shape)
+    As no grad is defined, this means you won't be able to
+    differentiate paths that include this op.
-FromfunctionOp
-==============
- Build a basic Theano Op around a function.
 .. note::
-    Since the resulting Op is very basic and is missing most 
+    It converts the python function to a callable object that takes as
-    of the optional functionalities, some optimizations may not 
+    inputs Theano variables that were declared.
-    apply.
-    If you want to help, you can supply an infer_shape function 
+as_op Example
-    that computes the shapes of the output given the shapes of 
+-------------
-    the inputs.
-    Also the gradient is undefined in the resulting op and 
-    Theano will raise an error if you attempt to get the 
-    gradient of a graph containing this op.
-Op Example
-==========
 .. code-block:: python
    import theano
    import numpy
    from theano.compile.ops import as_op
-    from theano.compile.ops import FromFunctionOp
    def infer_shape_numpy_dot(node, input_shapes):
        ashp, bshp = input_shapes
        return [ashp[:-1] + bshp[-1:]]
    @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
-          otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
+           otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
    def numpy_dot(a, b):
       return numpy.dot(a, b)
@@ -494,7 +484,7 @@ You can try it as follows:
 Exercise
-========
+--------
 Run the code of the *numpy_dot* example above.

--- a/doc/tutorial/extending_theano_solution_1.py
+++ b/doc/tutorial/extending_theano_solution_1.py
@@ -163,5 +163,41 @@ class TestSumDiffOp(utt.InferShapeTester):
                                 numpy.random.rand(5, 6)],
                                self.op_class)
+# as_op exercice
+import theano
+import numpy
+from theano.compile.ops import as_op
+def infer_shape_numpy_dot(node, input_shapes):
+    ashp, bshp = input_shapes
+    return [ashp[:-1] + bshp[-1:]]
+@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_dot)
+def numpy_add(a, b):
+    return numpy.add(a, b)
+def infer_shape_numpy_add_sub(node, input_shapes):
+    ashp, bshp = input_shapes
+    # Both inputs should have that same shape, so we just return one of them.
+    return [ashp[0]]
+@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_add_sub)
+def numpy_add(a, b):
+    return numpy.add(a, b)
+@as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+       otypes=[theano.tensor.fmatrix], infer_shape=infer_shape_numpy_add_sub)
+def numpy_sub(a, b):
+    return numpy.sub(a, b)
 if __name__ == "__main__":
    unittest.main()
--- a/doc/tutorial/python.txt
+++ b/doc/tutorial/python.txt
@@ -11,5 +11,6 @@ tutorials/exercises if you need to learn it or only need a refresher:
  * `Python Challenge <http://www.pythonchallenge.com/>`__
  * `Dive into Python <http://diveintopython.net/>`__
  * `Google Python Class <http://code.google.com/edu/languages/google-python-class/index.html>`__
+  * `Enthought python course <https://training.enthought.com/?utm_source=academic&utm_medium=email&utm_campaign=EToD-Launch#/courses>`__ (free for academics)
 We have a tutorial on how :ref:`Python manages its memory <python-memory-management>`.
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -3,12 +3,14 @@ import numpy
 import theano
 import theano.tensor as T
 from theano.gof import local_optimizer
-from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gpu, HostFromGpu
+from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
+                                           host_from_gpu, HostFromGpu)
 from theano.misc import strutil
 from theano.tensor.nnet.Conv3D import Conv3D
-from theano.sandbox.cuda.opt import register_opt
+from theano.sandbox.cuda.opt import gpu_optimizer
 from theano.sandbox.cuda import CudaNdarrayType, GpuOp
 class GpuConv3D(GpuOp):
    """ GPU implementation of Conv3D """
@@ -32,19 +34,21 @@ class GpuConv3D(GpuOp):
        W_ = as_cuda_ndarray_variable(W)
        b_ = as_cuda_ndarray_variable(b)
        d_ = T.as_tensor_variable(d)
+        broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
        return theano.Apply(self, inputs=[V_, W_, b_, d_],
-                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
+                            outputs=[CudaNdarrayType(dtype=V_.dtype,
+                                                     broadcastable=broad)()])
    def c_code_cache_version(self):
        return ()
    def c_code(self, node, nodename, inputs, outputs, sub):
        V, W, b, d = inputs
        fail = sub['fail']
        H = outputs[0]
-        codeSource =  """
+        codeSource = """
                        ///////////// < code generated by GpuConv3D >
                        //printf("\t\t\t\tConv3DGPU c code\\n");
@@ -220,13 +224,13 @@ if(!work_complete){
 }}}}}}} //extra scope so error handler jumps don't cross declarations
                        ///////////// < /code generated by GpuConv3D >
        """
-        return strutil.render_string(codeSource,locals())
+        return strutil.render_string(codeSource, locals())
    def c_support_code_apply(self, node, nodename):
        # This code is not sensitive to the ignore_border flag.
        # It runs for every position in the output z, and then computes the gradient for the
        # input pixels that were downsampled to that z-position.
-        codeSource =  """
+        codeSource = """
 __global__ void
 //thread block size = out_dur
 //grid block size =(out_len*out_wid, nb kern *nb batch)
@@ -283,11 +287,17 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
 gpu_convd = GpuConv3D()
-@register_opt()
 @local_optimizer([Conv3D])
 def local_gpu_conv3d(node):
    if isinstance(node.op, Conv3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, W, b, d = node.inputs
-                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
+                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
+                                                as_cuda_ndarray_variable(W),
+                                                as_cuda_ndarray_variable(b),
+                                                d))]
+# Not enabled by default as we don't want people to use it.
+gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -7,12 +7,11 @@ from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
 from theano.misc import strutil
 from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
-from theano.sandbox.cuda.opt import register_opt
+from theano.sandbox.cuda.opt import gpu_optimizer
 from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
                                 host_from_gpu, GpuOp)
 class GpuConvGrad3D(GpuOp):
    """ GPU version of gradient of ConvGrad3D with respect to W """
@@ -27,9 +26,10 @@ class GpuConvGrad3D(GpuOp):
        d_ = T.as_tensor_variable(d)
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = as_cuda_ndarray_variable(dCdH)
+        broad = (False,)*5
        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
-                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
+                            outputs=[CudaNdarrayType(dtype=V_.dtype,
+                                                     broadcastable=broad)()])
    def perform_(self, node, inputs, output_storage):
        V, d, WShape, dCdH = inputs
@@ -51,18 +51,18 @@ class GpuConvGrad3D(GpuOp):
        dCdW = numpy.zeros(WShape, dtype=V.dtype)
-        #block
+        # block
-        for j in xrange(0,WShape[0]):
+        for j in xrange(0, WShape[0]):
-            for z in xrange(0,WShape[1]):
+            for z in xrange(0, WShape[1]):
-                for k in xrange(0,WShape[2]):
+                for k in xrange(0, WShape[2]):
-                    for l in xrange(0,WShape[3]):
+                    for l in xrange(0, WShape[3]):
-                        #threads
+                        # threads
-                        for m in xrange(0,WShape[4]):
+                        for m in xrange(0, WShape[4]):
-                            #thread
+                            # thread
-                            for i in xrange(0,batchSize):
+                            for i in xrange(0, batchSize):
-                                for p in xrange(0,outputHeight):
+                                for p in xrange(0, outputHeight):
-                                    for q in xrange(0,outputWidth):
+                                    for q in xrange(0, outputWidth):
-                                        for r in xrange(0,outputDur):
+                                        for r in xrange(0, outputDur):
                                            dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
        output_storage[0][0] = dCdW
@@ -340,11 +340,18 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
 gpu_conv_grad3d = GpuConvGrad3D()
-@register_opt()
 @local_optimizer([ConvGrad3D])
-def local_gpu_conv_gradd(node):
+def local_gpu_conv_grad3d(node):
    if isinstance(node.op, ConvGrad3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, d, WShape, dCdH = node.inputs
-                return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))]
+                return [host_from_gpu(gpu_conv_grad3d(
+                    as_cuda_ndarray_variable(V),
+                    d,
+                    WShape,
+                    as_cuda_ndarray_variable(dCdH)))]
+# Not enabled by default as we don't want people to use it.
+gpu_optimizer.register("local_gpu_conv_grad3d", local_gpu_conv_grad3d)
--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -8,20 +8,20 @@ from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
 from theano.gof import local_optimizer
 from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
-from theano.sandbox.cuda.opt import register_opt
+from theano.sandbox.cuda.opt import gpu_optimizer
 from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
                                 host_from_gpu, GpuOp)
 class GpuConvTransp3D(GpuOp):
    """ The gpu version of ConvTransp3D """
-    def __eq__(self,other):
+    def __eq__(self, other):
        return type(self) == type(other)
    def __hash__(self):
        return hash(type(self))
-    def make_node(self, W, b, d, H, RShape = None):
+    def make_node(self, W, b, d, H, RShape=None):
        W_ = as_cuda_ndarray_variable(W)
        b_ = as_cuda_ndarray_variable(b)
        d_ = T.as_tensor_variable(d)
@@ -29,22 +29,21 @@ class GpuConvTransp3D(GpuOp):
        if RShape:
            RShape_ = T.as_tensor_variable(RShape)
        else:
-            RShape_ = T.as_tensor_variable([-1,-1,-1])
+            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
+        return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
-                            outputs = [CudaNdarrayType(dtype=H_.dtype,
+                            outputs=[CudaNdarrayType(dtype=H_.dtype,
-                                                       broadcastable=(False,)*5)()])
+                                                     broadcastable=(False,)*5)()])
    def infer_shape(self, node, input_shapes):
-        W,b,d,H,RShape = node.inputs
+        W, b, d, H, RShape = node.inputs
        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
        return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
    def perform_(self, node, inputs, output_storage):
        W, b, d, H, RShape = inputs
        print "\t\t\t\tGpuConvTransp3D python code still uses old format"
-        output_storage[0][0] = computeR(W,b,d,H,RShape)
+        output_storage[0][0] = computeR(W, b, d, H, RShape)
    def c_code_cache_version(self):
        return ()
@@ -55,7 +54,7 @@ class GpuConvTransp3D(GpuOp):
        R = outputs[0]
-        codeSource =  """
+        codeSource = """
            ///////////// < code generated by GpuConvTransp3D >
            //printf("\t\t\t\tGpuConvTransp c code\\n");
@@ -263,13 +262,13 @@ if(!work_complete){
 }}}}}} // for fail
            ///////////// < /code generated by GpuConvTransp3D >
        """
-        return strutil.render_string(codeSource,locals())
+        return strutil.render_string(codeSource, locals())
    def c_support_code_apply(self, node, nodename):
        # This code is not sensitive to the ignore_border flag.
        # It runs for every position in the output z, and then computes the gradient for the
        # input pixels that were downsampled to that z-position.
-        codeSource =  """
+        codeSource = """
 __global__ void
 //thread block size = videoDur
 //grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
@@ -347,18 +346,21 @@ conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
 gpu_conv_transpd = GpuConvTransp3D()
-@register_opt()
 @local_optimizer([ConvTransp3D])
-def local_gpu_conv_transpd(node):
+def local_gpu_conv_transp3d(node):
    if isinstance(node.op, ConvTransp3D):
-        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                W, b, d, H, RShape = node.inputs
                return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
+# Not enabled by default as we don't want people to use it.
+gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d)
 #If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
-def computeR(W,b,d,H,Rshape = None):
+def computeR(W, b, d, H, Rshape=None):
        assert len(W.shape) == 5
        assert len(H.shape) == 5
        assert len(b.shape) == 1
@@ -370,7 +372,7 @@ def computeR(W,b,d,H,Rshape = None):
        assert outputChannelsAgain == outputChannels
        assert b.shape[0] == inputChannels
-        dr,dc,dt = d
+        dr, dc, dt = d
        assert dr > 0
        assert dc > 0
        assert dt > 0
@@ -398,14 +400,14 @@ def computeR(W,b,d,H,Rshape = None):
            videoWidth, videoDur ) , dtype=H.dtype)
        #R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
-        for i in xrange(0,batchSize):
+        for i in xrange(0, batchSize):
            #print '\texample '+str(i+1)+'/'+str(batchSize)
-            for j in xrange(0,inputChannels):
+            for j in xrange(0, inputChannels):
                #print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
-                for r in xrange(0,videoHeight):
+                for r in xrange(0, videoHeight):
                    #print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
-                    for c in xrange(0,videoWidth):
+                    for c in xrange(0, videoWidth):
-                        for t in xrange(0,videoDur):
+                        for t in xrange(0, videoDur):
                            R[i,j,r,c,t] = b[j]
                            ftc = max([0, int(numpy.ceil(float(t-filterDur +1  )/float(dt))) ])
@@ -432,16 +434,16 @@ def computeR(W,b,d,H,Rshape = None):
                                        R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
                                        tc += 1
-                                    "" #close loop over tc
+                                    ""  # close loop over tc
                                    cc += 1
-                                "" #close loop over cc
+                                ""  # close loop over cc
                                rc += 1
-                            "" #close loop over rc
+                            ""  # close loop over rc
-                        "" #close loop over t
+                        ""  # close loop over t
-                    "" #close loop over c
+                    ""  # close loop over c
-                "" #close loop over r
+                ""  # close loop over r
-            "" #close loop over j
+            ""  # close loop over j
-        "" #close loop over i
+        ""  # close loop over i
        return R
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -436,6 +436,7 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
 /*
 * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
+ * Set the python error.
 */
 template<typename inttype> 
 static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
@@ -448,6 +449,9 @@ static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
            Py_DECREF(rval);
            return NULL;
        }
+    }else{
+        PyErr_SetString(PyExc_MemoryError,
+                        "Failed to allocate the CudaNdarray structure.");
    }
    return (PyObject*)rval;
 }

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -303,6 +303,8 @@ class NVCC_compiler(object):
        preargs2 = [pa for pa in preargs
                    if pa not in preargs1]  # other arguments
+        # Don't put -G by default, as it slow things down.
+        # We aren't sure if -g slow things down, so we don't put it by default.
        cmd = [nvcc_path, '-shared'] + preargs1
        if config.nvcc.compiler_bindir:
            cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -635,7 +635,8 @@ def test_valid(conv_gemm=False):
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
        cls = cuda.blas.GpuCorrMM
-        version = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
+        # dummy version; not used by GpuCorrMM so one version is enough
+        version = [-1]
        # Add tests with strided inputs by still square images and filters.
        shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
        shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
@@ -645,6 +646,7 @@ def test_valid(conv_gemm=False):
              print_=print_, ones=ones, rtol=1.1e-5,
              theano_mode=mode, cls=cls)
 def test_gemm_valid():
    test_valid(conv_gemm=True)
@@ -712,12 +714,14 @@ def test_full(conv_gemm=False):
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
        cls = cuda.blas.GpuCorrMM
-        version = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
+        # dummy version; not used by GpuCorrMM so one version is enough
+        version = [-1]
    else:
        mode = cls = None
    exec_conv(version, shapes, verbose, random, 'full',
              theano_mode=mode, cls=cls)
 def test_gemm_full():
    test_full(conv_gemm=True)
@@ -735,7 +739,8 @@ def test_subsample(conv_gemm=False):
    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
-#We put only the version that implement the subsample to make the test faster.
+    # We put only the version that implement the subsample to make the
+    # test faster.
    version_valid = [-2, -1, 1, 3, 11, 12]
    version_full = [-2, -1]
    verbose = 0
@@ -749,7 +754,8 @@ def test_subsample(conv_gemm=False):
        # Test the GpuCorrMM version
        mode = theano_mode.including("conv_gemm")
        cls = cuda.blas.GpuCorrMM
-        version_valid = version_full = [-1]  # dummy version; not used by GpuCorrMM so one version is enough
+        # dummy version; not used by GpuCorrMM so one version is enough
+        version_valid = version_full = [-1]
    else:
        mode = cls = None
@@ -760,6 +766,7 @@ def test_subsample(conv_gemm=False):
              print_=print_, ones=ones,
              theano_mode=mode, cls=cls)
 def test_gemm_subsample():
    test_subsample(conv_gemm=True)

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -49,7 +49,11 @@ class CudaNdarrayConstant(_operators, Constant):
    def __str__(self):
        if self.name is not None:
            return self.name
-        return "CudaNdarrayConstant{"+str(numpy.asarray(self.data))+"}"
+        try:
+            data = str(numpy.asarray(self.data))
+        except Exception, e:
+            data = "error while transfering the value:" + str(e)
+        return "CudaNdarrayConstant{"+data+"}"
 CudaNdarrayType.Constant = CudaNdarrayConstant
 class CudaNdarraySharedVariable(_operators, SharedVariable):

--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
@@ -453,6 +453,38 @@ class T_extending(unittest.TestCase):
        simplify = gof.TopoOptimizer(local_simplify)
        simplify.optimize(e)
+    def test_as_op(self):
+        import theano
+        import numpy
+        from theano.compile.ops import as_op
+        def infer_shape_numpy_dot(node, input_shapes):
+            ashp, bshp = input_shapes
+            return [ashp[:-1] + bshp[-1:]]
+        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+               otypes=[theano.tensor.fmatrix],
+               infer_shape=infer_shape_numpy_dot)
+        def numpy_add(a, b):
+            return numpy.add(a, b)
+        def infer_shape_numpy_add_sub(node, input_shapes):
+            ashp, bshp = input_shapes
+            # Both inputs should have that same shape, so we just
+            # return one of them.
+            return [ashp[0]]
+        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+               otypes=[theano.tensor.fmatrix],
+               infer_shape=infer_shape_numpy_add_sub)
+        def numpy_add(a, b):
+            return numpy.add(a, b)
+        @as_op(itypes=[theano.tensor.fmatrix, theano.tensor.fmatrix],
+               otypes=[theano.tensor.fmatrix],
+               infer_shape=infer_shape_numpy_add_sub)
+        def numpy_sub(a, b):
+            return numpy.sub(a, b)
 class T_introduction(unittest.TestCase):