Merged (solved conflict in theano/configdefaults.py)

79fda719 · Olivier Delalleau · 7a750ee1 · 5a755867 · 79fda719 · 79fda719
--- a/doc/install.txt
+++ b/doc/install.txt
@@ -55,11 +55,14 @@ The following libraries and software are optional:
    `nose <http://somethingaboutorange.com/mrl/projects/nose/>`_
        Recommended, to run Theano's test-suite.
    `Sphinx <http://sphinx.pocoo.org/>`_ >= 0.5.1, `pygments <http://pygments.org/>`_
        For building the documentation. LaTeX_ and dvipng_ are also necessary
        for math to show up as images.
    `Mercurial <http://mercurial.selenic.com/>`_
        To download bleeding-edge versions of Theano.
    `NVIDIA CUDA drivers and SDK`_
        Required for GPU code generation/execution. Only NVIDIA GPUs using
        32-bit floating point numbers are currently supported.
@@ -334,8 +337,8 @@ correctly (for example, for MKL this might be ``-lmkl -lguide -lpthread`` or
 Mac
 ---
- If the above required libraries are not already installed on your Mac, one option is first, to
+- If the above required libraries are not already installed on your Mac,
-  install `MacPorts <http://www.macports.org/>`__.
+  one option is first, to install `MacPorts <http://www.macports.org/>`__.
 - Then, in order to install one or more of the required libraries, use "port install", e.g. as follows:
@@ -421,9 +424,10 @@ Mac
      mac_framework_link=True
 Please infom us if you have trouble installing and running Theano on your mac.
-We would be especially interested in dependencies that we missed listing, as well as tests
+We would be especially interested in dependencies that we missed
-that fail on your platform (use the ``theano-users@googlegroups.com`` mailing list,
+listing, as well as tests that fail on your platform (use the
-but note that you must first register to it, by going to `theano-users`_).
+``theano-users@googlegroups.com`` mailing list, but note that you must
+first register to it, by going to `theano-users`_).
 Windows
@@ -706,7 +710,8 @@ Then
  4) Test some pre-compiled example of the sdk.
  5) Download Visual Studio 2008 Express (free, VS2010 not supported by nvcc 3.1,
-     VS2005 is not available for download but supported by nvcc, the non free version should work too).
+     VS2005 is not available for download but supported by nvcc, the non
+     free version should work too).
  6) Follow the instruction in the GettingStartedWindows.pdf file from the CUDA web
     site to compile CUDA code with VS2008. If that does not work, you will

--- a/doc/tutorial/aliasing.txt
+++ b/doc/tutorial/aliasing.txt
--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -6,6 +6,7 @@ from StringIO import StringIO
 import numpy
+import theano
 from theano import gof
 from theano.gof import Env, graph, utils, link
 from theano.gof.link import WrapLinkerMany, raise_with_op
@@ -536,6 +537,9 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, clobber_dr_v
            # But this depend on the version of numpy!
            if getattr(out_var,'size',2)==1:
                continue
+            if isinstance(node.op, theano.compile.mode.OutputGuard):
+                # This class is not in the final graph.
+                continue
            if not _may_share_memory(out_var, in_var):
                #when a subtensor return a tensor of ndim==0, numpy seam to return a copy.
                #when have an empty ndarray(happen with output guard) it is not the same. why?

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -24,7 +24,10 @@ AddConfigVar('device',
        )
 AddConfigVar('init_gpu_device',
-        "Initialize the gpu device to use. This don't change the default behavior. We don't default to try to move the computation to it. We don't default to put shared variable of float32 on it. Useful to run the test on a specific gpu.",
+        ("Initialize the gpu device to use, works only if device=cpu. "
+         "Unlike 'device', setting this option will NOT move computations, "
+         "nor shared variables, to the specified GPU. "
+         "It can be used to run GPU-specific tests on a particular GPU."),
        EnumStr('', 'gpu0', 'gpu1', 'gpu2', 'gpu3',
                allow_override=False)
        )

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -104,7 +104,8 @@ if cuda_available:
        cuda_available = False
        cuda_initialization_error_message = e.message
-# We must do those import to be able to create the full doc when nvcc
+# We must do those import to be able to create the full doc when
+# nvcc is not available
 from theano.sandbox.cuda.var import (CudaNdarrayVariable,
                                     CudaNdarrayConstant,
                                     CudaNdarraySharedVariable,
@@ -115,7 +116,12 @@ if cuda_available:
    #check if their is an old cuda_ndarray that was loading instead of the one we compiled!
    import cuda_ndarray.cuda_ndarray
    if cuda_ndarray_so != cuda_ndarray.cuda_ndarray.__file__:
-        warning("WARNING: cuda_ndarray was loaded from",cuda_ndarray.cuda_ndarray.__file__,"This is not expected as theano should compile it automatically for you. Do you have a directory called cuda_ndarray in your LD_LIBRARY_PATH environment variable? If so, please remove it as it is outdated!")
+        warning("WARNING: cuda_ndarray was loaded from",
+                cuda_ndarray.cuda_ndarray.__file__,
+                """This is not expected as theano should compile it
+ automatically for you. Do you have a directory called cuda_ndarray in your
+LD_LIBRARY_PATH environment variable? If so, please remove it as it is
+outdated!""")
    shared_constructor = float32_shared_constructor
@@ -204,8 +210,14 @@ def handle_shared_float32(tf):
        raise NotImplementedError('removing our handler')
 if config.device.startswith('gpu'):
-    use(config.device, config.force_device)
+    use(device=config.device, force=config.force_device)
 elif config.init_gpu_device:
-    assert config.device=="cpu", "We can use the theano flags init_gpu_device only when the theano flags device=='cpu'"
+    assert config.device=="cpu", "We can use the Theano flag init_gpu_device only when the Theano flag device=='cpu'"
-    print "Will init the gpu to use a specific gpu device. This don't default tomove computation and allocate shared variable of float32 to this device. For that try the theano flags device."
+    warning(("GPU device %s will be initialized, and used if a GPU is needed. "
-    use(config.init_gpu_device, config.force_device, False, False)
+          "However, no computation, nor shared variables, will be implicitly "
+          "moved to that device. If you want that behavior, use the 'device' "
+          "flag instead.") % config.init_gpu_device)
+    use(device=config.init_gpu_device,
+        force=config.force_device,
+        default_to_move_computation_to_gpu=False,
+        move_shared_float32_to_gpu=False)
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -335,12 +335,14 @@ class GpuConv(Op):
            ^ hash(self.imshp)
    def __str__(self):
-        return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
+        return '%s{%s, %s, %s, %s, %s, %s, %s}' %(self.__class__.__name__,
                self.border_mode,
                str(self.subsample),
                str(self.logical_img_hw),
                str(self.logical_kern_hw),
-                str(self.logical_kern_align_top))
+                str(self.logical_kern_align_top),
+                str(self.imshp),
+                str(self.kshp))
    def make_node(self, img, kern):
        if img.type.ndim != 4:

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -88,9 +88,11 @@ int device_free(void *ptr)
    _outstanding_mallocs[0] -= (ptr != NULL);
 #if COMPUTE_GPU_MEM_USED
    int i=0;
+    size_t total_freed = 0;
    for(;i<TABLE_SIZE;i++)
      if(_alloc_size_table[i].ptr==ptr){
        _allocated_size -= _alloc_size_table[i].size;
+        total_freed += _alloc_size_table[i].size;
        _alloc_size_table[i].ptr=0;
        _alloc_size_table[i].size=0;
@@ -98,6 +100,7 @@ int device_free(void *ptr)
      }
    if(i==TABLE_SIZE)
      printf("Unallocated unknow size!\n");
+    //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
 #endif
    return 0;
 }
@@ -339,13 +342,13 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
    CudaNdarray* rval = (CudaNdarray*)CudaNdarray_new_null();
    if (!rval)
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: call to new_null failed");
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to new_null failed");
        return NULL;
    }
    if (CudaNdarray_alloc_contiguous(rval, n, dims))
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: allocation failed.");
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
        Py_DECREF(rval);
        return NULL;
    }
@@ -354,14 +357,14 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
    //fprintf(stdout, "Sizeof: %d\n", total_size);
    if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
    {
-        PyErr_Format(PyExc_MemoryError, "Error memsetting %d bytes of device memory.", total_size);
+        PyErr_Format(PyExc_MemoryError, "CudaNdarray_ZEROS: Error memsetting %d bytes of device memory.", total_size);
        Py_DECREF(rval);
        return NULL;
    }
    if (cnda_copy_structure_to_device(rval))
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: syncing structure to device failed");
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
        Py_DECREF(rval);
        return NULL;
    }
@@ -842,7 +845,7 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
    //standard elemwise size checks
    if (self->nd != other->nd)
    {
-        PyErr_SetString(PyExc_TypeError, "need same number of dims");
+        PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
        return NULL;
    }
    //standard elemwise dim checks
@@ -1002,7 +1005,7 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
    //standard elemwise size checks
    if (self->nd != other->nd)
    {
-        PyErr_SetString(PyExc_TypeError, "need same number of dims");
+        PyErr_Format(PyExc_TypeError, "CudaNdarray_inplace_add_div: need same number of dims. Got %d and %d", self->nd, other->nd);
        return NULL;
    }
    //standard elemwise dim checks
@@ -1214,7 +1217,8 @@ CudaNdarray_inplace_add_div(PyObject* py_self, PyObject * py_other, int fct_nb)
 static PyObject *
 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){
  CudaNdarray_inplace_add_div(py_self, py_other, 0);
-  Py_INCREF(py_self);
+  //We should not increment the refcount as we are doing inplace operation
+  //And in this syntax, their is no additional reference created!
  return py_self;
 }
@@ -1225,7 +1229,8 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other){
 static PyObject *
 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other){
  CudaNdarray_inplace_add_div(py_self, py_other, 1);
-  Py_INCREF(py_self);
+  //We should not increment the refcount as we are doing inplace operation
+  //And in this syntax, their is no additional reference created!
  return py_self;
 }
@@ -1828,6 +1833,33 @@ static PyTypeObject CudaNdarrayType =
    CudaNdarray_new,           /* tp_new */
 };
+static __global__ void get_gpu_ptr_size(int* dst)
+{
+  dst[0] = sizeof(float*);
+}
+PyObject *
+CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
+{
+  int *gpu_data = (int*)device_malloc(sizeof(int));
+  if(gpu_data == NULL){
+    return PyErr_Format(PyExc_MemoryError,
+                        "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
+  }
+  get_gpu_ptr_size<<<1,1>>>(gpu_data);
+  if (cudaSuccess != cublasGetError()){
+    return PyErr_Format(PyExc_RuntimeError,
+                        "CudaNdarray_ptr_int_size: error when calling the gpu code.");
+  }
+  int gpu_ptr_size = -1;
+  cublasGetVector(1, sizeof(int), gpu_data, 1, &gpu_ptr_size, 1);
+  device_free(gpu_data);
+  if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
+    PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
+    return NULL;
+  }
+  return Py_BuildValue("iii", gpu_ptr_size, sizeof(float*), sizeof(int));
+}
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
@@ -2065,6 +2097,7 @@ static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
    {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
+    {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
    {NULL, NULL, NULL, NULL}  /* Sentinel */

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -56,7 +56,7 @@ class InputToGpuOptimizer(Optimizer):
                    new_input = host_from_gpu(gpu_from_host(input))
                    if new_input.type==input.type:
-                        env.replace_validate(input, new_input, "To allow further optimisation to move Ops to gpu")
+                        env.replace_validate(input, new_input, "InputToGpuOptimizer")
                except Exception, e:
                    #as we currently only support float32, this can fail.
                    #Using try except make that we won't need
@@ -113,9 +113,12 @@ def local_gpu_elemwise_0(node):
                else:
                    return False
-                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner).outputs[0]
+                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
+                if not gpu_elemwise:
-                return [host_from_gpu(gpu_elemwise)]
+                    return False
+                if max_inputs_to_GpuElemwise(node)<len(gpu_elemwise.inputs):
+                    return False
+                return [host_from_gpu(gpu_elemwise.outputs[0])]
 @register_opt()
 @local_optimizer([])
 def local_gpu_elemwise_1(node):
@@ -130,8 +133,10 @@ def local_gpu_elemwise_1(node):
            new_op = GpuElemwise(elemwise_node.op.scalar_op)
            if all([i.dtype=='float32' for i in elemwise_node.inputs]):
                gpu_elemwise = new_op(*[gpu_from_host(i) for i in elemwise_node.inputs])
-                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner).outputs[0]
+                gpu_elemwise = split_huge_add_or_mul(gpu_elemwise.owner)
-                return [gpu_elemwise]
+                if not gpu_elemwise:
+                    return False
+                return [gpu_elemwise.outputs[0]]
    return False
 @register_opt()
@@ -730,24 +735,35 @@ optdb.register('InplaceGpuBlasOpt',
            max_use_ratio=5),
               70.0, 'fast_run', 'inplace')
+gpu_ptr_size = 8
+cpu_ptr_size = 8
+int_size = 8
+try:
+    #RETURN (gpu ptr size, cpu ptr size, int sizes)
+    t = cuda_ndarray.cuda_ndarray.ptr_int_size()
+    gpu_ptr_size, cpu_ptr_size, int_size = t
+except Exception, e:
+    _logger.warning(("OPTIMIZATION WARNING: "
+        "Got the following error, but we can ignore it. "
+        "This could cause less GpuElemwise fused together.\n"
+        "%s") % e)
 def max_inputs_to_GpuElemwise(node):
    """
    return the maximum number of input this Apply node to an GpuElemwise can accept.
    This is needed as currently their is a limit of 256 bytes of paramter for the gpu function.
    This mesure the number of paramter we put in our gpu function and compute the maximum number of inputs that respect the 256 bytes limits.
    """
-    #TODO: detect the size of gpu pointeur and c int.
-    int_size = 8
-    ptr_size = 8
-    argument_limit = 256  # if was 240, with this note: 16 bytes are used for block and thread coords etc.
+    argument_limit = 232  # some bytes are used for block and thread coords etc.
+    ndim = node.inputs[0].type.ndim
    size_param_mandatory = int_size #for numels
-    size_param_mandatory += int_size *  node.inputs[0].type.ndim # for the shape#node.outputs[0].ndim+1+node.inputs[0].ndim+1
+    size_param_mandatory += int_size *  ndim # for the shape
-    size_param_mandatory += sum((ptr_size + int_size * i.type.ndim) for i in node.outputs)
+    size_param_mandatory += sum((gpu_ptr_size + int_size * ndim) for i in node.outputs)
-    nb_bytes_avail = argument_limit-size_param_mandatory
+    nb_bytes_avail = argument_limit - size_param_mandatory
-    nb_bytes_per_inputs = (node.inputs[0].ndim*int_size)+ptr_size
+    nb_bytes_per_inputs = (ndim*int_size) + gpu_ptr_size
-    max_nb_inputs = nb_bytes_avail//nb_bytes_per_inputs
+    max_nb_inputs = nb_bytes_avail // nb_bytes_per_inputs
    return max_nb_inputs
 def split_huge_add_or_mul(node):
@@ -762,6 +778,8 @@ def split_huge_add_or_mul(node):
    """
    if node.op.scalar_op in (scal.add, scal.mul):
        max_nb_inputs = max_inputs_to_GpuElemwise(node)
+        if max_nb_inputs<=1 and len(node.inputs)>1:
+            return False
        while len(node.inputs)>max_nb_inputs:
            inner_op = []
            for i in range(0,len(node.inputs),max_nb_inputs):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -161,8 +161,9 @@ def test_huge_elemwise_fusion():
        in case their is too many inputs and that would make it bust the 256
        bytes limits.
    """
-    shape = (3,4,5,6)
+    shape = (2,3,4,5,6)
-    vars = [tensor.tanh(tensor.ftensor4()) for x in range(10)]
+    ttype = tensor.tensor(dtype='float32',broadcastable=(False,)*len(shape))
+    vars = [tensor.tanh(ttype) for x in range(10)]
    f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
    topo = f.maker.env.toposort()
    #theano.printing.debugprint(f)
@@ -170,12 +171,29 @@ def test_huge_elemwise_fusion():
    #    print >> sys.stdout, i, node
    assert len(topo)==10
    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==2
-    assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Composite)
+    assert isinstance(topo[7].op.scalar_op,theano.scalar.basic.Sub)
    assert isinstance(topo[8].op.scalar_op,theano.scalar.basic.Composite)
    #let debugmode catch errors
    gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
    f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
+    # Test the case where we can't put the computation on the gpu! their is too many
+    # dimensions to the input to have 2 inputs to the op!
+    shape = (1,2,3,4,5,6,7,2,2,3,2,1,2,2,2,)
+    ttype = tensor.tensor(dtype='float32',broadcastable=(False,)*len(shape))
+    vars = [tensor.tanh(ttype) for x in range(10)]
+    f = pfunc(vars, [vars[0]-vars[1]-vars[2]-vars[3]-vars[4]-vars[5]-vars[6]], mode=mode_with_gpu)
+    topo = f.maker.env.toposort()
+    #theano.printing.debugprint(f)
+    assert len(topo)==1
+    assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo])==0
+    assert sum([isinstance(node.op, tensor.Elemwise) for node in topo])==1
+    #let debugmode catch errors
+    gen = lambda : theano._asarray(numpy.random.rand(*shape), dtype='float32')
+    f(gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen(),gen())
 def test_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly"""
    shape = (3,4)

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -10,6 +10,7 @@ from theano import scalar as scal
 try:
    # We must do those import to be able to create the full doc when nvcc
+    # is not available
    import cuda_ndarray.cuda_ndarray as cuda
    from theano.sandbox.cuda.nvcc_compiler import nvcc_module_compile_str
    import cuda_ndarray

--- a/theano/sandbox/cuda/var.py
+++ b/theano/sandbox/cuda/var.py
@@ -10,6 +10,7 @@ from theano.compile import SharedVariable
 from theano.sandbox.cuda.type import CudaNdarrayType
 try:
    # We must do those import to be able to create the full doc when nvcc
+    # is not available
    from theano.sandbox.cuda import filter as type_support_filter
    from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost
 except ImportError:

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -432,6 +432,10 @@ complexs128 = _multi(complex128)
 def upcast_out(*types):
    return Scalar(dtype = Scalar.upcast(*types)),
+def upcast_out_no_complex(*types):
+    if any([type in complex_types for type in types]):
+        raise TypeError('complex type are not supported')
+    return Scalar(dtype = Scalar.upcast(*types)),
 def same_out(type):
    return type,
 def same_out_float_only(type):
@@ -481,6 +485,14 @@ def upgrade_to_float(*types):
            int32: float64,
            int64: float64}
    return Scalar(Scalar.upcast(*[conv.get(type, type) for type in types])),
+def upgrade_to_float_no_complex(*types):
+    """
+    don't accept complex, otherwise call upgrade_to_float().
+    """
+    for type in types:
+        if type in complex_types:
+            raise TypeError('complex argument not supported')
+    return upgrade_to_float(*types)
 def same_out_nocomplex(type):
    if type in complex_types:
        raise TypeError('complex argument not supported')
@@ -622,8 +634,11 @@ class LT(LogicalComparison):
    commutative = False
    associative = False
    def impl(self, x, y):
-        return x < y
+        # built-in < don't support complex
+        return numpy.less(x, y)
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s < %(y)s);" % locals()
 lt = LT()
@@ -632,8 +647,11 @@ class GT(LogicalComparison):
    commutative = False
    associative = False
    def impl(self, x, y):
-        return x > y
+        # built-in > don't support complex
+        return numpy.greater(x, y)
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s > %(y)s);" % locals()
 gt = GT()
@@ -642,8 +660,11 @@ class LE(LogicalComparison):
    commutative = False
    associative = False
    def impl(self, x, y):
-        return x <= y
+        # built-in <= don't support complex
+        return numpy.less_equal(x, y)
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s <= %(y)s);" % locals()
 le = LE()
@@ -652,8 +673,11 @@ class GE(LogicalComparison):
    commutative = False
    associative = False
    def impl(self, x, y):
-        return x >= y
+        # built-in >= don't support complex
+        return numpy.greater_equal(x, y)
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s >= %(y)s);" % locals()
 ge = GE()
@@ -664,6 +688,8 @@ class EQ(LogicalComparison):
    def impl(self, x, y):
        return x == y
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s == %(y)s);" % locals()
 eq = EQ()
@@ -674,6 +700,8 @@ class NEQ(LogicalComparison):
    def impl(self, x, y):
        return x != y
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types:
+            raise NotImplementedError()
        return "%(z)s = (%(x)s != %(y)s);" % locals()
 neq = NEQ()
@@ -751,7 +779,7 @@ class UnaryBitOp(UnaryScalarOp):
    def output_types(self, *input_types):
        for i in input_types[0]:
            if i not in (int8, int32, int64):
-                raise TypeError('input to a BitOp must have type int8, int32 or int 64... not %s' % i)
+                raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
        return upcast_out(*input_types[0])
    def grad(self, inputs, output_gradients):
        return [None]
@@ -761,7 +789,7 @@ class BinaryBitOp(BinaryScalarOp):
        t0, t1 = input_types[0]
        for i in input_types[0]:
            if i not in (int8, int32, int64):
-                raise TypeError('input to a BitOp must have type int8, int32 or int 64... not %s' % i)
+                raise TypeError('input to a BitOp must have type int8, int32 or int64... not %s' % i)
        return upcast_out(*input_types[0])
    def grad(self, inputs, output_gradients):
        return [None, None]
@@ -806,8 +834,11 @@ class Maximum(BinaryScalarOp):
    commutative = True
    associative = True
    def impl(self, *inputs):
-        return max(inputs)
+        # The built-in max function don't support complex type
+        return numpy.maximum(*inputs)
    def c_code(self, node, name, (x,y), (z, ), sub):
+        if any([i.type in complex_types for i in node.inputs]):
+            raise NotImplementedError()
        return "%(z)s = ((%(y)s)>(%(x)s)? (%(y)s):(%(x)s));" %locals()
    def grad(self, (x, y), (gz, )):
@@ -826,8 +857,11 @@ class Minimum(BinaryScalarOp):
    commutative = True
    associative = True
    def impl(self, *inputs):
-        return min(inputs)
+        # The built-in min function don't support complex type
+        return numpy.minimum(*inputs)
    def c_code(self, node, name, (x,y), (z, ), sub):
+        if any([i.type in complex_types for i in node.inputs]):
+            raise NotImplementedError()
        return "%(z)s = ((%(y)s)<(%(x)s)? (%(y)s):(%(x)s));" %locals()
    def grad(self, (x, y), (gz, )):
@@ -949,6 +983,9 @@ class TrueDiv(BinaryScalarOp):
        else:
            return x / y
    def c_code(self, node, name, (x, y), (z, ), sub):
+        #we generate good c code only when both are complex!
+        if sum([node.inputs[0].type in complex_types, node.inputs[1].type in complex_types])==1:
+            raise NotImplementedError('type not supported', type)
        if node.inputs[0].type in int_types and node.inputs[1].type in int_types:
            return "%(z)s = ((double)%(x)s) / %(y)s;" % locals()
        return "%(z)s = %(x)s / %(y)s;" % locals()
@@ -1028,6 +1065,8 @@ class Pow(BinaryScalarOp):
    def impl(self, x, y):
        return x ** y
    def c_code(self, node, name, (x, y), (z, ), sub):
+        if node.inputs[0].type in complex_types or node.inputs[1].type in complex_types:
+            raise NotImplementedError('type not supported', type)
        return "%(z)s = pow(%(x)s, %(y)s);" % locals()
    def grad(self, (x, y), (gz, )):
        if gz.type in complex_types:
@@ -1865,5 +1904,3 @@ class Composite(ScalarOp):
        #we must call init to set env and _impls again.
        #otherwise self.perform won't work.
        self.__init__(self.inputs, self.outputs)
--- a/theano/scalar/basic_scipy.py
+++ b/theano/scalar/basic_scipy.py
 #definition theano.scalar op that have their python implementation taked from scipy
 #as scipy is not always available, we put threat them separatly
-from theano.scalar.basic import UnaryScalarOp,exp,sqrt,upgrade_to_float,complex_types,float_types,upcast
 import numpy
+from theano.scalar.basic import UnaryScalarOp,exp,upgrade_to_float,float_types
+from theano.scalar.basic import upgrade_to_float_no_complex,complex_types,upcast
 imported_scipy_special = False
 try:
    import scipy.special
@@ -49,4 +50,6 @@ class Erfc(UnaryScalarOp):
        if node.inputs[0].type in complex_types:
            raise NotImplementedError('type not supported', type)
        return "%(z)s = erfc(%(x)s);" % locals()
-erfc = Erfc(upgrade_to_float, name = 'erfc')
+# scipy.special.erfc don't support complex. Why?
+erfc = Erfc(upgrade_to_float_no_complex, name = 'erfc')
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -4414,6 +4414,7 @@ class numeric_grad:
            x[i] += eps
            f_eps = f(*apt)
            gx[i] = numpy.asarray((f_eps - f_x)/eps)
        if packed_pt:
@@ -4594,6 +4595,7 @@ def verify_grad(fun, pt, n_tests=2, rng=None, eps=None, abs_tol=None, rel_tol=No
    for test_num in xrange(n_tests):
        num_grad = numeric_grad(cost_fn, [p.copy() for p in pt], eps)
        analytic_grad = grad_fn(*[p.copy() for p in pt])
        if not isinstance(analytic_grad, (list, tuple)):
@@ -4621,6 +4623,7 @@ class GradientError(Exception):
        self.abs_tol = abs_tol
        self.rel_tol = rel_tol
    def __str__(self):
        return """GradientError: numeric gradient and analytic gradient exceed tolerance:
        At position %i of argument %i,

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -49,12 +49,16 @@ class Conv3D(theano.Op):
    def __str__(self):
        return "Conv3D"
+    def c_code_cache_version(self):
+        return (1,)
    def make_node(self, V, W, b, d):
        """
-            :param V: Visible unit, input
+            :param V: Visible unit, input(batch,row,column,time,in channel)
-            :param W: Weights, filter
+            :param W: Weights, filter(out channel,row,column,time,in channel)
            :param b: bias, shape == (W.shape[0],)
-            :param d: strides when moving the filter over the input
+            :param d: strides when moving the filter over the input(dx,dy,dt)
        """
        V_ = T.as_tensor_variable(V)
@@ -82,22 +86,22 @@ class Conv3D(theano.Op):
        dCdb = T.sum(dCdH, axis=(0,1,2,3))
        dCdd = None #not differentiable, since d is not continuous
-        if 'name' in dir(dCdH) and dCdH.name != None:
+        if 'name' in dir(dCdH) and dCdH.name is not None:
            dCdH_name = dCdH.name
        else:
            dCdH_name = 'anon'
-        if 'name' in dir(V) and V.name != None:
+        if 'name' in dir(V) and V.name is not None:
            V_name = V.name
        else:
            V_name = 'anon'
-        if 'name' in dir(W) and W.name != None:
+        if 'name' in dir(W) and W.name is not None:
            W_name = W.name
        else:
            W_name = 'anon'
-        if 'name' in dir(b) and b.name != None:
+        if 'name' in dir(b) and b.name is not None:
            b_name = b.name
        else:
            b_name = 'anon'

--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
@@ -3,6 +3,10 @@ from theano.tensor import basic as T
 from theano.misc import strutil
 import numpy as N
+#TODO: speed up by reordering loops. Should pass through the videos once, incrementing all weight gradients, rather
+# than visiting each weight gradient element once and passing through whole video
 class ConvGrad3D(theano.Op):
    """ Gradient of Conv3D with respect to W """
    def __eq__(self,other):
@@ -11,6 +15,9 @@ class ConvGrad3D(theano.Op):
    def __hash__(self):
        return hash(type(self))
+    def c_code_cache_version(self):
+        return (1,)
    def make_node(self, V, d, WShape, dCdH):
        V_ = T.as_tensor_variable(V)
        d_ = T.as_tensor_variable(d)

--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
@@ -11,6 +11,9 @@ class ConvTransp3D(theano.Op):
    def __hash__(self):
 	    return hash(type(self))
+    def c_code_cache_version(self):
+        return (1,)
    def make_node(self, W, b, d, H, RShape = None):
        """
        :param W: Weights, filter
@@ -50,22 +53,22 @@ class ConvTransp3D(theano.Op):
        dCdRShape = None #not differentiable, since RShape is not continuous
-        if 'name' in dir(dCdR) and dCdR.name != None:
+        if 'name' in dir(dCdR) and dCdR.name is not None:
            dCdR_name = dCdR.name
        else:
            dCdR_name = 'anon'
-        if 'name' in dir(H) and H.name != None:
+        if 'name' in dir(H) and H.name is not None:
            H_name = H.name
        else:
            H_name = 'anon'
-        if 'name' in dir(W) and W.name != None:
+        if 'name' in dir(W) and W.name is not None:
            W_name = W.name
        else:
            W_name = 'anon'
-        if 'name' in dir(b) and b.name != None:
+        if 'name' in dir(b) and b.name is not None:
            b_name = b.name
        else:
            b_name = 'anon'
@@ -360,7 +363,7 @@ def computeR(W,b,d,H,Rshape = None):
 		videoWidth = (outputWidth-1) * dc + filterWidth
 		videoDur = (outputDur-1) * dt + filterDur
-		if Rshape != None and Rshape[0] != -1:
+		if Rshape is not None and Rshape[0] != -1:
 			if Rshape[0] < videoHeight:
 				print (Rshape[0], videoHeight)
 				assert False

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -290,7 +290,7 @@ class ConvOp(Op):
        :type dx: int
        :param dx: patch stride rows
        :type dy: int
-        :param dx: patch stride cols
+        :param dy: patch stride cols
        Params which select the version of code used:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -283,7 +283,6 @@ def local_dimshuffle_lift(node):
        else:
            return DimShuffle(iinput.type.broadcastable, new_order, inplace).make_node(iinput).outputs
-@register_specialize
 @gof.local_optimizer([])
 def dimshuffle_as_view(node):
    op = node.op
@@ -293,6 +292,7 @@ def dimshuffle_as_view(node):
    return [new_op(*node.inputs)]
+register_specialize(dimshuffle_as_view, 'inplace')
 register_canonicalize(local_dimshuffle_lift)
 register_specialize(local_dimshuffle_lift)
@@ -2313,15 +2313,21 @@ def local_add_specialize(node):
                y = get_constant_value(input)
            except TypeError:
                y = input
-            if N.all(y == 0.0):
+            if numpy.all(y == 0.0):
                continue
            new_inputs.append(input)
        if len(new_inputs) < len(node.inputs):
            if len(new_inputs) == 0:
                #we got rid of the entire expression!
-                return fill_chain(T.TensorConstant(T.TensorType(dtype=node.outputs[0].type.dtype,
+                ndim = node.outputs[0].type.ndim
-                    broadcastable = [True] * node.outputs[0].ndim), N.asarray(0)))
+                dtype = node.outputs[0].type.dtype
+                return fill_chain(
+                        T.TensorConstant(
+                            T.TensorType(
+                                dtype=dtype,
+                                broadcastable = [True] * ndim),
+                            numpy.zeros((1,)*ndim, dtype=dtype)))
            if len(new_inputs) == 1:
                return fill_chain(new_inputs[0])

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -876,8 +876,7 @@ class test_fusion(unittest.TestCase):
        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
-    def test_gpu_fusion_3d(self):
+    def test_gpu_fusion_Xd(self):
-        shp=(5,5,5)
        #we need the optimisation enabled, debug do this.
        if theano.config.mode == "FAST_COMPILE":
            mode = theano.compile.mode.get_mode("FAST_RUN").including('local_elemwise_fusion','canonicalize','gpu')
@@ -886,7 +885,10 @@ class test_fusion(unittest.TestCase):
        import theano.sandbox.cuda as cuda
        if not cuda.cuda_available:
            raise SkipTest("cuda not available")
+        if cuda.opt.int_size == 4:
+            shp=(5,5,5,5)
+        else:
+            shp=(5,5,5)
        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
    def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
@@ -2174,3 +2176,15 @@ def test_local_mul_to_neg():
    aval = numpy.random.randint(0,10,(2,2)).astype('int32')
    assert f1(aval).dtype == a.dtype
    assert f2(aval).dtype == 'float64'
+def test_local_add_specialize():
+    # test of non-zero dimension
+    a = TT.vector()
+    s = TT.add(TT.zeros_like(a))
+    assert local_add_specialize.transform(s.owner)
+    # test of 0-d
+    a = TT.scalar()
+    s = TT.add(TT.zeros_like(a))
+    assert local_add_specialize.transform(s.owner)
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
@@ -231,6 +231,7 @@ def makeSharedTester(shared_constructor_,
            total = self.theano_fct(x_shared)
            total_func = theano.function([],total)
+            total_func()
            values_to_div = .5
            if self.op_by_matrix:
@@ -418,6 +419,7 @@ def makeSharedTester(shared_constructor_,
            #Test that we can replace with values of the different shape
            # but that will raise an error in some case, but not all
+            specify_shape_fct()
            x1_shared.set_value(x2)
            self.assertRaises(AssertionError, specify_shape_fct)
@@ -450,6 +452,7 @@ def makeSharedTester(shared_constructor_,
            assert numpy.allclose(self.ref_fct(x1_shared.value), self.ref_fct( x1_2))
            shape_op_fct = theano.function([],x1_shared.shape)
            topo = shape_op_fct.maker.env.toposort()
+            shape_op_fct()
            if theano.config.mode!='FAST_COMPILE':
                assert len(topo)==3
                assert isinstance(topo[0].op,tensor.opt.Shape_i)
@@ -458,6 +461,7 @@ def makeSharedTester(shared_constructor_,
            #Test that we forward the input
            specify_shape_fct = theano.function([],x1_specify_shape)
+            specify_shape_fct()
            #theano.printing.debugprint(specify_shape_fct)
            assert numpy.all(self.ref_fct(specify_shape_fct())
                             ==self.ref_fct(x1_2))