Merged

c239bfec · Olivier Delalleau · 42386839 · 6a159276 · c239bfec · c239bfec
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -760,6 +760,10 @@ def function(inputs, outputs, mode=None, accept_inplace = False):

     - EXPENSIVE_OPTIMIZATION TODO: NotImplemented

+     - PROFILE_MODE : allow to print a profile mode with mode.print_summary
+
+     - DEBUG_MODE : make all the check that we taught of(compare python and c,...)
+
    :param accept_inplace:  True iff the graph can contain inplace operations prior to the
    optimization phase (default is False)


--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
-import time
+import time, atexit

 from ..gof.link import WrapLinkerMany
 from ..gof.cutils import run_cthunk
-from ..compile.mode import Mode, predefined_linkers
+from ..compile.mode import Mode, predefined_linkers, register_mode, predefined_modes
 from ..gof.cc import OpWiseCLinker

 class ProfileMode(Mode):
@@ -110,3 +110,19 @@ class ProfileMode(Mode):
                  sum(f for f, t, a in sotimes[n_ops_to_print:])*100,
                  sum(t for f, t, a in sotimes[n_ops_to_print:]))
        print '(*) Op is running a c implementation'
+
+
+register_mode('PROFILE_MODE',ProfileMode())
+
+def atexit_print_default_profile_mode():
+    """Print the summary of the predefied mode PROFILE_MODE if used.
+    
+    This all to have the summary printed at exit when we do
+    THEANO_DEFAULT_MODE=PROFILE_MODE
+    """
+    prof_mode=predefined_modes["PROFILE_MODE"]
+    if prof_mode.local_time[0]>0: prof_mode.print_summary()
+
+#Register atexit_print_default_profile_mode to have the summary of the
+#predefined mode PROFILE_MODE if it is used printed when the program terminate.
+atexit.register(atexit_print_default_profile_mode)
--- a/theano/compile/sandbox/shared_randomstreams.py
+++ b/theano/compile/sandbox/shared_randomstreams.py
+"""Define RandomStreams, providing random number variables for Theano graphs."""
+__docformat__ = "restructuredtext en"
+
+import sys
+import numpy
+
+from ...gof import Container
+from ...tensor import raw_random
+
+from sharedvalue import SharedVariable, shared_constructor, shared
+
+class RandomStateSharedVariable(SharedVariable):
+    pass
+
+@shared_constructor
+def randomstate_constructor(value, name=None, strict=False):
+    """SharedVariable Constructor for RandomState"""
+    if not isinstance(value, numpy.random.RandomState):
+        raise TypeError
+    return RandomStateSharedVariable(
+            type=raw_random.random_state_type,
+            value=value, 
+            name=name,
+            strict=strict)
+
+class RandomStreams(object):
+    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""
+
+    random_state_variables = []
+    """A list of pairs of the form (input_r, output_r).  This will be over-ridden by the module
+    instance to contain stream generators.
+    """
+
+    default_instance_seed = None
+    """Instance variable should take None or integer value.  Used to seed the random number
+    generator that provides seeds for member streams"""
+
+    gen_seedgen = None
+    """numpy.RandomState instance that gen() uses to seed new streams.
+    """
+
+    def updates(self):
+        return list(self.random_state_variables)
+
+    def __init__(self, seed=None):
+        """
+        :type seed: None or int
+
+        :param seed: a default seed to initialize the RandomState instances after build.  See
+        `RandomStreamsInstance.__init__` for more details.
+        """
+        super(RandomStreams, self).__init__()
+        self.random_state_variables = []
+        self.default_instance_seed = seed
+        self.gen_seedgen = numpy.random.RandomState(seed)
+
+    def seed(self, seed=None):
+        """Re-initialize each random stream
+        
+        :param seed: each random stream will be assigned a unique state that depends
+        deterministically on this value.
+
+        :type seed: None or integer in range 0 to 2**30
+
+        :rtype: None
+        """
+        seed = self.default_instance_seed if seed is None else seed
+        seedgen = numpy.random.RandomState(seed)
+        for old_r, new_r in self.random_state_variables:
+            old_r_seed = seedgen.randint(2**30)
+            old_r.value = numpy.random.RandomState(int(old_r_seed))
+
+    def __getitem__(self, item):
+        """Retrieve the numpy RandomState instance associated with a particular stream
+
+        :param item: a variable of type RandomStateType, associated with this RandomStream
+
+        :rtype: numpy RandomState (or None, before initialize)
+
+        :note: This is kept for compatibility with `tensor.randomstreams.RandomStreams`.  The
+        simpler syntax ``item.rng.value`` is also valid.
+
+        """
+        return item.value
+
+    def __setitem__(self, item, val):
+        """Set the numpy RandomState instance associated with a particular stream
+
+        :param item: a variable of type RandomStateType, associated with this RandomStream
+
+        :param val: the new value
+        :type val: numpy RandomState
+
+        :rtype:  None
+
+        :note: This is kept for compatibility with `tensor.randomstreams.RandomStreams`.  The
+        simpler syntax ``item.rng.value = val`` is also valid.
+
+        """
+        item.value = val
+
+
+    def gen(self, op, *args, **kwargs):
+        """Create a new random stream in this container.
+
+        :param op: a RandomFunction instance to 
+
+        :param args: interpreted by `op`
+
+        :param kwargs: interpreted by `op`
+
+        :returns: The symbolic random draw part of op()'s return value.  This function stores
+        the updated RandomStateType Variable for use at `build` time.
+
+        :rtype: TensorVariable
+        """
+        seed = int(self.gen_seedgen.randint(2**30))
+        random_state_variable = shared(numpy.random.RandomState(seed))
+        new_r, out = op(random_state_variable, *args, **kwargs)
+        out.rng = random_state_variable
+        self.random_state_variables.append((random_state_variable, new_r))
+        return out
+
+    def binomial(self, *args, **kwargs):
+        """Return a symbolic binomial sample
+
+        This is a shortcut for a call to `self.gen`
+        """
+        return self.gen(raw_random.binomial, *args, **kwargs)
+
+    def uniform(self, *args, **kwargs):
+        """Return a symbolic uniform sample
+
+        This is a shortcut for a call to `self.gen`
+        """
+        return self.gen(raw_random.uniform, *args, **kwargs)
+
+    def normal(self, *args, **kwargs):
+        """Return a symbolic normal sample
+
+        This is a shortcut for a call to `self.gen`
+        """
+        return self.gen(raw_random.normal, *args, **kwargs)
+
+    def random_integers(self, *args, **kwargs):
+        """Return a symbolic random integer sample
+
+        This is a shortcut for a call to `self.gen`
+        """
+        return self.gen(raw_random.random_integers, *args, **kwargs)
+
--- a/theano/compile/sandbox/tests/test_shared_randomstreams.py
+++ b/theano/compile/sandbox/tests/test_shared_randomstreams.py
+__docformat__ = "restructuredtext en"
+
+import sys
+import unittest
+import numpy 
+
+from theano.tensor import raw_random
+from theano.compile.sandbox.shared_randomstreams import RandomStreams
+from theano.compile.sandbox.pfunc import pfunc
+
+from theano import tensor
+from theano import compile, gof
+
+
+class T_RandomStreams(unittest.TestCase):
+    def test_basics(self):
+        random = RandomStreams(234)
+        fn = pfunc([], random.uniform((2,2)), updates=random.updates())
+        gn = pfunc([], random.normal((2,2)), updates=random.updates())
+
+        fn_val0 = fn()
+        fn_val1 = fn()
+
+        gn_val0 = gn()
+
+        rng_seed = numpy.random.RandomState(234).randint(2**30)
+        rng = numpy.random.RandomState(int(rng_seed)) #int() is for 32bit
+
+        #print fn_val0
+        numpy_val0 = rng.uniform(size=(2,2))
+        numpy_val1 = rng.uniform(size=(2,2))
+        #print numpy_val0
+
+        assert numpy.all(fn_val0 == numpy_val0)
+        print fn_val0
+        print numpy_val0
+        print fn_val1
+        print numpy_val1
+        assert numpy.all(fn_val1 == numpy_val1)
+
+    def test_seed_fn(self):
+        random = RandomStreams(234)
+        fn = pfunc([], random.uniform((2,2)), updates=random.updates())
+
+        random.seed(888)
+
+        fn_val0 = fn()
+        fn_val1 = fn()
+
+        rng_seed = numpy.random.RandomState(888).randint(2**30)
+        rng = numpy.random.RandomState(int(rng_seed))  #int() is for 32bit
+
+        #print fn_val0
+        numpy_val0 = rng.uniform(size=(2,2))
+        numpy_val1 = rng.uniform(size=(2,2))
+        #print numpy_val0
+
+        assert numpy.all(fn_val0 == numpy_val0)
+        assert numpy.all(fn_val1 == numpy_val1)
+
+    def test_getitem(self):
+
+        random = RandomStreams(234)
+        out = random.uniform((2,2))
+        fn = pfunc([], out, updates=random.updates())
+
+        random.seed(888)
+
+        rng = numpy.random.RandomState()
+        rng.set_state(random[out.rng].get_state()) #tests getitem
+
+        fn_val0 = fn()
+        fn_val1 = fn()
+        numpy_val0 = rng.uniform(size=(2,2))
+        numpy_val1 = rng.uniform(size=(2,2))
+        assert numpy.all(fn_val0 == numpy_val0)
+        assert numpy.all(fn_val1 == numpy_val1)
+
+    def test_setitem(self):
+
+        random = RandomStreams(234)
+        out = random.uniform((2,2))
+        fn = pfunc([], out, updates=random.updates())
+
+        random.seed(888)
+
+        rng = numpy.random.RandomState(823874)
+        random[out.rng] = numpy.random.RandomState(823874)
+
+        fn_val0 = fn()
+        fn_val1 = fn()
+        numpy_val0 = rng.uniform(size=(2,2))
+        numpy_val1 = rng.uniform(size=(2,2))
+        assert numpy.all(fn_val0 == numpy_val0)
+        assert numpy.all(fn_val1 == numpy_val1)
+
+
+if __name__ == '__main__':
+    from theano.tests import main
+    main("test_randomstreams")
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -72,7 +72,7 @@ class BROKEN_ON_PURPOSE_StructuredDotCSC(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
            dims[1] = %(b)s->dimensions[1];
@@ -189,13 +189,13 @@ class WeirdBrokenOp(gof.Op):
    def c_code(self, node, name, (a,), (z,), sub):
        if "inplace" in self.behaviour:
            z_code = """
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            Py_INCREF(%(a)s);
            %(z)s = %(a)s;
            """
        else:
            z_code = """
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, %(a)s->dimensions, %(a)s->descr->type_num);
            """
        prep_vars = """

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -144,18 +144,18 @@ def struct_gen(args, struct_builders, blocks, sub):
            PyObject* err_msg = NULL;
            PyObject* err_traceback = NULL;
            PyErr_Fetch(&err_type, &err_msg, &err_traceback);
-            if (!err_type) {err_type = Py_None; Py_XINCREF(Py_None);}
-            if (!err_msg) {err_msg = Py_None; Py_XINCREF(Py_None);}
-            if (!err_traceback) {err_traceback = Py_None; Py_XINCREF(Py_None);}
+            if (!err_type) {err_type = Py_None;Py_INCREF(Py_None);}
+            if (!err_msg) {err_msg = Py_None; Py_INCREF(Py_None);}
+            if (!err_traceback) {err_traceback = Py_None; Py_INCREF(Py_None);}
            PyObject* old_err_type = PyList_GET_ITEM(__ERROR, 0);
            PyObject* old_err_msg = PyList_GET_ITEM(__ERROR, 1);
            PyObject* old_err_traceback = PyList_GET_ITEM(__ERROR, 2);
            PyList_SET_ITEM(__ERROR, 0, err_type);
            PyList_SET_ITEM(__ERROR, 1, err_msg);
            PyList_SET_ITEM(__ERROR, 2, err_traceback);
-            Py_XDECREF(old_err_type);
-            Py_XDECREF(old_err_msg);
-            Py_XDECREF(old_err_traceback);
+            {Py_XDECREF(old_err_type);}
+            {Py_XDECREF(old_err_msg);}
+            {Py_XDECREF(old_err_traceback);}
        }
        // The failure code is returned to index what code block failed.
        return %(failure_var)s;
@@ -222,7 +222,7 @@ def get_c_init(r, name, sub):
    """WRITEME"""
    pre = "" """
    py_%(name)s = Py_None;
-    Py_XINCREF(py_%(name)s);
+    {Py_XINCREF(py_%(name)s);}
    """ % locals()
    return pre + r.type.c_init(name, sub)

@@ -230,14 +230,14 @@ def get_c_extract(r, name, sub):
    """WRITEME"""
    pre = """
    py_%(name)s = PyList_GET_ITEM(storage_%(name)s, 0);
-    Py_XINCREF(py_%(name)s);
+    {Py_XINCREF(py_%(name)s);}
    """ % locals()
    return pre + r.type.c_extract(name, sub)

 def get_c_cleanup(r, name, sub):
    """WRITEME"""
    post = """
-    Py_XDECREF(py_%(name)s);
+    {Py_XDECREF(py_%(name)s);}
    """ % locals()
    return r.type.c_cleanup(name, sub) + post

@@ -247,9 +247,9 @@ def get_c_sync(r, name, sub):
    if (!%(failure_var)s) {
      %(sync)s
      PyObject* old = PyList_GET_ITEM(storage_%(name)s, 0);
-      Py_XINCREF(py_%(name)s);
+      {Py_XINCREF(py_%(name)s);}
      PyList_SET_ITEM(storage_%(name)s, 0, py_%(name)s);
-      Py_XDECREF(old);
+      {Py_XDECREF(old);}
    }
    """ % dict(sync = r.type.c_sync(name, sub), name = name, **sub)

@@ -526,11 +526,15 @@ class CLinker(link.Linker):

        This might contain duplicates.
        """
-        ret = ["-O3", "-w"]#-w means supress all warnings
-# this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable "-ffinite-math-only". Putting -ffast-math would make it disable all other parameter at the same time.
+        ret = ["-O3"]
+# this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable some of them. Putting -ffast-math would make it disable all other parameter at the same time.
        ret += ["-fno-math-errno", "-funsafe-math-optimizations",
                "-fno-signaling-nans", "-fcx-limited-range",
-                "-fno-rounding-math", "-ffinite-math-only"]
+                "-fno-rounding-math", "-ffinite-math-only",
+                "-Wno-unused-label",#the current code generate label event if they are not used. Could use gcc attribute for those label only
+                "-Wno-unused-variable",#idem as the precedent
+                "-Wno-write-strings",#generated by our code generator...
+                ]
        for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
            try: ret += x.c_compile_args()
            except utils.MethodNotDefined: pass

--- a/theano/gradient.py
+++ b/theano/gradient.py
+"""Driver for general gradient calculations."""
+
+__docformat__ = "restructuredtext en"
+
+import sys
 import gof #, gof.variable
 import numpy #for numeric_grad

 from gof.python25 import all
 import gof.utils

+def warning(msg):
+    # replace this with logger.warning when adding logging support
+    print >> sys.stderr, 'WARNING', msg
+
 _msg_retType = 'op.grad(...) returned a non-list'
 _msg_badlen = 'op.grad(...) returned wrong number of gradients'

-def grad_sources_inputs(sources, graph_inputs):
+def grad_sources_inputs(sources, graph_inputs, warn_type=True):
    """
-    A gradient source is a pair (r, g_r), in which r is a variable, and g_r is a
-    variable that is a gradient wrt r.
+    A gradient source is a pair (``r``, ``g_r``), in which ``r`` is a `Variable`, and ``g_r`` is a
+    `Variable` that is a gradient wrt ``r``.
+
+    This function traverses the graph backward from the ``r`` sources,
+    calling ``op.grad(...)`` for all ops with some non-None gradient on an output.
+
+    The ``op.grad(...)`` functions are called like this:

-    This function traverses the graph backward from the 'r' sources,
-    calling L{Op.grad}(...) when it is provided by an L{Op}, and at least one of the
-    outputs of the L{Op} has an associated gradient.
+    .. code-block:: python
+        op.grad(op.inputs[:], [total_gradient(v for v in op.outputs)])

-    The L{Op.grad}(...) functions are called as such:
-        op.grad( op.inputs[0], grad(op.outputs[0]))
+    This call to ``op.grad`` should return a list or tuple: one symbolic gradient per input.
+    If ``op`` has a single input, then ``op.grad``  should return a list or tuple of length 1.

-    This function expects the L{Op.grad}(...) function to return the gradient
-    expression [variables] associated with the inputs of the L{Op}. The L{Op} should
-    return a list of variables corresponding to the gradients in the same order
-    as the inputs. If it has a single output it should return a list or tuple
-    of length 1.
+    For each input wrt to which ``op`` is not differentiable, it should return ``None`` instead
+    of a `Variable` instance.

-    For each input wrt to which an L{Op} is not differentiable, it should return
-    None instead of a variable instance.
+    If a source ``r`` receives a gradient from another source ``r2``, then the effective
+    gradient on ``r`` is the sum of both gradients.

-    @type sources: list
-    @param sources: gradient sources (explained below)
-    @type graph_inputs: list
-    @param graph_inputs: variables considered to be constant
+    :type sources: list of pairs of Variable: (v, gradient-on-v)
+    :param sources: gradients to back-propagate using chain rule
+    :type graph_inputs: list of Variable
+    :param graph_inputs: variables considered to be constant (do not backpropagate through
+    them)

-    @rtype: dictionary
-    @return: dictionary mapping each variable necessary for a source to its gradient.
+    :rtype: dictionary whose keys and values are of type `Variable`
+    :return: mapping from each Variable encountered in the backward traversal to its gradient.
    """
    gmap = {}
    for (r, g_r) in sources:
@@ -90,8 +101,9 @@ def grad_sources_inputs(sources, graph_inputs):
                    len(g_inputs),
                    len(node.inputs))
        for ii, (r, g_r) in enumerate(zip(node.inputs, g_inputs)):
-            if g_r and (r.type != g_r.type):
-                print 'WARNING: %s.grad returned a different type for input %i: %s vs. %s'%(node.op, ii, r.type, g_r.type)
+            if warn_type:
+                if g_r and (getattr(r,'type',0) != getattr(g_r,'type', 1)):
+                    warning('%s.grad returned a different type for input %i: %s vs. %s'%(node.op, ii, r, g_r))
            if g_r and len(sources) == 1 and sources[0][0].name and r.name:
                g_r.name = "(d%s/d%s)" % (sources[0][0].name, r.name)
            if g_r is not None: 

--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -28,8 +28,9 @@ class ConvOp(Op):

    #TODO: make the stacksize its own parameter, and make imshp a pair

-    def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid', unroll_batch=0,
-            unroll_kern=0,
+    def __init__(self, imshp, kshp, nkern, bsize, dx, dy, output_mode='valid',
+            unroll_batch=4,
+            unroll_kern=4,
            imshp_logical=None,
            kshp_logical=None,
            kshp_logical_top_aligned=True):
@@ -57,6 +58,10 @@ class ConvOp(Op):

        unroll_batch. If >0 will use a version that will unroll the batch loop by the value of the option. By default don't use this version of the code.
        unroll_nkern. idem as unroll_batch but unroll the kernel loop.
+
+        The version is with unroll_batch=4 and unroll_nkern if possible(currenctly it don't support logical shape != physical shape) as this is what give the best performance in practice. This also tell that to have the best performance, you should have a batch size and a number of kernel multiple of 4. In the article:
+        Anatomy of High-Performance Matrix Multiplication by Kazushige Goto and Robert A. Van De Geijn, ACM Transactions on Mathematical Software, vol 34, No. 3, article 12, May 2008.
+        In figure 12, it give the value mr x nr, those value are the optimum to use for unroll_batch and unroll_kern. For x86_64 bits computer it is 4x4. Other architecture can have different value.(2x4 for x86, 8x8 for itanium,...)
        """
        imshp = tuple(imshp)
        if len(imshp)==2:
@@ -473,8 +478,8 @@ if(%(filtersflipped)s->nd==3){

 img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
 img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
+if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s))
+     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
    Py_DECREF(img2d);
    img2d = contig;
@@ -487,8 +492,8 @@ img2d_arr = (PyArrayObject*)img2d;

 filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
 filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) 
-     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
+if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
+     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
    Py_DECREF(filtersflipped);
    filtersflipped = contig;
@@ -517,9 +522,8 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
  dims[2]=dim_zz[0];
@@ -540,10 +544,10 @@ for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){

    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;

    %(type)s * __restrict__ out=(%(type)s *)(PyArray_GETPTR2(%(z)s,b,n_kern));
    for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out[i] = 0;
@@ -717,8 +721,8 @@ if (NKERN != kerns_dim[0])

 img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
 img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
+if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
+     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
    Py_DECREF(img2d);
    img2d = contig;
@@ -746,7 +750,7 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
@@ -764,7 +768,7 @@ Os[1] = dim_im[1]-dim_ker[1]+1;
 // allocate a temporary buffer for storing the inner product of each nth kernel row 
 // with each row of an image
 {
-%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* sizeof(%(type)s));
+%(type)s * kbuf = (%(type)s *)malloc((Os[0] * NKERN + PyArray_Size((PyObject*)%(filtersflipped)s))* (npy_intp)sizeof(%(type)s));
 int kbufstride = NKERN;
 %(type)s * myfilters = kbuf + Os[0] * NKERN;

@@ -809,7 +813,7 @@ for(int b=0;b< %(self_bsize)s;b++){
                int imgview_stride = dim_im[1];
                int filter_rows_stride =kerns_dim[1]*kerns_dim[2]*kerns_dim[3];
                //remember, Fortran wants a column-major interpretation
-                assert(img2d->strides[3] == sizeof(%(type)s));
+                assert(img2d->strides[3] == (npy_intp)sizeof(%(type)s));

                if (0){
                    std::cerr << "b " << b << " img_col " << img_col << " filterrow " << filter_row << " stackidx " <<stackidx << "\\n";
@@ -958,8 +962,8 @@ if(%(filtersflipped)s->nd==3){

 img2d = PyArray_Newshape(%(img2d)s,&img2d_shape, PyArray_CORDER);
 img2d_arr = (PyArrayObject*)img2d;
-if ((img2d_arr->strides[3] != sizeof(%(type)s)) 
-     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*sizeof(%(type)s))){
+if ((img2d_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
+     || (img2d_arr->strides[2] != img2d_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)img2d));
    Py_DECREF(img2d);
    img2d = contig;
@@ -972,8 +976,8 @@ img2d_arr = (PyArrayObject*)img2d;

 filtersflipped = PyArray_Newshape(%(filtersflipped)s,&kerns_shape, PyArray_CORDER);
 filtersflipped_arr = (PyArrayObject*)filtersflipped;
-if ((filtersflipped_arr->strides[3] != sizeof(%(type)s)) 
-     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*sizeof(%(type)s))){
+if ((filtersflipped_arr->strides[3] != (npy_intp)sizeof(%(type)s)) 
+     || (filtersflipped_arr->strides[2] != filtersflipped_arr->dimensions[3]*(npy_intp)sizeof(%(type)s))){
    contig = (PyObject*)(PyArray_GETCONTIGUOUS((PyArrayObject*)filtersflipped));
    Py_DECREF(filtersflipped);
    filtersflipped = contig;
@@ -1002,9 +1006,8 @@ if ((!%(z)s)
  || (%(z)s->dimensions[3] != dim_zz[1])
  )
 {
-  if (%(z)s) Py_DECREF(%(z)s);
+  {Py_XDECREF(%(z)s);}
  npy_intp dims[4] = {0,0,0,0};
-  if(!dims) %(fail)s;
  dims[0]=%(self_bsize)s;
  dims[1]=%(self_nkern)s;
  dims[2]=dim_zz[0];
@@ -1025,10 +1028,10 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){

    //assertions
-    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[2] != %(z)s->dimensions[3] * sizeof(%(type)s)) %(fail)s;
-    if (%(z)s->strides[3] != sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[0] != %(z)s->dimensions[1] *%(z)s->dimensions[2] *%(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[1] != %(z)s->dimensions[2] * %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[2] != %(z)s->dimensions[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
+    if (%(z)s->strides[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
 """%d
    ret+=my_dup2("%(type)s * __restrict__ out%(unroll_iter)s=(%(type)s *)(PyArray_GETPTR2(%(z)s,b+%(unroll_biter)s,n_kern+%(unroll_kiter)s));")
    ret+=my_dup("for (int i = 0; i < dim_zz[0]*dim_zz[1]; ++i) out%(unroll_iter)s[i] = 0;",unroll_bsize*unroll_ksize)

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -819,7 +819,7 @@ class StructuredDotCSC(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = ((npy_int32 *)%(a_nrows)s->data)[0];
            dims[1] = %(b)s->dimensions[1];
@@ -951,7 +951,7 @@ class StructuredDotCSR(gof.Op):
            || (%(z)s->dimensions[1] != %(b)s->dimensions[1])       //b's columns
            )
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            {Py_XDECREF(%(z)s);}
            npy_intp dims[] = {0,0};
            dims[0] = %(a_ptr)s->dimensions[0]-1;
            dims[1] = %(b)s->dimensions[1];

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -394,15 +394,15 @@ class TensorType(Type):
    def c_sync(self, name, sub):
        """Override `CLinkerOp.c_sync` """
        return """
-        Py_XDECREF(py_%(name)s);
+        {Py_XDECREF(py_%(name)s);}
        if (!%(name)s) {
-            Py_XINCREF(Py_None);
+            Py_INCREF(Py_None);
            py_%(name)s = Py_None;
        }
        else if ((void*)py_%(name)s != (void*)%(name)s) {
            py_%(name)s = (PyObject*)%(name)s;
        }
-        Py_XINCREF(py_%(name)s);
+        {Py_XINCREF(py_%(name)s);}
        """ % locals()

    def c_headers(self):
@@ -2389,20 +2389,24 @@ outer = Outer()

 def grad(cost, wrt, g_cost=None, consider_constant=[]):
    """
-    @type cost: L{Variable}
-    @type wrt: L{Variable} or list of L{Variable}s.
-    @type g_cost: L{Variable} broadcastable to size of I{cost}, or None
-    @param g_cost: an expression for the gradient through cost.  The default is
-        {{{ones_like(cost)}}}
-    @param consider_constant: a list of expressions not to backpropagate through
-
-    @rtype: L{Variable} or list of L{Variable}s (depending upon I{wrt})
-    @return: symbolic expression of gradient of I{cost} with respect to I{wrt}.
-    If I{wrt} is a list, then return a list containing the gradient of I{cost} wrt
-    each element of the list.  If an element of I{wrt} is not differentiable
-    with respect to the output, then a L{TensorConstant} with an appropriate
+    :type cost: `Variable`
+    :type wrt: `Variable` or list of `Variable`s.
+    :type g_cost: `Variable` broadcastable to size of `cost`, or None
+    :param g_cost: an expression for the gradient through cost.  The default is
+        ``ones_like(cost)``.
+    :param consider_constant: a list of expressions not to backpropagate through
+
+    :rtype: `Variable` or list of `Variable`s (depending upon `wrt`)
+
+    :return: symbolic expression of gradient of `cost` with respect to `wrt`.
+    If `wrt` is a list, then return a list containing the gradient of `cost` wrt
+    each element of the list.  If an element of `wrt` is not differentiable
+    with respect to the output, then a `TensorConstant` with an appropriate
    kind of zero is returned.

+    This function is a wrapper around a the more general function
+    `theano.gradient.grad_sources_inputs``.
+
    """
    if not isinstance(cost, TensorVariable):
        raise TypeError('In tensor.grad(), cost argument should be a TensorVariable.', cost)

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -373,7 +373,36 @@ def test_mixeddiv():
 # #         print g
 # # #        print g.outputs[0].owner.c_code(['x', 'y', 'z'], ['e'], dict(fail = "FAIL;", id = 0))
 # #         print gof.OpWiseCLinker(g).make_function()(numpy.ones((5, 5)), numpy.ones((5, 5)), numpy.ones((5, 5)))
-        
+
+
+def test_const_type_in_mul_canonizer():
+    input = dmatrix()
+    w = dmatrix()
+    visb = dvector()
+    hidb = dvector()
+    betas = dvector()
+    a = dvector()
+
+    def sigm(x): return 1./(1+exp(-x))
+
+    hid = sigm( (dot(w,input) + hidb) * betas )
+
+    vis_gauss1 = (dot(w.T, hid) + visb) * betas / (2 * a * a)
+    vis_gauss2 = (dot(w.T, hid) + visb) * betas / (2. * a * a)
+
+    f1 = function([input,w,visb,hidb,betas,a],vis_gauss1)
+    f2 = function([input,w,visb,hidb,betas,a],vis_gauss2)
+
+    ival = numpy.random.rand(5,5)
+    wval = numpy.random.rand(5,5)
+    visbval = numpy.random.rand(5)
+    hidbval = numpy.random.rand(5)
+    betaval = numpy.random.rand(5)
+    aval = numpy.random.rand(5)
+
+    assert numpy.allclose(
+        f2(ival, wval, visbval, hidbval, betaval, aval),
+        f1(ival, wval, visbval, hidbval, betaval, aval))
    



--- a/theano/tests/test_gradient.py
+++ b/theano/tests/test_gradient.py
@@ -9,6 +9,11 @@ from theano import gof
 from theano.gradient import *
 from theano import gradient

+
+def _grad_sources_inputs(*args):
+    # warn_type was introduced after this code, it complains throughout for nothing.
+    return grad_sources_inputs(warn_type=False, *args)
+
 class test_grad_sources_inputs(unittest.TestCase):
    def test_retNone1(self): 
        """Test that it is not ok to return None from op.grad()"""
@@ -21,7 +26,7 @@ class test_grad_sources_inputs(unittest.TestCase):
                pass
        a = retNone().make_node()
        try:
-            grad_sources_inputs([(a.out, 1)], None)
+            _grad_sources_inputs([(a.out, 1)], None)
        except ValueError, e:
            self.failUnless(e[0] is gradient._msg_retType)
            return
@@ -36,7 +41,7 @@ class test_grad_sources_inputs(unittest.TestCase):
                return [None]
        i = gof.generic()
        a = retNone().make_node(i)
-        g = grad_sources_inputs([(a.out, 1)], None)
+        g = _grad_sources_inputs([(a.out, 1)], None)
        self.failUnless(not i in g)

    def test_wrong_rval_len1(self): 
@@ -51,10 +56,10 @@ class test_grad_sources_inputs(unittest.TestCase):
        i = gof.generic()
        j = gof.generic()
        a1 = retNone().make_node(i)
-        g = grad_sources_inputs([(a1.out, 1)], None)
+        g = _grad_sources_inputs([(a1.out, 1)], None)
        a2 = retNone().make_node(i,j)
        try:
-            g = grad_sources_inputs([(a2.out, 1)], None)
+            g = _grad_sources_inputs([(a2.out, 1)], None)
        except ValueError, e:
            self.failUnless(e[0] is gradient._msg_badlen)
            return
@@ -74,7 +79,7 @@ class test_grad_sources_inputs(unittest.TestCase):

        i = gof.generic()
        a1 = retNone(self).make_node(i)
-        g = grad_sources_inputs([(a1.out, None)], None)
+        g = _grad_sources_inputs([(a1.out, None)], None)

    def test_1in_1out(self):
        """Test grad is called correctly for a 1-to-1 op"""
@@ -87,7 +92,7 @@ class test_grad_sources_inputs(unittest.TestCase):
            def grad(self, (x, ), (gz, )):
                return gval,
        a1 = O().make_node()
-        g = grad_sources_inputs([(a1.outputs[0], 1)], None)
+        g = _grad_sources_inputs([(a1.outputs[0], 1)], None)
        self.failUnless(g[a1.inputs[0]] is gval)

    def test_1in_Nout(self):
@@ -101,7 +106,7 @@ class test_grad_sources_inputs(unittest.TestCase):
            def grad(self, (x, ), (gz1, gz2)):
                return gval,
        a1 = O().make_node()
-        g = grad_sources_inputs([(a1.outputs[0], 1)], None)
+        g = _grad_sources_inputs([(a1.outputs[0], 1)], None)
        self.failUnless(g[a1.inputs[0]] is gval)
    def test_Nin_1out(self):
        """Test grad is called correctly for a many-to-1 op"""
@@ -115,7 +120,7 @@ class test_grad_sources_inputs(unittest.TestCase):
            def grad(self, (x0,x1), (gz, )):
                return (gval0, gval1)
        a1 = O().make_node()
-        g = grad_sources_inputs([(a1.outputs[0], 1)], None)
+        g = _grad_sources_inputs([(a1.outputs[0], 1)], None)
        self.failUnless(g[a1.inputs[0]] is gval0)
        self.failUnless(g[a1.inputs[1]] is gval1)
    def test_Nin_Nout(self):
@@ -130,7 +135,7 @@ class test_grad_sources_inputs(unittest.TestCase):
            def grad(self, (x0,x1), (gz0,gz1)):
                return gval0, gval1
        a1 = O().make_node()
-        g = grad_sources_inputs([(a1.outputs[0], 1)], None)
+        g = _grad_sources_inputs([(a1.outputs[0], 1)], None)
        self.failUnless(g[a1.inputs[0]] is gval0)
        self.failUnless(g[a1.inputs[1]] is gval1)
    def test_some_None_ograds(self):
@@ -145,7 +150,7 @@ class test_grad_sources_inputs(unittest.TestCase):
                return [1]
        i = gof.generic()
        a1 = O(self).make_node(i)
-        g = grad_sources_inputs([(a1.outputs[0], 1)], None)
+        g = grad_sources_inputs([(a1.outputs[0], 1)], None, warn_type=False)
        self.failUnless(g[i] is 1)

    def test_some_None_igrads(self):
@@ -167,12 +172,12 @@ class test_grad_sources_inputs(unittest.TestCase):
        k = gof.generic()
        a1 = O(self, True).make_node(i,j)
        a2 = O(self, True).make_node(a1.outputs[1], k)
-        g = grad_sources_inputs([(a2.outputs[0], 1)], None)
+        g = grad_sources_inputs([(a2.outputs[0], 1)], None, warn_type=False)
        self.failUnless(g[i] is 1 and j not in g and k not in g)

        a1 = O(self, True).make_node(i,j)
        a2 = O(self, True).make_node(k, a1.outputs[1])
-        g = grad_sources_inputs([(a2.outputs[0], 1)], None)
+        g = _grad_sources_inputs([(a2.outputs[0], 1)], None)
        self.failUnless(g[k] is 1 and i not in g and j not in g)

    def test_inputs(self):
@@ -197,7 +202,7 @@ class test_grad_sources_inputs(unittest.TestCase):
        k = gof.generic()
        a1 = O(self, True).make_node(i,j)
        a2 = O(self, True).make_node(k,a1.outputs[1])
-        g = grad_sources_inputs([(a2.outputs[0], 1), (a1.outputs[1],4),
+        g = _grad_sources_inputs([(a2.outputs[0], 1), (a1.outputs[1],4),
            (a1.outputs[0], 3), (a1.outputs[0], 3)], a1.outputs)
        self.failUnless(g[a2.inputs[0]] == 1)
        self.failUnless(g[a2.inputs[1]] == 5)
@@ -228,7 +233,7 @@ class test_grad_sources_inputs(unittest.TestCase):
        k = gof.generic()
        a1 = O(self,True).make_node(i,j)
        a2 = O(self,True).make_node(k,a1.outputs[1])
-        g = grad_sources_inputs([(a2.outputs[0], 1), (a1.outputs[1],4),
+        g = _grad_sources_inputs([(a2.outputs[0], 1), (a1.outputs[1],4),
            (a1.outputs[0], 3), (a1.outputs[0], 3)], None)
        self.failUnless(g[a2.inputs[0]] == 1)
        self.failUnless(g[a2.inputs[1]] == 5)