Merged (resolved conflict in theano/tensor/opt.py, keeping repo version)

0f89d7fa · Olivier Delalleau · 19d905a8 · d01b400e · 0f89d7fa · 0f89d7fa
--- a/doc/extending/type.txt
+++ b/doc/extending/type.txt
@@ -42,6 +42,16 @@ default values.
      have a default value of ``False``. The third argument must be called
      ``allow_downcast`` and must have a default value of ``None``.
+    .. method:: filter_inplace(value, storage, strict=False, allow_downcast=None)
+      If filter_inplace is defined, it will be called instead of
+      filter() This is to allow reusing the old allocated memory. As
+      of this writing this is used only when we transfer new data to a
+      shared variable on the gpu.
+      ``storage`` will be the old value. i.e. The old numpy array,
+      CudaNdarray, ...
    .. method:: is_valid_value(value)
      Returns True iff the value is compatible with the Type. If

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -520,8 +520,14 @@ class Function(object):
                            allow_downcast=s.allow_downcast)
                except Exception, e:
-                    e.args = tuple(list(e.args)+["Bad input argument at index %d" % i])
+                    function_name="theano function"
+                    if self.name:
+                        function_name += 'with name "'+self.name+'" '
+                    #end if
+                    e.args = tuple(list(e.args)+["Bad input argument to "+function_name+" at index %d" % i])
                    raise
+                #end except
+            #end if
            s.provided += 1
            i+=1

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1100,17 +1100,15 @@ def _execute(cthunk, init_tasks, tasks, error_storage):
                trace = ()
            try:
                exc_type, _exc_value, exc_trace = error_storage
+                if hasattr(task, "outputs"):
+                    exc_value = exc_type(_exc_value, task, task.outputs)
+                else:
                    exc_value = exc_type(_exc_value, task)
                exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared
            except:
                print >> sys.stderr, 'ERROR retrieving error_storage', error_storage
                raise
-            #TODO-- someone who understands how these exceptions work, please put this info into the exception message itself
-            # (exc_value.message seems to be ignored)
-            print "while computing "+str(task.outputs)
            raise exc_type, exc_value, exc_trace
    execute.cthunk = cthunk
    return execute

--- a/theano/gof/env.py
+++ b/theano/gof/env.py
@@ -98,6 +98,7 @@ class Env(utils.object2):
        for f in features:
            self.extend(f)
+        self.extend(toolbox.ReplaceValidate())
        for input in self.inputs:
            if input.owner is not None:

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -92,7 +92,9 @@ class FromFunctionOptimizer(Optimizer):
    def __init__(self, fn):
        self.apply = fn
    def add_requirements(self, env):
-        env.extend(toolbox.ReplaceValidate())
+        # Added by default
+        #env.extend(toolbox.ReplaceValidate())
+        pass
    def print_summary(self, stream=sys.stdout, level=0):
        print >> stream, "%s%s id=%i" %(' '*level,
@@ -252,7 +254,9 @@ class MergeOptimizer(Optimizer):
        self.skip_const_merge = skip_const_merge
    def add_requirements(self, env):
-        env.extend(toolbox.ReplaceValidate())
+        # Added by default
+        #env.extend(toolbox.ReplaceValidate())
+        pass
    def apply_constant_merge(self, env):
        seen_constants = set()
@@ -421,7 +425,9 @@ class LocalOptimizer(object):
    def add_requirements(self, env):
        """If this local optimization wants to add some requirements to the env,
        This is the place to do it."""
-        env.extend(toolbox.ReplaceValidate())
+        # Added by default
+        #env.extend(toolbox.ReplaceValidate())
+        pass
    def print_summary(self, stream=sys.stdout, level=0):
        print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self))
@@ -908,7 +914,8 @@ class NavigatorOptimizer(Optimizer):
    def add_requirements(self, env):
        super(NavigatorOptimizer, self).add_requirements(env)
-        env.extend(toolbox.ReplaceValidate())
+        # Added by default
+        #env.extend(toolbox.ReplaceValidate())
        if self.local_opt:
            self.local_opt.add_requirements(env)
@@ -989,7 +996,7 @@ class OpKeyOptimizer(NavigatorOptimizer):
        """
        Requires the following features:
          - NodeFinder
-          - ReplaceValidate
+          - ReplaceValidate(Added by default)
        """
        super(OpKeyOptimizer, self).add_requirements(env)
        env.extend(toolbox.NodeFinder())

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -224,6 +224,13 @@ class PureType(object):
        """
        raise MethodNotDefined("filter", type(self), self.__class__.__name__)
+    # If filter_inplace is defined, it will be called instead of
+    # filter() This is to allow reusing the old allocated memory. As
+    # of this writing this is used only when we transfer new data to a
+    # shared variable on the gpu.  
+    #def filter_inplace(value, storage, strict=False, allow_downcast=None)
    def is_valid_value(self, a):
        """Required: Return True for any python object `a` that would be a legal value for a Variable of this Type"""
        try:

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,7 +363,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,13) # raise this whenever modifying any of the support_code_files
+        return (0,14) # raise this whenever modifying any of the support_code_files
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of these files

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out,
 * 
 * nkern: the number of kernel, used to compute the output image to store the result
 * nstack: the size of the stack, used to compute the image to load.
+ * dx: patch stride rows(1 for normal convolution)
+ * dy: patch stride cols(1 for normal convolution)
 * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
 * template accumulate: if true, we add the result, else we override the result
 * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out,
 * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
 * template split: if true, each thread generate more then 1 output pixel, but use more registers.
 * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
+ * template subsample: if false, remove some computation needed when dx or dy!=1.
 */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern>
+template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
 conv_patch_stack( float* img, float* kern, float* out,
 		  int img_len, int img_wid, int kern_len, int kern_wid,
+		  int out_len, int out_wid,
 		  int nkern, int nstack, int img_stride_col,int img_stride_row,
 		  int img_stride_stack, int img_stride_batch,
 		  int kern_stride_col, int kern_stride_row,
-		  int kern_stride_stack, int kern_stride_nkern)
+		  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
 {
-  int __shared__ out_len, out_wid, nb_thread_id;
+  int __shared__ nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
  extern __shared__ float s_data[];
@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	  const float* idx_kern;
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	  else idx_kern=d_kern;
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  const float* idx_in;
+	  if(subsample)
+	    idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	  else
+	    idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
 	}
@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out,
      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
      //TODO: do this happen elsewhere?
-      for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){
+      for(;out_row<out_len_max;out_row+=blockDim.y){
 	float sum = 0.0f;
 	for (int stack = 0;stack<nstack;stack++){
 	  //TODO: load only the part of the image needed or put the partial result in shared memory
@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	    const float* idx_kern;
 	    if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	    else idx_kern=d_kern;
-	    const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	    const float* idx_in;
+	    if(subsample)
+	      idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	    else
+	      idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	    //if needed as on Fermi as reading out of bound index from shared memory generate an error.
 	    //Not needed on generation before as they worked anyway. Removing the if generate the good code

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -80,7 +80,7 @@ def test_gemm():
    c = tensor.fmatrix('c')
    f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
-    assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()])
+    assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
    a0 = a.get_value() * 1.0
    print a0

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -282,8 +282,7 @@ def get_valid_shapes():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
         #other test
@@ -502,8 +501,7 @@ def test_full():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
        #other test
@@ -553,21 +551,31 @@ def test_full():
 def test_subsample():
    # implement when
    shapes = [ 
-            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1))
+            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
-            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2))
+            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
            ]
-    all_good = True
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
-    _params_allgood_header()
+#We put only the version that implement the subsample to make the test faster.
-    for ishape, kshape, ds in shapes:
+    version_valid = [-2,-1,1,3,11,12]
-        if not _params_allgood(ishape, kshape, 'full', subsample=ds):
+    version_full = [-2,-1]
-            all_good = False
+    verbose = 0
-        if not _params_allgood(ishape, kshape, 'valid', subsample=ds):
+    random = True
-            all_good = False
+    print_ = False
-    assert all_good
+    ones = False
+    if ones:
+        random = False
+    #test
+    random = False
+    exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
 ## See #616
 #def test_logical_shapes():

--- a/theano/scan_module/scan.py
+++ b/theano/scan_module/scan.py
@@ -104,11 +104,11 @@ def scan( fn
        .. code-block:: python
-            scan(fn, sequences = [ dict( Sequence1, taps = [-3,2,-1])
+            scan(fn, sequences = [ dict(input= Sequence1, taps = [-3,2,-1])
                                 , Sequence2
-                                 , dict( Sequence3, taps = 3) ]
+                                 , dict(input =  Sequence3, taps = 3) ]
-                   , outputs_info = [ dict( Output1, taps = [-3,-5])
+                   , outputs_info = [ dict(initial =  Output1, taps = [-3,-5])
-                                    , dict( Output2, taps = None)
+                                    , dict(initial = Output2, taps = None)
                                    , Output3 ]
                   , non_sequences = [ Argument1, Argument 2])
@@ -371,7 +371,7 @@ def scan( fn
                    # ^ explicitly provided a None for taps
                    warning (' Output %s ( index %d) has a initial state '
                             ' but taps is explicitly set to None ' % (
-                                 outs_info[i]['initial'].name
+                                 getattr(outs_info[i]['initial'],'name','None')
                                 , i) )
                outs_info[i]['taps'] = [-1]
        else:
@@ -416,12 +416,10 @@ def scan( fn
                nw_slice = seq['input'][0].type()
                actual_slice = seq['input'][k-mintap]
-                if not hasattr(seq['input'],'name'):
-                    raise TypeError('Expected object with a "name" field, got '+str(seq)+"['input'] = "+str(seq['input']))
                # Add names to slices for debugging and pretty printing ..
                # that is if the input already has a name
-                if seq['input'].name:
+                if getattr(seq['input'],'name', None) is not None:
                    if k > 0:
                        nw_name = seq['input'].name + '[t+%d]'%k
                    elif k == 0:
@@ -481,7 +479,7 @@ def scan( fn
    # Add names -- it helps a lot when debugging
    for (nw_seq, seq) in zip(scan_seqs, seqs):
-        if seq['input'].name:
+        if getattr(seq['input'],'name', None) is not None:
            nw_seq.name = seq['input'].name + '[%d:]'%k
    # Conventions :
@@ -534,7 +532,7 @@ def scan( fn
            actual_arg = init_out['initial']
            arg = safe_new(init_out['initial'])
-            if init_out['initial'].name:
+            if getattr(init_out['initial'],'name', None) is not None:
                arg.name = init_out['initial'].name+'[t-1]'
            # We need now to allocate space for storing the output and copy
            # the initial state over. We do this using the expand function
@@ -579,7 +577,7 @@ def scan( fn
                nw_slice = init_out['initial'][0].type()
                # give it a name or debugging and pretty printing
-                if init_out['initial'].name:
+                if getattr(init_out['initial'],'name', None) is not None:
                    if k > 0:
                        nw_slice.name = ( init_out['initial'].name +
                                            '[t+%d]'%k )
@@ -746,7 +744,7 @@ def scan( fn
    for input in dummy_f.maker.expanded_inputs:
        if isinstance(input.variable, SharedVariable) and input.update:
            new_var = safe_new(input.variable)
-            if input.variable.name:
+            if getattr(input.variable,'name', None) is not None:
                new_var.name = input.variable.name + '_copy'
            shared_inner_inputs.append( new_var )
            shared_scan_inputs.append( input.variable )
@@ -777,7 +775,7 @@ def scan( fn
    ## Step 5.6 all shared variables with no update rules
    def new_variable( v ):
        new_v = safe_new(v)
-        if v.name:
+        if getattr(v,'name', None) is not None:
            new_v.name = v.name + '_copy'
        return new_v
    other_inner_args += [ new_variable(arg) for arg in non_seqs

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -226,10 +226,10 @@ class Scan(Op):
        for idx in xrange(self.n_seqs):
            if inputs[1+idx].dtype != self.inputs[idx].dtype:
                raise ValueError(err_msg1%( 'Sequence'
-                                       , inputs[1+idx].name
+                                       , str(inputs[1+idx])
                                       , idx
                                       , inputs[1+idx].dtype
-                                       , self.inputs[idx].name
+                                       , str(self.inputs[idx])
                                       , self.inputs[idx].dtype) )
        # Check that this 3 things have the same dtype for mit_mot:
@@ -246,10 +246,10 @@ class Scan(Op):
            for k in self.tap_array[index-start]:
                if inputs[index].dtype != self.inputs[index_i].dtype:
                    raise ValueError(err_msg1%( 'Initial state'
-                                               , inputs[index].name
+                                               , str(inputs[index])
                                               , index
                                               , inputs[index].dtype
-                                               , self.inputs[index_i].name
+                                               , str(self.inputs[index_i])
                                               , self.inputs[index_i].dtype) )
                index_i += 1
            for k in self.mit_mot_out_slices[index-start]:
@@ -266,14 +266,14 @@ class Scan(Op):
            for k in self.tap_array[index-start]:
                if inputs[index].dtype != self.inputs[index_i].dtype:
                    raise ValueError(err_msg1%( 'Initial state'
-                                               , inputs[index].name
+                                               , str(inputs[index])
                                               , index
                                               , inputs[index].dtype
-                                               , self.inputs[index_i].name
+                                               , str(self.inputs[index_i])
                                               , self.inputs[index_i].dtype) )
                index_i += 1
            if inputs[index].dtype != self.outputs[index_o].dtype:
-                raise ValueError(err_msg2%( inputs[index].name
+                raise ValueError(err_msg2%( str(inputs[index])
                                           , index
                                           , inputs[index].dtype
                                           , self.outputs[index_o].dtype) )
@@ -287,7 +287,7 @@ class Scan(Op):
        while index < end:
            if (hasattr(inputs[index],'dtype') and
                inputs[index].dtype != self.outputs[index_o].dtype):
-                raise ValueError(err_msg2%( inputs[index].name
+                raise ValueError(err_msg2%( str(inputs[index])
                                           , index
                                           , inputs[index].dtype
                                           , self.outputs[index_o].dtype) )
@@ -610,11 +610,13 @@ class Scan(Op):
        t_call = time.time() - t0_call
        if hasattr(self.fn.maker.mode,'fct_call_time'):
-            self.fn.maker.mode.fct_call_time[self.fn] += t_call
+            self.fn.maker.mode.fct_call_time[self.fn] += t_fn
            self.fn.maker.mode.fct_call[self.fn] += n_steps
-        self.fn.maker.mode.call_time += t_call
+        self.fn.maker.mode.call_time += t_fn
        self.fn.maker.mode.fn_time += t_fn
+        self.t_call = t_call
+        self.t_fn = t_fn
    ### Infer Shape
@@ -792,7 +794,7 @@ class Scan(Op):
                g_out_slices.append(g_outs_no_shared[dx][0])
            else:
                g_out_slices.append(None)
-            if out.name:
+            if getattr(out,'name',None) is not None:
                inner_g_out.name = 'g_'+out.name
            else:
                inner_g_out.name = 'g_'+str(dx)
@@ -870,7 +872,7 @@ class Scan(Op):
                    nw_seq = seq[dim_offset +k -mintap: -(maxtap -k)]
                else:
                    nw_seq = seq[dim_offset +k -mintap: ]
-                if seq.name:
+                if getattr(seq,'name', None) is not None:
                    nw_seq.name = seq.name + '[%d:]'%k
                scan_seqs.append(nw_seq)

--- a/theano/scan_module/scan_views.py
+++ b/theano/scan_module/scan_views.py
@@ -118,10 +118,9 @@ def reduce( fn
            if not isinstance(out_info, dict):
                # Specifies that it should return only the last step.
                outs_info[i] = dict(
-                    initial = out_info,  return_steps = 1, store_steps = 1)
+                    initial = out_info,  return_steps = 1)
            else:
                # Specifies that it should return only the last step.
-                outs_info[i]['store_steps']  = 1
                outs_info[i]['return_steps'] = 1
                # NOTE : If the user asks for more then the last step,
                # it means he does not understand ``reduce``. We could
@@ -131,7 +130,7 @@ def reduce( fn
                , outputs_info      = outs_info
                , non_sequences     = non_sequences
                , go_backwards      = go_backwards
-                , truncate_gradient = 1
+                , truncate_gradient = -1
                , mode              = mode
                , name              = name )

--- a/theano/scan_module/tests/test_scan.py
+++ b/theano/scan_module/tests/test_scan.py
@@ -2036,6 +2036,71 @@ class T_Scan(unittest.TestCase):
        f2_vals = f2(x_val)
        assert numpy.allclose(f_vals, f2_vals)
+    def test_reduce_memory_consumption(self):
+        x = theano.shared( numpy.asarray(
+            numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
+        o,_ = theano.reduce(lambda v,acc : acc+v, x,
+                           theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
+                            )
+        #f1 = theano.function([],o)
+        # Get the scan node
+        #scan_node = [n for n in f1.maker.env.toposort()
+        #             if n.op.__class__.__name__=='Scan'][0]
+        # Check how much memory it uses
+        # Can actually do that since things are hidden by the infershape
+        # mechanism
+        #assert scan_node.inputs[2].value.shape == ()
+        gx = theano.tensor.grad(o, x)
+        f2 = theano.function([],gx)
+        assert numpy.allclose( f2(), numpy.ones((10,)))
+    def test_foldl_memory_consumption(self):
+        x = theano.shared( numpy.asarray(
+            numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
+        o,_ = theano.foldl(lambda v,acc : acc+v, x,
+                           theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
+                            )
+        #f1 = theano.function([],o)
+        # Get the scan node
+        #scan_node = [n for n in f1.maker.env.toposort()
+        #             if n.op.__class__.__name__=='Scan'][0]
+        # Check how much memory it uses
+        # Can actually do that since things are hidden by the infershape
+        # mechanism
+        #assert scan_node.inputs[2].value.shape == ()
+        gx = theano.tensor.grad(o, x)
+        f2 = theano.function([],gx)
+        assert numpy.allclose( f2(), numpy.ones((10,)))
+    def test_foldr_memory_consumption(self):
+        x = theano.shared( numpy.asarray(
+            numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
+        o,_ = theano.foldr(lambda v,acc : acc+v, x,
+                           theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
+                            )
+        #f1 = theano.function([],o)
+        # Get the scan node
+        #scan_node = [n for n in f1.maker.env.toposort()
+        #             if n.op.__class__.__name__=='Scan'][0]
+        # Check how much memory it uses
+        # Can actually do that since things are hidden by the infershape
+        # mechanism
+        #assert scan_node.inputs[2].value.shape == ()
+        gx = theano.tensor.grad(o, x)
+        f2 = theano.function([],gx)
+        assert numpy.allclose( f2(), numpy.ones((10,)))
 if __name__ == '__main__':
    #'''
    print ' Use nosetests to run these tests '

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -1056,6 +1056,9 @@ class CAReduce(Op):
                scal_name = 'maximum'
                if input.type.dtype in ["float32","float64"]:
                    identity = "-__builtin_inf()"
+                elif input.type.dtype.startswith("uint"):
+                    # numpy1.5.1 don't define NPY_MIN_UINT*
+                    identity = "0"
                else:
                    identity = "NPY_MIN_"+str(input.type.dtype).upper()
            if self.scalar_op == scalar.minimum:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -9,7 +9,7 @@ _logger = logging.getLogger('theano.tensor.opt')
 import operator
 import itertools
 import sys
+import traceback
 import numpy
 import numpy as N #guys... please don't do this in the library :(
@@ -567,16 +567,18 @@ class ShapeFeature(object):
    sometimes Theano constants?? That would be confusing.
    """
-    def shape_i(self, i):
-        def op_deco(r):
+    def shape_ir(self, i, r):
+        #TODO: Write a doc string for this method
        if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
            return self.lscalar_one
        else:
-                return Shape_i(i)(r)
+            return Shape_i(i).make_node(r).outputs[0]
-        return op_deco
    def shape_tuple(self, r):
-        return tuple([self.shape_i(i)(r) for i in xrange(r.ndim)])
+        #TODO: Write a doc string for this method
+        return tuple([self.shape_ir(i,r) for i in xrange(r.ndim)])
    def default_infer_shape(self, node, i_shapes):
        rval = []
@@ -690,10 +692,13 @@ class ShapeFeature(object):
    def on_attach(self, env):
        assert not hasattr(env, 'shape_feature')
        env.shape_feature = self
-        self.shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        # Must be local to the object as otherwise we reuse the same
-        self.scheduled = {} # Variable ->
+        # variable for multiple env!
        self.lscalar_one = T.constant(1, dtype='int64')
        assert self.lscalar_one.type == T.lscalar
+        self.shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        self.scheduled = {} # Variable ->
        for node in env.toposort():
            self.on_import(env, node)
@@ -725,12 +730,10 @@ class ShapeFeature(object):
                    'supported, and one should now use tensor.ShapeError '
                    'instead. The original exception message is: %s' % e)
        except Exception, e:
-            _logger.error('Failed to infer_shape from Op %s (i_shapes=%s): %s %s'% (node.op,
+            _logger.error('Failed to infer_shape from Op %s.\nInput shapes:%s\nException encountered during infer_shape: %s\nException message: %s\nTraceback: %s'% (node.op,
                [self.shape_of[r] for r in node.inputs],
-                type(e), str(e)))
+                type(e), str(e), traceback.format_exc()))
-            # We raise the exception to make sure the user knows something bad
+            o_shapes = self.default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
-            # is going on.
-            raise
        # this is packed information
        # an element of o_shapes is either None or a tuple

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -195,7 +195,8 @@ class test_CAReduce(unittest.TestCase):
            if tosum is None: tosum = range(len(xsh))
            f = copy(linker).accept(Env([x], [e])).make_function()
            xv = numpy.asarray(numpy.random.rand(*xsh))
-            if dtype.startswith('float'):
+            if not "int" in dtype:
                xv = numpy.asarray(xv,dtype=dtype)
            else:
                xv = numpy.asarray(xv<0.5,dtype=dtype)
@@ -245,7 +246,8 @@ class test_CAReduce(unittest.TestCase):
                raise Exception("Test for CAReduce with scalar_op %s not implemented"%str(scalar_op))
            if scalar_op in [maximum,minimum] and numpy_raised:
                try:
-                    f(xv)
+                    out = f(xv)
+                    assert out.dtype == dtype
                except ValueError:
                    pass
                else:
@@ -254,7 +256,7 @@ class test_CAReduce(unittest.TestCase):
                #numpy.{all,any} return bool type.
                if scalar_op in [and_, or_]:
                    zv = numpy.asarray(zv, dtype=dtype)
-                self.assertTrue((numpy.abs(f(xv) - zv) < 1e-10).all())
+                self.assertTrue(numpy.allclose(f(xv), zv))
            #test CAReduce.infer_shape
@@ -268,22 +270,27 @@ class test_CAReduce(unittest.TestCase):
                    assert all(f(xv)== zv.shape)
    def test_perform(self):
-        self.with_linker(gof.PerformLinker(), add)
+        for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
-        self.with_linker(gof.PerformLinker(), mul)
+            self.with_linker(gof.PerformLinker(), add, dtype=dtype)
-        self.with_linker(gof.PerformLinker(), maximum)
+            self.with_linker(gof.PerformLinker(), mul, dtype=dtype)
-        self.with_linker(gof.PerformLinker(), minimum)
+            self.with_linker(gof.PerformLinker(), maximum, dtype=dtype)
-        self.with_linker(gof.PerformLinker(), or_, dtype='int8')
+            self.with_linker(gof.PerformLinker(), minimum, dtype=dtype)
-        self.with_linker(gof.PerformLinker(), and_, dtype='int8')
+        for dtype in ["int8", "uint8"]:
-        self.with_linker(gof.PerformLinker(), xor, dtype='int8')
+            self.with_linker(gof.PerformLinker(), or_, dtype=dtype)
+            self.with_linker(gof.PerformLinker(), and_, dtype=dtype)
+            self.with_linker(gof.PerformLinker(), xor, dtype=dtype)
    def test_c(self):
-        self.with_linker(gof.CLinker(), add)
+        for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
-        self.with_linker(gof.CLinker(), mul)
+            self.with_linker(gof.CLinker(), add, dtype=dtype)
-        self.with_linker(gof.CLinker(), maximum)
+            self.with_linker(gof.CLinker(), mul, dtype=dtype)
-        self.with_linker(gof.CLinker(), minimum)
+        for dtype in ["floatX", "int8", "uint8"]:
-        self.with_linker(gof.CLinker(), or_, dtype='int8')
+            self.with_linker(gof.CLinker(), minimum, dtype=dtype)
-        self.with_linker(gof.CLinker(), and_, dtype='int8')
+            self.with_linker(gof.CLinker(), maximum, dtype=dtype)
-        self.with_linker(gof.CLinker(), xor, dtype='int8')
+        for dtype in ["int8", "uint8"]:
+            self.with_linker(gof.CLinker(), or_, dtype=dtype)
+            self.with_linker(gof.CLinker(), and_, dtype=dtype)
+            self.with_linker(gof.CLinker(), xor, dtype=dtype)
 class test_Prod(unittest.TestCase):