merge

168c502e · Pascal Lamblin · 9e674cb6 · 82403109 · 168c502e · 168c502e
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -182,14 +182,14 @@ class ProfileMode(Mode):
        print 'local_time %fs (Time spent running thunks)'% local_time

        if print_apply:
-            print 'Apply-wise summary: <% of local_time spent at this position> <total of local_time spent at this position> <nb_call> <Apply position> <Apply Op name>'
+            print 'Apply-wise summary: <% of local_time spent at this position> <cumulative seconds> <apply time> <time per call> <nb_call> <Apply position> <Apply Op name>'
            atimes = [(t/local_time, t, (a[0], str(a[1])), apply_call[a]) for a, t in apply_time.items()]
            atimes.sort()
            atimes.reverse()
            tot=0
            for f,t,a,nb_call in atimes[:n_apply_to_print]:
                tot+=t
-                print '   %4.1f%%  %.3fs  %.3fs  %i  %i %s' % (f*100, tot, t, nb_call, a[0], a[1])
+                print '   %4.1f%%  %.3fs  %.3fs %.2es  %i  %i %s' % (f*100, tot, t, t/nb_call,nb_call, a[0], a[1])
            print '   ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
                    %(max(0, len(atimes)-n_apply_to_print),
                      sum(f for f, t, a, nb_call in atimes[n_apply_to_print:])*100,
@@ -199,7 +199,7 @@ class ProfileMode(Mode):
        if op_flops:
            flops_msg=' <MFlops/s>'
            print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
-        print '\nOp-wise summary: < of local_time spent on this kind of Op> <cumulative seconds> <self seconds>%s <nb_call> <Op name>'%(flops_msg)
+        print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> %s <nb_call> <Op name>'%(flops_msg)

        otimes = [(t/local_time, t, a, op_cimpl[a], op_call[a]) for a, t in op_time.items()]
        otimes.sort()
@@ -212,9 +212,9 @@ class ProfileMode(Mode):
            else:
              msg = ' '
            if op_flops:
-                print '   %4.1f%%  %.3fs  %.3fs  %s %7.1f %d %s' % (f*100, tot, t, msg, op_flops.get(a,-1), nb_call, a)
+                print '   %4.1f%%  %.3fs  %.3fs  %.2es %s %7.1f %d %s' % (f*100, tot, t, t/nb_call, msg, op_flops.get(a,-1), nb_call, a)
            else:
-                print '   %4.1f%%  %.3fs  %.3fs  %s %d %s' % (f*100, tot, t, msg, nb_call, a)
+                print '   %4.1f%%  %.3fs  %.3fs  %.2es  %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
        print '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
                %(max(0, len(otimes)-n_ops_to_print),
                  sum(f for f, t, a, ci, nb_call in otimes[n_ops_to_print:])*100,
@@ -231,7 +231,7 @@ class ProfileMode(Mode):
            sop_c.setdefault(type(a),True)
            sop_c[type(a)]=sop_c[type(a)] and op_cimpl[a]
            sop_call[type(a)]=sop_call.get(type(a),0)+op_call[a]
-        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <nb_call> <Op name>'
+        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> <nb_call> <Op name>'
        sotimes = [(t/local_time, t, a, sop_c[a], sop_call[a]) for a, t in sop_time.items()]
        sotimes.sort()
        sotimes.reverse()
@@ -242,7 +242,7 @@ class ProfileMode(Mode):
              msg = '*'
            else:
              msg = ' '
-            print '   %4.1f%%  %.3fs  %.3fs  %s %d %s' % (f*100, tot, t, msg, nb_call, a)
+            print '   %4.1f%%  %.3fs  %.3fs  %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
        print '   ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
                %(max(0, len(sotimes)-n_ops_to_print),
                  sum(f for f, t, a, ci, nb_call in sotimes[n_ops_to_print:])*100,

--- a/theano/config.py
+++ b/theano/config.py
@@ -8,6 +8,10 @@ default_={
 'ProfileMode.n_ops_to_print':20,
 'tensor_opt.local_elemwise_fusion':False,
 'lib.amdlibm':False,
+'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
+'nvcc.fastmath':False,
+'scalar.floatX':'float64',
+'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
 }

 #default value taked from env variable
@@ -38,6 +42,8 @@ THEANO_DEBUGMODE_CHECK_PY = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1)))
 THEANO_DEBUGMODE_CHECK_FINITE = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1)))
 THEANO_DEBUGMODE_CHECK_STRIDES = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 1)))

+THEANO_FLAGS=os.getenv("THEANO_FLAGS","")
+
 class TheanoConfig(object):
    """Return the value for a key after parsing ~/.theano.cfg and 
    the THEANO_FLAGS environment variable.
@@ -72,7 +78,7 @@ class TheanoConfig(object):
        #user config file override the default value
        self.config.read(['theano.cfg', os.path.expanduser('~/.theano.cfg')])

-        self.env_flags=os.getenv("THEANO_FLAGS","")
+        self.env_flags=THEANO_FLAGS
        #The value in the env variable THEANO_FLAGS override the previous value
        for flag in self.env_flags.split(','):
            if not flag:
@@ -88,16 +94,17 @@ class TheanoConfig(object):
                self.config.set(sp[0],sp[1],val)
            else:
                found=0
+                sp=sp[0].lower()#the ConfigParser seam to use only lower letter.
                for sec in self.config.sections():
                    for opt in self.config.options(sec):
-                        if opt == sp[0]:
+                        if opt == sp:
                            found+=1
                            section=sec
                            option=opt
                if found==1:
                    self.config.set(section,option,val)
                elif found>1:
-                    raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp[0]))
+                    raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp))
                
    def __getitem__(self, key):
        """:returns: a str with the value associated to the key"""
@@ -142,3 +149,5 @@ class TheanoConfig(object):

 config = TheanoConfig()

+if config.get('scalar.floatX') not in ['float32', 'float64']:
+    raise Exception("the configuration scalar.floatX must have value float32 or float64")
--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
 import numpy as N
 import theano
 import theano.tensor as T
-from theano import gof, Op, tensor
+from theano import gof, Op, tensor, config
 from theano.printing import Print

 def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
@@ -131,6 +131,8 @@ class ConvOp(Op):
                "'valid' mode)")%(self.imshp_logical,self.kshp_logical))

        self._rehash()
+        if config.config.getboolean('op.set_flops'):
+            self.set_flops()

    def __eq__(self, other):
        if type(self) != type(other):
@@ -177,11 +179,12 @@ class ConvOp(Op):
                            col=-img_col
                            img_col+=col
                        while col < max_col: #loop over kern col
-                            self.flops+=1
+                            self.flops+=2
                            col+=1
            
            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
-
+            
+            assert self.flops==self.bsize * self.nkern * self.imshp[0] * self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2

    def make_node(self, inputs, kerns):
        # TODO: find a way to make ConvOp work for N-D (after NIPS09)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2,7 +2,7 @@ import StringIO, sys
 import numpy

 from theano import Op, Type, Apply, Variable, Constant
-from theano import tensor, scalar
+from theano import tensor, scalar, config

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.type_support import filter as type_support_filter
@@ -67,7 +67,7 @@ class GpuElemwise(Op):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op, inplace_pattern):
+    def __init__(self, scalar_op, inplace_pattern, sync=None):
        ##
        # TODO: implement inplace operations.  
        #       It's ok that we set the DestroyMap to something but then don't actually destroy
@@ -77,6 +77,7 @@ class GpuElemwise(Op):
        #       the amount of loading and storing to global memory that we would have to do.
        #       That's why it isn't implemented yet.
        #
+        sync = config.config.getboolean('gpuelemwise.sync',sync)
        self.scalar_op = scalar_op
        self.inplace_pattern = inplace_pattern
        self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
@@ -86,7 +87,8 @@ class GpuElemwise(Op):
            self.ufunc = None
        self._rehash()

-        self.src_generator = NaiveAlgo(self.scalar_op)
+        self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
+        self.sync = sync

    def __getstate__(self):
        d = copy.copy(self.__dict__)

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -210,12 +210,19 @@ class RecAlgo(object):
        return self.c_src_kernel(node, nodename) + self.c_src_callkernel(node, nodename)

 class NaiveAlgo(object):
-    verbose = 0 # 1 or 2 for more verbose output.
+    verbose = 0 # 1, 2 or 3 for more verbose output.
    cache_version = ()
-    cache_version = ('debug', 6, verbose)
+    cache_version = ('debug', 7, verbose)

-    def __init__(self, scalar_op):
+    def __init__(self, scalar_op, sync=True):
+        """ 
+        :param scalar_op: the scalar operation to execute on each element.
+        :param sync: if True, will wait after the kernel launch and check for error call.
+        """
        self.scalar_op = scalar_op
+        self.sync = sync
+        if not self.sync:
+            self.cache_version+=('nosync',)

    def c_src_kernel(self, node, nodename, nd):
        sio = StringIO.StringIO()
@@ -248,9 +255,6 @@ class NaiveAlgo(object):
            if _logical_scalar(i):
                print >> sio, "    const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)

-        
-        #TODO: insert code to check for strides of 1, and use a different loop
-        
        #loop over the elements to be treated by this kernel call
        print >> sio, "    for (int i = idx; i < numEls; i += numThreads) {"
        # calculate the data pointers for all arguments
@@ -286,9 +290,6 @@ class NaiveAlgo(object):
        print >> sio, "       ", task_code
        print >> sio, "    }"

-        #TODO: insert runtime stride checks that select the best loop order either here, or in
-        # the host code that launched the  kernel (host code probably better spot)
-
        #indent = " "*(4*d+7)
        #for ipos, i in enumerate(node.inputs):
            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
@@ -635,8 +636,9 @@ class NaiveAlgo(object):
        # like 
        #    float *, int, int, int ...
        #
-        # The second is to recognize when trailing (right-most in numpy) dimensions can be collapsed as
-        # being contiguous... (confusing... read code)
+        # The second is to recognize when any dimensions can be collapsed as
+        # being contiguous. That mean that we can merge that dimensions with another
+        # one for all inputs/outputs and have the same retusuls (confusing... read code)
        #
        # The thrid is to make a special case for scalar element. We allow the collapsing of them.
        # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.
@@ -644,6 +646,8 @@ class NaiveAlgo(object):
        #TODO: make a special case for broadcasting, to store the data in shared memory.

        nd = node.outputs[0].type.ndim
+        nb_inputs = len(node.inputs)
+        nb_outputs = len(node.outputs)
        id_self = id(self)
        d = dict()
        #input_params and output_params go into the function declaration/definition
@@ -668,11 +672,7 @@ class NaiveAlgo(object):
        {
            //can we collapse dims[i] and dims[i-1]
            for(int i=nd-1;i>0;i--){
-                if(false && dims[i]==1 && strides[i]==0){//
-                    collapse[i]=1;
-                }else if(false && dims[i-1]==1 && strides[i-1]==0){
-                    collapse[i]=1;
-                }else   if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
+                if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
                    collapse[i]=1;
                }else collapse[i]=0;
            }
@@ -704,9 +704,85 @@ class NaiveAlgo(object):
                std::cerr << "   %(ipos)s data strides" << 
                """ %locals() + " << ' ' <<  ".join(["o%s_data"%ipos]
                    + list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
+    # collapse dimension that are broadcast in all inputs.
+    # need to be done before contiguous collapse as it will break it.
+    # do the dimensions and the strides
+        print >> sio, """
+        int local_dims[%(nd)s];
+        int local_str[%(nb_inputs)s][%(nd)s];
+        int local_ostr[%(nb_inputs)s][%(nd)s];
+        int nd_collapse = %(nd)s;
+        for(int i=0;i<%(nd)s;i++){//init new dim
+          local_dims[i]=dims[i];
+        }
+        """%locals()
+        for ipos in xrange(len(node.inputs)):
+            print >> sio, """
+            for(int i=0;i<%(nd)s;i++){//init new strides
+              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
+            }
+            """%locals()
+        for ipos in xrange(len(node.outputs)):
+            print >> sio, """
+            for(int i=0;i<%(nd)s;i++){//init new strides
+              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
+            }
+            """%locals()
+        if self.verbose>2:
+            print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
+            print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
+            print >> sio, 'std::cerr << "local_dims";'
+            for d in xrange(nd):
+                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
+            print >> sio, 'std::cerr << "\\n";'
+                
+            for ipos in xrange(len(node.inputs)):
+                print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
+            for ipos in xrange(len(node.outputs)):
+                print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
+
+        print >> sio, """
+        for(int id=0;id<nd_collapse;id++){
+          
+          bool all_broadcast=true;
+          for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
+            if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
+          }
+          for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
+            if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
+          }
+          if(all_broadcast){
+            for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
+              local_dims[j-1]=local_dims[j];
+            for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
+              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
+                local_str[input_id][j-1]=local_str[input_id][j];
+              }
+            }
+            for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
+              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
+                local_ostr[output_id][j-1]=local_ostr[output_id][j];
+              }
+            }
+            nd_collapse--; id--;
+          }
+        }
+        """%locals()

+        if self.verbose>2:
+            print >>sio, 'std::cerr <<"after broadcast collapse\\n";'
+            print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
+            print >> sio, 'std::cerr << "local_dims";'
+            for d in xrange(nd):
+                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
+            print >> sio, 'std::cerr << "\\n";'
+                
+            for ipos in xrange(len(node.inputs)):
+                print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
+            for ipos in xrange(len(node.outputs)):
+                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
-    # this is a good idea because [we assume that] the output has been allocated c_contiguous
+    # this is a good idea because we make less index calculation in the gpu.

        print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() +','.join(['1' for x in range(nd)]) +"};"
        for ipos in xrange(len(node.inputs)):
@@ -714,8 +790,8 @@ class NaiveAlgo(object):
                print >> sio, """
                    int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() +','.join(['1' for x in range(nd)]) +"};"
                print >> sio, """
-can_collapse_%(nodename)s(%(nd)s, dims, i%(ipos)s_str, nd_collapse_%(ipos)s);
-for(int i=0;i<%(nd)s;i++){
+can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
+for(int i=0;i<nd_collapse;i++){
 if(nd_collapse_%(ipos)s[i]==0)
 nd_collapse_[i]=0;
 }
@@ -731,79 +807,69 @@ nd_collapse_[i]=0;
                    """%locals()
                    print >>sio, ' << " " << '.join(["nd_collapse_["%locals()+str(i)+"]" for i in range(nd)])
                    print >>sio, '<< "\\n";'
-        print >> sio, """
-        int nd_collapse=%(nd)s;
-        for(int i=1;i<%(nd)s;i++){
-        if(nd_collapse_[i]==1)nd_collapse--;
-        }
-        if(nd_collapse==1 && """%locals()
-        print >> sio, " && ".join([ "i%(ipos)s_str[%(nd)s-1]==1 "%locals()for x in range(len(node.inputs))])
-        print >> sio,"""){nd_collapse=0;} """
-        if self.verbose:
-            print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ %locals()

-    # set the new dims.
-        print >> sio, "int local_dims[%(nd)s];"%locals()
-        print >> sio, """
-        for(int i=0;i<%(nd)s;i++){//init new dim
-          local_dims[i]=dims[i];
-        }
-        for(int i=%(nd)s-1;i>0;i--){
-          if(nd_collapse_[i]==1){
-            local_dims[i-1]*=local_dims[i];//set new dims
-            for(int j=i+1;j<%(nd)s;j++)//remove dims i from the array
-              local_dims[j-1]=local_dims[j];
-          }
-        }
-
-        """%locals()
-
-        if self.verbose>1:
-            for d in xrange(nd):
-                print >> sio, 'std::cerr << "local_dims %(d)s " << local_dims[%(d)s] << "\\n"; '%locals()
-
-        # set the new stride.
+    # update the local stride.
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
-            int local_i%(ipos)s_str[%(nd)s];
-            """%locals()
-            print >> sio, """
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_i%(ipos)s_str[i]=i%(ipos)s_str[i];
-            }
-
-            for(int i=%(nd)s-1;i>0;i--){
+            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
-                local_i%(ipos)s_str[i-1]=local_i%(ipos)s_str[i];//set new strides
-                for(int j=i+1;j<%(nd)s;j++)//remove stride i from the array
-                  local_i%(ipos)s_str[j-1]=local_i%(ipos)s_str[j];
+                local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
+                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
+                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
                }
            }
            """%locals()


        for ipos in xrange(len(node.outputs)):
-            print >> sio, "int local_o%(ipos)s_str[%(nd)s];"%locals()
            print >> sio, """
-            for(int i=0;i<%(nd)s;i++){//init new strides
-              local_o%(ipos)s_str[i]=o%(ipos)s_str[i];
-            }
-
-            for(int i=%(nd)s-1;i>0;i--){
+            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
-                local_o%(ipos)s_str[i-1]=local_o%(ipos)s_str[i];//set new strides
-                for(int j=i+1;j<%(nd)s;j++)//remove stride i from the array
-                  local_o%(ipos)s_str[j-1]=local_o%(ipos)s_str[j];
+                local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
+                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
+                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
                }
            }
            """%locals()

+    # update the local dims.
+        print >> sio, """
+        for(int i=nd_collapse-1;i>0;i--){
+          if(nd_collapse_[i]==1){
+            local_dims[i-1]*=local_dims[i];//set new dims
+            for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
+              local_dims[j-1]=local_dims[j];
+          }
+        }
+        """%locals()
+
+    #update the new number of dim
+        print >> sio, """
+        for(int i=1, end=nd_collapse;i<end;i++){
+          if(nd_collapse_[i]==1)nd_collapse--;
+        }
+        if(nd_collapse == 1 """%locals()
+        l=["local_str[%(ipos)s][nd_collapse-1]==1 "%locals()for ipos in range(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])]
+        l+=["local_ostr[%(ipos)s][nd_collapse-1]==1 "%locals()for ipos in range(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])]
+        if len(l)>0:
+              print >> sio," && "," && ".join(l)
+        print >> sio,"""){nd_collapse=0;} """
+
+        if self.verbose:
+            print >> sio, 'std::cerr <<"after can_collapse\\n";'
+            print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ %locals()
        if self.verbose>1:
-            for ipos in ["i"+ str(x) for x in xrange(len(node.inputs))]+["o"+ str(x) for x in xrange(len(node.outputs))]:
-                print >> sio, 'std::cerr << " local_%(ipos)s_str " <<'%locals()+' << " " << '.join(["local_%(ipos)s_str[%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
+            for d in xrange(nd):
+                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
+            print >> sio, 'std::cerr << "\\n";'
+
+            for ipos in xrange(len(node.inputs)):
+                print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
+            for ipos in xrange(len(node.outputs)):
+                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'


-        def launch_Ccontiguous(nodename, id_self, scalar_op):
+        def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
                kernel_call_args.append("i%i_data"%ipos)
@@ -819,6 +885,9 @@ nd_collapse_[i]=0;
                kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);

                //std::cerr << "calling callkernel returned\\n";
+                """ %locals()
+            if sync:
+                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err) 
@@ -830,18 +899,20 @@ nd_collapse_[i]=0;
                %(verb)s
                return 0;
                """ %locals()
+            else:
+                print >> sio, " return 0; " %locals()

-        def launch_General(nodename, id_self, scalar_op, force_nd):
+        def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
            # kernel_call_args are used to invoke the cuda kernel
            local="local_"
            kernel_call_args = ["numEls"]
            kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd))
            for ipos in xrange(len(node.inputs)):
-                kernel_call_args+=["i%i_data"%ipos] + list(local+"i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                kernel_call_args+=["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd))
                #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
            for ipos in xrange(len(node.outputs)):
-                kernel_call_args+=["o%i_data"%ipos] + list(local+"o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
+                kernel_call_args+=["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd))
                #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
            if self.verbose:
@@ -857,6 +928,9 @@ nd_collapse_[i]=0;
                int threads_per_block = std::min(numEls, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)NUM_VECTOR_OP_BLOCKS);
                kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
+                """ %locals()
+            if sync:
+                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err) 
@@ -867,14 +941,15 @@ nd_collapse_[i]=0;
                }                         
                return 0;
                """ %locals()
-
+            else:
+                print >> sio, " return 0; " %locals()
        print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
        print >> sio, "case 0: {"
-        launch_Ccontiguous(nodename, id_self, scalar_op)
+        launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
        print >> sio, "        } break;"
        for i in range(1, nd+1):
            print >> sio, "case "+str(i)+": {"
-            launch_General(nodename, id_self, scalar_op, i)
+            launch_General(nodename, id_self, scalar_op, i, self.sync)
            print >> sio, "        } break;"
                                   
        print >> sio, "}"#end case

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -51,7 +51,6 @@ def local_gpu_elemwise_0(node):
        if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
            if numpy.any([o.type.dtype == 'float64' for o in node.outputs]):
                print 'WARNING: THERE ARE STILL float64s in your graph local_gpu_elemwise_0', node
-                import pdb; pdb.set_trace()
            else:
                # move the add to a GpuAdd
                new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -172,7 +172,7 @@ def speed_elemwise_collapse2():
    t2=time.time()

 def test_elemwise_collapse():
-    """ used to test if the case where all inputs are broadcast """
+    """ Test when all inputs have one(and the same) broadcastable dimension """
    
    shape = (4,5,60)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -186,14 +186,16 @@ def test_elemwise_collapse():

    v = numpy.asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse of all dimensions"

 def test_elemwise_collapse2():
-    """ used to test if the case where one inputs have a broadcast """
+    """ Test when only one inputs have one broadcastable dimension """
    
    shape = (4,5,60)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -207,14 +209,16 @@ def test_elemwise_collapse2():

    v = numpy.asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse to 3 dimensions"

 def test_elemwise_collapse3():
-    """ used to test if the case where one inputs have 2 broadcast dimensions at each ends."""
+    """ Test when only one inputs have two broadcastable dimension at each ends """
    
    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -228,14 +232,16 @@ def test_elemwise_collapse3():

    v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
    print "Expected collapse to 3 dimensions"

 def test_elemwise_collapse4():
-    """ used to test if the case where one inputs have 2 broadcast dimensions at each ends and a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
    
    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -249,8 +255,74 @@ def test_elemwise_collapse4():

    v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
    print "Expected collapse to 3 dimensions"
+
+def test_elemwise_collapse5():
+    """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
+    
+    shape = (4,5)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a, 'a')
+    a3 = a2.dimshuffle('x','x',0,1)
+    b = tcn.CudaNdarrayType((False, False, False, False))()
+    c = (a3+b+2)
+    f = pfunc([b], [c])
+
+
+    v = numpy.asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
+    v=cuda_ndarray.CudaNdarray(v)
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
+    print "Expected collapse to 2 dimensions"
+
+def test_elemwise_collapse6():
+    """ Test when all inputs have two broadcastable dimension at the beginning"""
+    
+    shape = (4,5)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a, 'a')
+    a3 = a2.dimshuffle('x','x',0,1)
+    b = tcn.CudaNdarrayType((True, True, False, False))()
+    f = pfunc([b], [a3+b])
+
+    v = numpy.asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
+    v=cuda_ndarray.CudaNdarray(v)
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
+    print "Expected collapse to c contiguous"
+
+
+def test_elemwise_collapse7(atol=1e-6):
+    """ Test when one input have one broadcastable dimension and the other is a scalar"""
+    
+    shape = (5,4,1)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a.copy(), 'a')
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
+    f = pfunc([], [a3+2])
+
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f()[0]
+    ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
+    assert numpy.allclose(out,ans, atol=atol)
+    print "Expected collapse to c contiguous"
--- a/theano/sandbox/cuda/tests/test_bench_loopfusion.py
+++ b/theano/sandbox/cuda/tests/test_bench_loopfusion.py
@@ -228,6 +228,11 @@ class Kouh2008(object):
 class Config(object):
    use_gpu = True
    dtype='float32'
+    dtype2=dtype
+    if dtype2=='floatX':
+        import theano.config as c
+        dtype2 = c.config.get('scalar.floatX')
+        
    rng_seed = 23498

    n_hid = 300
@@ -296,7 +301,7 @@ def test_bench_elemwise(n_iter=1000, **kwargs):

    xval = numpy.asarray(
        rng.uniform(size=(conf.ft_batchsize, x.type.shape[1])),
-        dtype=conf.dtype,
+        dtype=conf.dtype2,
        )
    yval = numpy.arange(conf.ft_batchsize)
    for i in xrange(n_iter):

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -261,11 +261,17 @@ class CudaNdarrayType(Type):

    def c_code_cache_version(self):
        #return ()
+        #no need to put nvcc.fastmath in the tuple as the c_compile_args is put in the key.
        return (2,) # with assertion about refcounts

    def c_compiler(self):
        return nvcc_module_compile_str

+    def c_compile_args(self):
+        ret = []
+        if config.config.getboolean('nvcc.fastmath'):
+            ret.append('-use_fast_math')
+        return ret

 # THIS WORKS
 # But CudaNdarray instances don't compare equal to one another, and what about __hash__ ?

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -56,6 +56,8 @@ def constant(x):
 class Scalar(Type):

    def __init__(self, dtype):
+        if dtype=='floatX':
+            dtype=config.config.get('floatX')
        self.dtype = dtype
        self.dtype_specs() # error checking
    
@@ -238,8 +240,8 @@ class Scalar(Type):

    def c_code_cache_version(self):
        #return ()
+        # no need to put lib.amdlibm here as c_compile_args() are put in the key.
        return (4,)  #explicit T given in specialization of operator= lines.  This makes it compile with open64
-        #2,


 int8 = Scalar('int8')
@@ -252,6 +254,7 @@ uint32 = Scalar('uint32')
 uint64 = Scalar('uint64')
 float32 = Scalar('float32')
 float64 = Scalar('float64')
+floatX = Scalar(config.config.get('scalar.floatX'))
 complex64 = Scalar('complex64')
 complex128 = Scalar('complex128')

@@ -934,6 +937,7 @@ convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
 convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
 convert_to_float32 = Cast(float32, name='convert_to_float32')
 convert_to_float64 = Cast(float64, name='convert_to_float64')
+convert_to_floatX = Cast(floatX, name='convert_to_floatX')
 convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
 convert_to_complex128 = Cast(complex128, name='convert_to_complex128')

@@ -948,10 +952,13 @@ _cast_mapping = {
           'uint64': convert_to_uint64,
           'float32': convert_to_float32,
           'float64': convert_to_float64,
+           'floatX': convert_to_floatX,
           'complex64': convert_to_complex64,
           'complex128': convert_to_complex128}
 def cast(x, dtype):
    """Symbolically cast `x` to a Scalar of given `dtype`.""" 
+    if dtype=='floatX': dtype = config.config.get('scalar.floatX')
+
    _x = as_scalar(x)
    if _x.type.dtype == dtype:
        return _x

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -217,10 +217,17 @@ def _wrap_tensor_into_member(x):
    return compile.module.Member(constant(x))
 compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member)

-if int(config.THEANO_CMP_SLOPPY):
+if int(config.THEANO_CMP_SLOPPY)>1:
    # This environment variable is a quick-and-dirty way to get low-precision comparisons.
    # For a more precise setting of these tolerances set them explicitly in your user code by
    # assigning, for example, "theano.tensor.basic.float32_atol = ..."
+
+    #when THEANO_CMP_SLOPPY>1 we are even more sloppy. This is usefull to test the gpu as they don't use extended precision and this cause some difference bigger then the normal sloppy.
+    float32_atol = 5e-4
+    float32_rtol = 1e-3 
+    float64_rtol = 1e-4
+    float64_atol = 1e-3
+elif int(config.THEANO_CMP_SLOPPY):
    float32_atol = 1e-4
    float32_rtol = 1e-3 
    float64_rtol = 1e-4
@@ -275,6 +282,8 @@ class TensorType(Type):
           Optional name for this type.
        """
        self.dtype = str(dtype)
+        if self.dtype=='floatX':
+          self.dtype=config.config.get('scalar.floatX')
        self.broadcastable = tuple(broadcastable)
        self.dtype_specs() # error checking is done there
        self.name = name
@@ -601,6 +610,7 @@ cscalar = TensorType('complex64', ())
 zscalar = TensorType('complex128', ())
 fscalar = TensorType('float32', ())
 dscalar = TensorType('float64', ())
+xscalar = TensorType('floatX',())
 bscalar = TensorType('int8', ())
 wscalar = TensorType('int16', ())
 iscalar = TensorType('int32', ())
@@ -621,6 +631,7 @@ cvector = TensorType('complex64', (False, ))
 zvector = TensorType('complex128', (False, ))
 fvector = TensorType('float32', (False, ))
 dvector = TensorType('float64', (False, ))
+xvector = TensorType('floatX', (False, ))
 bvector = TensorType('int8', (False,))
 wvector = TensorType('int16', (False,))
 ivector = TensorType('int32', (False, ))
@@ -638,6 +649,7 @@ cmatrix = TensorType('complex64', (False, False))
 zmatrix = TensorType('complex128', (False, False))
 fmatrix = TensorType('float32', (False, False))
 dmatrix = TensorType('float64', (False, False))
+xmatrix = TensorType('floatX', (False, False))
 bmatrix = TensorType('int8', (False, False))
 wmatrix = TensorType('int16', (False, False))
 imatrix = TensorType('int32', (False, False))
@@ -655,6 +667,7 @@ crow = TensorType('complex64', (True, False))
 zrow = TensorType('complex128', (True, False))
 frow = TensorType('float32', (True, False))
 drow = TensorType('float64', (True, False))
+xrow = TensorType('floatX', (True, False))
 brow = TensorType('int8', (True, False))
 wrow = TensorType('int16', (True, False))
 irow = TensorType('int32', (True, False))
@@ -668,6 +681,7 @@ ccol = TensorType('complex64', (False, True))
 zcol = TensorType('complex128', (False, True))
 fcol = TensorType('float32', (False, True))
 dcol = TensorType('float64', (False, True))
+xcol = TensorType('floatX', (False, True))
 bcol = TensorType('int8', (False, True))
 wcol = TensorType('int16', (False, True))
 icol = TensorType('int32', (False, True))
@@ -681,6 +695,7 @@ ctensor3 = TensorType('complex64', (False,)*3)
 ztensor3 = TensorType('complex128', (False,)*3)
 ftensor3 = TensorType('float32', (False,)*3)
 dtensor3 = TensorType('float64', (False,)*3)
+xtensor3 = TensorType('floatX', (False,)*3)
 btensor3 = TensorType('int8', (False,)*3)
 wtensor3 = TensorType('int16', (False,)*3)
 itensor3 = TensorType('int32', (False,)*3)
@@ -690,6 +705,7 @@ ctensor4 = TensorType('complex64', (False,)*4)
 ztensor4 = TensorType('complex128', (False,)*4)
 ftensor4 = TensorType('float32', (False,)*4)
 dtensor4 = TensorType('float64', (False,)*4)
+xtensor4 = TensorType('floatX', (False,)*4)
 btensor4 = TensorType('int8', (False,)*4)
 wtensor4 = TensorType('int16', (False,)*4)
 itensor4 = TensorType('int32', (False,)*4)
@@ -1086,6 +1102,9 @@ _convert_to_float32 = _conversion(elemwise.Elemwise(scal.convert_to_float32), 'f
 _convert_to_float64 = _conversion(elemwise.Elemwise(scal.convert_to_float64), 'float64')
 """Cast to double-precision floating point"""

+_convert_to_floatX = _conversion(elemwise.Elemwise(scal.convert_to_floatX), 'floatX')
+"""Cast to floatX floating point"""
+
 _convert_to_complex64  = _conversion(elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
 """Cast to single-precision complex"""

@@ -1103,11 +1122,14 @@ _cast_mapping = {
           'uint64': _convert_to_uint64,
           'float32': _convert_to_float32,
           'float64': _convert_to_float64,
+           'floatX': _convert_to_floatX,
           'complex64': _convert_to_complex64,
           'complex128': _convert_to_complex128}
 @constructor
 def cast(x, dtype):
    """Symbolically cast `x` to a Tensor of type `dtype`.""" 
+    if dtype=='floatX': dtype = config.config.get('scalar.floatX')
+    
    _x = as_tensor_variable(x)
    if _x.type.dtype == dtype:
        return _x
@@ -2462,7 +2484,7 @@ def get_vector_length(v):
            return join.vec_length(v)
        except ValueError:
            pass
-    if v.owner and v.owner.op == _shape:
+    if v.owner and isinstance(v.owner.op, Shape):
        return v.owner.inputs[0].type.ndim
    raise ValueError("length not known")


--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1929,6 +1929,63 @@ def test_default_state():
    assert f(1) == 4.8
    assert f(2.2) == 7

+def test_cast_floatX():
+    floatx=config.config.get('scalar.floatX')
+
+    #float64 cast to float64 should not generate an op
+    x = dvector('x')
+    f = function([x],[cast(x,'float64')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort())==0
+
+    #float32 cast to float32 should not generate an op
+    x = fvector('x')
+    f = function([x],[cast(x,'float32')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort())==0
+
+    #floatX cast to float64
+    x = xvector('x')
+    f = function([x],[cast(x,'float64')])
+#    print f.maker.env.toposort()
+    if floatx=='float64':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #floatX cast to float32
+    x = xvector('x')
+    f = function([x],[cast(x,'float32')])
+#    print f.maker.env.toposort()
+    if floatx=='float32':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #float64 cast to floatX
+    x = dvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    if floatx=='float64':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #float32 cast to floatX
+    x = fvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    if floatx=='float32':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #floatX cast to floatX
+    x = xvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort()) == 0 
+
 if __name__ == '__main__':
    if len(sys.argv) >= 2 and sys.argv[1] == 'OPT':
        default_mode = compile.Mode(linker = 'c&py',

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1059,34 +1059,42 @@ class test_fusion(unittest.TestCase):
        print "time", self.do(mode, shared, shp=(1000,1000),gpu=False, assert_len_topo=False,slice=s, nb_repeat=100)


-    def tes_memory_leak(self, mode=compile.mode.predefined_modes['FAST_RUN'], shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
+    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'), shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
        """
        param shared_fn: if None, will use compile.function
        verify that the elemwise fusion work
        Test with and without DimShuffle
        """
        #TODO: disable the canonizer?
-        fx, fy = fmatrices('xy')
+        fx = fmatrices('x')
        fxv = numpy.zeros(shp, dtype='float32')+ 2
-        fyv = numpy.zeros(shp, dtype='float32')+ 3
        cases = [
-            (fx+fy,(fx,fy),(fxv,fyv),1,fxv+fyv,'float32'),#1
+            (fx,(fx),(fxv),'float32'),#1
            ]
        import gc, pdb, objgraph, weakref
        d={}
        dl=[]
        v1=None
-        for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
+        mode=compile.mode.Mode('c', 'merge')
+        for id, [g, sym_inputs, val_inputs, out_dtype] in enumerate(cases):
            for zzzz in range(nb_repeat):
                v=numpy.zeros(shp, dtype=out_dtype)
                gc.collect();gc.collect();gc.collect()
-                print 'v1',v1
-                v1=weakref.ref(v)
+#                print 'v1',v1
+#                v1=weakref.ref(v)
                out=shared_fn(v,'out')
-                f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
+                pdb.set_trace()
+#                f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
+#                f = pfunc([fx],[],updates=[(out,out+fx)],mode=mode)
+#                f = pfunc([fx],out+fx,mode=mode)
+#                f = compile.function([fx,out],[out+fx],mode=mode)#no memory leak.
+                f = compile.function([fx,compile.In(variable=out, value=out.container, mutable=None)],
+                                     [out+fx],mode=mode)#if mutable is True or False, their is a memory leak
                del v
+                gc.collect();gc.collect();gc.collect()
+                pdb.set_trace()

-                if True:
+                if False:
                    gc.collect();gc.collect();gc.collect()
                    nd=objgraph.typestats()
                    print 'key, old val, new val, diff'
@@ -1097,7 +1105,7 @@ class test_fusion(unittest.TestCase):
                    d=nd
            
 #                pdb.set_trace()
-                if True:
+                if False:
                    gc.collect();gc.collect();gc.collect()
                    ndl=objgraph.by_type('list')
                    ll=[]