merge

168c502e · Pascal Lamblin · 9e674cb6 · 82403109 · 168c502e · 168c502e
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -182,14 +182,14 @@ class ProfileMode(Mode):
        print 'local_time %fs (Time spent running thunks)'% local_time

        if print_apply:
-            print 'Apply-wise summary: <% of local_time spent at this position> <total of local_time spent at this position> <nb_call> <Apply position> <Apply Op name>'
+            print 'Apply-wise summary: <% of local_time spent at this position> <cumulative seconds> <apply time> <time per call> <nb_call> <Apply position> <Apply Op name>'
            atimes = [(t/local_time, t, (a[0], str(a[1])), apply_call[a]) for a, t in apply_time.items()]
            atimes.sort()
            atimes.reverse()
            tot=0
            for f,t,a,nb_call in atimes[:n_apply_to_print]:
                tot+=t
-                print '   %4.1f%%  %.3fs  %.3fs  %i  %i %s' % (f*100, tot, t, nb_call, a[0], a[1])
+                print '   %4.1f%%  %.3fs  %.3fs %.2es  %i  %i %s' % (f*100, tot, t, t/nb_call,nb_call, a[0], a[1])
            print '   ... (remaining %i Apply instances account for %.2f%%(%.2fs) of the runtime)'\
                    %(max(0, len(atimes)-n_apply_to_print),
                      sum(f for f, t, a, nb_call in atimes[n_apply_to_print:])*100,
@@ -199,7 +199,7 @@ class ProfileMode(Mode):
        if op_flops:
            flops_msg=' <MFlops/s>'
            print '\nHACK WARNING: we print the flops for some OP, but the logic don\' always work. You need to know the internal of Theano to make it work correctly. Otherwise don\'t use!'
-        print '\nOp-wise summary: < of local_time spent on this kind of Op> <cumulative seconds> <self seconds>%s <nb_call> <Op name>'%(flops_msg)
+        print '\nOp-wise summary: <%% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> %s <nb_call> <Op name>'%(flops_msg)

        otimes = [(t/local_time, t, a, op_cimpl[a], op_call[a]) for a, t in op_time.items()]
        otimes.sort()
@@ -212,9 +212,9 @@ class ProfileMode(Mode):
            else:
              msg = ' '
            if op_flops:
-                print '   %4.1f%%  %.3fs  %.3fs  %s %7.1f %d %s' % (f*100, tot, t, msg, op_flops.get(a,-1), nb_call, a)
+                print '   %4.1f%%  %.3fs  %.3fs  %.2es %s %7.1f %d %s' % (f*100, tot, t, t/nb_call, msg, op_flops.get(a,-1), nb_call, a)
            else:
-                print '   %4.1f%%  %.3fs  %.3fs  %s %d %s' % (f*100, tot, t, msg, nb_call, a)
+                print '   %4.1f%%  %.3fs  %.3fs  %.2es  %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
        print '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
                %(max(0, len(otimes)-n_ops_to_print),
                  sum(f for f, t, a, ci, nb_call in otimes[n_ops_to_print:])*100,
@@ -231,7 +231,7 @@ class ProfileMode(Mode):
            sop_c.setdefault(type(a),True)
            sop_c[type(a)]=sop_c[type(a)] and op_cimpl[a]
            sop_call[type(a)]=sop_call.get(type(a),0)+op_call[a]
-        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <nb_call> <Op name>'
+        print '\nSingle Op-wise summary: <% of local_time spent on this kind of Op> <cumulative seconds> <self seconds> <time per call> <nb_call> <Op name>'
        sotimes = [(t/local_time, t, a, sop_c[a], sop_call[a]) for a, t in sop_time.items()]
        sotimes.sort()
        sotimes.reverse()
@@ -242,7 +242,7 @@ class ProfileMode(Mode):
              msg = '*'
            else:
              msg = ' '
-            print '   %4.1f%%  %.3fs  %.3fs  %s %d %s' % (f*100, tot, t, msg, nb_call, a)
+            print '   %4.1f%%  %.3fs  %.3fs  %.2es %s %d %s' % (f*100, tot, t, t/nb_call, msg, nb_call, a)
        print '   ... (remaining %i Ops account for %.2f%%(%.2fs) of the runtime)'\
                %(max(0, len(sotimes)-n_ops_to_print),
                  sum(f for f, t, a, ci, nb_call in sotimes[n_ops_to_print:])*100,

--- a/theano/config.py
+++ b/theano/config.py
@@ -8,6 +8,10 @@ default_={
 'ProfileMode.n_ops_to_print':20,
 'tensor_opt.local_elemwise_fusion':False,
 'lib.amdlibm':False,
+'op.set_flops':False,#currently used only in ConvOp. The profile mode will print the flops/s for the op.
+'nvcc.fastmath':False,
+'scalar.floatX':'float64',
+'gpuelemwise.sync':True, #when true, wait that the gpu fct finished and check it error code.
 }

 #default value taked from env variable
@@ -38,6 +42,8 @@ THEANO_DEBUGMODE_CHECK_PY = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_PY', 1)))
 THEANO_DEBUGMODE_CHECK_FINITE = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_FINITE', 1)))
 THEANO_DEBUGMODE_CHECK_STRIDES = bool(int(os.getenv('THEANO_DEBUGMODE_CHECK_STRIDES', 1)))

+THEANO_FLAGS=os.getenv("THEANO_FLAGS","")
+
 class TheanoConfig(object):
    """Return the value for a key after parsing ~/.theano.cfg and 
    the THEANO_FLAGS environment variable.
@@ -72,7 +78,7 @@ class TheanoConfig(object):
        #user config file override the default value
        self.config.read(['theano.cfg', os.path.expanduser('~/.theano.cfg')])

-        self.env_flags=os.getenv("THEANO_FLAGS","")
+        self.env_flags=THEANO_FLAGS
        #The value in the env variable THEANO_FLAGS override the previous value
        for flag in self.env_flags.split(','):
            if not flag:
@@ -88,16 +94,17 @@ class TheanoConfig(object):
                self.config.set(sp[0],sp[1],val)
            else:
                found=0
+                sp=sp[0].lower()#the ConfigParser seam to use only lower letter.
                for sec in self.config.sections():
                    for opt in self.config.options(sec):
-                        if opt == sp[0]:
+                        if opt == sp:
                            found+=1
                            section=sec
                            option=opt
                if found==1:
                    self.config.set(section,option,val)
                elif found>1:
-                    raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp[0]))
+                    raise Exception("Ambiguous option (%s) in THEANO_FLAGS"%(sp))
                
    def __getitem__(self, key):
        """:returns: a str with the value associated to the key"""
@@ -142,3 +149,5 @@ class TheanoConfig(object):

 config = TheanoConfig()

+if config.get('scalar.floatX') not in ['float32', 'float64']:
+    raise Exception("the configuration scalar.floatX must have value float32 or float64")
--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
 import numpy as N
 import theano
 import theano.tensor as T
-from theano import gof, Op, tensor
+from theano import gof, Op, tensor, config
 from theano.printing import Print

 def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
@@ -131,6 +131,8 @@ class ConvOp(Op):
                "'valid' mode)")%(self.imshp_logical,self.kshp_logical))

        self._rehash()
+        if config.config.getboolean('op.set_flops'):
+            self.set_flops()

    def __eq__(self, other):
        if type(self) != type(other):
@@ -177,11 +179,12 @@ class ConvOp(Op):
                            col=-img_col
                            img_col+=col
                        while col < max_col: #loop over kern col
-                            self.flops+=1
+                            self.flops+=2
                            col+=1
            
            self.flops*=self.imshp[0]*self.nkern*self.bsize#for all outputs images#n_stack==self.imshp[0]
-
+            
+            assert self.flops==self.bsize * self.nkern * self.imshp[0] * self.kshp[0] * self.kshp[1] * self.imshp[1] * self.imshp[2] * 2

    def make_node(self, inputs, kerns):
        # TODO: find a way to make ConvOp work for N-D (after NIPS09)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2,7 +2,7 @@ import StringIO, sys
 import numpy

 from theano import Op, Type, Apply, Variable, Constant
-from theano import tensor, scalar
+from theano import tensor, scalar, config

 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.type_support import filter as type_support_filter
@@ -67,7 +67,7 @@ class GpuElemwise(Op):
    nin = property(lambda self: self.scalar_op.nin)
    nout = property(lambda self: self.scalar_op.nout)

-    def __init__(self, scalar_op, inplace_pattern):
+    def __init__(self, scalar_op, inplace_pattern, sync=None):
        ##
        # TODO: implement inplace operations.  
        #       It's ok that we set the DestroyMap to something but then don't actually destroy
@@ -77,6 +77,7 @@ class GpuElemwise(Op):
        #       the amount of loading and storing to global memory that we would have to do.
        #       That's why it isn't implemented yet.
        #
+        sync = config.config.getboolean('gpuelemwise.sync',sync)
        self.scalar_op = scalar_op
        self.inplace_pattern = inplace_pattern
        self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
@@ -86,7 +87,8 @@ class GpuElemwise(Op):
            self.ufunc = None
        self._rehash()

-        self.src_generator = NaiveAlgo(self.scalar_op)
+        self.src_generator = NaiveAlgo(self.scalar_op, sync=sync)
+        self.sync = sync

    def __getstate__(self):
        d = copy.copy(self.__dict__)

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -51,7 +51,6 @@ def local_gpu_elemwise_0(node):
        if numpy.any([hasattr(i.owner, 'op') and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
            if numpy.any([o.type.dtype == 'float64' for o in node.outputs]):
                print 'WARNING: THERE ARE STILL float64s in your graph local_gpu_elemwise_0', node
-                import pdb; pdb.set_trace()
            else:
                # move the add to a GpuAdd
                new_op = GpuElemwise(node.op.scalar_op, node.op.inplace_pattern)

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -172,7 +172,7 @@ def speed_elemwise_collapse2():
    t2=time.time()

 def test_elemwise_collapse():
-    """ used to test if the case where all inputs are broadcast """
+    """ Test when all inputs have one(and the same) broadcastable dimension """
    
    shape = (4,5,60)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -186,14 +186,16 @@ def test_elemwise_collapse():

    v = numpy.asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse of all dimensions"

 def test_elemwise_collapse2():
-    """ used to test if the case where one inputs have a broadcast """
+    """ Test when only one inputs have one broadcastable dimension """
    
    shape = (4,5,60)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -207,14 +209,16 @@ def test_elemwise_collapse2():

    v = numpy.asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse to 3 dimensions"

 def test_elemwise_collapse3():
-    """ used to test if the case where one inputs have 2 broadcast dimensions at each ends."""
+    """ Test when only one inputs have two broadcastable dimension at each ends """
    
    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -228,14 +232,16 @@ def test_elemwise_collapse3():

    v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v)
    print "Expected collapse to 3 dimensions"

 def test_elemwise_collapse4():
-    """ used to test if the case where one inputs have 2 broadcast dimensions at each ends and a scalar"""
+    """ Test when only one inputs have two broadcastable dimension at each ends and we add a scalar"""
    
    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
@@ -249,8 +255,74 @@ def test_elemwise_collapse4():

    v = numpy.asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
-    for id,n in enumerate(f.maker.env.toposort()):
-        print id, n
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
    #let debugmode catch errors
-    f(v)
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2)
    print "Expected collapse to 3 dimensions"
+
+def test_elemwise_collapse5():
+    """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""
+    
+    shape = (4,5)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a, 'a')
+    a3 = a2.dimshuffle('x','x',0,1)
+    b = tcn.CudaNdarrayType((False, False, False, False))()
+    c = (a3+b+2)
+    f = pfunc([b], [c])
+
+
+    v = numpy.asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
+    v=cuda_ndarray.CudaNdarray(v)
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
+    print "Expected collapse to 2 dimensions"
+
+def test_elemwise_collapse6():
+    """ Test when all inputs have two broadcastable dimension at the beginning"""
+    
+    shape = (4,5)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a, 'a')
+    a3 = a2.dimshuffle('x','x',0,1)
+    b = tcn.CudaNdarrayType((True, True, False, False))()
+    f = pfunc([b], [a3+b])
+
+    v = numpy.asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32')
+    v=cuda_ndarray.CudaNdarray(v)
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f(v)[0]
+    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v)
+    print "Expected collapse to c contiguous"
+
+
+def test_elemwise_collapse7(atol=1e-6):
+    """ Test when one input have one broadcastable dimension and the other is a scalar"""
+    
+    shape = (5,4,1)
+    a = cuda_ndarray.CudaNdarray(numpy.asarray(numpy.random.rand(*shape),dtype='float32'))
+    a = numpy.asarray(numpy.random.rand(*shape),dtype='float32')
+    a2 = tcn.shared_constructor(a.copy(), 'a')
+    a3 = a2.dimshuffle(0, 'x', 1, 2)
+    f = pfunc([], [a3+2])
+
+    if False:
+        for id,n in enumerate(f.maker.env.toposort()):
+            print id, n
+    #let debugmode catch errors
+    out=f()[0]
+    ans=(a+2).reshape(shape[0],1,shape[1],shape[2])
+    assert numpy.allclose(out,ans, atol=atol)
+    print "Expected collapse to c contiguous"
--- a/theano/sandbox/cuda/tests/test_bench_loopfusion.py
+++ b/theano/sandbox/cuda/tests/test_bench_loopfusion.py
@@ -228,6 +228,11 @@ class Kouh2008(object):
 class Config(object):
    use_gpu = True
    dtype='float32'
+    dtype2=dtype
+    if dtype2=='floatX':
+        import theano.config as c
+        dtype2 = c.config.get('scalar.floatX')
+        
    rng_seed = 23498

    n_hid = 300
@@ -296,7 +301,7 @@ def test_bench_elemwise(n_iter=1000, **kwargs):

    xval = numpy.asarray(
        rng.uniform(size=(conf.ft_batchsize, x.type.shape[1])),
-        dtype=conf.dtype,
+        dtype=conf.dtype2,
        )
    yval = numpy.arange(conf.ft_batchsize)
    for i in xrange(n_iter):

--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -261,11 +261,17 @@ class CudaNdarrayType(Type):

    def c_code_cache_version(self):
        #return ()
+        #no need to put nvcc.fastmath in the tuple as the c_compile_args is put in the key.
        return (2,) # with assertion about refcounts

    def c_compiler(self):
        return nvcc_module_compile_str

+    def c_compile_args(self):
+        ret = []
+        if config.config.getboolean('nvcc.fastmath'):
+            ret.append('-use_fast_math')
+        return ret

 # THIS WORKS
 # But CudaNdarray instances don't compare equal to one another, and what about __hash__ ?

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -56,6 +56,8 @@ def constant(x):
 class Scalar(Type):

    def __init__(self, dtype):
+        if dtype=='floatX':
+            dtype=config.config.get('floatX')
        self.dtype = dtype
        self.dtype_specs() # error checking
    
@@ -238,8 +240,8 @@ class Scalar(Type):

    def c_code_cache_version(self):
        #return ()
+        # no need to put lib.amdlibm here as c_compile_args() are put in the key.
        return (4,)  #explicit T given in specialization of operator= lines.  This makes it compile with open64
-        #2,


 int8 = Scalar('int8')
@@ -252,6 +254,7 @@ uint32 = Scalar('uint32')
 uint64 = Scalar('uint64')
 float32 = Scalar('float32')
 float64 = Scalar('float64')
+floatX = Scalar(config.config.get('scalar.floatX'))
 complex64 = Scalar('complex64')
 complex128 = Scalar('complex128')

@@ -934,6 +937,7 @@ convert_to_uint32 = Cast(uint32, name='convert_to_uint32')
 convert_to_uint64 = Cast(uint64, name='convert_to_uint64')
 convert_to_float32 = Cast(float32, name='convert_to_float32')
 convert_to_float64 = Cast(float64, name='convert_to_float64')
+convert_to_floatX = Cast(floatX, name='convert_to_floatX')
 convert_to_complex64 = Cast(complex64, name='convert_to_complex64')
 convert_to_complex128 = Cast(complex128, name='convert_to_complex128')

@@ -948,10 +952,13 @@ _cast_mapping = {
           'uint64': convert_to_uint64,
           'float32': convert_to_float32,
           'float64': convert_to_float64,
+           'floatX': convert_to_floatX,
           'complex64': convert_to_complex64,
           'complex128': convert_to_complex128}
 def cast(x, dtype):
    """Symbolically cast `x` to a Scalar of given `dtype`.""" 
+    if dtype=='floatX': dtype = config.config.get('scalar.floatX')
+
    _x = as_scalar(x)
    if _x.type.dtype == dtype:
        return _x

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -217,10 +217,17 @@ def _wrap_tensor_into_member(x):
    return compile.module.Member(constant(x))
 compile.module.register_wrapper(_obj_is_wrappable_as_tensor, _wrap_tensor_into_member)

-if int(config.THEANO_CMP_SLOPPY):
+if int(config.THEANO_CMP_SLOPPY)>1:
    # This environment variable is a quick-and-dirty way to get low-precision comparisons.
    # For a more precise setting of these tolerances set them explicitly in your user code by
    # assigning, for example, "theano.tensor.basic.float32_atol = ..."
+
+    #when THEANO_CMP_SLOPPY>1 we are even more sloppy. This is usefull to test the gpu as they don't use extended precision and this cause some difference bigger then the normal sloppy.
+    float32_atol = 5e-4
+    float32_rtol = 1e-3 
+    float64_rtol = 1e-4
+    float64_atol = 1e-3
+elif int(config.THEANO_CMP_SLOPPY):
    float32_atol = 1e-4
    float32_rtol = 1e-3 
    float64_rtol = 1e-4
@@ -275,6 +282,8 @@ class TensorType(Type):
           Optional name for this type.
        """
        self.dtype = str(dtype)
+        if self.dtype=='floatX':
+          self.dtype=config.config.get('scalar.floatX')
        self.broadcastable = tuple(broadcastable)
        self.dtype_specs() # error checking is done there
        self.name = name
@@ -601,6 +610,7 @@ cscalar = TensorType('complex64', ())
 zscalar = TensorType('complex128', ())
 fscalar = TensorType('float32', ())
 dscalar = TensorType('float64', ())
+xscalar = TensorType('floatX',())
 bscalar = TensorType('int8', ())
 wscalar = TensorType('int16', ())
 iscalar = TensorType('int32', ())
@@ -621,6 +631,7 @@ cvector = TensorType('complex64', (False, ))
 zvector = TensorType('complex128', (False, ))
 fvector = TensorType('float32', (False, ))
 dvector = TensorType('float64', (False, ))
+xvector = TensorType('floatX', (False, ))
 bvector = TensorType('int8', (False,))
 wvector = TensorType('int16', (False,))
 ivector = TensorType('int32', (False, ))
@@ -638,6 +649,7 @@ cmatrix = TensorType('complex64', (False, False))
 zmatrix = TensorType('complex128', (False, False))
 fmatrix = TensorType('float32', (False, False))
 dmatrix = TensorType('float64', (False, False))
+xmatrix = TensorType('floatX', (False, False))
 bmatrix = TensorType('int8', (False, False))
 wmatrix = TensorType('int16', (False, False))
 imatrix = TensorType('int32', (False, False))
@@ -655,6 +667,7 @@ crow = TensorType('complex64', (True, False))
 zrow = TensorType('complex128', (True, False))
 frow = TensorType('float32', (True, False))
 drow = TensorType('float64', (True, False))
+xrow = TensorType('floatX', (True, False))
 brow = TensorType('int8', (True, False))
 wrow = TensorType('int16', (True, False))
 irow = TensorType('int32', (True, False))
@@ -668,6 +681,7 @@ ccol = TensorType('complex64', (False, True))
 zcol = TensorType('complex128', (False, True))
 fcol = TensorType('float32', (False, True))
 dcol = TensorType('float64', (False, True))
+xcol = TensorType('floatX', (False, True))
 bcol = TensorType('int8', (False, True))
 wcol = TensorType('int16', (False, True))
 icol = TensorType('int32', (False, True))
@@ -681,6 +695,7 @@ ctensor3 = TensorType('complex64', (False,)*3)
 ztensor3 = TensorType('complex128', (False,)*3)
 ftensor3 = TensorType('float32', (False,)*3)
 dtensor3 = TensorType('float64', (False,)*3)
+xtensor3 = TensorType('floatX', (False,)*3)
 btensor3 = TensorType('int8', (False,)*3)
 wtensor3 = TensorType('int16', (False,)*3)
 itensor3 = TensorType('int32', (False,)*3)
@@ -690,6 +705,7 @@ ctensor4 = TensorType('complex64', (False,)*4)
 ztensor4 = TensorType('complex128', (False,)*4)
 ftensor4 = TensorType('float32', (False,)*4)
 dtensor4 = TensorType('float64', (False,)*4)
+xtensor4 = TensorType('floatX', (False,)*4)
 btensor4 = TensorType('int8', (False,)*4)
 wtensor4 = TensorType('int16', (False,)*4)
 itensor4 = TensorType('int32', (False,)*4)
@@ -1086,6 +1102,9 @@ _convert_to_float32 = _conversion(elemwise.Elemwise(scal.convert_to_float32), 'f
 _convert_to_float64 = _conversion(elemwise.Elemwise(scal.convert_to_float64), 'float64')
 """Cast to double-precision floating point"""

+_convert_to_floatX = _conversion(elemwise.Elemwise(scal.convert_to_floatX), 'floatX')
+"""Cast to floatX floating point"""
+
 _convert_to_complex64  = _conversion(elemwise.Elemwise(scal.convert_to_complex64), 'complex64')
 """Cast to single-precision complex"""

@@ -1103,11 +1122,14 @@ _cast_mapping = {
           'uint64': _convert_to_uint64,
           'float32': _convert_to_float32,
           'float64': _convert_to_float64,
+           'floatX': _convert_to_floatX,
           'complex64': _convert_to_complex64,
           'complex128': _convert_to_complex128}
 @constructor
 def cast(x, dtype):
    """Symbolically cast `x` to a Tensor of type `dtype`.""" 
+    if dtype=='floatX': dtype = config.config.get('scalar.floatX')
+    
    _x = as_tensor_variable(x)
    if _x.type.dtype == dtype:
        return _x
@@ -2462,7 +2484,7 @@ def get_vector_length(v):
            return join.vec_length(v)
        except ValueError:
            pass
-    if v.owner and v.owner.op == _shape:
+    if v.owner and isinstance(v.owner.op, Shape):
        return v.owner.inputs[0].type.ndim
    raise ValueError("length not known")


--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1929,6 +1929,63 @@ def test_default_state():
    assert f(1) == 4.8
    assert f(2.2) == 7

+def test_cast_floatX():
+    floatx=config.config.get('scalar.floatX')
+
+    #float64 cast to float64 should not generate an op
+    x = dvector('x')
+    f = function([x],[cast(x,'float64')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort())==0
+
+    #float32 cast to float32 should not generate an op
+    x = fvector('x')
+    f = function([x],[cast(x,'float32')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort())==0
+
+    #floatX cast to float64
+    x = xvector('x')
+    f = function([x],[cast(x,'float64')])
+#    print f.maker.env.toposort()
+    if floatx=='float64':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #floatX cast to float32
+    x = xvector('x')
+    f = function([x],[cast(x,'float32')])
+#    print f.maker.env.toposort()
+    if floatx=='float32':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #float64 cast to floatX
+    x = dvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    if floatx=='float64':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #float32 cast to floatX
+    x = fvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    if floatx=='float32':
+        assert len(f.maker.env.toposort()) == 0 
+    else:
+        assert len(f.maker.env.toposort()) == 1
+
+    #floatX cast to floatX
+    x = xvector('x')
+    f = function([x],[cast(x,'floatX')])
+#    print f.maker.env.toposort()
+    assert len(f.maker.env.toposort()) == 0 
+
 if __name__ == '__main__':
    if len(sys.argv) >= 2 and sys.argv[1] == 'OPT':
        default_mode = compile.Mode(linker = 'c&py',

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1059,34 +1059,42 @@ class test_fusion(unittest.TestCase):
        print "time", self.do(mode, shared, shp=(1000,1000),gpu=False, assert_len_topo=False,slice=s, nb_repeat=100)


-    def tes_memory_leak(self, mode=compile.mode.predefined_modes['FAST_RUN'], shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
+    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'), shared_fn=shared, shp=(3000,3000), gpu=False, nb_repeat=30, assert_len_topo=True, slice=None):
        """
        param shared_fn: if None, will use compile.function
        verify that the elemwise fusion work
        Test with and without DimShuffle
        """
        #TODO: disable the canonizer?
-        fx, fy = fmatrices('xy')
+        fx = fmatrices('x')
        fxv = numpy.zeros(shp, dtype='float32')+ 2
-        fyv = numpy.zeros(shp, dtype='float32')+ 3
        cases = [
-            (fx+fy,(fx,fy),(fxv,fyv),1,fxv+fyv,'float32'),#1
+            (fx,(fx),(fxv),'float32'),#1
            ]
        import gc, pdb, objgraph, weakref
        d={}
        dl=[]
        v1=None
-        for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
+        mode=compile.mode.Mode('c', 'merge')
+        for id, [g, sym_inputs, val_inputs, out_dtype] in enumerate(cases):
            for zzzz in range(nb_repeat):
                v=numpy.zeros(shp, dtype=out_dtype)
                gc.collect();gc.collect();gc.collect()
-                print 'v1',v1
-                v1=weakref.ref(v)
+#                print 'v1',v1
+#                v1=weakref.ref(v)
                out=shared_fn(v,'out')
-                f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
+                pdb.set_trace()
+#                f = pfunc(sym_inputs,[],updates=[(out,out+g)],mode=mode)
+#                f = pfunc([fx],[],updates=[(out,out+fx)],mode=mode)
+#                f = pfunc([fx],out+fx,mode=mode)
+#                f = compile.function([fx,out],[out+fx],mode=mode)#no memory leak.
+                f = compile.function([fx,compile.In(variable=out, value=out.container, mutable=None)],
+                                     [out+fx],mode=mode)#if mutable is True or False, their is a memory leak
                del v
+                gc.collect();gc.collect();gc.collect()
+                pdb.set_trace()

-                if True:
+                if False:
                    gc.collect();gc.collect();gc.collect()
                    nd=objgraph.typestats()
                    print 'key, old val, new val, diff'
@@ -1097,7 +1105,7 @@ class test_fusion(unittest.TestCase):
                    d=nd
            
 #                pdb.set_trace()
-                if True:
+                if False:
                    gc.collect();gc.collect();gc.collect()
                    ndl=objgraph.by_type('list')
                    ll=[]