merge

56267692 · Simon Lemieux · 6516ddd5 · da2cc473 · 56267692 · 6516ddd5
--- a/benchmark/numexpr/gen_graph.py
+++ b/benchmark/numexpr/gen_graph.py
@@ -150,38 +150,35 @@ def execs_timeit_2vector(exprs, fname=None):
    assert len(colors)>=len(times)
    fig = pylab.figure()
    for idx,(time,expr) in enumerate(zip(times,str_expr)):
+        ###
+        ###
+        ###
+        # Creating each subplot
+        ###
+        ###
+        ###
+        ###
        pylab.subplot(220+idx+1)
        pylab.subplots_adjust(wspace=0.25, hspace=0.25)
        #legend=[]
        #plot = fig.add_subplot(1,len(exprs),idx)
        speedup = [t[0].min()/t[1].min() for t in time]
        pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
        speedup = [t[0].min()/t[2].min() for t in time]
        pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
        pylab.grid(True)
        if (idx == 2) or (idx == 3):
-            pylab.xlabel('Dimension of vectors a and b')
+            pylab.xlabel('Dimension of vectors a and b', fontsize = 15)
        if (idx == 0) or (idx == 2):
-            pylab.ylabel('Speed up vs NumPy')
+            pylab.ylabel('Speed up vs NumPy', fontsize = 15)
        pylab.axhline(y=1, linewidth=1.0, color='black')
        pylab.xlim(1e3,1e7)
        pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
        pylab.title(expr)
-        #for time,expr,color in zip(times,str_expr,colors):
-        #    speedup = [t[0].min()/t[1].min() for t in time]
-        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
-        #    speedup = [t[0].min()/t[2].min() for t in time]
-        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
-            #legend += ["Numexpr "+expr,"Theano "+expr]
-    #pylab.title('Speed up Numexpr and Theano vs NumPy')
-    #pylab.grid(True)
-    #pylab.xlabel('Nb element')
-    #pylab.ylabel('Speed up vs NumPy')
-    #pylab.legend(legend,loc='upper left')
-#    fig.legend(legend,loc='upper left')
    if fname:
        fig.savefig(fname)

--- a/benchmark/numexpr/multiple_graph.png
+++ b/benchmark/numexpr/multiple_graph.png
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -126,6 +126,18 @@ class AddDestroyHandler(gof.Optimizer):
        super(AddDestroyHandler, self).add_requirements(env)
        env.extend(gof.DestroyHandler())
+class PrintCurrentEnv(gof.Optimizer):
+    """This optimizer is for debugging.
+    Toss it into the optimization pipeline to see the state of things at any given point.
+    """
+    def __init__(self, header):
+        self.header =header
+    def apply(self, env):
+        import theano.printing
+        print "PrintCurrentEnv:", self.header
+        theano.printing.debugprint(env.outputs)
 optdb = gof.SequenceDB()
 optdb.register('merge1', gof.MergeOptimizer(), 
        0, 'fast_run', 'fast_compile')
@@ -133,10 +145,19 @@ optdb.register('canonicalize', gof.EquilibriumDB(),         # rearranges elemwis
        1, 'fast_run', 'fast_compile')
 optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
        1.2, 'fast_run', 'fast_compile')
+optdb.register('Print1.21', PrintCurrentEnv('Post-canonicalize'),
+        1.21,)# 'fast_run', 'fast_compile')
 optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable subgraphs
        1.5, 'fast_run')          
+optdb.register('Print1.51', PrintCurrentEnv('Post-stabilize'),
+        1.51,) #'fast_run', 'fast_compile')
 optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
        2, 'fast_run')
+optdb.register('Print2.01', PrintCurrentEnv('Post-specialize'),
+        2.01, )#'fast_run', 'fast_compile')
+optdb.register('specialize_device', gof.EquilibriumDB(),           # misc special cases for speed that are dependent on the device.
+        48.6, 'fast_run')#must be after gpu stuff at 48.5
 optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
        49, 'fast_run')
 optdb.register('add_destroy_handler', AddDestroyHandler(), 

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -341,6 +341,8 @@ class Value(Variable):
        if value is not None:
            raise ValueError("Value instances cannot have an owner.")
    owner = property(lambda self: None, __set_owner)
+    value = property(lambda self: self.data, 
+            doc='read-only data access method')
    # index is not defined, because the `owner` attribute must necessarily be None

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -525,7 +525,8 @@ class PatternSub(LocalOptimizer):
                      (scrabble, 'x'))
    """
-    def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False):
+    def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False,
+            skip_identities_fn=None):
        """
        Creates a PatternSub that replaces occurrences of
        in_pattern by occurrences of out_pattern.
@@ -543,7 +544,12 @@ class PatternSub(LocalOptimizer):
            raise TypeError("The pattern to search for must start with a specific Op instance.")
        self.__doc__ = self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n"
        self.allow_multiple_clients = allow_multiple_clients
+        self.skip_identities_fn = skip_identities_fn
+    def skip_identities(self, expr):
+        if self.skip_identities_fn:
+            return self.skip_identities_fn(expr)
    def op_key(self):
        return self.op
@@ -568,13 +574,22 @@ class PatternSub(LocalOptimizer):
        if node.op != self.op:
            return False
        def match(pattern, expr, u, allow_multiple_clients = False):
+            def retry_with_equiv():
+                expr_equiv = self.skip_identities(expr)
+                if expr_equiv is None:
+                    return False
+                #TODO: Not sure how to handle multiple_clients flag
+                ###print 'retrying match', pattern, expr_equiv
+                return match(pattern, expr_equiv, u,
+                        allow_multiple_clients=allow_multiple_clients)
            if isinstance(pattern, (list, tuple)):
                if expr.owner is None:
                    return False
                if not (expr.owner.op == pattern[0]) or (not allow_multiple_clients and len(expr.clients) > 1):
-                    return False
+                    return retry_with_equiv()
                if len(pattern) - 1 != len(expr.owner.inputs):
-                    return False
+                    return retry_with_equiv()
                for p, v in zip(pattern[1:], expr.owner.inputs):
                    u = match(p, v, u, self.allow_multiple_clients)
                    if not u:
@@ -588,17 +603,17 @@ class PatternSub(LocalOptimizer):
                if constraint(expr):
                    return match(real_pattern, expr, u, pattern.get('allow_multiple_clients', False))
                else:
-                    return False
+                    return retry_with_equiv()
            elif isinstance(pattern, str):
                v = unify.Var(pattern)
                if u[v] is not v and u[v] is not expr:
-                    return False
+                    return retry_with_equiv()
                else:
                    u = u.merge(expr, v)
            elif isinstance(pattern, graph.Constant) and isinstance(expr, graph.Constant) and pattern.equals(expr):
                return u
            else:
-                return False
+                return retry_with_equiv()
            return u
        def build(pattern, u):
@@ -614,6 +629,7 @@ class PatternSub(LocalOptimizer):
        if u:
            p = self.out_pattern
            new = build(p, u)
+            ####print "PatternSub matched:", new
            return [new]
        else:
            return False

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -359,7 +359,8 @@ pprint.assign(lambda pstate, r: hasattr(pstate, 'target') and pstate.target is n
 pp = pprint
-def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'), compact=True, mode=None, format='png'):
+def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
+        compact=True, mode=None, format='png', with_ids=False):
    """
    print to a file in png format the graph of op of a compile theano fct.
@@ -390,14 +391,15 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
    g=pd.Dot()
    var_str={}
+    all_strings = set()
    def var_name(var):
        if var in var_str:
            return var_str[var]
        if var.name is not None:
-            varstr = var.name+" "+str(var.type)
+            varstr = 'name='+var.name+" "+str(var.type)
        elif isinstance(var,gof.Constant):
-            dstr = str(var.data)
+            dstr = 'val='+str(var.data)
            if '\n' in dstr:
                dstr = dstr[:dstr.index('\n')]
            if len(dstr) > 30:
@@ -408,12 +410,17 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        else:
            #a var id is needed as otherwise var with the same type will be merged in the graph.
            varstr = str(var.type)
-        varstr += ' ' + str(len(var_str))
+        if (varstr in all_strings) or with_ids:
+            varstr += ' id=' + str(len(var_str))
        var_str[var]=varstr
+        all_strings.add(varstr)
        return varstr
    topo = fct.maker.env.toposort()
+    apply_name_cache = {}
    def apply_name(node):
+        if node in apply_name_cache:
+            return apply_name_cache[node]
        prof_str=''
        if mode:
            time = mode.apply_time.get((topo.index(node),node),0)
@@ -425,7 +432,12 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
                pf=0
            else: pf = time*100/mode.fct_call_time[fct]
            prof_str='   (%.3fs,%.3f%%,%.3f%%)'%(time,pt,pf)
-        return str(node.op).replace(':','_')+'    '+str(topo.index(node))+prof_str
+        applystr = str(node.op).replace(':','_')
+        if (applystr in all_strings) or with_ids:
+            applystr = applystr+'    id='+str(topo.index(node))+prof_str
+        all_strings.add(applystr)
+        apply_name_cache[node] = applystr
+        return applystr
    # Update the inputs that have an update function
    input_update={}
@@ -434,16 +446,18 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        if i.update is not None:
            input_update[outputs.pop()] = i
+    apply_shape='ellipse'
+    var_shape='box'
    for node_idx,node in enumerate(topo):
        astr=apply_name(node)
-        g.add_node(pd.Node(astr,shape='box'))
+        g.add_node(pd.Node(astr,shape=apply_shape))
        for id,var in enumerate(node.inputs):
            varstr=var_name(var)
            label=''
            if len(node.inputs)>1:
                label=str(id)
            if var.owner is None:
-                g.add_node(pd.Node(varstr,color='green'))
+                g.add_node(pd.Node(varstr,color='green',shape=var_shape))
                g.add_edge(pd.Edge(varstr,astr, label=label))
            elif var.name or not compact:
                g.add_edge(pd.Edge(varstr,astr, label=label))
@@ -460,10 +474,10 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
                label=str(id)
            if out:
                g.add_edge(pd.Edge(astr, varstr, label=label))
-                g.add_node(pd.Node(varstr,color='blue'))
+                g.add_node(pd.Node(varstr,color='blue',shape=var_shape))
            elif len(var.clients)==0:
                g.add_edge(pd.Edge(astr, varstr, label=label))
-                g.add_node(pd.Node(varstr,color='grey'))
+                g.add_node(pd.Node(varstr,color='grey',shape=var_shape))
            elif var.name or not compact:
                g.add_edge(pd.Edge(astr, varstr, label=label))
 #            else:
@@ -495,9 +509,9 @@ def pydot_var(vars, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
            return var_str[var]
        if var.name is not None:
-            varstr = var.name
+            varstr = 'name='+var.name
        elif isinstance(var,gof.Constant):
-            dstr = str(var.data)
+            dstr = 'val='+str(var.data)
            if '\n' in dstr:
                dstr = dstr[:dstr.index('\n')]
            if len(dstr) > 30:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -21,8 +21,8 @@ def debug(*msg):
 # printed and this module will not be working properly (we set `cuda_available`
 # to False).
-# This variable is True by default, and set to False if something goes wrong
+# This variable is True by default, and set to False if nvcc is not available or
-# when trying to initialize cuda.
+# their is no cuda card or something goes wrong when trying to initialize cuda.
 cuda_available = True
 # Global variable to avoid displaying the same warning multiple times.
@@ -89,6 +89,9 @@ except Exception, e:
    error( "Failed to compile cuda_ndarray.cu: %s" % str(e))
    set_cuda_disabled()
+if cuda_available:
+    cuda_available=device_available()
 if cuda_available:
    #check if their is an old cuda_ndarray that was loading instead of the one we compiled!
    import cuda_ndarray.cuda_ndarray

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1715,14 +1715,21 @@ class GpuSubtensor(tensor.Subtensor):
        cdata = tuple(map(convert, self.idx_list))
        if len(cdata) == 1:
            cdata = cdata[0]
+        out[0] = x.__getitem__(cdata)
-        # some numpy installations don't expose the __index__() methods for
+        if 0: 
-        # numpy.int8/16/32/64. Casting to python's int instead
+            # JSB: commenting this out because it breaks code and does not look right
-        start = int(cdata.start) if cdata.start!=None else None
+            #      Dumi could you try to run the examples in the DeepLearningBenchmarks
-        stop = int(cdata.stop) if cdata.stop!=None else None
+            #      for example?  This logic doesn't seem right -- we just 
-        step = int(cdata.step) if cdata.step!=None else None
+            #      cast cdata to a tuple, so it doesn't have a .start field.
-        newslice = slice(start,stop,step)
-        out[0] = x.__getitem__(newslice)
+            # some numpy installations don't expose the __index__() methods for
+            # numpy.int8/16/32/64. Casting to python's int instead
+            start = int(cdata.start) if cdata.start!=None else None
+            stop = int(cdata.stop) if cdata.stop!=None else None
+            step = int(cdata.step) if cdata.step!=None else None
+            newslice = slice(start,stop,step)
+            out[0] = x.__getitem__(newslice)
 class GpuIncSubtensor(tensor.IncSubtensor):
    def make_node(self, x, y, *inputs):

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -722,7 +722,9 @@ conv_rows_stack2( float* img, float* kern, float* out,
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	  else idx_kern=d_kern;
 	  const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	  float sum_ =0.0f;
+	  convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+	  sum+=sum_;//We pass by an intermediate variable to have more precission.
 	}
      }
    }

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1604,6 +1604,35 @@ static PyTypeObject CudaNdarrayType =
    CudaNdarray_new,           /* tp_new */
 };
+//This fct return True it is able to find a cuda card and query its properti
+//Otherwise we return False
+PyObject *
+device_available(PyObject* _unsed, PyObject * args)
+{
+  int deviceCount;
+  cudaError err = cudaGetDeviceCount(&deviceCount);
+  if( cudaSuccess != err) {
+    Py_RETURN_FALSE;
+  }
+  if (deviceCount <= 0) {
+    Py_RETURN_FALSE;
+  }
+  cudaDeviceProp deviceProp;
+  err=cudaGetDeviceProperties(&deviceProp, 0);
+  if( cudaSuccess != err) {
+    Py_RETURN_FALSE;
+  }
+  if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
+    Py_RETURN_FALSE;
+  }
+  Py_RETURN_TRUE;
+}
 PyObject *
 CudaNdarray_gpu_init(PyObject* _unsed, PyObject * args)
 {
@@ -1810,6 +1839,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
 static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
+    {"device_available", device_available, METH_VARARGS, "Return Py_True if a cuda card is available."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},    
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -165,7 +165,9 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
                ret = _params_allgood(ishape, kshape, mode,
                                      subsample=subshape, img_stride=istride, kern_stride=kstride,
                                      version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
-            except:
+            except Exception, e:
+                print ver, id,(ishape, kshape, subshape, istride, kstride)
+                print e
                pass
            if not ret:
                failed_version.add(ver)

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -11,6 +11,7 @@ if cuda_available:
 import unittest
 from theano.tests import unittest_tools as utt
+from nose.plugins.skip import SkipTest
 #TODO: test gpu
 # Done in test_consistency_GPU_{serial,parallel}
@@ -22,7 +23,6 @@ from theano.tests import unittest_tools as utt
 #TODO: make tests work when no flags gived. Now need: THEANO_FLAGS=device=gpu0,floatX=float32
 # Partly done, in test_consistency_GPU_{serial,parallel}
-#TODO: bug fix test_normal0, in normal() fct, n_samples currently need to be numpy.prod(size) not self.n_streams(size)
 mode = config.mode
 mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
@@ -287,6 +287,7 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[],
    for i in xrange(steps):
        t0 = time.time()
        ival = f(*inputs)
+        assert ival.shape==sample_size
        dt += time.time() - t0
        ival = numpy.asarray(ival)
        if i == 0:
@@ -324,7 +325,7 @@ def test_uniform():
        sample_size = (10,100)
        steps = 50
    else:
-        sample_size = (500,100)
+        sample_size = (500,50)
        steps = int(1e3)
    x = tensor.matrix()
@@ -381,9 +382,9 @@ def test_binomial():
    if mode in ['DEBUG_MODE','FAST_COMPILE']:
        sample_size = (10,50)
-        steps = 70
+        steps = 50
    else:
-        sample_size = (500,100)
+        sample_size = (500,50)
        steps = int(1e3)
    x = tensor.matrix()
@@ -430,9 +431,9 @@ def test_normal0():
    steps = 50
    if mode in ['DEBUG_MODE','FAST_COMPILE']:
-        sample_size = (99,100)
+        sample_size = (99,30)
    else:
-        sample_size = (999,100)
+        sample_size = (999,50)
    print ''
    print 'ON CPU:'
@@ -464,8 +465,8 @@ def test_normal0():
        print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
        print '----'
        sys.stdout.flush()
-        basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
+        basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
    print ''
    print 'ON CPU w NUMPY:'
@@ -476,7 +477,7 @@ def test_normal0():
    basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)
-def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
+def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):
    dt = 0.0
    avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
@@ -484,6 +485,7 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
    for i in xrange(steps):
        t0 = time.time()
        ival = f()
+        assert ival.shape==sample_size
        dt += time.time() - t0
        #ival = numpy.asarray(ival)
        avg_pvals += ival
@@ -518,7 +520,7 @@ def test_multinomial():
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
-    basic_multinomialtest(f, steps, pvals, prefix='mrg ')
+    basic_multinomialtest(f, steps, sample_size, pvals, prefix='mrg ')
    sys.stdout.flush()
@@ -535,4 +537,4 @@ def test_multinomial():
        theano.printing.debugprint(f)
        sys.stdout.flush()
-        basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ')
+        basic_multinomialtest(f, steps, sample_size, pvals, prefix='gpu mrg ')
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -302,7 +302,7 @@ class Scalar(Type):
            return ""
    def c_code_cache_version(self):
-        return (8,) # put const around operators and added unary '-' operator
+        return (8, numpy.__version__) # put const around operators and added unary '-' operator
        # no need to put lib.amdlibm here as c_compile_args() are put in the key.
        return (7,)  # make complex c code optional
        return (6,)  # added implemeentations of operators that work with scalar arguments
@@ -932,6 +932,7 @@ class IntDiv(BinaryScalarOp):
        return [None] * len(inputs)
 int_div = IntDiv(upcast_out, name = 'int_div')
+floor_div = int_div
 class Mod(BinaryScalarOp):
    def impl(self, x, y):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -887,6 +887,11 @@ class _tensor_py_operators:
        except Exception, e:
            return NotImplemented
+    def __truediv__(self,other): return true_div(self, other)
+    def __floordiv__(self,other): return floor_div(self, other)
+    def __rtruediv__(self,other): return true_div(other, self)
+    def __rfloordiv__(self,other): return floor_div(other, self)
 #     ##### DON"T USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED BY OPTIMIZATION ONLY
 #     #ARITHMETIC - INPLACE
 #     def __iadd__(self,other): return _add_inplace(self,other)
@@ -2066,6 +2071,11 @@ def true_div(a, b):
    """elementwise [true] division (inverse of multiplication)"""
    # see decorator for function body
+@_scal_elemwise
+def floor_div(a, b):
+    """elementwise [floor] division (inverse of multiplication)"""
+    # see decorator for function body
 @_scal_elemwise
 def int_div(a, b):
    """elementwise integer-division"""
@@ -3607,8 +3617,12 @@ class Dot(Op):
            nx = x.type.ndim
            ny = y.type.ndim
-            if nx not in (1,2): raise TypeError('not matrix or vector', x)
+            if nx not in (1,2):
-            if ny not in (1,2): raise TypeError('not matrix or vector', y)
+                raise TypeError(('dot supports matrix and vector args: email theano-dev about'
+                    ' enabling numpy dot semantics if you want them'), x)
+            if ny not in (1,2):
+                raise TypeError(('dot supports matrix and vector args: email theano-dev about'
+                    ' enabling numpy dot semantics if you want them'), y)
            if nx == 2 and ny == 2:
                bz = [x.type.broadcastable[0], y.type.broadcastable[1]]

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -7,12 +7,13 @@ from theano.configparser import config, AddConfigVar, StrParam
 from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler, 
        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer, 
        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
-from theano.printing import pprint, FunctionPrinter
+from theano.printing import pprint, FunctionPrinter, debugprint
 from theano.compile.mode import optdb
 from theano.gof.python25 import any
 import theano.scalar
 import basic as T
 from theano.tensor.tsor_apply import Apply
 #NB: this clobbers the builtin 'compile' symbol
@@ -28,6 +29,74 @@ def warn(*msg): _logger.warn(' '.join(str(m) for m in msg))
 def warning(*msg): _logger.warning(' '.join(str(m) for m in msg))
 def error(*msg): _logger.error(' '.join(str(m) for m in msg))
+try:
+    import scipy.linalg.blas
+    _have_fblas = True
+    _blas_gemv_fns = {
+            numpy.dtype('float32'):scipy.linalg.blas.fblas.sgemv,
+            numpy.dtype('float64'):scipy.linalg.blas.fblas.dgemv,
+            numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgemv,
+            numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgemv,
+            }
+except ImportError, e:
+    _have_fblas = False
+    warning('Failed to import scipy.linalg.blas.fblas. Falling back on slower implementations (%s)' % str(e))
+class Gemv(Op):
+    """
+    expression is beta * y + alpha * A x
+    A is matrix
+    x, y are vectors
+    alpha, beta are scalars
+    """
+    def __init__(self, inplace):
+        self.inplace=inplace
+        if inplace:
+            self.destroy_map={0:[0]}
+    def __eq__(self, other):
+        return type(self)==type(other) and self.inplace == other.inplace
+    def __str__(self):
+        if self.inplace:
+            return 'Gemv{inplace}'
+        else:
+            return 'Gemv{no_inplace}'
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.inplace)
+    def make_node(self, y, alpha, A, x, beta):
+        y = T.as_tensor_variable(y)
+        x = T.as_tensor_variable(x)
+        A = T.as_tensor_variable(A)
+        alpha = T.as_tensor_variable(alpha)
+        beta = T.as_tensor_variable(beta)
+        if y.dtype != A.dtype or y.dtype != x.dtype:
+            raise TypeError('Gemv requires matching dtypes', (y.dtype, A.dtype, x.dtype))
+        if A.ndim != 2: raise TypeError('gemv requires matrix for A', A.type)
+        if x.ndim != 1: raise TypeError('gemv requires vector for x', x.type)
+        if y.ndim != 1: raise TypeError('gemv requires vector for y', y.type)
+        if y.broadcastable[0] != A.broadcastable[0]:
+            raise TypeError('broadcastable mismatch between y and A', (y.type, A.type))
+        # The following is not grounds for error
+        # because as long as sizes are 1 at time of perform() there is no problem
+        #if x.broadcastable[0] != A.broadcastable[1]:
+            #raise TypeError('broadcastable mismatch between x and A', (x.type, A.type))
+        return Apply(self, [y, alpha, A, x, beta], [y.type()])
+    def perform(self, node, inputs, out_storage):
+        y, alpha, A, x, beta = inputs
+        if _have_fblas:
+            if not self.inplace:
+                y = y.copy()
+            gemv = _blas_gemv_fns[y.dtype]
+            out_storage[0][0] = gemv(alpha, A, x, beta, y, overwrite_y=self.inplace)
+        else:
+            out_storage[0][0] = numpy.asarray(
+                    beta * y + alpha * numpy.dot(A, x)
+                    , dtype=y.dtype)
+gemv_no_inplace = Gemv(inplace=False)
+gemv_inplace = Gemv(inplace=True)
 def default_blas_ldflags():
    try:
        return ' '.join(
@@ -520,6 +589,9 @@ class Gemm(GemmRelated):
        """
    def c_code(self, node, name, (_z, _a, _x, _y, _b), (_zout, ), sub): #DEBUG
+        if node.inputs[0].type.dtype.startswith('complex'):
+            raise utils.MethodNotDefined('%s.c_code' \
+                    % self.__class__.__name__)
        if not config.blas.ldflags:
            return super(Gemm, self).c_code(node, name, (_z, _a, _x, _y, _b), (_zout, ), sub)
        full_code = self.build_gemm_call() % dict(locals(), **sub)
@@ -571,6 +643,10 @@ def _is_real_matrix(res):
            and res.type.ndim == 2 \
            and res.type.broadcastable[0] == False \
            and res.type.broadcastable[1] == False #cope with tuple vs. list
+def _is_real_vector(res):
+    return res.type.dtype in ('float32', 'float64') \
+            and res.type.ndim == 1 \
+            and res.type.broadcastable[0] == False 
 def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
    #print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip
@@ -579,9 +655,41 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
    # we've already checked the client counts, now just make the type check.
    ####if res_is_a(M, _dot22, 1):
    if M.owner and M.owner.op == _dot22:
+        if M.broadcastable == L.broadcastable:
+            Ml, Mr = M.owner.inputs
+            rval = [gemm_no_inplace(L, alpha, Ml, Mr, beta)]
+            #print 'GEMM 0', rval, beta, L, alpha, M
+            return rval
+    if M.owner and M.owner.op == T.dot\
+            and L.broadcastable==(False,) \
+            and M.broadcastable==(False,):
        Ml, Mr = M.owner.inputs
-        rval = [gemm_no_inplace(L, alpha, Ml, Mr, beta)]
+        rval = None
-        #print 'GEMM 0', rval, beta, L, alpha, M
+        if Ml.ndim == 1:
+            if Mr.ndim == 1:
+                #TODO: insert a BLAS ddot Op
+                pass
+            if Mr.ndim == 2:
+                #print "RETURNING GEMV (case 2)"
+                if Mr.dtype == Ml.dtype: 
+                    rval = [gemv_no_inplace(L, alpha, Mr.T, Ml, beta)]
+                    assert L.type == rval[0].type, (L.type, rval[0].type)
+                else:
+                    # TODO
+                    pass
+        if Ml.ndim == 2:
+            if Mr.ndim == 1:
+                #print "RETURNING GEMV (case 3)"
+                if Mr.dtype == Ml.dtype:
+                    rval = [gemv_no_inplace(L, alpha, Ml, Mr, beta)]
+                    assert L.type == rval[0].type, (L.type, rval[0].type)
+                else:
+                    # TODO
+                    pass
+            if Mr.ndim == 2:
+                # should have already got this case with a _dot22
+                pass
        return rval
    # this is False'd out because of inadequate testing.  
@@ -616,7 +724,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
 def _gemm_canonicalize(r, scale, rval, maxclients):
-    # Tries to interpret node as a sum of scalars * matrices
+    # Tries to interpret node as a sum of scalars * (vectors or matrices)
    def scaled(thing):
        if scale == 1:
            return thing
@@ -629,7 +737,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
    except:
        return None
-    if (tuple(r.type.broadcastable) != (False, False) or
+    if ((r.type.ndim not in (1, 2)) or
            r.type.dtype not in ('float32', 'float64', 'complex64', 'complex128')):
        rval.append(scaled(r))
        return rval
@@ -651,6 +759,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
    elif r.owner and r.owner.op == T.mul:
        scalars = []
+        vectors = []
        matrices = []
        for i in r.owner.inputs:
            if numpy.all(i.type.broadcastable):
@@ -660,6 +769,8 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
                    scalars.append(i.dimshuffle())
                else:
                    scalars.append(i)
+            elif _is_real_vector(i):
+                vectors.append(i)
            elif _is_real_matrix(i):
                matrices.append(i)
            else:
@@ -667,6 +778,7 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
                rval.append((scale,r))
                return rval
        if len(matrices)==1:
+            assert len(vectors)==0
            m = matrices[0]
            if len(scalars) == 0:
                _gemm_canonicalize(m, scale, rval, 1)
@@ -674,7 +786,16 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
                _gemm_canonicalize(m, scaled(scalars[0]), rval, 1)
            else:
                _gemm_canonicalize(m, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1)
-        else: #there are many matrices... lets not open this up
+        elif len(vectors)==1:
+            assert len(matrices)==0
+            v = vectors[0]
+            if len(scalars) == 0:
+                _gemm_canonicalize(v, scale, rval, 1)
+            elif len(scalars) == 1:
+                _gemm_canonicalize(v, scaled(scalars[0]), rval, 1)
+            else:
+                _gemm_canonicalize(v, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1)
+        else: #lets not open this up
            rval.append((scale,r))
    else:
        rval.append((scale,r))
@@ -735,8 +856,8 @@ def _gemm_from_factored_list(lst):
            #print 'TRYING', (s_i, M_i, s_j, M_j)
            gemm_of_sM_list = _beta_L_plus_alpha_M(s_i, M_i, s_j, M_j)
+            #print 'GOT IT', gemm_of_sM_list
            if gemm_of_sM_list:
-                #print 'GOT IT', gemm_of_sM_list
                def item_to_var(t):
                    try: s,M = t
                    except: return t
@@ -749,9 +870,11 @@ def _gemm_from_factored_list(lst):
                        for k, input in enumerate(lst) if k not in (i,j)]
                add_inputs.extend(gemm_of_sM_list)
                if len(add_inputs) > 1:
-                    return [T.add(*add_inputs)]
+                    rval = [T.add(*add_inputs)]
                else:
-                    return add_inputs
+                    rval = add_inputs
+                #print "RETURNING GEMM THIGN", rval
+                return rval
 def _gemm_from_node2(node):
    """
@@ -762,9 +885,13 @@ def _gemm_from_node2(node):
    """
    lst = []
    _gemm_canonicalize(node.outputs[0], 1.0, lst, 0)
+    #print "GEMM CANON", lst
    if len(lst) > 1:
        lst = _factor_canonicalized(lst)
        rval = _gemm_from_factored_list(lst)
+        #print "RVAL", rval
+        if rval:
+            assert rval[0].type == node.outputs[0].type, (rval[0].type, node.outputs[0].type)
        return rval
 class GemmOptimizer(Optimizer):
@@ -783,7 +910,6 @@ class GemmOptimizer(Optimizer):
            did_something = False
            nodelist.reverse()
            for node in nodelist:
-                #new_outputs = _gemm_from_node(node)
                try:
                    new_outputs = _gemm_from_node2(node)
                except InconsistencyError, e:
@@ -805,13 +931,13 @@ class Dot22(GemmRelated):
    This is a specialization of the more general Dot()
    """
    def make_node(self, x, y):
-        if not _is_real_matrix(x):
+        if x.type.ndim != 2 or x.type.dtype not in ('float32', 'float64'):
            raise TypeError(x)
-        if not _is_real_matrix(x):
+        if y.type.ndim != 2 or y.type.dtype not in ('float32', 'float64'):
            raise TypeError(y)
        if y.type.dtype != x.type.dtype:
            raise TypeError('dtype mismatch to Dot22')
-        bz = [False, False]
+        bz = (x.type.broadcastable[0], y.type.broadcastable[1])
        outputs = [T.tensor(x.type.dtype, bz)]
        return Apply(self, [x,y], outputs)
@@ -855,6 +981,9 @@ class Dot22(GemmRelated):
                double b = 0.0;
        """
    def c_code(self, node, name, (_x, _y), (_zout, ), sub): #DEBUG
+        if node.inputs[0].type.dtype.startswith('complex'):
+            raise utils.MethodNotDefined('%s.c_code' \
+                    % self.__class__.__name__)
        if len(self.c_libraries())<=0:
            return super(Dot22, self).c_code(node, name, (_x, _y), (_zout, ), sub)
        full_code = self.build_gemm_call() % dict(locals(), **sub)
@@ -870,19 +999,35 @@ _dot22 = Dot22()
 @local_optimizer([T.dot])
 def local_dot_to_dot22(node):
-    if node.op == T.dot:
+    if node.op != T.dot:
-        x,y = node.inputs
+        return
-        if _is_real_matrix(x) and _is_real_matrix(y) and y.type.dtype == x.type.dtype:
+    x,y = node.inputs
+    if y.type.dtype != x.type.dtype:
+        # TODO: upcast one so the types match
+        info('Not optimizing dot with inputs', x, y, x.type, y.type)
+        return
+    if y.type.dtype.startswith('float'):
+        if _is_real_matrix(x) and _is_real_matrix(y):
            return [_dot22(*node.inputs)]
-        else:
+        if 0:
-            info('Not optimizing dot with inputs', x, y, x.type, y.type)
+            if _is_real_matrix(x) and _is_real_vector(y):
-    else:
+                return [_dot22(x, y.dimshuffle(0,'x')).dimshuffle(0)]
-        return False
+            if _is_real_vector(x) and _is_real_matrix(y):
+                return [_dot22(x.dimshuffle('x',0), y).dimshuffle(1)]
+            if _is_real_vector(x) and _is_real_vector(x):
+                return [_dot22(x.dimshuffle('x',0), y.dimshuffle(0,'x')).dimshuffle()]
+    info('Not optimizing dot with inputs', x, y, x.type, y.type)
 @local_optimizer([gemm_no_inplace])
 def local_inplace_gemm(node):
    if node.op == gemm_no_inplace:
        return [gemm_inplace(*node.inputs)]
+@local_optimizer([gemv_no_inplace])
+def local_inplace_gemv(node):
+    if node.op == gemv_no_inplace:
+        return [gemv_inplace(*node.inputs)]
 #################################
 #
@@ -906,7 +1051,7 @@ blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
 # Try to make gemm inplace
 # Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
 optdb.register('InplaceBlasOpt', 
-        EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace,
+        EquilibriumOptimizer([local_inplace_gemm, local_inplace_gemv], failure_callback=EquilibriumOptimizer.warn_inplace,
            max_use_ratio=5), 
        70.0, 'fast_run', 'inplace')
@@ -1048,3 +1193,10 @@ blas_optdb.register('local_dot22_to_dot22scalar',
        11, 'fast_run')
+from opt import register_specialize, register_canonicalize
+#@register_specialize
+@local_optimizer([])
+def local_print_as_we_go_along(node):
+    if node.op in (T.sub, T.add):
+        debugprint(node)
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -397,41 +397,41 @@ def local_softmax_with_bias(node):
                    return
                return [sm_bias]
-if 0:
+def softmax_simplifier(numerators, denominators):
-    def softmax_simplifier(numerators, denominators):
+    for numerator in list(numerators):
-        for numerator in list(numerators):
+        #TODO: a single softmax'd vector??
-            #TODO: a single softmax'd vector??
+        if not numerator.type.dtype.startswith('float'):
-            if not numerator.type.dtype.startswith('float'):
+            continue
-                continue
+        if not numerator.type.broadcastable == (False, False):
-            if not numerator.type.broadcastable == (False, False):
+            continue
-                continue
+        if numerator.owner and numerator.owner.op == tensor.exp:
-            if numerator.owner and numerator.owner.op == tensor.exp:
+            x = numerator.owner.inputs[0]
-                x = numerator.owner.inputs[0]
+        else:
-            else:
+            continue
-                continue
+        matching_denom = None
-            matching_denom = None
+        for denominator in denominators:
-            for denominator in denominators:
+            if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
-                if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
+                if denominator.owner.op.new_order == (0,'x'):
-                    if denominator.owner.op.new_order == (0,'x'):
+                    z = denominator.owner.inputs[0] # thing getting dimshuffled
-                        z = denominator.owner.inputs[0] # thing getting dimshuffled
+                    if z.owner and isinstance(z.owner.op, tensor.Sum):
-                        if z.owner and isinstance(z.owner.op, tensor.Sum):
+                        #print 'ASDF', denominator.owner.op.new_order
-                            #print 'ASDF', denominator.owner.op.new_order
+                        #print z.owner.op.axis
-                            #print z.owner.op.axis
+                        if z.owner.op.axis == (1,):
-                            if z.owner.op.axis == (1,):
+                            #print "almost there.. softmax", x, z.owner.inputs[0]
-                                #print "almost there.. softmax", x, z.owner.inputs[0]
+                            if z.owner.inputs[0] is numerator:
-                                if z.owner.inputs[0] is numerator:
+                                matching_denom = denominator
-                                    matching_denom = denominator
+                                break
-                                    break
+        if matching_denom:
-            if matching_denom:
+            numerators.remove(numerator)
-                numerators.remove(numerator)
+            denominators.remove(matching_denom)
-                denominators.remove(matching_denom)
+            numerators.append(softmax(x))
-                numerators.append(softmax(x))
+    return numerators, denominators
-        return numerators, denominators
+opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
-    opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
+if 0:
    def softmax_grad_simplifier(numerators, denominators):
        print "mul simplify numerators"
        printing.debugprint(numerators)

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -8,7 +8,7 @@ from theano import gof
 from theano import scalar
 from theano import printing
 from theano.tensor import basic as tensor
-from theano.printing import pprint
+from theano.printing import pprint, debugprint
 from theano.tensor import elemwise
 from theano.tensor import opt
 from theano.compile import optdb
@@ -95,10 +95,17 @@ softplus = elemwise.Elemwise(scalar_softplus, name='softplus')
 pprint.assign(softplus, printing.FunctionPrinter('softplus'))
+def _skip_mul_1(r):
+    if r.owner and r.owner.op == tensor.mul:
+        not_is_1 = [i for i in r.owner.inputs if not _is_1(i) ]
+        if len(not_is_1)==1:
+            return not_is_1[0]
 logsigm_to_softplus = gof.PatternSub(
    (tensor.log, (sigmoid, 'x')),
    (tensor.neg, (softplus, (tensor.neg, 'x'))),
-    allow_multiple_clients = True)
+    allow_multiple_clients = True,
+    skip_identities_fn=_skip_mul_1)
 def _is_1(expr):
    """rtype bool. True iff expr is a constant close to 1
@@ -115,7 +122,8 @@ log1msigm_to_softplus = gof.PatternSub(
            dict(pattern='y', constraint = _is_1),
            (sigmoid, 'x'))),
    (tensor.neg, (softplus, 'x')),
-    allow_multiple_clients = True)
+    allow_multiple_clients = True,
+    skip_identities_fn=_skip_mul_1)
 log1pexp_to_softplus = gof.PatternSub(
    (tensor.log1p, 
@@ -329,3 +337,48 @@ register_local_1msigmoid = False
 if register_local_1msigmoid:
    opt.register_canonicalize(local_1msigmoid)
+if 0:
+    # This code is if'd out because it is not complete,
+    # and it isn't obviously a good idea anyway.
+    # The motivation here was to identify the last exp() node
+    # in the SciPy2010 article, which was not optimized away at the time of publication,
+    # so the example is actually not numerically stable, even though it should be.
+    @opt.register_stabilize
+    @gof.local_optimizer([tensor.mul])
+    def local_sigm_gest(node):
+        print "CANONICALIZE"
+        print sigm_canonicalize(node)
+    def sigm_canonicalize(node):
+        add = tensor.add
+        mul = tensor.mul
+        div = tensor.true_div
+        if node.op == tensor.add:
+            rval = []
+            for i in node.inputs:
+                rval += sigm_canonicalize(i)
+            return rval
+        if node.op == tensor.mul:
+            rval = sigm_canonicalize(node.inputs[0])
+            for i in node.inputs[1:]:
+                old_rval = rval
+                rval = []
+                for t1 in sigm_canonicalize(i):
+                    for t0 in old_rval:
+                        assert t1.owner.op == div
+                        assert t0.owner.op == div
+                        t0top, t0bot = t0.owner.inputs
+                        t1top, t1bot = t1.owner.inputs
+                        rval.append(div(mul(*(t0top+t1top)), mul(*(t0bot+t1bot))))
+                        if len(rval) > 100:
+                            # This loop can be exponentially long.
+                            # aborting
+                            return []
+        elif len(node.outputs)>1:
+            return []
+        else:
+            return [node.outputs[0]]
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -924,6 +924,10 @@ class Test_softmax_opt():
        assert softmax in f_ops
        f(self.rng.rand(3,4))
+    def test_grad(self):
+        c = T.matrix()
+        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
        # test that function contains softmax and no div.
        w = T.matrix()
        g = theano.function([c,w],T.grad((p_y*w).sum(), c))

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
    return lopt
+def register_specialize_device(lopt, *tags, **kwargs):
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
+    return lopt
 def register_stabilize(lopt, *tags, **kwargs):
    name = (kwargs and kwargs.pop('name')) or lopt.__name__
    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
@@ -189,6 +194,11 @@ def local_dimshuffle_lift(node):
 register_canonicalize(local_dimshuffle_lift)
 register_specialize(local_dimshuffle_lift)
+@register_canonicalize
+@gof.local_optimizer([])
+def local_dimshuffle_no_inplace_at_canonicalize(node):
+    if isinstance(node.op, T.DimShuffle) and node.op.inplace:
+        return [T.DimShuffle(node.op.input_broadcastable, node.op.new_order, inplace=False)(node.inputs[0])]
 #####################################
@@ -1603,18 +1613,20 @@ def local_sum_mul_by_scalar(node):
 @register_canonicalize
 @gof.local_optimizer([])
 def local_sum_div_dimshuffle(node):
-    '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis=l) / b,
+    '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
    if dimension l of the DimShuffle is 'x'.'''
    # TODO: extend it to product, and quotient of products
    if isinstance(node.op, T.Sum):
        axis = node.op.axis
+        if axis is None:
+            axis = range(node.inputs[0].ndim)
        #print 'axis =', axis
        thing_summed = node.inputs[0]
        dimshuffled = None
        if thing_summed.owner and thing_summed.owner.op == T.true_div:
            numerator, denominator = thing_summed.owner.inputs
-            if isinstance(numerator.owner.op, T.DimShuffle):
+            if numerator.owner and isinstance(numerator.owner.op, T.DimShuffle):
                new_order = numerator.owner.op.new_order
                #print 'new_order =', new_order
                # check compatibility
@@ -1630,7 +1642,7 @@ def local_sum_div_dimshuffle(node):
                #else:
                #    print 'incompatible dims:', axis, new_order
-            if isinstance(denominator.owner.op, T.DimShuffle):
+            if denominator.owner and isinstance(denominator.owner.op, T.DimShuffle):
                new_order = denominator.owner.op.new_order
                #print 'new_order =', new_order
                # check compatibility
@@ -1827,9 +1839,31 @@ def local_pow_specialize(node):
                rval = [T.inv(xsym)]
            if N.all(y == -2):
                rval = [T.inv(T.sqr(xsym))]
+            if rval:
+                rval[0] = T.cast(rval[0], odtype)
+                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
+                return rval
+    else:
+        return False
+register_specialize(local_pow_specialize)
-            # Optimize all integral powers in [-RANGE, RANGE]
+@register_specialize_device
-            if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
+@gof.local_optimizer([T.pow])
+def local_pow_specialize_device(node):
+    """
+    This optimization is not the same on all device. We do it only on cpu here.
+    """
+    if node.op == T.pow:
+        #the idea here is that we have pow(x, y)
+        odtype = node.outputs[0].dtype
+        xsym = node.inputs[0]
+        ysym = node.inputs[1]
+        y = local_mul_canonizer.get_constant(ysym)
+        if (y is not None) \
+                and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
+            rval = None
+            # 512 is too small for the cpu and too big for some gpu!
+            if abs(y)==int(abs(y)) and abs(y) <= 512:
                pow2 = [xsym]
                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
                y_to_do = abs(y)
@@ -1859,14 +1893,7 @@ def local_pow_specialize(node):
                rval[0] = T.cast(rval[0], odtype)
                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
                return rval
-    else:
-        return False
-register_specialize(local_pow_specialize)
-theano.configparser.AddConfigVar('experimental.pow',
-        "Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
-        theano.configparser.BoolParam(False),
-        )
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):
    """Remove special-case constants from mul arguments
@@ -1965,20 +1992,28 @@ register_specialize(local_add_specialize)
 mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink))
 def check_for_x_over_absX(numerators, denominators):
+    """Convert x/abs(x) into sign(x). """
    # TODO: this function should dig/search through dimshuffles
    # This won't catch a dimshuffled absolute value
    for den in list(denominators):
        if den.owner and den.owner.op == T.abs_ and den.owner.inputs[0] in numerators:
-            denominators.remove(den)
+            if den.owner.inputs[0].type.dtype.startswith('complex'):
-            numerators.remove(den.owner.inputs[0])
+                #TODO: Make an Op that projects a complex number to have unit length
-            numerators.append(T.sgn(den.owner.inputs[0]))
+                #      but projects 0 to 0.  That would be a weird Op, but consistent with the 
+                #      special case below.  I heard there's some convention in Matlab that is 
+                #      similar to this... but not sure.
+                pass
+            else:
+                denominators.remove(den)
+                numerators.remove(den.owner.inputs[0])
+                numerators.append(T.sgn(den.owner.inputs[0]))
    return numerators, denominators
 local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'teststest')
 @register_stabilize
 @gof.local_optimizer([T.log])
 def local_log1p(node):
-    # log(1+exp(x)) -> log1p(x)
+    # log(1+x) -> log1p(x)
    if node.op == T.log:
        log_arg, = node.inputs
        if log_arg.owner and log_arg.owner.op == T.add:
@@ -2207,7 +2242,7 @@ def local_elemwise_fusion_op(OP):
    """
    def local_fuse(node):
        """
-        As part of specialisation, we fusion two consecutif elemwise op of the same shape.
+        As part of specialisation, we fuse two consecutive elemwise op of the same shape.
        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
        The number of dimension is validated at call time by theano itself.
@@ -2240,7 +2275,7 @@ def local_elemwise_fusion_op(OP):
        for i in node.inputs:
            do_fusion = False
            catch = False
-            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
+            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1:
                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
                do_fusion=True
                try:
@@ -2296,7 +2331,7 @@ def local_elemwise_fusion_op(OP):
        # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
        # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
-        if True:
+        if OP != T.Elemwise:
            argument_limit = 240  # 16 bytes are used for block and thread coords etc.
            #TODO: read in from architecture to make this 4 or 8
            int_size = 8

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1604,6 +1604,54 @@ class t_dot(unittest.TestCase):
        #utt.verify_grad(dot, [self.rand(), self.rand(2)])
        #utt.verify_grad(dot, [self.rand(), self.rand(2,5)])
+    def test_broadcastable_patterns(self):
+        #
+        # These examples hsould all work because we broadcastable or no, all dimensions of all
+        # results have size 1.
+        #
+        def val_for(r):
+            if r.ndim == 0:
+                return numpy.asarray(1.1, dtype=r.dtype)
+            if r.ndim == 1:
+                return numpy.asarray([1.2], dtype=r.dtype)
+            elif r.ndim == 2:
+                return numpy.asarray([[1.3]], dtype=r.dtype)
+            raise ValueError()
+        failures = []
+        for dtype0 in ('float32', 'float64', 'complex64', 'complex128'):
+            for dtype1 in ('float32', 'float64', 'complex64', 'complex128'):
+                for bc0 in ((True,), (False,), (True, True), (True, False), (False, True),
+                        (False,False)):
+                    for bc1 in ((True,), (False,), (True, True), (True, False), (False, True),
+                            (False,False)):
+                        x = TensorType(dtype=dtype0, broadcastable=bc0)()
+                        y = TensorType(dtype=dtype1, broadcastable=bc1)()
+                        z = dot(x,y)
+                        t = TensorType(dtype=dtype0, broadcastable=z.broadcastable)()
+                        rval =  z * 3 + 2*t
+                        if rval.type.dtype.startswith('complex'):
+                            # there is a problem with complex numbers right now
+                            # Elemwise code doesn't compile when both precisions of complex
+                            # numbers are used in the same file because the operators
+                            # aren't declared properly.
+                            failures.append((dtype0, dtype1, bc0, bc1))
+                            continue
+                        f = function([x,y,t], rval)
+                        xval = val_for(x)
+                        yval = val_for(y)
+                        tval = val_for(t)
+                        f(xval, yval, tval) #debugmode checks result
+        #if failures:
+            #print failures
+        assert not failures
 class T_tensorfromscalar(unittest.TestCase):
    def test0(self):
        s = scal.constant(56)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -416,7 +416,7 @@ def test_gemm_canonicalize():
    can = []
    _gemm_canonicalize(X + Y + u, 1.0, can, 0)
-    assert can == [(1.0, X), (1.0, Y), u]
+    assert can == [(1.0, X), (1.0, Y), u], can
    can = []
    _gemm_canonicalize(a*X + Y - b*Z*c, 1.0, can, 0)

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -14,7 +14,6 @@ from theano.gof import Env
 from theano.tensor.elemwise import DimShuffle
 from theano import pprint, shared
 from theano.tests import unittest_tools as utt
-import scalar as scal
 from theano import function, compile
 from nose.plugins.skip import SkipTest
@@ -467,6 +466,22 @@ class test_canonize(unittest.TestCase):
                topo=f.maker.env.toposort()
                assert len(topo)==0
                assert(out_dtype==out.dtype)
+            #test x / abs(x) -> sign(x)
+            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
+                                                           (dx/abs(dx),[dx],[0.5-dxv],'float64'),
+                                                           (fx/abs(fx),[fx],[0.5-fxv],'float32'),
+                                                           (dx/abs(dx),[dx],[0.0*dxv],'float64'),
+                                                           (fx/abs(fx),[fx],[0.0*fxv],'float32'),
+                                                           (dv/abs(dv),[dv],[0.5-dvv],'float64'),
+                                                           (fv/abs(fv),[fv],[0.5-fvv],'float32'),
+                ]):
+                f = compile.function(list(sym_inputs), g,
+                                     mode=mode)
+                out = f(*val_inputs)
+                assert numpy.all(numpy.isfinite(out))
+                assert numpy.allclose(out,numpy.sign(val_inputs[0]))
+                assert(out_dtype==out.dtype)
        finally:
            mode._optimizer = old_optimizer
@@ -599,34 +614,34 @@ class test_fusion(unittest.TestCase):
        izv = theano._asarray(my_init(shp,num=70),dtype='int32')
        fwx=fw+fx
        cases = [
-            (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#1
+            (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#0
-            (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),
+            (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),#1
            (fx+fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv,'float32'),
            (fx*fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv,'float32'),
-            (fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
+            (fw+fx+fy+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
-            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#5
            (((fw+fx)+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
            ((fw+(fx+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
            ((fw+(fx+fy)+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
-            (fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10
+            (fw+(fx+(fy+fz)),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
-            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),
+            ((fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv,'float32'),#10
            (fw*fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv*fxv*fyv*fzv,'float32'),
            (fw+fx*fy*fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv*fyv*fzv,'float32'),
            (fx+fy*fz*fx,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv*fzv*fxv,'float32'),
-            (fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),#15
+            (fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),
-            (fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),2,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),#expect 2 as their is limit to the fusion on the gpu.
+            (fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),#15
            #test with constant
            ((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
            (((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
            ((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
+            ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
            (2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
            #mix float32 and float64
            (2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
            (2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
-            (2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),#25
+            (2+(fw+fx)+(dw+fz),(fw,fx,dw,fz),(fwv,fxv,dwv,fzv),1,fwv+fxv+dwv+fzv+2,'float64'),
-            (2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),
+            (2+(fw+fx)+(fy+dw),(fw,fx,fy,dw),(fwv,fxv,fyv,dwv),1,fwv+fxv+fyv+dwv+2,'float64'),#25
            #test when their is other op then elemwise.
            #the good output for the next test.
 #            (Pdb) p f.maker.env.toposort()
@@ -642,33 +657,33 @@ class test_fusion(unittest.TestCase):
            #test other elemwise op
            (fx+fy+cos(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cos(fzv),'float32'),
            (fx+fy+cosh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.cosh(fzv),'float32'),
-            (fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),#30
+            (fx+fy+abs(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.absolute(fzv),'float32'),
-            (ix+iy+abs(iz),(ix,iy,iz),(ixv,iyv,izv),1,ixv+iyv+numpy.absolute(izv),'int32'),
+            (ix+iy+abs(iz),(ix,iy,iz),(ixv,iyv,izv),1,ixv+iyv+numpy.absolute(izv),'int32'),#30
            (fx+fy+theano.tensor.log(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log(fzv),'float32'),
            (fx+fy+theano.tensor.log2(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log2(fzv),'float32'),
            (fx+fy+theano.tensor.log10(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.log10(fzv),'float32'),
-            (fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow #35
+            (fx+fy**fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv**fzv,'float32'),#pow
-            (fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),
+            (fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),#35
            (fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
            (fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
-            (fx-theano.tensor.true_div(fy,2),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/2),'float32'),
+            (fx-theano.tensor.true_div(fy,2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
-            (fx-theano.tensor.true_div(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),#40
+            (fx-theano.tensor.true_div(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
-            (fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)),'float64'),#int32 - float32 = float64 #No c_code for int_div
+            (fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)),'float64'),#int32 - float32 = float64 #No c_code for int_div#40
-            (fx-(fy/2),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/2),'float32'),
+            (fx-(fy/2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
            (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),
            (fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
-            (fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),#45
+            (fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
-            (fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),
+            (fx-(fy<fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<fzv),'float32'),#45
-            (fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),#TODO: bugged on the gpu
+            (fx-(fy<=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv<=fzv),'float32'),
-            (fx-(fy==fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),#TODO: bugged
+            (fx-T.eq(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv==fzv),'float32'),
-            (fx-(fy!=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
+            (fx-T.neq(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv!=fzv),'float32'),
-            (fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),#50
+            (fx-fy+tan(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tan(fzv),'float32'),
-            (fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),
+            (fx-fy+tanh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.tanh(fzv),'float32'),#50
            (fx-fy+sin(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sin(fzv),'float32'),
            (fx-fy+sinh(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sinh(fzv),'float32'),
            (fx-fy+theano.tensor.sqr(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(fzv*fzv),'float32'),
-            (fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),#55
+            (fx-fy+theano.tensor.sqrt(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.sqrt(fzv),'float32'),
-            (fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),
+            (fx-fy+theano.tensor.inv(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(1/fzv),'float32'),#55
            (fx-fy+theano.tensor.neg(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+(-fzv),'float32'),
 #            (fx-fy+theano.tensor.iround(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),#TODO: trouble with the output type. To my understanding, numpy and c round fct return the same type as the input. Why we don't do this?
@@ -714,10 +729,9 @@ class test_fusion(unittest.TestCase):
                fail1.append(id)
            topo=f.maker.env.toposort()
            if gpu:
-                import theano_cuda_ndarray as tcn
+                import theano.sandbox.cuda as cuda
+                topo_ = [x for x in topo if not isinstance(x.op,cuda.basic_ops.GpuFromHost) and not isinstance(x.op,cuda.basic_ops.HostFromGpu)]
-                topo_ = [x for x in topo if not isinstance(x.op,tcn.basic_ops.GpuFromHost)]
+                gpu_ = [x for x in topo if isinstance(x.op,cuda.basic_ops.GpuFromHost)]
-                gpu_ = [x for x in topo if isinstance(x.op,tcn.basic_ops.GpuFromHost)]
                if not len(gpu_)==len(sym_inputs):
                    fail2.append((id,gpu_,sym_inputs))
            else: topo_=topo
@@ -727,9 +741,6 @@ class test_fusion(unittest.TestCase):
            if not out_dtype==out.dtype:
                fail4.append((id,out_dtype,out.dtype))
-#            cases[id]=None #to remove g, that link to out that link to the ndarray!
-            #g.owner.inputs[0] is out... make owner a weakref?
        failed=len(fail1+fail2+fail3+fail4)
        print "Executed",len(cases),"cases", "failed", failed
        if failed>0:
@@ -751,8 +762,9 @@ class test_fusion(unittest.TestCase):
        mode=compile.mode.predefined_modes['FAST_COMPILE']
        mode=compile.mode.predefined_modes['FAST_RUN']
        mode=compile.mode.predefined_modes['DEBUG_MODE']
-        import theano.sandbox.cuda as tcn
+        mode = theano.compile.mode.get_mode(mode).including('gpu')
-        self.do(mode, tcn.shared_constructor, shp, gpu=True)
+        import theano.sandbox.cuda as cuda
+        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
    def speed_fusion(self, shared_fn = shared, gpu = False, s=None):
        """
@@ -788,8 +800,8 @@ class test_fusion(unittest.TestCase):
        print "min", d.min(), "argmin", d.argmin(), "max", d.max(), "mean", d.mean(), "std", d.std()
    def speed_fusion_gpu(self):
-        import theano_cuda_ndarray as tcn
+        import theano.sandbox.cuda as cuda
-        self.speed_fusion(shared_fn=tcn.shared_constructor, gpu=True, s=slice(0,15))
+        self.speed_fusion(shared_fn=tcn.float32_shared_constructor, gpu=True, s=slice(0,15))
    def speed_log_exp(self):
        s=slice(31,36)
@@ -1260,6 +1272,7 @@ def test_local_pow_specialize():
    v = T.vector()
    val = numpy.arange(10,dtype=theano.config.floatX)
    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
    f = function([v], v**0, mode=mode)
    nodes = [node.op for node in f.maker.env.toposort()]
    assert nodes == [Shape_i(0), T.alloc]
@@ -1301,33 +1314,44 @@ def test_local_pow_specialize():
 #    assert nodes == [T.sqrt,T.inv]#Why this don't work?
    assert numpy.allclose(f(val_no0),val_no0**(-.5))
-    if config.experimental.pow:
+def test_local_pow_specialize_device():
-        print "Test experimental.pow=True"
-        f = function([v], v**(15), mode=mode)
+    # test that on cpu we use more agressive optimization
-        nodes = [node.op for node in f.maker.env.toposort()]
-        assert len(nodes)==1
+    mode = theano.config.mode
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    if mode == 'FAST_COMPILE':
-        assert numpy.allclose(f(val),val**15)
+       mode = 'FAST_RUN'
+    mode = compile.mode.get_mode(mode)
-        f = function([v], v**(-15), mode=mode)
+    mode = mode.excluding('fusion').excluding('gpu')
-        nodes = [node.op for node in f.maker.env.toposort()]
-        assert len(nodes)==2
+    v = T.vector()
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    val = numpy.arange(10,dtype=theano.config.floatX)
-        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+    val_no0 = numpy.arange(1,10,dtype=theano.config.floatX)
-        assert numpy.allclose(f(val_no0),val_no0**(-15))
+    f = function([v], v**(15), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
-        f = function([v], v**(16), mode=mode)
+    assert len(nodes)==1
-        nodes = [node.op for node in f.maker.env.toposort()]
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert len(nodes) == 1
+    assert numpy.allclose(f(val),val**15)
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert numpy.allclose(f(val),val**16)
+    f = function([v], v**(-15), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
-        f = function([v], v**(-16), mode=mode)
+    assert len(nodes)==2
-        nodes = [node.op for node in f.maker.env.toposort()]
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
-        assert len(nodes) == 2
+    assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
-        assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert numpy.allclose(f(val_no0),val_no0**(-15))
-        assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
-        assert numpy.allclose(f(val_no0),val_no0**(-16))
+    f = function([v], v**(16), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes) == 1
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert numpy.allclose(f(val),val**16)
+    f = function([v], v**(-16), mode=mode)
+    nodes = [node.op for node in f.maker.env.toposort()]
+    assert len(nodes) == 2
+    assert isinstance(nodes[0].scalar_op,theano.scalar.Composite)
+    assert isinstance(nodes[-1].scalar_op,theano.scalar.basic.Inv)
+    assert numpy.allclose(f(val_no0),val_no0**(-16))
 class T_Rebroadcast(unittest.TestCase):

--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
+from nose.plugins.skip import SkipTest
+import unittest
+import theano
+import numpy
+import random
+import numpy.random
+from theano.tests  import unittest_tools as utt
+'''
+  Different tests that are not connected to any particular Op, or functionality of 
+  Theano. Here will go for example code that we will publish in papers, that we 
+  should ensure that it will remain operational
+'''
+class T_diverse(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+    def scipy_paper_example1(self):
+        a = theano.tensor.vector('a') # declare variable
+        b = a + a**10                 # build expression
+        f = theano.function([a], b)   # compile function
+        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
+    def scipy_papaer_example2(self):
+        ''' This just sees if things compile well and if they run '''
+        x = T.matrix()
+        y = T.vector()
+        w = shared(rng.randn(100))
+        b = shared(numpy.zeros(()))
+        # Construct Theano expression graph
+        p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+        xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+        prediction = p_1 > 0.5
+        cost = xent.mean() + 0.01*(w**2).sum()
+        gw,gb = T.grad(cost, [w,b])
+        # Compile expressions to functions
+        train = function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+        predict = function(inputs=[x], outputs=prediction)
+        N = 4
+        feats = 100
+        D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
+        training_steps = 10
+        for i in range(training_steps):
+            pred, err = train(D[0], D[1])
+if __name__ == '__main__':
+    unittest.main()