merge

56267692 · Simon Lemieux · 6516ddd5 · da2cc473 · 56267692 · 6516ddd5
--- a/benchmark/numexpr/gen_graph.py
+++ b/benchmark/numexpr/gen_graph.py
@@ -150,38 +150,35 @@ def execs_timeit_2vector(exprs, fname=None):
    assert len(colors)>=len(times)
    fig = pylab.figure()
    for idx,(time,expr) in enumerate(zip(times,str_expr)):
+
+        ###
+        ###
+        ###
+        # Creating each subplot
+        ###
+        ###
+        ###
+        ###
        pylab.subplot(220+idx+1)
        pylab.subplots_adjust(wspace=0.25, hspace=0.25)
        #legend=[]
        #plot = fig.add_subplot(1,len(exprs),idx)
        speedup = [t[0].min()/t[1].min() for t in time]
+
        pylab.semilogx(nb_calls, speedup, linewidth=1.0, linestyle = '--', color='r')
        speedup = [t[0].min()/t[2].min() for t in time]
        pylab.semilogx(nb_calls, speedup, linewidth=1.0, color = 'b')
        pylab.grid(True)
        if (idx == 2) or (idx == 3):
-            pylab.xlabel('Dimension of vectors a and b')
+            pylab.xlabel('Dimension of vectors a and b', fontsize = 15)
        if (idx == 0) or (idx == 2):
-            pylab.ylabel('Speed up vs NumPy')
+            pylab.ylabel('Speed up vs NumPy', fontsize = 15)
        pylab.axhline(y=1, linewidth=1.0, color='black')
        pylab.xlim(1e3,1e7)
        pylab.xticks([1e3,1e5,1e7],['1e3','1e5','1e7'])
        pylab.title(expr)
-        #for time,expr,color in zip(times,str_expr,colors):
-        #    speedup = [t[0].min()/t[1].min() for t in time]
-        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, linestyle='--', color=color)
-        #    speedup = [t[0].min()/t[2].min() for t in time]
-        #    plot.semilogx(nb_calls, speedup, linewidth=1.0, color=color)
-            #legend += ["Numexpr "+expr,"Theano "+expr]
-        
-        
-    #pylab.title('Speed up Numexpr and Theano vs NumPy')
-    #pylab.grid(True)
-    #pylab.xlabel('Nb element')
-    #pylab.ylabel('Speed up vs NumPy')

-    #pylab.legend(legend,loc='upper left')
-#    fig.legend(legend,loc='upper left')
+

    if fname:
        fig.savefig(fname)

--- a/benchmark/numexpr/multiple_graph.png
+++ b/benchmark/numexpr/multiple_graph.png
--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -126,6 +126,18 @@ class AddDestroyHandler(gof.Optimizer):
        super(AddDestroyHandler, self).add_requirements(env)
        env.extend(gof.DestroyHandler())

+class PrintCurrentEnv(gof.Optimizer):
+    """This optimizer is for debugging.
+
+    Toss it into the optimization pipeline to see the state of things at any given point.
+    """
+    def __init__(self, header):
+        self.header =header
+    def apply(self, env):
+        import theano.printing
+        print "PrintCurrentEnv:", self.header
+        theano.printing.debugprint(env.outputs)
+
 optdb = gof.SequenceDB()
 optdb.register('merge1', gof.MergeOptimizer(), 
        0, 'fast_run', 'fast_compile')
@@ -133,10 +145,19 @@ optdb.register('canonicalize', gof.EquilibriumDB(),         # rearranges elemwis
        1, 'fast_run', 'fast_compile')
 optdb.register('merge1.2', gof.MergeOptimizer(skip_const_merge=False),
        1.2, 'fast_run', 'fast_compile')
+optdb.register('Print1.21', PrintCurrentEnv('Post-canonicalize'),
+        1.21,)# 'fast_run', 'fast_compile')
+
 optdb.register('stabilize', gof.EquilibriumDB(),            # replace unstable subgraphs
        1.5, 'fast_run')          
+optdb.register('Print1.51', PrintCurrentEnv('Post-stabilize'),
+        1.51,) #'fast_run', 'fast_compile')
 optdb.register('specialize', gof.EquilibriumDB(),           # misc special cases for speed
        2, 'fast_run')
+optdb.register('Print2.01', PrintCurrentEnv('Post-specialize'),
+        2.01, )#'fast_run', 'fast_compile')
+optdb.register('specialize_device', gof.EquilibriumDB(),           # misc special cases for speed that are dependent on the device.
+        48.6, 'fast_run')#must be after gpu stuff at 48.5
 optdb.register('merge2', gof.MergeOptimizer(),              # especially constant merge
        49, 'fast_run')
 optdb.register('add_destroy_handler', AddDestroyHandler(), 

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -341,6 +341,8 @@ class Value(Variable):
        if value is not None:
            raise ValueError("Value instances cannot have an owner.")
    owner = property(lambda self: None, __set_owner)
+    value = property(lambda self: self.data, 
+            doc='read-only data access method')

    # index is not defined, because the `owner` attribute must necessarily be None


--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -525,7 +525,8 @@ class PatternSub(LocalOptimizer):
                      (scrabble, 'x'))
    """

-    def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False):
+    def __init__(self, in_pattern, out_pattern, allow_multiple_clients = False,
+            skip_identities_fn=None):
        """
        Creates a PatternSub that replaces occurrences of
        in_pattern by occurrences of out_pattern.
@@ -543,7 +544,12 @@ class PatternSub(LocalOptimizer):
            raise TypeError("The pattern to search for must start with a specific Op instance.")
        self.__doc__ = self.__class__.__doc__ + "\n\nThis instance does: " + str(self) + "\n"
        self.allow_multiple_clients = allow_multiple_clients
+        self.skip_identities_fn = skip_identities_fn

+
+    def skip_identities(self, expr):
+        if self.skip_identities_fn:
+            return self.skip_identities_fn(expr)
    def op_key(self):
        return self.op

@@ -568,13 +574,22 @@ class PatternSub(LocalOptimizer):
        if node.op != self.op:
            return False
        def match(pattern, expr, u, allow_multiple_clients = False):
+            def retry_with_equiv():
+                expr_equiv = self.skip_identities(expr)
+                if expr_equiv is None:
+                    return False
+                #TODO: Not sure how to handle multiple_clients flag
+                ###print 'retrying match', pattern, expr_equiv
+                return match(pattern, expr_equiv, u,
+                        allow_multiple_clients=allow_multiple_clients)
+
            if isinstance(pattern, (list, tuple)):
                if expr.owner is None:
                    return False
                if not (expr.owner.op == pattern[0]) or (not allow_multiple_clients and len(expr.clients) > 1):
-                    return False
+                    return retry_with_equiv()
                if len(pattern) - 1 != len(expr.owner.inputs):
-                    return False
+                    return retry_with_equiv()
                for p, v in zip(pattern[1:], expr.owner.inputs):
                    u = match(p, v, u, self.allow_multiple_clients)
                    if not u:
@@ -588,17 +603,17 @@ class PatternSub(LocalOptimizer):
                if constraint(expr):
                    return match(real_pattern, expr, u, pattern.get('allow_multiple_clients', False))
                else:
-                    return False
+                    return retry_with_equiv()
            elif isinstance(pattern, str):
                v = unify.Var(pattern)
                if u[v] is not v and u[v] is not expr:
-                    return False
+                    return retry_with_equiv()
                else:
                    u = u.merge(expr, v)
            elif isinstance(pattern, graph.Constant) and isinstance(expr, graph.Constant) and pattern.equals(expr):
                return u
            else:
-                return False
+                return retry_with_equiv()
            return u

        def build(pattern, u):
@@ -614,6 +629,7 @@ class PatternSub(LocalOptimizer):
        if u:
            p = self.out_pattern
            new = build(p, u)
+            ####print "PatternSub matched:", new
            return [new]
        else:
            return False

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -359,7 +359,8 @@ pprint.assign(lambda pstate, r: hasattr(pstate, 'target') and pstate.target is n
 pp = pprint


-def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'), compact=True, mode=None, format='png'):
+def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.png'),
+        compact=True, mode=None, format='png', with_ids=False):
    """
    print to a file in png format the graph of op of a compile theano fct.

@@ -390,14 +391,15 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
    
    g=pd.Dot()
    var_str={}
+    all_strings = set()
    def var_name(var):
        if var in var_str:
            return var_str[var]
        
        if var.name is not None:
-            varstr = var.name+" "+str(var.type)
+            varstr = 'name='+var.name+" "+str(var.type)
        elif isinstance(var,gof.Constant):
-            dstr = str(var.data)
+            dstr = 'val='+str(var.data)
            if '\n' in dstr:
                dstr = dstr[:dstr.index('\n')]
            if len(dstr) > 30:
@@ -408,12 +410,17 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        else:
            #a var id is needed as otherwise var with the same type will be merged in the graph.
            varstr = str(var.type)
-        varstr += ' ' + str(len(var_str))
+        if (varstr in all_strings) or with_ids:
+            varstr += ' id=' + str(len(var_str))
        var_str[var]=varstr
+        all_strings.add(varstr)

        return varstr
    topo = fct.maker.env.toposort()
+    apply_name_cache = {}
    def apply_name(node):
+        if node in apply_name_cache:
+            return apply_name_cache[node]
        prof_str=''
        if mode:
            time = mode.apply_time.get((topo.index(node),node),0)
@@ -425,7 +432,12 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
                pf=0
            else: pf = time*100/mode.fct_call_time[fct]
            prof_str='   (%.3fs,%.3f%%,%.3f%%)'%(time,pt,pf)
-        return str(node.op).replace(':','_')+'    '+str(topo.index(node))+prof_str
+        applystr = str(node.op).replace(':','_')
+        if (applystr in all_strings) or with_ids:
+            applystr = applystr+'    id='+str(topo.index(node))+prof_str
+        all_strings.add(applystr)
+        apply_name_cache[node] = applystr
+        return applystr

    # Update the inputs that have an update function
    input_update={}
@@ -434,16 +446,18 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
        if i.update is not None:
            input_update[outputs.pop()] = i

+    apply_shape='ellipse'
+    var_shape='box'
    for node_idx,node in enumerate(topo):
        astr=apply_name(node)
-        g.add_node(pd.Node(astr,shape='box'))
+        g.add_node(pd.Node(astr,shape=apply_shape))
        for id,var in enumerate(node.inputs):
            varstr=var_name(var)
            label=''
            if len(node.inputs)>1:
                label=str(id)
            if var.owner is None:
-                g.add_node(pd.Node(varstr,color='green'))
+                g.add_node(pd.Node(varstr,color='green',shape=var_shape))
                g.add_edge(pd.Edge(varstr,astr, label=label))
            elif var.name or not compact:
                g.add_edge(pd.Edge(varstr,astr, label=label))
@@ -460,10 +474,10 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
                label=str(id)
            if out:
                g.add_edge(pd.Edge(astr, varstr, label=label))
-                g.add_node(pd.Node(varstr,color='blue'))
+                g.add_node(pd.Node(varstr,color='blue',shape=var_shape))
            elif len(var.clients)==0:
                g.add_edge(pd.Edge(astr, varstr, label=label))
-                g.add_node(pd.Node(varstr,color='grey'))
+                g.add_node(pd.Node(varstr,color='grey',shape=var_shape))
            elif var.name or not compact:
                g.add_edge(pd.Edge(astr, varstr, label=label))
 #            else:
@@ -495,9 +509,9 @@ def pydot_var(vars, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
            return var_str[var]

        if var.name is not None:
-            varstr = var.name
+            varstr = 'name='+var.name
        elif isinstance(var,gof.Constant):
-            dstr = str(var.data)
+            dstr = 'val='+str(var.data)
            if '\n' in dstr:
                dstr = dstr[:dstr.index('\n')]
            if len(dstr) > 30:

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -21,8 +21,8 @@ def debug(*msg):
 # printed and this module will not be working properly (we set `cuda_available`
 # to False).

-# This variable is True by default, and set to False if something goes wrong
-# when trying to initialize cuda.
+# This variable is True by default, and set to False if nvcc is not available or
+# their is no cuda card or something goes wrong when trying to initialize cuda.
 cuda_available = True

 # Global variable to avoid displaying the same warning multiple times.
@@ -89,6 +89,9 @@ except Exception, e:
    error( "Failed to compile cuda_ndarray.cu: %s" % str(e))
    set_cuda_disabled()

+if cuda_available:
+    cuda_available=device_available()
+
 if cuda_available:
    #check if their is an old cuda_ndarray that was loading instead of the one we compiled!
    import cuda_ndarray.cuda_ndarray

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1715,14 +1715,21 @@ class GpuSubtensor(tensor.Subtensor):
        cdata = tuple(map(convert, self.idx_list))
        if len(cdata) == 1:
            cdata = cdata[0]
+        out[0] = x.__getitem__(cdata)

-        # some numpy installations don't expose the __index__() methods for
-        # numpy.int8/16/32/64. Casting to python's int instead
-        start = int(cdata.start) if cdata.start!=None else None
-        stop = int(cdata.stop) if cdata.stop!=None else None
-        step = int(cdata.step) if cdata.step!=None else None
-        newslice = slice(start,stop,step)
-        out[0] = x.__getitem__(newslice)
+        if 0: 
+            # JSB: commenting this out because it breaks code and does not look right
+            #      Dumi could you try to run the examples in the DeepLearningBenchmarks
+            #      for example?  This logic doesn't seem right -- we just 
+            #      cast cdata to a tuple, so it doesn't have a .start field.
+            
+            # some numpy installations don't expose the __index__() methods for
+            # numpy.int8/16/32/64. Casting to python's int instead
+            start = int(cdata.start) if cdata.start!=None else None
+            stop = int(cdata.stop) if cdata.stop!=None else None
+            step = int(cdata.step) if cdata.step!=None else None
+            newslice = slice(start,stop,step)
+            out[0] = x.__getitem__(newslice)

 class GpuIncSubtensor(tensor.IncSubtensor):
    def make_node(self, x, y, *inputs):

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -722,7 +722,9 @@ conv_rows_stack2( float* img, float* kern, float* out,
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	  else idx_kern=d_kern;
 	  const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	  float sum_ =0.0f;
+	  convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+	  sum+=sum_;//We pass by an intermediate variable to have more precission.
 	}
      }
    }

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1604,6 +1604,35 @@ static PyTypeObject CudaNdarrayType =
    CudaNdarray_new,           /* tp_new */
 };

+
+//This fct return True it is able to find a cuda card and query its properti
+//Otherwise we return False
+PyObject *
+device_available(PyObject* _unsed, PyObject * args)
+{
+  int deviceCount;
+
+  cudaError err = cudaGetDeviceCount(&deviceCount);
+  if( cudaSuccess != err) {
+    Py_RETURN_FALSE;
+  }
+  if (deviceCount <= 0) {
+    Py_RETURN_FALSE;
+  }
+
+  cudaDeviceProp deviceProp;
+  err=cudaGetDeviceProperties(&deviceProp, 0);
+  if( cudaSuccess != err) {
+    Py_RETURN_FALSE;
+  }
+
+  if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
+    Py_RETURN_FALSE;
+  }
+  
+  Py_RETURN_TRUE;
+}
+
 PyObject *
 CudaNdarray_gpu_init(PyObject* _unsed, PyObject * args)
 {
@@ -1810,6 +1839,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s

 static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
+    {"device_available", device_available, METH_VARARGS, "Return Py_True if a cuda card is available."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},    
    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -165,7 +165,9 @@ def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, on
                ret = _params_allgood(ishape, kshape, mode,
                                      subsample=subshape, img_stride=istride, kern_stride=kstride,
                                      version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
-            except:
+            except Exception, e:
+                print ver, id,(ishape, kshape, subshape, istride, kstride)
+                print e
                pass
            if not ret:
                failed_version.add(ver)

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -11,6 +11,7 @@ if cuda_available:

 import unittest
 from theano.tests import unittest_tools as utt
+from nose.plugins.skip import SkipTest

 #TODO: test gpu
 # Done in test_consistency_GPU_{serial,parallel}
@@ -22,7 +23,6 @@ from theano.tests import unittest_tools as utt
 #TODO: make tests work when no flags gived. Now need: THEANO_FLAGS=device=gpu0,floatX=float32
 # Partly done, in test_consistency_GPU_{serial,parallel}

-#TODO: bug fix test_normal0, in normal() fct, n_samples currently need to be numpy.prod(size) not self.n_streams(size)

 mode = config.mode
 mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
@@ -287,6 +287,7 @@ def basictest(f, steps, sample_size, prefix="", allow_01=False, inputs=[],
    for i in xrange(steps):
        t0 = time.time()
        ival = f(*inputs)
+        assert ival.shape==sample_size
        dt += time.time() - t0
        ival = numpy.asarray(ival)
        if i == 0:
@@ -324,7 +325,7 @@ def test_uniform():
        sample_size = (10,100)
        steps = 50
    else:
-        sample_size = (500,100)
+        sample_size = (500,50)
        steps = int(1e3)
    
    x = tensor.matrix()
@@ -381,9 +382,9 @@ def test_binomial():

    if mode in ['DEBUG_MODE','FAST_COMPILE']:
        sample_size = (10,50)
-        steps = 70
+        steps = 50
    else:
-        sample_size = (500,100)
+        sample_size = (500,50)
        steps = int(1e3)
    
    x = tensor.matrix()
@@ -430,9 +431,9 @@ def test_normal0():

    steps = 50
    if mode in ['DEBUG_MODE','FAST_COMPILE']:
-        sample_size = (99,100)
+        sample_size = (99,30)
    else:
-        sample_size = (999,100)
+        sample_size = (999,50)

    print ''
    print 'ON CPU:'
@@ -464,8 +465,8 @@ def test_normal0():
        print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
        print '----'
        sys.stdout.flush()
-        basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
-
+        basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
+        

    print ''
    print 'ON CPU w NUMPY:'
@@ -476,7 +477,7 @@ def test_normal0():

    basictest(ff, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='numpy ', allow_01=True)

-def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
+def basic_multinomialtest(f, steps, sample_size, target_pvals, prefix="", mean_rtol=0.04):

    dt = 0.0
    avg_pvals = numpy.zeros(target_pvals.shape, dtype=config.floatX)
@@ -484,6 +485,7 @@ def basic_multinomialtest(f, steps, target_pvals, prefix="", mean_rtol=0.04):
    for i in xrange(steps):
        t0 = time.time()
        ival = f()
+        assert ival.shape==sample_size
        dt += time.time() - t0
        #ival = numpy.asarray(ival)
        avg_pvals += ival
@@ -518,7 +520,7 @@ def test_multinomial():
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    
-    basic_multinomialtest(f, steps, pvals, prefix='mrg ')
+    basic_multinomialtest(f, steps, sample_size, pvals, prefix='mrg ')

    sys.stdout.flush()

@@ -535,4 +537,4 @@ def test_multinomial():

        theano.printing.debugprint(f)
        sys.stdout.flush()
-        basic_multinomialtest(f, steps, pvals, prefix='gpu mrg ')
+        basic_multinomialtest(f, steps, sample_size, pvals, prefix='gpu mrg ')
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -302,7 +302,7 @@ class Scalar(Type):
            return ""

    def c_code_cache_version(self):
-        return (8,) # put const around operators and added unary '-' operator
+        return (8, numpy.__version__) # put const around operators and added unary '-' operator
        # no need to put lib.amdlibm here as c_compile_args() are put in the key.
        return (7,)  # make complex c code optional
        return (6,)  # added implemeentations of operators that work with scalar arguments
@@ -932,6 +932,7 @@ class IntDiv(BinaryScalarOp):
        return [None] * len(inputs)
 int_div = IntDiv(upcast_out, name = 'int_div')

+floor_div = int_div

 class Mod(BinaryScalarOp):
    def impl(self, x, y):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -887,6 +887,11 @@ class _tensor_py_operators:
        except Exception, e:
            return NotImplemented

+    def __truediv__(self,other): return true_div(self, other)
+    def __floordiv__(self,other): return floor_div(self, other)
+    def __rtruediv__(self,other): return true_div(other, self)
+    def __rfloordiv__(self,other): return floor_div(other, self)
+
 #     ##### DON"T USE THESE BECAUSE INPLACE OPS SHOULD BE INSERTED BY OPTIMIZATION ONLY
 #     #ARITHMETIC - INPLACE
 #     def __iadd__(self,other): return _add_inplace(self,other)
@@ -2066,6 +2071,11 @@ def true_div(a, b):
    """elementwise [true] division (inverse of multiplication)"""
    # see decorator for function body

+@_scal_elemwise
+def floor_div(a, b):
+    """elementwise [floor] division (inverse of multiplication)"""
+    # see decorator for function body
+
 @_scal_elemwise
 def int_div(a, b):
    """elementwise integer-division"""
@@ -3607,8 +3617,12 @@ class Dot(Op):
            nx = x.type.ndim
            ny = y.type.ndim

-            if nx not in (1,2): raise TypeError('not matrix or vector', x)
-            if ny not in (1,2): raise TypeError('not matrix or vector', y)
+            if nx not in (1,2):
+                raise TypeError(('dot supports matrix and vector args: email theano-dev about'
+                    ' enabling numpy dot semantics if you want them'), x)
+            if ny not in (1,2):
+                raise TypeError(('dot supports matrix and vector args: email theano-dev about'
+                    ' enabling numpy dot semantics if you want them'), y)

            if nx == 2 and ny == 2:
                bz = [x.type.broadcastable[0], y.type.broadcastable[1]]

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -397,41 +397,41 @@ def local_softmax_with_bias(node):
                    return
                return [sm_bias]

-if 0:
-    def softmax_simplifier(numerators, denominators):
-        for numerator in list(numerators):
-            #TODO: a single softmax'd vector??
-            if not numerator.type.dtype.startswith('float'):
-                continue
-
-            if not numerator.type.broadcastable == (False, False):
-                continue
-            if numerator.owner and numerator.owner.op == tensor.exp:
-                x = numerator.owner.inputs[0]
-            else:
-                continue
-
-            matching_denom = None
-
-            for denominator in denominators:
-                if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
-                    if denominator.owner.op.new_order == (0,'x'):
-                        z = denominator.owner.inputs[0] # thing getting dimshuffled
-                        if z.owner and isinstance(z.owner.op, tensor.Sum):
-                            #print 'ASDF', denominator.owner.op.new_order
-                            #print z.owner.op.axis
-                            if z.owner.op.axis == (1,):
-                                #print "almost there.. softmax", x, z.owner.inputs[0]
-                                if z.owner.inputs[0] is numerator:
-                                    matching_denom = denominator
-                                    break
-            if matching_denom:
-                numerators.remove(numerator)
-                denominators.remove(matching_denom)
-                numerators.append(softmax(x))
-        return numerators, denominators
-    opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')
+def softmax_simplifier(numerators, denominators):
+    for numerator in list(numerators):
+        #TODO: a single softmax'd vector??
+        if not numerator.type.dtype.startswith('float'):
+            continue
+
+        if not numerator.type.broadcastable == (False, False):
+            continue
+        if numerator.owner and numerator.owner.op == tensor.exp:
+            x = numerator.owner.inputs[0]
+        else:
+            continue
+
+        matching_denom = None
+
+        for denominator in denominators:
+            if denominator.owner and isinstance(denominator.owner.op, tensor.DimShuffle):
+                if denominator.owner.op.new_order == (0,'x'):
+                    z = denominator.owner.inputs[0] # thing getting dimshuffled
+                    if z.owner and isinstance(z.owner.op, tensor.Sum):
+                        #print 'ASDF', denominator.owner.op.new_order
+                        #print z.owner.op.axis
+                        if z.owner.op.axis == (1,):
+                            #print "almost there.. softmax", x, z.owner.inputs[0]
+                            if z.owner.inputs[0] is numerator:
+                                matching_denom = denominator
+                                break
+        if matching_denom:
+            numerators.remove(numerator)
+            denominators.remove(matching_denom)
+            numerators.append(softmax(x))
+    return numerators, denominators
+opt.local_mul_canonizer.add_simplifier(softmax_simplifier, 'softmax_simplifier')

+if 0:
    def softmax_grad_simplifier(numerators, denominators):
        print "mul simplify numerators"
        printing.debugprint(numerators)

--- a/theano/tensor/nnet/sigm.py
+++ b/theano/tensor/nnet/sigm.py
@@ -8,7 +8,7 @@ from theano import gof
 from theano import scalar
 from theano import printing
 from theano.tensor import basic as tensor
-from theano.printing import pprint
+from theano.printing import pprint, debugprint
 from theano.tensor import elemwise
 from theano.tensor import opt
 from theano.compile import optdb
@@ -95,10 +95,17 @@ softplus = elemwise.Elemwise(scalar_softplus, name='softplus')

 pprint.assign(softplus, printing.FunctionPrinter('softplus'))

+def _skip_mul_1(r):
+    if r.owner and r.owner.op == tensor.mul:
+        not_is_1 = [i for i in r.owner.inputs if not _is_1(i) ]
+        if len(not_is_1)==1:
+            return not_is_1[0]
+
 logsigm_to_softplus = gof.PatternSub(
    (tensor.log, (sigmoid, 'x')),
    (tensor.neg, (softplus, (tensor.neg, 'x'))),
-    allow_multiple_clients = True)
+    allow_multiple_clients = True,
+    skip_identities_fn=_skip_mul_1)

 def _is_1(expr):
    """rtype bool. True iff expr is a constant close to 1
@@ -115,7 +122,8 @@ log1msigm_to_softplus = gof.PatternSub(
            dict(pattern='y', constraint = _is_1),
            (sigmoid, 'x'))),
    (tensor.neg, (softplus, 'x')),
-    allow_multiple_clients = True)
+    allow_multiple_clients = True,
+    skip_identities_fn=_skip_mul_1)

 log1pexp_to_softplus = gof.PatternSub(
    (tensor.log1p, 
@@ -329,3 +337,48 @@ register_local_1msigmoid = False
              
 if register_local_1msigmoid:
    opt.register_canonicalize(local_1msigmoid)
+
+if 0:
+    # This code is if'd out because it is not complete,
+    # and it isn't obviously a good idea anyway.
+    # The motivation here was to identify the last exp() node
+    # in the SciPy2010 article, which was not optimized away at the time of publication,
+    # so the example is actually not numerically stable, even though it should be.
+    @opt.register_stabilize
+    @gof.local_optimizer([tensor.mul])
+    def local_sigm_gest(node):
+        print "CANONICALIZE"
+        print sigm_canonicalize(node)
+
+    def sigm_canonicalize(node):
+        add = tensor.add
+        mul = tensor.mul
+        div = tensor.true_div
+
+        if node.op == tensor.add:
+            rval = []
+            for i in node.inputs:
+                rval += sigm_canonicalize(i)
+            return rval
+        if node.op == tensor.mul:
+            rval = sigm_canonicalize(node.inputs[0])
+            for i in node.inputs[1:]:
+                old_rval = rval
+                rval = []
+                for t1 in sigm_canonicalize(i):
+                    for t0 in old_rval:
+                        assert t1.owner.op == div
+                        assert t0.owner.op == div
+                        t0top, t0bot = t0.owner.inputs
+                        t1top, t1bot = t1.owner.inputs
+                        rval.append(div(mul(*(t0top+t1top)), mul(*(t0bot+t1bot))))
+
+                        if len(rval) > 100:
+                            # This loop can be exponentially long.
+                            # aborting
+                            return []
+        elif len(node.outputs)>1:
+            return []
+        else:
+            return [node.outputs[0]]
+
--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -924,6 +924,10 @@ class Test_softmax_opt():
        assert softmax in f_ops
        f(self.rng.rand(3,4))

+    def test_grad(self):
+        c = T.matrix()
+        p_y = T.exp(c) / T.exp(c).sum(axis=1).dimshuffle(0,'x')
+        
        # test that function contains softmax and no div.
        w = T.matrix()
        g = theano.function([c,w],T.grad((p_y*w).sum(), c))

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -144,6 +144,11 @@ def register_specialize(lopt, *tags, **kwargs):
    compile.optdb['specialize'].register(name, lopt, 'fast_run', *tags)
    return lopt

+def register_specialize_device(lopt, *tags, **kwargs):
+    name = (kwargs and kwargs.pop('name')) or lopt.__name__
+    compile.optdb['specialize_device'].register(name, lopt, 'fast_run', *tags)
+    return lopt
+
 def register_stabilize(lopt, *tags, **kwargs):
    name = (kwargs and kwargs.pop('name')) or lopt.__name__
    compile.optdb['stabilize'].register(name, lopt, 'fast_run', *tags)
@@ -189,6 +194,11 @@ def local_dimshuffle_lift(node):
 register_canonicalize(local_dimshuffle_lift)
 register_specialize(local_dimshuffle_lift)

+@register_canonicalize
+@gof.local_optimizer([])
+def local_dimshuffle_no_inplace_at_canonicalize(node):
+    if isinstance(node.op, T.DimShuffle) and node.op.inplace:
+        return [T.DimShuffle(node.op.input_broadcastable, node.op.new_order, inplace=False)(node.inputs[0])]


 #####################################
@@ -1603,18 +1613,20 @@ def local_sum_mul_by_scalar(node):
 @register_canonicalize
 @gof.local_optimizer([])
 def local_sum_div_dimshuffle(node):
-    '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis=l) / b,
+    '''sum(a / dimshuffle{...}(b), axis=l) -> sum(a, axis={...}) / b,
    if dimension l of the DimShuffle is 'x'.'''
    # TODO: extend it to product, and quotient of products

    if isinstance(node.op, T.Sum):
        axis = node.op.axis
+        if axis is None:
+            axis = range(node.inputs[0].ndim)
        #print 'axis =', axis
        thing_summed = node.inputs[0]
        dimshuffled = None
        if thing_summed.owner and thing_summed.owner.op == T.true_div:
            numerator, denominator = thing_summed.owner.inputs
-            if isinstance(numerator.owner.op, T.DimShuffle):
+            if numerator.owner and isinstance(numerator.owner.op, T.DimShuffle):
                new_order = numerator.owner.op.new_order
                #print 'new_order =', new_order
                # check compatibility
@@ -1630,7 +1642,7 @@ def local_sum_div_dimshuffle(node):
                #else:
                #    print 'incompatible dims:', axis, new_order

-            if isinstance(denominator.owner.op, T.DimShuffle):
+            if denominator.owner and isinstance(denominator.owner.op, T.DimShuffle):
                new_order = denominator.owner.op.new_order
                #print 'new_order =', new_order
                # check compatibility
@@ -1827,9 +1839,31 @@ def local_pow_specialize(node):
                rval = [T.inv(xsym)]
            if N.all(y == -2):
                rval = [T.inv(T.sqr(xsym))]
+            if rval:
+                rval[0] = T.cast(rval[0], odtype)
+                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
+                return rval
+    else:
+        return False
+register_specialize(local_pow_specialize)

-            # Optimize all integral powers in [-RANGE, RANGE]
-            if config.experimental.pow and rval is None and abs(y)==int(abs(y)) and abs(y) <= 512:# 512 is too small for the cpu and too big for some gpu!
+@register_specialize_device
+@gof.local_optimizer([T.pow])
+def local_pow_specialize_device(node):
+    """
+    This optimization is not the same on all device. We do it only on cpu here.
+    """
+    if node.op == T.pow:
+        #the idea here is that we have pow(x, y)
+        odtype = node.outputs[0].dtype
+        xsym = node.inputs[0]
+        ysym = node.inputs[1]
+        y = local_mul_canonizer.get_constant(ysym)
+        if (y is not None) \
+                and encompasses_broadcastable(xsym.type.broadcastable, ysym.type.broadcastable):
+            rval = None
+            # 512 is too small for the cpu and too big for some gpu!
+            if abs(y)==int(abs(y)) and abs(y) <= 512:
                pow2 = [xsym]
                pow2_scal = [theano.scalar.Scalar(xsym.dtype)()]
                y_to_do = abs(y)
@@ -1859,14 +1893,7 @@ def local_pow_specialize(node):
                rval[0] = T.cast(rval[0], odtype)
                assert rval[0].type == node.outputs[0].type, (rval, node.outputs)
                return rval
-    else:
-        return False
-register_specialize(local_pow_specialize)
-theano.configparser.AddConfigVar('experimental.pow',
-        "Transform a pow to a constant integer to a graph of mul. Fast on cpu, but more work needed for gpu.",
-        theano.configparser.BoolParam(False),
-        )
-
+   
 @gof.local_optimizer([T.mul])
 def local_mul_specialize(node):
    """Remove special-case constants from mul arguments
@@ -1965,20 +1992,28 @@ register_specialize(local_add_specialize)
 mul_canonizer = in2out(gof.LocalOptGroup(local_mul_canonizer, local_fill_cut, local_fill_sink))

 def check_for_x_over_absX(numerators, denominators):
+    """Convert x/abs(x) into sign(x). """
    # TODO: this function should dig/search through dimshuffles
    # This won't catch a dimshuffled absolute value
    for den in list(denominators):
        if den.owner and den.owner.op == T.abs_ and den.owner.inputs[0] in numerators:
-            denominators.remove(den)
-            numerators.remove(den.owner.inputs[0])
-            numerators.append(T.sgn(den.owner.inputs[0]))
+            if den.owner.inputs[0].type.dtype.startswith('complex'):
+                #TODO: Make an Op that projects a complex number to have unit length
+                #      but projects 0 to 0.  That would be a weird Op, but consistent with the 
+                #      special case below.  I heard there's some convention in Matlab that is 
+                #      similar to this... but not sure.
+                pass
+            else:
+                denominators.remove(den)
+                numerators.remove(den.owner.inputs[0])
+                numerators.append(T.sgn(den.owner.inputs[0]))
    return numerators, denominators
 local_mul_canonizer.add_simplifier(check_for_x_over_absX, 'teststest')

 @register_stabilize
 @gof.local_optimizer([T.log])
 def local_log1p(node):
-    # log(1+exp(x)) -> log1p(x)
+    # log(1+x) -> log1p(x)
    if node.op == T.log:
        log_arg, = node.inputs
        if log_arg.owner and log_arg.owner.op == T.add:
@@ -2207,7 +2242,7 @@ def local_elemwise_fusion_op(OP):
    """
    def local_fuse(node):
        """
-        As part of specialisation, we fusion two consecutif elemwise op of the same shape.
+        As part of specialisation, we fuse two consecutive elemwise op of the same shape.

        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
        The number of dimension is validated at call time by theano itself.
@@ -2240,7 +2275,7 @@ def local_elemwise_fusion_op(OP):
        for i in node.inputs:
            do_fusion = False
            catch = False
-            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)<=1:
+            if i.owner and isinstance(i.owner.op, OP) and len(i.clients)==1:
                #if the scalar_op don't have a c implementation, we skip its fusion to allow the fusion of the other ops.
                do_fusion=True
                try:
@@ -2296,7 +2331,7 @@ def local_elemwise_fusion_op(OP):

        # There is a hard limit of 256 bytes for the formal argument list to a GPU kernel function.
        # Here, we estimate how many bytes the new Op will need, and abort if it needs too much.
-        if True:
+        if OP != T.Elemwise:
            argument_limit = 240  # 16 bytes are used for block and thread coords etc.
            #TODO: read in from architecture to make this 4 or 8
            int_size = 8

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1604,6 +1604,54 @@ class t_dot(unittest.TestCase):
        #utt.verify_grad(dot, [self.rand(), self.rand(2)])
        #utt.verify_grad(dot, [self.rand(), self.rand(2,5)])

+    def test_broadcastable_patterns(self):
+
+        #
+        # These examples hsould all work because we broadcastable or no, all dimensions of all
+        # results have size 1.
+        #
+        def val_for(r):
+
+            if r.ndim == 0:
+                return numpy.asarray(1.1, dtype=r.dtype)
+            if r.ndim == 1:
+                return numpy.asarray([1.2], dtype=r.dtype)
+            elif r.ndim == 2:
+                return numpy.asarray([[1.3]], dtype=r.dtype)
+            raise ValueError()
+
+        failures = []
+        for dtype0 in ('float32', 'float64', 'complex64', 'complex128'):
+            for dtype1 in ('float32', 'float64', 'complex64', 'complex128'):
+                for bc0 in ((True,), (False,), (True, True), (True, False), (False, True),
+                        (False,False)):
+                    for bc1 in ((True,), (False,), (True, True), (True, False), (False, True),
+                            (False,False)):
+
+                        x = TensorType(dtype=dtype0, broadcastable=bc0)()
+                        y = TensorType(dtype=dtype1, broadcastable=bc1)()
+                        z = dot(x,y)
+                        t = TensorType(dtype=dtype0, broadcastable=z.broadcastable)()
+
+                        rval =  z * 3 + 2*t
+                        if rval.type.dtype.startswith('complex'):
+                            # there is a problem with complex numbers right now
+                            # Elemwise code doesn't compile when both precisions of complex
+                            # numbers are used in the same file because the operators
+                            # aren't declared properly.
+                            failures.append((dtype0, dtype1, bc0, bc1))
+                            continue
+                        f = function([x,y,t], rval)
+                        xval = val_for(x)
+                        yval = val_for(y)
+                        tval = val_for(t)
+
+                        f(xval, yval, tval) #debugmode checks result
+        #if failures:
+            #print failures
+        assert not failures
+
+
 class T_tensorfromscalar(unittest.TestCase):
    def test0(self):
        s = scal.constant(56)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -416,7 +416,7 @@ def test_gemm_canonicalize():

    can = []
    _gemm_canonicalize(X + Y + u, 1.0, can, 0)
-    assert can == [(1.0, X), (1.0, Y), u]
+    assert can == [(1.0, X), (1.0, Y), u], can

    can = []
    _gemm_canonicalize(a*X + Y - b*Z*c, 1.0, can, 0)

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tests/diverse_tests.py
+++ b/theano/tests/diverse_tests.py
+from nose.plugins.skip import SkipTest
+
+import unittest
+import theano
+import numpy
+import random
+import numpy.random
+from theano.tests  import unittest_tools as utt
+
+'''
+  Different tests that are not connected to any particular Op, or functionality of 
+  Theano. Here will go for example code that we will publish in papers, that we 
+  should ensure that it will remain operational
+'''
+
+class T_diverse(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+
+
+    def scipy_paper_example1(self):
+        a = theano.tensor.vector('a') # declare variable
+        b = a + a**10                 # build expression
+        f = theano.function([a], b)   # compile function
+        assert numpy.all(f([0,1,2]) == numpy.array([0,2,1026]))
+
+    def scipy_papaer_example2(self):
+        ''' This just sees if things compile well and if they run '''
+        x = T.matrix()
+        y = T.vector()
+        w = shared(rng.randn(100))
+        b = shared(numpy.zeros(()))
+
+        # Construct Theano expression graph
+        p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b))
+        xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1)
+        prediction = p_1 > 0.5
+        cost = xent.mean() + 0.01*(w**2).sum()
+        gw,gb = T.grad(cost, [w,b])
+
+        # Compile expressions to functions
+        train = function(
+            inputs=[x,y],
+            outputs=[prediction, xent],
+            updates={w:w-0.1*gw, b:b-0.1*gb})
+        predict = function(inputs=[x], outputs=prediction)
+
+        N = 4
+        feats = 100
+        D = (rng.randn(N, feats), rng.randint(size=4,low=0, high=2))
+        training_steps = 10
+        for i in range(training_steps):
+            pred, err = train(D[0], D[1])
+
+if __name__ == '__main__':
+    unittest.main()
+