Merged

d1c3d45e · Olivier Delalleau · 57cf5930 · 7e118143 · d1c3d45e · d1c3d45e
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -2,7 +2,7 @@ Trunk sin last release
 ------
 * Sparse type is now supported by the shape op and the ShapeFeature optimizer work correctly with them.
 * fuse GpuElemwise more often(in the case where their is too many inputs that fusing all of them would bust the 256 bytes limits of parameter to gpu function)
-
+ * Speed up gemv by a work around scipy gemv slowness when the matrix is in c order(the default)

 Theano 0.3 (2010-11-23)
 -----------------------

--- a/doc/extending/fibby.txt
+++ b/doc/extending/fibby.txt
@@ -99,7 +99,7 @@ The ``make_node`` method creates a node to be included in the expression graph.
 It runs when we apply our Op (``fibby``) to Variable (``x``), as in ``fibby(tensor.vector())``.
 When an Op has multiple inputs, their order in the inputs argument to ``Apply``
 is important:  Theano will call ``make_node(*inputs)`` to copy the graph, 
-so it is important to not change the semantics of the expression by doing changing the argument order.
+so it is important not to change the semantics of the expression by changing the argument order.




--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -138,7 +138,7 @@ following methods:
  other criterion C with respect to the Op's input.

  If the outputs of your op are :math:`[ f_1, ... f_n]`, then
-  ``output_derivatives`` gives
+  ``output_gradients`` is
  :math:`[ grad_{f_1}(C), grad_{f_2}(C), ... , grad_{f_n}(C) ]`.
  If the inputs of your op are :math:`[x_1, ..., x_m]`, then your Op.grad
  should return :math:`[ grad_{x_1}(C), grad_{x_2}(C), ..., grad_{x_m}(C) ]`,

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -128,16 +128,26 @@ Config Attributes

    Default 'Mode'

-    This set the default compilation mode for theano functions. By default the
+    This sets the default compilation mode for theano functions. By default the
    mode Mode is equivalent to FAST_RUN. See Config attribute linker and optimizer.

+.. attribute:: config.lib.amdlibm
+
+    Bool value: either True or False
+
+    Default False
+
+    This makes the compilation use the
+    `amdlibm <http://developer.amd.com/cpu/libraries/libm/>`__
+    library, which is faster than the standard libm.
+
 .. attribute:: linker

    String value: 'c|py', 'py', 'c', 'c|py_nogc', 'c&py'

    Default: 'c|py'

-    When the mode is Mode, it set the default linker used.
+    When the mode is Mode, it sets the default linker used.

 .. attribute:: optimizer

@@ -145,7 +155,7 @@ Config Attributes

    Default: 'fast_run'

-    When the mode is Mode, it set the default optimizer used.
+    When the mode is Mode, it sets the default optimizer used.

 .. attribute:: warn.ignore_bug_before


--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -46,7 +46,7 @@ AddConfigVar('DebugMode.check_strides',
        IntParam(1, lambda i: i in (0,1,2)))

 AddConfigVar('DebugMode.warn_input_not_reused',
-        ("Generate a warning when the destroy_map tell that an op work inplace, but the op did not reuse the input for its output."
+        ("Generate a warning when the destroy_map or view_map tell that an op work inplace, but the op did not reuse the input for its output."
         ),
        BoolParam(True))

@@ -519,6 +519,18 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, clobber_dr_v
            if storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
                warning("input idx %d marked as destroyed was not changed for node '%s'"%(ii[0],str(node)))

+    if warn_input_not_reused:
+        vmap=getattr(node.op,'view_map',{})
+        for oo,ii in vmap.iteritems():
+            if hasattr(node.outputs[0].type,"may_share_memory"):
+                if not node.outputs[0].type.may_share_memory(storage_map[node.outputs[oo]][0],storage_map[node.inputs[ii[0]]][0]):
+                    #when a subtensor return a tensor ofndim==0, numpy seam to return a copy.
+                    #when have an empty ndarray(happen with output guard) it is not the same. why?
+                    if storage_map[node.outputs[oo]][0].ndim>0 and storage_map[node.outputs[oo]][0].size>0:
+                        warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
+            elif storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
+                warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
+
    for r_idx, r in enumerate(node.inputs):
        if not r.type.values_eq(r_vals[r], storage_map[r][0]):
            # some input node 'r' got changed by running the node

--- a/theano/misc/hooks/check_whitespace.py
+++ b/theano/misc/hooks/check_whitespace.py
@@ -14,6 +14,8 @@ import tokenize
 import argparse
 import reindent

+SKIP_WHITESPACE_CHECK_FILENAME = ".hg/skip_whitespace_check"
+
 def get_parse_error(code):
    """
    Checks code for ambiguous tabs or other basic parsing issues.
@@ -128,6 +130,20 @@ def save_diffs(diffs, filename):
    diff_file.write(diff)
    diff_file.close()

+def should_skip_commit():
+    if not os.path.exists(SKIP_WHITESPACE_CHECK_FILENAME):
+        return False
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "r")
+    whitespace_check_changeset = whitespace_check_file.read()
+    whitespace_check_file.close()
+    return whitespace_check_changeset == parent_commit()
+
+def save_skip_next_commit():
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "w")
+    whitespace_check_file.write(parent_commit())
+    whitespace_check_file.close()
+
+
 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
@@ -145,12 +161,32 @@ def main(argv=None):
                        const=True,
                        help="only check indentation if the file was previously correctly indented (or is new)"
                       )
+    parser.add_argument("-s", "--skip-after-failure",
+                        action="store_const",
+                        default=False,
+                        const=True,
+                        help="when this pre-commit hook fails, don't run it on the next commit; "
+                             "this lets you check in your changes and then check in "
+                             "any necessary whitespace changes in the subsequent commit"
+                       )
    args = parser.parse_args(argv)

+    # -i and -s are incompatible; if you skip checking, you end up with a not-correctly-indented
+    # file, which -i then causes you to ignore!
+    if args.skip_after_failure and args.incremental:
+        print >> sys.stderr, "*** check whitespace hook misconfigured! -i and -s are incompatible."
+        return 1
+
    if is_merge():
        # don't inspect merges: (a) they're complex and (b) they don't really introduce new code
        return 0

+    if args.skip_after_failure and should_skip_commit():
+        # we're set up to skip this one, so skip it, but
+        # first, make sure we don't skip the next one as well :)
+        os.remove(SKIP_WHITESPACE_CHECK_FILENAME)
+        return 0
+
    block_commit = False

    diffs = []
@@ -185,12 +221,15 @@ def main(argv=None):
        save_diffs(diffs, diffs_filename)
        print >> sys.stderr, "*** To fix all indentation issues, run: cd `hg root` && patch -p0 < %s" % diffs_filename

-
    if block_commit:
        save_filename = ".hg/commit_message.saved"
        save_commit_message(save_filename)
        print >> sys.stderr, "*** Commit message saved to %s" % save_filename

+        if args.skip_after_failure:
+            save_skip_next_commit()
+            print >> sys.stderr, "*** Next commit attempt will not be checked. To change this, rm %s" % SKIP_WHITESPACE_CHECK_FILENAME
+
    return int(block_commit)



--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-import atexit, os, stat
+import atexit, gc, os, stat
 from theano.compile import optdb
 from theano import config

@@ -96,6 +96,9 @@ if cuda_available:
        cuda_initialization_error_message = ""
        # actively closing our gpu session presents segfault-on-exit on some systems
        atexit.register(gpu_shutdown)
+        # do garbage collection before releasing the gpu to avoid releasing invalid pointers later
+        # note that atexit-registered calls are called in LIFO order
+        atexit.register(gc.collect)
    except EnvironmentError, e:
        cuda_available = False
        cuda_initialization_error_message = e.message

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -12,43 +12,12 @@
 //If true, we fill with NAN allocated device memory.
 #define ALLOC_MEMSET 0

-#define DEBUG_GPU_CONTEXT_REFCOUNT 0
-// g_gpu_context_refcount starts at one b/c the gpu context will be implicitly created
-// on the first successful cuda call. the matching decref is in CudaNdarray_gpu_shutdown.
-static int g_gpu_context_refcount = 1;
-
-///////////////////////////
-// cuda context management
-///////////////////////////
-
-void gpu_context_incref() {
-  g_gpu_context_refcount++;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_incref, to %d\n", g_gpu_context_refcount);
-#endif
-}
-
-void gpu_context_decref() {
-  g_gpu_context_refcount--;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_decref, to %d\n", g_gpu_context_refcount);
-#endif
-  if(g_gpu_context_refcount == 0) {
-    // we're now free to close the cuda context; if we don't explicitly
-    // exit our cuda context, some systems segfault on process exit
-    // for as-yet unknown reasons; see
-    // http://groups.google.com/group/theano-users/browse_thread/thread/c351846e5cebe35f
-    cudaThreadExit();
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-    fprintf(stderr, "gpu_context_decref at 0, calling cudaThreadExit\n");
-#endif
-  }
-}
-
 /////////////////////////
 // Alloc and Free
 /////////////////////////

+static int g_gpu_context_active = 0;
+
 /**
 *
 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
@@ -80,9 +49,6 @@ void * device_malloc(size_t size)
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
-    if(rval != NULL) {
-        gpu_context_incref(); // keep the gpu context around until we've free this memory
-    }
 #if COMPUTE_GPU_MEM_USED
    for(int i=0;i<TABLE_SIZE;i++){
      if(NULL==_alloc_size_table[i].ptr){
@@ -104,6 +70,10 @@ void * device_malloc(size_t size)
 }
 int device_free(void *ptr)
 {
+    // if there is no gpu context, the call to cudaFree will fail; skip it entirely
+    if(!g_gpu_context_active) {
+        return 0;
+    }
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
    {
@@ -116,9 +86,6 @@ int device_free(void *ptr)
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
-    if(ptr != NULL) {
-        gpu_context_decref();
-    }
 #if COMPUTE_GPU_MEM_USED
    int i=0;
    for(;i<TABLE_SIZE;i++)
@@ -1883,6 +1850,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
                        "Unable to get the number of gpus available: %s",
                        cudaGetErrorString(cudaGetLastError()));
  }
+  
+  // as soon as the first successful call to a cuda* function is made, a
+  // gpu context has been created
+  g_gpu_context_active = 1;
+  
  if(deviceCount <= 0) {
    return PyErr_Format(PyExc_EnvironmentError,
                        "Can't use the GPU, no devices support CUDA");
@@ -1926,7 +1898,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)

 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
-    gpu_context_decref();
+    cudaThreadExit();
+    g_gpu_context_active = 0; // context has now been closed down
    Py_INCREF(Py_None);
    return Py_None;
 }

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -46,6 +46,9 @@ class Images2Neibs(Op):

        return Apply(self, [ten4, neib_shape,neib_step], [T.matrix(dtype=ten4.type.dtype)])

+    def grad(self, (x, neib_shape, neib_step), (gz,)):
+        return [neibs2images(gz, neib_shape, x.shape), None, None]
+
    def c_code_cache_version(self):
        return (3,)
                
@@ -97,8 +100,8 @@ class Images2Neibs(Op):
            }
            if ( (%(ten4)s->dimensions)[2] < c || (%(ten4)s->dimensions)[3] < d)
            {
-                PyErr_Format(PyExc_TypeError, "Images2Neibs: in wrap_centered mode, don't support image shapes smaller then the patch shapes: neib_shape=(%%d,%%d), ten4[2:]=[%%d,%%d]",
-                             c, d,%(ten4)s->dimensions[2], %(ten4)s->dimensions[3]);
+                PyErr_Format(PyExc_TypeError, "Images2Neibs: in wrap_centered mode, don't support image shapes smaller then the patch shapes: neib_shape=(%%ld,%%ld), ten4[2:]=[%%ld,%%ld]",
+                             (long int)c, (long int)d, (long int)(%(ten4)s->dimensions[2]), (long int)(%(ten4)s->dimensions[3]));
                %(fail)s;
            }
            //grid_c = CEIL_INTDIV(((%(ten4)s->dimensions)[2]),step_x)
@@ -108,14 +111,14 @@ class Images2Neibs(Op):
        }else if ( "%(mode)s" == "valid") {
            if ( ((%(ten4)s->dimensions)[2] < c) ||( (((%(ten4)s->dimensions)[2]-c) %% step_x)!=0))
            {
-                PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%d, neib_step[0]=%%d and ten4.shape[2]=%%d not consistent",
-                             c, step_x, %(ten4)s->dimensions[2]);
+                PyErr_Format(PyExc_TypeError, "neib_shape[0]=%%ld, neib_step[0]=%%ld and ten4.shape[2]=%%ld not consistent",
+                             (long int)c, (long int)step_x, (long int)(%(ten4)s->dimensions[2]));
                %(fail)s;
            }
            if ( ((%(ten4)s->dimensions)[3] < d) ||( (((%(ten4)s->dimensions)[3]-d) %% step_y)!=0))
            {
-                PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%d, neib_step[1]=%%d and ten4.shape[3]=%%d not consistent",
-                             d, step_y, %(ten4)s->dimensions[3]);
+                PyErr_Format(PyExc_TypeError, "neib_shape[1]=%%ld, neib_step[1]=%%ld and ten4.shape[3]=%%ld not consistent",
+                             (long int)d, (long int)step_y, (long int)(%(ten4)s->dimensions[3]));
                %(fail)s;
            }
            grid_c = 1+(((%(ten4)s->dimensions)[2]-c)/step_x); //number of patch in height
@@ -211,36 +214,16 @@ class Images2Neibs(Op):
 def images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
    return Images2Neibs(mode)(ten4, neib_shape, neib_step)

-def neibs2images(neibs, neib_shape, original_shape, neib_step=None, mode='valid'):
+def neibs2images(neibs, neib_shape, original_shape):
    """
-    Inverse of images2neib. Don't implement neib_step and mode.
+    Inverse of images2neib.
+    
+    neibs : matrix like the one obtained by images2neib
+    neib_shape : neib_shape that was used in images2neib
+    original_shape : original shape of the 4d tensor given to images2neib
    
-    :type neibs: Theano variable
-    :param neibs: matrix like the one obtained by images2neib
-
-    :type neib_shape: Theano variable
-    :param neib_shape: neib_shape that was used in images2neib
-
-    :type original_shape: Theano variable
-    :param original_shape: original shape of the 4d tensor given to images2neib.
-
-    :type neib_step: Theano variable or None
-    :param neib_step: neib_step that was used in images2neib Implement only None.
-                      None is non overlapping patches and not-adjacent patches.
-
-    :type mode: str
-    :param mode: The mode that was used in images2neib. Implement only valid.
-
    Return a 4d tensor of shape `original_shape`.
    """
-    # TODO: handle the case where patches either overlap
-    # TODO: handle the case where patches are not directly adjacent
-    # TODO: at least separate these cases so that the following code does not incorrectly
-    # handle them by accident.
-    if neib_step != None:
-        raise NotImplementedError('neibs2images do not implement overlapping patches or non-adjacent patches.')
-    if mode != 'valid':
-        raise NotImplementedError('neibs2images do not implement the mode %s. It currently only implement `valid`.'%mode)
    neibs = T.as_tensor_variable(neibs)
    neib_shape = T.as_tensor_variable(neib_shape)
    original_shape = T.as_tensor_variable(original_shape)

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -7,6 +7,8 @@ from neighbours import images2neibs, neibs2images, Images2Neibs, GpuImages2Neibs
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda

+from theano.tests import unittest_tools
+
 if theano.config.mode=='FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
@@ -328,8 +330,67 @@ def speed_neibs_wrap_centered():
    for i in range(1000):
        f()
        
+def test_neibs_grad():
+    shape = (2,3,4,4)
+    images = T.shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
+
+    cost = T.sum(T.sqr(images2neibs(images, (2,2))), axis=[0,1])
+
+    grad = T.grad(cost, images)
+
+    f = theano.function([], [cost, grad], mode=mode_without_gpu)
+
+    got = f()
+
+    should_get = [numpy.asarray(290320.0, dtype=numpy.float32),
+        numpy.asarray([[[[   0.,    2.,    4.,    6.],
+         [   8.,   10.,   12.,   14.],
+         [  16.,   18.,   20.,   22.],
+         [  24.,   26.,   28.,   30.]],
+
+        [[  32.,   34.,   36.,   38.],
+         [  40.,   42.,   44.,   46.],
+         [  48.,   50.,   52.,   54.],
+         [  56.,   58.,   60.,   62.]],
+
+        [[  64.,   66.,   68.,   70.],
+         [  72.,   74.,   76.,   78.],
+         [  80.,   82.,   84.,   86.],
+         [  88.,   90.,   92.,   94.]]],
+

+       [[[  96.,   98.,  100.,  102.],
+         [ 104.,  106.,  108.,  110.],
+         [ 112.,  114.,  116.,  118.],
+         [ 120.,  122.,  124.,  126.]],
+
+        [[ 128.,  130.,  132.,  134.],
+         [ 136.,  138.,  140.,  142.],
+         [ 144.,  146.,  148.,  150.],
+         [ 152.,  154.,  156.,  158.]],
+
+        [[ 160.,  162.,  164.,  166.],
+         [ 168.,  170.,  172.,  174.],
+         [ 176.,  178.,  180.,  182.],
+         [ 184.,  186.,  188.,  190.]]]], dtype=numpy.float32)]
+
+    assert numpy.allclose(got[0], should_get[0])
+    assert numpy.allclose(got[1], should_get[1])
+
+def test_neibs_grad_verify_grad():
+    shape = (2,3,4,4)
+    images = T.dtensor4()
+    images_val = numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape)
+
+    def fn(images):
+        return T.sum(T.sqr(images2neibs(images, (2,2))), axis=[0,1])
+
+    unittest_tools.verify_grad(fn, [images_val], mode=mode_without_gpu)
+    if cuda.cuda_available:
+        unittest_tools.verify_grad(fn, [images_val], mode=mode_with_gpu)

 if __name__ == '__main__':
-    test_neibs_gpu()
-    test_neibs()
+    #test_neibs_gpu()
+    #test_neibs()
+    test_neibs_grad_verify_grad()
+
--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -213,7 +213,8 @@ class SparseType(gof.Type):
        # a FAST_RUN computation..
        return scipy.sparse.issparse(a) \
                and scipy.sparse.issparse(b) \
-                and abs(a-b).sum() < (1e-6 * a.nnz)
+                and ((abs(a-b).sum() < (1e-6 * a.nnz))
+                     or (a.nnz==0 and b.nnz==0))#in case a and b are empty

    def values_eq(self, a, b):
        #WARNING: equality comparison of sparse matrices is not fast or easy
@@ -248,9 +249,18 @@ class _sparse_py_operators:
    def __dot__(left, right): return structured_dot(left, right)
    def __rdot__(right, left): return structured_dot(left, right)

+    #N.B. THIS IS COMMENTED OUT ON PURPOSE!!!
+    #     Discussion with Fred & James (at least, and maybe others before) 
+    #     we decided that casting from a sparse to dense should be explicit
+    #     because it's usually something you want to be pretty careful about,
+    #     and not to do by accident.
    #def _as_TensorVariable(self):
    #    return dense_from_sparse(self)
-    shape = property(lambda self: tensor.shape(self))
+
+    shape = property(lambda self: tensor.shape(dense_from_sparse(self))) # don't worry!
+    # ... the plan is that the ShapeFeature in tensor.opt will do shape propagation
+    # ... and remove the dense_from_sparse from the graph.  This will *NOT* actually expand
+    # ... your sparse matrix just to get the shape.
    ndim = property(lambda self: self.type.ndim)
    dtype = property(lambda self: self.type.dtype)

@@ -513,6 +523,8 @@ class DenseFromSparse(gof.op.Op):
            return [sp_ones_like(x) * gz]
        else:
            return [SparseFromDense(x.type.format)(gz)]
+    def infer_shape(self, node, (ishape,)):
+        return [ishape]
 dense_from_sparse = DenseFromSparse()

 class SparseFromDense(gof.op.Op):
@@ -535,6 +547,8 @@ class SparseFromDense(gof.op.Op):
        out[0] = SparseType.format_cls[self.format](x)
    def grad(self, (x, ), (gz, )):
        return dense_from_sparse(gz),
+    def infer_shape(self, node, (ishape,)):
+        return [ishape]
 csr_from_dense = SparseFromDense('csr')
 csc_from_dense = SparseFromDense('csc')

@@ -776,7 +790,11 @@ class StructuredDot(gof.Op):
        dtype_out = scalar.upcast(a.type.dtype, b.type.dtype)
        if b.type.ndim != 2:
            raise NotImplementedError('non-matrix b')
-        return gof.Apply(self, [a,b], [tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])
+
+        if _is_sparse_variable(b):
+            return gof.Apply(self, [a,b], [SparseType(a.type.format,dtype_out)()])
+        else:
+            return gof.Apply(self, [a,b], [tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])

    def perform(self, node, (a,b), (out,)):
        if a.shape[1] != b.shape[0]:
@@ -784,6 +802,11 @@ class StructuredDot(gof.Op):

        #variable = a.dot(b)  # deprecated
        variable = a * b
+        if isinstance(node.outputs[0].type,SparseType):
+            assert _is_sparse(variable)
+            out[0] = variable
+            return
+
        assert _is_dense(variable) # scipy 0.7 automatically converts to dense

        # dot of an NxM sparse matrix, with a Mx1 dense matrix, returns vector not matrix

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -344,6 +344,28 @@ class test_structureddot(unittest.TestCase):
        outvals = f(kernvals,imvals)
        print outvals

+    def test_dot_sparse_sparse(self):
+        #test dot for 2 input sparse matrix
+        sparse_dtype = 'float64'
+        for sparse_format in ['csc','csr']:
+            a = SparseType(sparse_format, dtype=sparse_dtype)()
+            b = SparseType(sparse_format, dtype=sparse_dtype)()
+            d = theano.dot(a,b)
+            f = theano.function([a,b], theano.Out(d, borrow=True))
+            topo = f.maker.env.toposort()
+            for M,N,K,nnz in [(4,3,2,3),
+                              (40,30,20,3),
+                              (40,30,20,30),
+                              (400,3000,200,6000),
+                              ]:
+                if sparse_format == 'csc':
+                    spmat = sp.csc_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csc_matrix(random_lil((N,K), sparse_dtype, nnz))
+                elif sparse_format == 'csr':
+                    spmat = sp.csr_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csr_matrix(random_lil((N,K), sparse_dtype, nnz))
+                f(spmat,spmat2)
+
    def test_csc_correct_output_faster_than_scipy(self):
        sparse_dtype = 'float64'
        dense_dtype = 'float64'
@@ -449,6 +471,8 @@ def test_shape_i():
    assert f(sp.csr_matrix(random_lil((100,10), sparse_dtype, 3)))==(10)

 def test_shape():
+    # Test that getting the shape of a sparse variable
+    # does not actually create a dense tensor in the process.
    sparse_dtype = 'float32'

    a = SparseType('csr', dtype=sparse_dtype)()

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -367,8 +367,27 @@ def get_constant_value(v):
            ret = [[None]]
            v.owner.op.perform(v.owner, [const], ret)
            return ret[0][0]
-        if isinstance(v.owner.op, Subtensor) and v.ndim==0 and isinstance(v.owner.inputs[0], TensorConstant):
-            return v.owner.inputs[0].data[v.owner.op.idx_list[0]]
+        if isinstance(v.owner.op, Subtensor) and v.ndim==0:
+            if isinstance(v.owner.inputs[0], TensorConstant):
+                return v.owner.inputs[0].data[v.owner.op.idx_list[0]]
+            #Needed to make better graph in this test.
+            #theano/tensor/tests/test_sharedvar.py:test_shared_options.test_specify_shape_partial
+            if (v.owner.inputs[0].owner and
+                isinstance(v.owner.inputs[0].owner.op, Join) and
+                # Ensure the Join is joining only scalar variables (so that
+                # the constant value can be found at the same index as the one
+                # used in the sub-tensor).
+                all(var.ndim==0 for var in v.owner.inputs[0].owner.inputs)):
+
+                # The index list 'idx_list' should have length one
+                # since joining scalar variables results in a 1D vector.
+                assert len(v.owner.op.idx_list) == 1
+                # Note the '+ 1' is because the first argument to Join is the
+                # axis.
+                ret = v.owner.inputs[0].owner.inputs[v.owner.op.idx_list[0]+1]
+                ret = get_constant_value(ret)
+                #join can cast implicitly its input in some case.
+                return theano._asarray(ret, dtype=v.type.dtype)
    raise TypeError(v)


@@ -531,7 +550,11 @@ class TensorType(Type):

    @staticmethod
    def may_share_memory(a,b):
-        return numpy.may_share_memory(a,b)
+        #when this is called with a an ndarray and b
+        #a sparce matrix, numpy.may_share_memory fail.
+        if a.__class__ is b.__class__:
+            return numpy.may_share_memory(a,b)
+        else: return False

    @staticmethod
    def values_eq(a, b):
@@ -1444,11 +1467,13 @@ class Shape(Op):
    def __str__(self):
        return self.__class__.__name__
    def make_node(self, x):
-        if not isinstance(x, Variable):
-            raise TypeError('x must be Variable whose value have a shape attribute', x)
        #Must work for all type that have a shape attribute.
        #This will fail at execution time.
-        #x = as_tensor_variable(x)
+        x = as_tensor_variable(x)
+        #Each type variable should implement their .shape attribute
+        #and have the fct infer_shape() implemented in the op that convert
+        #the type to TensorVariable to have the optimization working
+        #correctly.
        return Apply(self, [x], [lvector()])
    def perform(self, node, (x, ), (out, )):
        out[0] = theano._asarray(x.shape, dtype = 'int64')
@@ -1475,6 +1500,54 @@ shape = Shape()
 _shape = shape #was used in the past, now use shape directly.
 pprint.assign(_shape, printing.MemberPrinter('shape'))

+class SpecifyShape(Op):
+    """
+    L{Op} put into the graph the user provided shape
+
+    In the case where this op stay in the final graph, we assert the shape.
+    For this the output of this op must be used in the graph. This is not 
+    the case most of the time if we only take the shape of the output.
+    Maybe there is other optimization that will mess with this.
+
+    @note:     Maybe in the futur we will never do the assert!
+    @note:     We currently don't support specifying partial shape information.
+    """
+    view_map = {0: [0]}
+    def __hash__(self):
+        return hash(type(self))
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x, shape):
+        if not isinstance(x,Variable):
+            x = as_tensor_variable(x)
+        shape = as_tensor_variable(shape)
+        return Apply(self, [x, shape], [x.type()])
+    
+    def perform(self, node, (x,shape ), (out, )):
+        assert numpy.all(x.shape==shape), ("got shape", x.shape,
+                                           "expected", shape)
+        out[0] = x
+        
+    def infer_shape(self, node, (xshape, sshape)):
+        new_shape=[]
+        for dim in range(node.inputs[0].ndim):
+            try:
+                s=get_constant_value(node.inputs[1][dim])
+                s=as_tensor_variable(s)
+                new_shape.append(s)
+            except TypeError, e:
+                new_shape.append(node.inputs[1][dim])
+
+        assert len(new_shape)==len(xshape)
+        return [new_shape]
+
+    def grad(self, (x,), (gz,)):
+        return [gz]
+
+specify_shape = SpecifyShape()
+
 class MaxAndArgmax(Op):
    """Calculate the max and argmax over a given axis.

@@ -1618,10 +1691,10 @@ def min(x, axis='DEFAULT'):
        axis = 0
    elif axis=='DEFAULT':
        axis = x.type.ndim - 1
-        warnings.warn("The default axis of min will change! Now we return the min over the last dimensions. It will change to be the same as numpy: the min over all dimensions. To hide this warning and be compatible with the future behavior, set axis to -1 to have the current behavior. To have the futur behavior set axis to range(nb dim), but this don't support the grad. To have the grad, you must flatten the tensor before calling min().")
+        warnings.warn("The default axis of min will change! Now we return the min over the last dimensions. It will change to be the same as numpy: the min over all dimensions. To hide this warning and be compatible with the future behavior, set axis to -1 to have the current behavior. To have the future behavior, set axis to range(x.ndim), but this does not support the grad. To be able to get the grad, you must flatten the tensor before calling min().")
    elif axis is None:
        axis = x.type.ndim - 1
-        warnings.warn("The behavior of min when axis==None will change! Now we return the min over the last dimensions. It will change to the min over all dimensions as numpy. To hide this warning and be compatible with the future behavior, set axis to -1 to have the current behavior. To have the futur behavior set axis to range(nb dim), but this don't support the grad. To have the grad, you must flatten the tensor before calling min().")
+        warnings.warn("The behavior of min when axis is None will change! Now we return the min over the last dimensions. It will change to the min over all dimensions as numpy. To hide this warning and be compatible with the future behavior, set axis to -1 to have the current behavior. To have the future behavior, set axis to range(x.ndim), but this does not support the grad. To be able to get the grad, you must flatten the tensor before calling min().")
    str_x_type = str(x.dtype)
    if str_x_type.startswith('float') or str_x_type.startswith('int'):
        return -max(-x, axis=axis)
@@ -1889,6 +1962,22 @@ def zeros_like(model):
    #return Zeros(model.type.ndim)(shape(model))
    return fill(model, constant(0.0, dtype=model.type.dtype))

+
+def zeros(shape, dtype=config.floatX):
+    """
+    Create a Tensor filled with zeros, closer to Numpy's syntax than ``alloc``.
+    """
+    return alloc(numpy.array(0, dtype=dtype), *shape)
+
+
+def ones(shape, dtype=config.floatX):
+    """
+    Create a Tensor filled with ones, closer to Numpy's syntax than ``alloc``.
+    """
+    return alloc(numpy.array(1, dtype=dtype), *shape)
+
+
+
 class Eye(gof.Op):
    def __init__(self, dtype=config.floatX):
        self.dtype = dtype
@@ -2054,6 +2143,7 @@ class Alloc(gof.Op):
 alloc = Alloc()
 pprint.assign(alloc, printing.FunctionPrinter('alloc'))

+
 @_redefine(elemwise.Elemwise(scal.identity))
 def tensor_copy(a):
    """Create a duplicate of `a` (with duplicated storage)"""
@@ -2140,9 +2230,10 @@ def mean(input, axis = None, op = False):

 @constructor
 def var(input, axis = None):
-    """Compute the variance along the given axis of a tensor `input`
+    """Compute the variance along the given axis of a tensor `input`.

-    :param axis: compute the variance along this axis of the tensor.  None means trailing axis.
+    :param axis: Compute the variance along this axis of the tensor.
+                 None means all axes (like numpy).
    :type axis: None or int or (list of int) (see `Sum`)

    """
@@ -2176,6 +2267,16 @@ def var(input, axis = None):
    #return the mean sqr
    return mean(centered_input**2, axis)

+@constructor
+def std(input, axis=None):
+    """Compute the standard deviation along the given axis of a tensor `input`.
+
+    :param axis: Compute the standard deviation along this axis of the tensor.
+                 None means all axes (like numpy).
+    :type axis: None or int or (list of int) (see `Sum`)
+    """
+    return sqrt(var(input=input, axis=axis))
+ 
 if 0:
    ## COMMENTED OUT FEB 17 2010
    ## TODO (DOCUMENT AND WRITE TESTS) OR DELETE

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -4,8 +4,8 @@ import sys, traceback, logging, copy, os
 import numpy
 import numpy.distutils
 from theano.configparser import config, AddConfigVar, StrParam
-from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler, 
-        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer, 
+from theano.gof import (utils, Op, view_roots, PatternSub, DestroyHandler,
+        SeqOptimizer, local_optimizer, Optimizer, LocalOptimizer, OpKeyOptimizer,
        InconsistencyError, toolbox, SequenceDB, EquilibriumOptimizer)
 from theano.printing import pprint, FunctionPrinter, debugprint
 from theano.compile.mode import optdb
@@ -17,7 +17,7 @@ import basic as T
 from theano.tensor.tsor_apply import Apply

 #NB: this clobbers the builtin 'compile' symbol
-from theano import compile  #to register the optimizer built by this file 
+from theano import compile  #to register the optimizer built by this file

 from theano.tensor.blas_headers import cblas_header_text, blas_header_text

@@ -85,10 +85,15 @@ class Gemv(Op):
    def perform(self, node, inputs, out_storage):
        y, alpha, A, x, beta = inputs
        if _have_fblas:
-            if not self.inplace:
-                y = y.copy()
            gemv = _blas_gemv_fns[y.dtype]
-            out_storage[0][0] = gemv(alpha, A, x, beta, y, overwrite_y=self.inplace)
+
+            #Here I suppose that A is in c order. If we don't make it explicitly
+            #  as fortran order, scipy 0.7.2 seam to create a copy in fortran
+            #  order instead of just reshaping it and using the trans flag.
+            #If A is already in fortran order, make it in c order and using the
+            #  trans flag don't seam to cause slowdown.
+            #out_storage[0][0] = gemv(alpha, A, x, beta, y, overwrite_y=self.inplace)
+            out_storage[0][0] = gemv(alpha, A.T, x, beta, y, overwrite_y=self.inplace, trans=True)
        else:
            out_storage[0][0] = numpy.asarray(
                    beta * y + alpha * numpy.dot(A, x)
@@ -103,11 +108,11 @@ def default_blas_ldflags():
        if all(not os.path.exists(dir) for dir in numpy.distutils.__config__.blas_opt_info['library_dirs']):
            return "-lblas"
        return ' '.join(
-			#TODO: the Gemm op below should separate the -L and -l arguments into the two callbacks that CLinker uses for that stuff.
+                        #TODO: the Gemm op below should separate the -L and -l arguments into the two callbacks that CLinker uses for that stuff.
                        # for now, we just pass the whole ldflags as the -l options part.
-			['-L%s'%l for l in numpy.distutils.__config__.blas_opt_info['library_dirs']] +
-			['-l%s'%l for l in numpy.distutils.__config__.blas_opt_info['libraries']])
-#			['-I%s'%l for l in numpy.distutils.__config__.blas_opt_info['include_dirs']])
+                        ['-L%s'%l for l in numpy.distutils.__config__.blas_opt_info['library_dirs']] +
+                        ['-l%s'%l for l in numpy.distutils.__config__.blas_opt_info['libraries']])
+#                       ['-I%s'%l for l in numpy.distutils.__config__.blas_opt_info['include_dirs']])
    except KeyError:
        return "-lblas"

@@ -119,7 +124,7 @@ AddConfigVar('blas.ldflags',
 def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):
    """Return a list of libraries against which an Op's object file should be
    linked to benefit from a BLAS implementation.
-    
+
    Default: ['blas'], but configuration variable config.blas.ldflags overrides this.
    """
    rval = []
@@ -134,7 +139,7 @@ def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):
                        found_dyn=True
        if not found_dyn and dirs:
            warning("We did not found a dynamic library into the library_dir of the library we use for blas. If you use ATLAS, make sure to compile it with dynamics library.")
-            
+
    for t in config.blas.ldflags.split():
        try:
            t0, t1, t2 = t[0:3]
@@ -157,7 +162,7 @@ def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):

 class GemmRelated(Op):
    """Base class for Gemm and Dot22
-    
+
    This class provides a kind of templated gemm Op.
    """
    def __eq__(self, other):
@@ -181,14 +186,14 @@ class GemmRelated(Op):
        """
        return blas_header_text() + mod_str
    def c_headers(self):
-        # std.cout doesn't require the '%' symbol to print stuff... 
+        # std.cout doesn't require the '%' symbol to print stuff...
        # so it works much better with python's string-substitution stuff.
-        return ['<iostream>', '<time.h>', '<sys/time.h>'] 
-    
+        return ['<iostream>', '<time.h>', '<sys/time.h>']
+
    def c_libraries(self):
        return ldflags()

-    # code_cache_version is built by subclasses from 
+    # code_cache_version is built by subclasses from
    #  build_gemm_version

    def c_compile_args(self):
@@ -196,10 +201,10 @@ class GemmRelated(Op):

    def c_lib_dirs(self):
        return ldflags(libs=False, libs_dir=True)
-    
+
    def c_header_dirs(self):
        return ldflags(libs=False, include_dir=True)
-        
+
    declare_NS = """
        int unit = 0;

@@ -226,15 +231,15 @@ class GemmRelated(Op):
        if (%(_zout)s->nd != 2) {PyErr_SetString(PyExc_NotImplementedError, "rank(z) != 2"); %(fail)s;}
        """
    check_xyz_double_or_float = """
-        if ((%(_x)s->descr->type_num != PyArray_DOUBLE) 
+        if ((%(_x)s->descr->type_num != PyArray_DOUBLE)
            && (%(_x)s->descr->type_num != PyArray_FLOAT))
        {PyErr_SetString(PyExc_NotImplementedError, "type(x) is not double or float"); %(fail)s;}

-        if ((%(_y)s->descr->type_num != PyArray_DOUBLE) 
+        if ((%(_y)s->descr->type_num != PyArray_DOUBLE)
            && (%(_y)s->descr->type_num != PyArray_FLOAT))
        {PyErr_SetString(PyExc_NotImplementedError, "type(y) is not double or float"); %(fail)s;}

-        if ((%(_zout)s->descr->type_num != PyArray_DOUBLE) 
+        if ((%(_zout)s->descr->type_num != PyArray_DOUBLE)
            && (%(_zout)s->descr->type_num != PyArray_FLOAT))
        {PyErr_SetString(PyExc_NotImplementedError, "type(z) is not double or float"); %(fail)s;}

@@ -257,21 +262,21 @@ class GemmRelated(Op):
    check_dims_strides = """
        if (Nx[0] != Nz[0])
        {
-            PyErr_Format(PyExc_ValueError, 
+            PyErr_Format(PyExc_ValueError,
                "Shape mismatch: x has %%ld rows but z has %%ld rows",
                (long int)Nx[0], (long int)Nz[0]);
            %(fail)s;
        }
        if (Nx[1] != Ny[0])
        {
-            PyErr_Format(PyExc_ValueError, 
+            PyErr_Format(PyExc_ValueError,
                "Shape mismatch: x has %%ld cols but y has %%ld rows",
                (long int)Nx[1], (long int)Ny[0]);
            %(fail)s;
        }
        if (Ny[1] != Nz[1])
        {
-            PyErr_Format(PyExc_ValueError, 
+            PyErr_Format(PyExc_ValueError,
                "Shape mismatch: y has %%ld cols but z has %%ld cols",
                (long int)Ny[1], (long int)Nz[1]);
            %(fail)s;
@@ -408,11 +413,11 @@ class Gemm(GemmRelated):

    When a and b are scalars and x, y, and z are matrices, then

-        gemm(z,a,x,y,b) 
+        gemm(z,a,x,y,b)

-    is similar to 
+    is similar to

-        b*z + a*dot(x,y) 
+        b*z + a*dot(x,y)

    The difference between the two is that the top form is destructive on z,
    whereas the bottom form is not.  Gemm works in-place on the storage
@@ -445,7 +450,7 @@ class Gemm(GemmRelated):
    def __setstate__(self, dct):
        inplace = dct.get('inplace', True)
        if inplace:
-            self.destroy_map = {0: [0]} 
+            self.destroy_map = {0: [0]}
            self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_inplace
        else:
            self.setup_z_Nz_Sz = self.setup_z_Nz_Sz_outplace
@@ -572,7 +577,7 @@ class Gemm(GemmRelated):

    case_float_ab_constants = """
        #define REAL float
-        float a = (%(_a)s->descr->type_num == PyArray_FLOAT) 
+        float a = (%(_a)s->descr->type_num == PyArray_FLOAT)
        ? (REAL)(((float*)%(_a)s->data)[0])
        : (REAL)(((double*)%(_a)s->data)[0]);
        float b = (%(_b)s->descr->type_num == PyArray_FLOAT) ?
@@ -582,7 +587,7 @@ class Gemm(GemmRelated):
        """
    case_double_ab_constants = """
        #define REAL double
-        double a = (%(_a)s->descr->type_num == PyArray_FLOAT) 
+        double a = (%(_a)s->descr->type_num == PyArray_FLOAT)
        ? (REAL)(((float*)%(_a)s->data)[0])
        : (REAL)(((double*)%(_a)s->data)[0]);
        double b = (%(_b)s->descr->type_num == PyArray_FLOAT) ?
@@ -613,14 +618,14 @@ pprint.assign(gemm_inplace, FunctionPrinter('gemm_inplace'))
 pprint.assign(gemm_no_inplace, FunctionPrinter('gemm_no_inplace'))

 def res_is_a(node, op, maxclients=None):
-  if maxclients is not None:
-    retval = (len(node.clients) <= maxclients)
-  else:
-    retval = True
+    if maxclients is not None:
+        retval = (len(node.clients) <= maxclients)
+    else:
+        retval = True

-  return node.owner \
-            and node.owner.op == op \
-            and retval
+    return node.owner \
+              and node.owner.op == op \
+              and retval


 def _as_scalar(res):
@@ -649,7 +654,7 @@ def _is_real_matrix(res):
 def _is_real_vector(res):
    return res.type.dtype in ('float32', 'float64') \
            and res.type.ndim == 1 \
-            and res.type.broadcastable[0] == False 
+            and res.type.broadcastable[0] == False

 def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
    #print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip
@@ -675,7 +680,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
                pass
            if Mr.ndim == 2:
                #print "RETURNING GEMV (case 2)"
-                if Mr.dtype == Ml.dtype: 
+                if Mr.dtype == Ml.dtype:
                    rval = [gemv_no_inplace(L, alpha, Mr.T, Ml, beta)]
                    assert L.type == rval[0].type, (L.type, rval[0].type)
                else:
@@ -695,7 +700,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
                pass
        return rval

-    # this is False'd out because of inadequate testing.  
+    # this is False'd out because of inadequate testing.
    # TODO see ticket #237
    if False and res_is_a(M, gemm_no_inplace, 1):
        #EXPRESSION: (beta * L) + (alpha * (gemm_no_inplace(G, a, u, v, b)))
@@ -855,7 +860,7 @@ def _gemm_from_factored_list(lst):
                s_j, M_j = lst[j]
            except:
                continue
-            
+
            #print 'TRYING', (s_i, M_i, s_j, M_j)

            gemm_of_sM_list = _beta_L_plus_alpha_M(s_i, M_i, s_j, M_j)
@@ -869,7 +874,7 @@ def _gemm_from_factored_list(lst):
                    return s*M

                assert len(gemm_of_sM_list) == 1
-                add_inputs = [item_to_var(input) 
+                add_inputs = [item_to_var(input)
                        for k, input in enumerate(lst) if k not in (i,j)]
                add_inputs.extend(gemm_of_sM_list)
                if len(add_inputs) > 1:
@@ -1045,7 +1050,7 @@ optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
 # run before specialize (2.0) because specialize is basically a free-for-all that makes the
 # graph crazy.

-blas_optdb.register('local_dot_to_dot22', 
+blas_optdb.register('local_dot_to_dot22',
        EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
        0, 'fast_run')
 blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
@@ -1053,9 +1058,9 @@ blas_optdb.register('local_dot_to_gemm', GemmOptimizer(), 10, 'fast_run')
 # After destroyhandler is in but before we try to make elemwise things inplace
 # Try to make gemm inplace
 # Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71)
-optdb.register('InplaceBlasOpt', 
+optdb.register('InplaceBlasOpt',
        EquilibriumOptimizer([local_inplace_gemm, local_inplace_gemv], failure_callback=EquilibriumOptimizer.warn_inplace,
-            max_use_ratio=5), 
+            max_use_ratio=5),
        70.0, 'fast_run', 'inplace')

 class Dot22Scalar(GemmRelated):
@@ -1098,7 +1103,7 @@ class Dot22Scalar(GemmRelated):
        """
    case_float_ab_constants = """
        #define REAL float
-        float a = (%(_a)s->descr->type_num == PyArray_FLOAT) 
+        float a = (%(_a)s->descr->type_num == PyArray_FLOAT)
        ? (REAL)(((float*)%(_a)s->data)[0])
        : (REAL)(((double*)%(_a)s->data)[0]);
        #undef REAL
@@ -1106,7 +1111,7 @@ class Dot22Scalar(GemmRelated):
        """
    case_double_ab_constants = """
        #define REAL double
-        double a = (%(_a)s->descr->type_num == PyArray_FLOAT) 
+        double a = (%(_a)s->descr->type_num == PyArray_FLOAT)
        ? (REAL)(((float*)%(_a)s->data)[0])
        : (REAL)(((double*)%(_a)s->data)[0]);
        #undef REAL
@@ -1133,7 +1138,7 @@ def local_dot22_to_dot22scalar(node):
    .. note:
        We execute this optimizer after the gemm optimizer. This allow to give more priority to gemm that give more speed up then this optimizer, but allow the gemm optimizer to ignore this op.

-    
+
    TODO: support when we can reorder the mul to generate a dot22scalar or fix the canonizer to merge them(1 mul with multiple inputs)
    """
    if node.op != T.mul:
@@ -1149,7 +1154,7 @@ def local_dot22_to_dot22scalar(node):
        #no scalar in input and no multiplication
        #if their was a multiplication we couls reorder the graph by the associativity of the graph.
        return False
-    
+
    if not any(i_scalar):
        #maybe we can reorder the graph as this mul have a mul in input.
        #The canonizer should have merged those mul together.
@@ -1202,4 +1207,3 @@ from opt import register_specialize, register_canonicalize
 def local_print_as_we_go_along(node):
    if node.op in (T.sub, T.add):
        debugprint(node)
-
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -240,10 +240,10 @@ class DimShuffle(Op):

        shape_statements = ['npy_intp dimensions[%i]'%nd_out]
        for i, o in enumerate(self.new_order):
-          if o != 'x':
-            shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')]
-          else:
-            shape_statements += [('dimensions['+str(i)+'] = 1')]
+            if o != 'x':
+                shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')]
+            else:
+                shape_statements += [('dimensions['+str(i)+'] = 1')]
        #backport
        #shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')
        #    if o != 'x' else
@@ -255,10 +255,10 @@ class DimShuffle(Op):

        #set the strides of the non-broadcasted dimensions
        for i, o in enumerate(self.new_order):
-          if o != 'x':
-             strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')]
-          else:
-             strides_statements += [('strides['+str(i)+'] = 0')]
+            if o != 'x':
+                strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')]
+            else:
+                strides_statements += [('strides['+str(i)+'] = 0')]
        #backport
        #strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')
        #    if o != 'x' else
@@ -276,7 +276,7 @@ class DimShuffle(Op):
        #                       npy_intp* strides, void* data, int itemsize, int flags, PyObject* obj)
        #
        close_bracket = [
-                #create a new array, 
+                #create a new array,
                ('%(res)s = (PyArrayObject*)PyArray_New(&PyArray_Type, '
                            '' + str(nd_out) + ', dimensions, '
                            'PyArray_TYPE(%(basename)s), strides, '
@@ -287,13 +287,13 @@ class DimShuffle(Op):
                #recalculate flags: CONTIGUOUS, FORTRAN, ALIGNED
                'PyArray_UpdateFlags(%(res)s, NPY_UPDATE_ALL)',
                #we are making a view in both inplace and non-inplace cases
-                '%(res)s->base = (PyObject*)%(basename)s', 
+                '%(res)s->base = (PyObject*)%(basename)s',
                '}']

-        full_code = statements(check_input_nd 
+        full_code = statements(check_input_nd
                + clear_output
                + get_base
-                + shape_statements 
+                + shape_statements
                + strides_statements
                + close_bracket)

@@ -345,7 +345,7 @@ class DimShufflePrinter:
            raise TypeError("Can only print DimShuffle.")
        elif isinstance(r.owner.op, DimShuffle):
            ord = r.owner.op.new_order
-            return self.__p(ord, pstate, r.owner.inputs[0])            
+            return self.__p(ord, pstate, r.owner.inputs[0])
        else:
            raise TypeError("Can only print DimShuffle.")

@@ -411,7 +411,7 @@ class Elemwise(Op):
        d.pop('__epydoc_asRoutine', None)
        d.pop('_hashval')
        return d
-    
+
    def __setstate__(self, d):
        self.__dict__.update(d)
        if self.scalar_op.nin > 0:
@@ -441,7 +441,7 @@ class Elemwise(Op):
            else:
                # TODO: use LComplete instead
                args.append(DimShuffle(
-                    input.type.broadcastable, 
+                    input.type.broadcastable,
                    ['x']*difference + range(length),
                    inplace = True)(input))
        inputs = args
@@ -463,7 +463,7 @@ class Elemwise(Op):
                        raise ValueError("Operation cannot be done inplace on an input with broadcasted dimensions.")
        out_dtypes = [o.type.dtype for o in shadow.outputs]
        if any(inputs[i].type.dtype != out_dtypes[o] for o, i in inplace_pattern.items()):
-            raise TypeError("Cannot do an inplace operation on incompatible data types.", 
+            raise TypeError("Cannot do an inplace operation on incompatible data types.",
                    ([i.type.dtype for i in inputs], out_dtypes, inplace_pattern))
        outputs = [TensorType(dtype = dtype, broadcastable = broadcastable)() for dtype, broadcastable in zip(out_dtypes, out_broadcastables)]
        return Apply(self, inputs, outputs)
@@ -484,10 +484,10 @@ class Elemwise(Op):
        first_part = [k for k,v in items]
        second_part = []
        for k,v in items:
-          if isinstance(v, (tuple, list)):
-            second_part += [tuple(v)]
-          else:
-            second_part += [v]
+            if isinstance(v, (tuple, list)):
+                second_part += [tuple(v)]
+            else:
+                second_part += [v]
        tuple_items = tuple(first_part + second_part)
        #backport
        #tuple_items = tuple([k for k,v in items] + [(tuple(v) if isinstance(v, (tuple, list)) else v) for k,v in items])
@@ -511,7 +511,7 @@ class Elemwise(Op):

    def grad(self, inputs, ograds):
        # Gradients (especially on the final costs) don't have to be symbolic
-        ograds = map(as_tensor_variable, ograds) 
+        ograds = map(as_tensor_variable, ograds)
        scalar_inputs = [Scalar(dtype = t.type.dtype)() for t in inputs]
        scalar_ograds = [Scalar(dtype = ograd.type.dtype)() for ograd in ograds]
        scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds)
@@ -575,7 +575,7 @@ class Elemwise(Op):
                    msg2 = []
                    for d, b in zip(input.shape, sinput.type.broadcastable):
                        if b:
-                            msg2 += ['*'] 
+                            msg2 += ['*']
                        else:
                            msg2 += [str(d)]
                    msg.append('(%s)' % ", ".join(msg2))
@@ -616,7 +616,7 @@ class Elemwise(Op):
        # the first (faster) version leads to segfaults
        ufunc_args = inputs # + output_storage
        ufunc = self.ufunc or numpy.frompyfunc(self.scalar_op.impl, len(inputs), self.scalar_op.nout)
-        
+
        try:
            variables = ufunc(*ufunc_args)
        except Exception, e:
@@ -655,7 +655,7 @@ class Elemwise(Op):
                # b_dim might still be None, if every input's shape was unknown in dimension 'dim'
                oshp.append(b_dim)
                # TODO: it would be interesting to return the constraining information that if
-                # one of the inputs shape[dim] is known and another input's shape[dim] is not, 
+                # one of the inputs shape[dim] is known and another input's shape[dim] is not,
                # that we can now assume that the other input's shape[dim] is the same as the
                # first.
            rval.append(tuple(oshp))
@@ -841,9 +841,9 @@ class CAReduce(Op):
    Examples:
     CAReduce(add) -> sum
     CAReduce(mul) -> product
-     CAReduce(maximum) -> sum
-     CAReduce(_or) -> any # not lazy
-     CAReduce(_and) -> all # not lazy
+     CAReduce(maximum) -> max
+     CAReduce(or_) -> any # not lazy
+     CAReduce(and_) -> all # not lazy

    In order to (eventually) optimize memory usage patterns,
    L{CAReduce} makes zero guarantees on the order in which it
@@ -899,7 +899,7 @@ class CAReduce(Op):
            assert len(axis)==len(axis2)
            axis = tuple(axis2)
            op = self.__class__(self.scalar_op, axis)
-        else: 
+        else:
            op = self
        output = TensorType(dtype = self._output_dtype(input.type.dtype),
                             broadcastable = [x for i, x in enumerate(input.type.broadcastable) if i not in axis])()
@@ -910,7 +910,7 @@ class CAReduce(Op):
        d = copy(self.__dict__)
        d.pop('ufunc')
        return d
-    
+
    def __setstate__(self, d):
        self.__dict__.update(d)
        self.ufunc = numpy.frompyfunc(self.scalar_op.impl, 2, 1)
@@ -1155,8 +1155,28 @@ class Prod(CAReduce):
    def grad(self, (x, ), (gz, )):
        if x.dtype[0:3] in ('int','uin'):
            return [None]
-        else:
-            raise NotImplementedError('Will be implemented shortly')
+
+        prod_out = self(x)
+
+        gz = as_tensor_variable(gz)
+        axis = self.axis
+        if axis is None:
+            axis = range(x.type.ndim)
+        if axis == ():
+            return gz,
+        new_dims = []
+        i = 0
+        for j, _ in enumerate(x.type.broadcastable):
+            if j in axis:
+                new_dims.append('x')
+            else:
+                new_dims.append(i)
+                i += 1
+
+        p_gz = theano.tensor.mul(prod_out, gz)
+        p_gz = DimShuffle(p_gz.type.broadcastable, new_dims)(p_gz)
+
+        return [Elemwise(scalar.true_div)(p_gz, x)]

    def __str__(self):
        if self.axis is None:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -317,8 +317,10 @@ class MakeVector(T.Op):
        inputs = map(T.as_tensor_variable, inputs)
        if not all(a.type == inputs[0].type for a in inputs) or (len(inputs)>0 and inputs[0].dtype != self.dtype):
            dtype=theano.scalar.upcast(self.dtype,*[i.dtype for i in inputs])
-            #upcast the input to the determined dtype, but don't upcast downcast anything
-            assert dtype==self.dtype, "Upcast the input of MakeVector to dtype gived in init without precissino loss only."
+            #upcast the input to the determined dtype, but don't downcast anything
+            assert dtype==self.dtype, (
+                    "The upcast of the inputs to MakeVector should match the "
+                    "dtype given in __init__.")
            if not all(self.dtype == T.cast(i,dtype=dtype).dtype for a in inputs):
                raise TypeError("MakeVector.make_node expected inputs upcastable to %s. got %s"%(
                        self.dtype,
@@ -348,6 +350,9 @@ class MakeVector(T.Op):
            # assume that out has correct dtype.  there is no cheap way to check
            out[0][...] = inputs

+    def grad(self, inputs, output_gradients):
+        return [output_gradients[0][i] for i in xrange(len(inputs))]
+
 make_vector = MakeVector()

 class MakeVectorPrinter:
@@ -407,6 +412,9 @@ class Shape_i(T.Op):
            ((npy_int64*)PyArray_DATA(%(out)s))[0]=CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];
            """%locals()
        else:
+            #TODO: if your type is not listed here, make a damn registry of shape_i ops for
+            #      various types of variables.
+            #      Do not continue this madness.
            return super(Shape_i, self).c_code(node, name, (x,), (out,), sub)
    def grad(self, (x,), (gz,)):
        return [None]

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -687,27 +687,56 @@ def test_dot_mv():
 def test_gemv1():
    ''' test vector1+dot(matrix,vector2) '''
    v1 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
-    v2 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
+    v2_orig = numpy.array(numpy.random.rand(2), dtype='float32')
+    v2 = theano.shared( v2_orig )
    m  = theano.shared( numpy.array(numpy.random.rand(2,2), dtype='float32'))

    f = theano.function([], v2+theano.dot(m,v1), mode = mode_blas_opt)

    # Assert they produce the same output
-    assert numpy.allclose(f(), numpy.dot(m.value,v1.value)+v2.value)
+    assert numpy.allclose(f(), numpy.dot(m.value,v1.value)+v2_orig)
+    topo = f.maker.env.toposort()
+    assert len(topo)==1
+    assert isinstance(topo[0].op, Gemv)
+    assert topo[0].op.inplace==False

-    assert sum([isinstance(node.op, Gemv) for node in
-                f.maker.env.toposort() ]) == 1
+    #test the inplace version
+    f = theano.function([], [], updates={v2:v2+theano.dot(m,v1)}
+                        , mode = mode_blas_opt)
+
+    # Assert they produce the same output
+    f()
+    assert numpy.allclose(v2.value, numpy.dot(m.value,v1.value)+v2_orig)
+    topo = f.maker.env.toposort()
+    assert len(topo)==1
+    assert isinstance(topo[0].op, Gemv)
+    if config.mode != 'FAST_COMPILE':
+        assert topo[0].op.inplace==True
+    

 def test_gemv2():
    ''' test vector1+dot(vector2,matrix) '''
    v1 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
-    v2 = theano.shared( numpy.array(numpy.random.rand(2)  , dtype='float32'))
+    v2_orig = numpy.array(numpy.random.rand(2), dtype='float32')
+    v2 = theano.shared( v2_orig )
    m  = theano.shared( numpy.array(numpy.random.rand(2,2), dtype='float32'))

    f = theano.function([], v2+theano.dot(v1,m), mode = mode_blas_opt)

    # Assert they produce the same output
    assert numpy.allclose(f(), numpy.dot(v1.value,m.value)+v2.value)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, Gemv) for node in topo)==1
+    assert topo[-1].op.inplace==False

-    assert sum([isinstance(node.op, Gemv) for node in
-                f.maker.env.toposort() ]) == 1
+    #test the inplace version
+    f = theano.function([], [], updates={v2:v2+theano.dot(v1,m)}
+                        , mode = mode_blas_opt)
+
+    # Assert they produce the same output
+    f()
+    assert numpy.allclose(v2.value, numpy.dot(v1.value, m.value)+v2_orig)
+    topo = f.maker.env.toposort()
+    assert sum(isinstance(node.op, Gemv) for node in topo)==1
+    if config.mode != 'FAST_COMPILE':
+        assert topo[-1].op.inplace==True
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -254,5 +254,53 @@ class test_CAReduce(unittest.TestCase):
        #self.with_linker(gof.CLinker(), and_)


+class test_Prod(unittest.TestCase):
+    def setUp(self):
+        unittest_tools.seed_rng()
+
+    def test_prod_grad(self):
+        x_val = numpy.asarray([[1,2,3],[4,5,6],[7,8,9]], dtype='float32')
+        x = theano.tensor.dmatrix()
+        p = Prod(axis=0)(x)
+
+        # sanity check
+        fn = theano.function([x], [p])
+        assert numpy.allclose(fn(x_val), numpy.array([  28.,  80.,  162.]))
+
+        # very basic case for the product; no broadcasting in x
+        g = theano.tensor.grad(p.sum(), x)
+        g_fn = theano.function([x], g)
+        assert numpy.allclose(g_fn(x_val),
+                numpy.asarray([[28.,40.,54.],[7.,16.,27.],[4.,10.,18.]]))
+
+        # now with some tranposition in input
+        x_bc = x.dimshuffle(1, 0)
+        p_bc = Prod(axis=0)(x_bc)
+        p_bc_sum = p_bc.sum()
+        g_bc = theano.tensor.grad(p_bc_sum, x)
+        g_fn_bc = theano.function([x], [p_bc,g_bc])
+        p_bc_ret, g_bc_ret =  g_fn_bc(x_val)
+
+        assert numpy.allclose(p_bc_ret, numpy.array([  6.,  120.,  504.]))
+        assert numpy.allclose(g_bc_ret,
+                numpy.asarray([[6.,3.,2.],[30.,24.,20.],[72.,63.,56.]]))
+
+    def test_verify_grad(self):
+        x_val = numpy.asarray([[1,2,3],[4,5,6],[7,8,9]], dtype='float32')
+        x = theano.tensor.dmatrix()
+        # now with verify_grad
+        unittest_tools.verify_grad(Prod(axis=0), [x_val])
+
+        # second time, with some added complexity
+        # verify_grad takes the sum of the matrices anyway
+        def fn(x2):
+            return theano.tensor.sqr(Prod(axis=0)(x2))
+
+        unittest_tools.verify_grad(fn, [x_val])
+
 if __name__ == '__main__':
    unittest.main()
+    #suite = unittest.TestSuite([test_Prod('test_prod_grad')])
+    #suite.addTest(test_Prod('test_verify_grad'))
+    #unittest.TextTestRunner().run(suite)
+
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
@@ -80,6 +80,49 @@ def makeSharedTester(shared_constructor_,
            else:
                assert numpy.allclose(x_ref, total_func())

+        def test_shape(self):
+            dtype = self.dtype
+            if dtype is None:
+                dtype = theano.config.floatX
+
+            rng = numpy.random.RandomState([3,5,17])
+            x = numpy.asarray(rng.uniform(0,1,[2,4]),dtype=dtype)
+            x = self.cast_value(x)
+
+            x_ref = self.ref_fct(x)
+            x_shared = self.shared_constructor(x, borrow = False)
+            total = self.theano_fct(x_shared)
+
+            f = theano.function([],x_shared.shape)
+            topo = f.maker.env.toposort()
+
+            assert numpy.all(f()==(2,4))
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo)==3
+                assert isinstance(topo[0].op,tensor.opt.Shape_i)
+                assert isinstance(topo[1].op,tensor.opt.Shape_i)
+                assert isinstance(topo[2].op,tensor.opt.MakeVector)
+
+        def test_shape_i(self):
+            dtype = self.dtype
+            if dtype is None:
+                dtype = theano.config.floatX
+
+            rng = numpy.random.RandomState([3,5,17])
+            x = numpy.asarray(rng.uniform(0,1,[2,4]),dtype=dtype)
+            x = self.cast_value(x)
+
+            x_ref = self.ref_fct(x)
+            x_shared = self.shared_constructor(x, borrow = False)
+            total = self.theano_fct(x_shared)
+
+            f = theano.function([],x_shared.shape[1])
+            topo = f.maker.env.toposort()
+
+            assert numpy.all(f()==(4))
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo)==1
+                assert isinstance(topo[0].op,tensor.opt.Shape_i)

        def test_return_internal_type(self):
            dtype = self.dtype
@@ -191,6 +234,174 @@ def makeSharedTester(shared_constructor_,
            else:
                assert numpy.allclose(x_ref, total_func())

+        def test_specify_shape(self):
+            dtype = self.dtype
+            if dtype is None:
+                dtype = theano.config.floatX
+
+            rng = numpy.random.RandomState([2,4,16])
+            x1_1 = numpy.asarray(rng.uniform(1,2,[4,2]),dtype=dtype)
+            x1_1 = self.cast_value(x1_1)
+            x1_2 = numpy.asarray(rng.uniform(1,2,[4,2]),dtype=dtype)
+            x1_2 = self.cast_value(x1_2)
+            x2 = numpy.asarray(rng.uniform(1,2,[4,3]),dtype=dtype)
+            x2 = self.cast_value(x2)
+
+            #Test that we can replace with values of the same shape
+            x1_shared = self.shared_constructor(x1_1)
+            x1_specify_shape = tensor.specify_shape(x1_shared,x1_1.shape)
+            x1_shared.set_value(x1_2)
+            assert numpy.allclose(self.ref_fct(x1_shared.value), self.ref_fct( x1_2))
+            shape_op_fct = theano.function([],x1_shared.shape)
+            topo = shape_op_fct.maker.env.toposort()
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo)==3
+                assert isinstance(topo[0].op,tensor.opt.Shape_i)
+                assert isinstance(topo[1].op,tensor.opt.Shape_i)
+                assert isinstance(topo[2].op,tensor.opt.MakeVector)
+
+            #Test that we forward the input
+            specify_shape_fct = theano.function([],x1_specify_shape)
+            assert numpy.all(self.ref_fct(specify_shape_fct())==
+                             self.ref_fct(x1_2))
+            topo_specify = specify_shape_fct.maker.env.toposort()
+            assert len(topo_specify)==2
+
+            #Test that we put the shape info into the graph
+            shape_constant_fct = theano.function([],x1_specify_shape.shape)
+            assert numpy.all(shape_constant_fct()==shape_op_fct())
+            topo_cst = shape_constant_fct.maker.env.toposort()
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo_cst)==0
+
+            #Test that we can replace with values of the different shape
+            # but that will raise an error in some case, but not all
+            x1_shared.set_value(x2)
+            self.assertRaises(AssertionError, specify_shape_fct)
+
+            #No assertion will be raised as the Op is removed from the graph
+            #when their is optimization
+            if theano.config.mode not in ['FAST_COMPILE','DebugMode','DEBUG_MODE']:
+                shape_constant_fct()
+            else:
+                self.assertRaises(AssertionError, shape_constant_fct)
+
+        def test_specify_shape_partial(self):
+            dtype = self.dtype
+            if dtype is None:
+                dtype = theano.config.floatX
+
+            rng = numpy.random.RandomState([2,4,16])
+            x1_1 = numpy.asarray(rng.uniform(1,2,[4,2]),dtype=dtype)
+            x1_1 = self.cast_value(x1_1)
+            x1_2 = numpy.asarray(rng.uniform(1,2,[4,2]),dtype=dtype)
+            x1_2 = self.cast_value(x1_2)
+            x2 = numpy.asarray(rng.uniform(1,2,[5,2]),dtype=dtype)
+            x2 = self.cast_value(x2)
+
+            #Test that we can replace with values of the same shape
+            x1_shared = self.shared_constructor(x1_1)
+            x1_specify_shape = tensor.specify_shape(x1_shared,
+                                                    (tensor.as_tensor_variable(x1_1.shape[0]),
+                                                     x1_shared.shape[1]))
+            x1_shared.set_value(x1_2)
+            assert numpy.allclose(self.ref_fct(x1_shared.value), self.ref_fct( x1_2))
+            shape_op_fct = theano.function([],x1_shared.shape)
+            topo = shape_op_fct.maker.env.toposort()
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo)==3
+                assert isinstance(topo[0].op,tensor.opt.Shape_i)
+                assert isinstance(topo[1].op,tensor.opt.Shape_i)
+                assert isinstance(topo[2].op,tensor.opt.MakeVector)
+
+            #Test that we forward the input
+            specify_shape_fct = theano.function([],x1_specify_shape)
+            theano.printing.debugprint(specify_shape_fct)
+            assert numpy.all(self.ref_fct(specify_shape_fct())
+                             ==self.ref_fct(x1_2))
+            topo_specify = specify_shape_fct.maker.env.toposort()
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo_specify)==6
+
+            #Test that we put the shape info into the graph
+            shape_constant_fct = theano.function([],x1_specify_shape.shape)
+            theano.printing.debugprint(shape_constant_fct)
+            assert numpy.all(shape_constant_fct()==shape_op_fct())
+            topo_cst = shape_constant_fct.maker.env.toposort()
+            if theano.config.mode!='FAST_COMPILE':
+                assert len(topo_cst)==6
+
+            #Test that we can replace with values of the different shape
+            # but that will raise an error in some case, but not all
+            x1_shared.set_value(x2)
+            self.assertRaises(AssertionError, specify_shape_fct)
+
+            #No assertion will be raised as the Op is removed from the graph
+            if theano.config.mode not in ['FAST_COMPILE','DebugMode','DEBUG_MODE']:
+                shape_constant_fct()
+            else:
+                self.assertRaises(AssertionError, shape_constant_fct)
+
+        def test_specify_shape_inplace(self):
+            #test that specify_shape don't break inserting inplace op
+
+            dtype = self.dtype
+            if dtype is None:
+                dtype = theano.config.floatX
+
+            rng = numpy.random.RandomState([2,4,16])
+            a = numpy.asarray(rng.uniform(1,2,[40,40]),dtype=dtype)
+            a = self.cast_value(a)
+            a_shared = self.shared_constructor(a)
+            b = numpy.asarray(rng.uniform(1,2,[40,40]),dtype=dtype)
+            b = self.cast_value(b)
+            b_shared = self.shared_constructor(b)
+            s = numpy.zeros((40,40),dtype=dtype)
+            s = self.cast_value(s)
+            s_shared = self.shared_constructor(s)
+            f = theano.function([],
+                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                         +s_shared})
+            topo=f.maker.env.toposort()
+            f()
+            #[Gemm{inplace}(<TensorType(float64, matrix)>, 0.01, <TensorType(float64, matrix)>, <TensorType(float64, matrix)>, 2e-06)]
+            #print topo
+            if theano.config.mode!='FAST_COMPILE':
+                assert sum([node.op.__class__.__name__ in ["Gemm","GpuGemm","StructuredDot"] for node in topo])==1
+                assert all(node.op == tensor.blas.gemm_inplace for node in topo if isinstance(node.op,tensor.blas.Gemm))
+                assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "GpuGemm")
+            #Their is no inplace gemm for sparse
+            #assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "StructuredDot")
+            s_shared_specify = tensor.specify_shape(s_shared,s_shared.value.shape)
+
+            #now test with the specify shape op in the output
+            f = theano.function([], s_shared.shape,
+                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                         +s_shared_specify})
+            topo=f.maker.env.toposort()
+            print topo
+            shp=f()
+            assert numpy.all(shp == (40,40))
+            if theano.config.mode!='FAST_COMPILE':
+                assert sum([node.op.__class__.__name__ in ["Gemm","GpuGemm","StructuredDot"] for node in topo])==1
+                assert all(node.op == tensor.blas.gemm_inplace for node in topo if isinstance(node.op,tensor.blas.Gemm))
+                assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "GpuGemm")
+            #now test with the specify shape op in the inputs and outputs
+            a_shared = tensor.specify_shape(a_shared,a_shared.value.shape)
+            b_shared = tensor.specify_shape(b_shared,b_shared.value.shape)
+
+            f = theano.function([], s_shared.shape,
+                                updates={s_shared:theano.dot(a_shared,b_shared)
+                                         +s_shared_specify})
+            topo=f.maker.env.toposort()
+            print topo
+            shp=f()
+            assert numpy.all(shp == (40,40))
+            if theano.config.mode!='FAST_COMPILE':
+                assert sum([node.op.__class__.__name__ in ["Gemm","GpuGemm","StructuredDot"] for node in topo])==1
+                assert all(node.op == tensor.blas.gemm_inplace for node in topo if isinstance(node.op,tensor.blas.Gemm))
+                assert all(node.op.inplace for node in topo if node.op.__class__.__name__ == "GpuGemm")
+
    return SharedTester

 test_shared_options=makeSharedTester(tensor.shared, 'float64',
@@ -199,4 +410,3 @@ test_shared_options=makeSharedTester(tensor.shared, 'float64',
                                     lambda a: isinstance(a,numpy.ndarray),
                                     theano.tensor.sum,
                                     numpy.sum)
-