Merge branch 'master' of git://github.com/Theano/Theano into scan

9ad99d0f · Laurent Dinh · ff371509 · 99a994cb · 9ad99d0f · 9ad99d0f
--- a/doc/library/sandbox/index.txt
+++ b/doc/library/sandbox/index.txt
@@ -16,6 +16,4 @@
    cuda/index
    linalg
    neighbours
+    rng_mrg
--- a/doc/library/sandbox/rng_mrg.txt
+++ b/doc/library/sandbox/rng_mrg.txt
+.. _libdoc_rng_mrg:
+===================================================================
+:mod:`sandbox.rng_mrg` --  MRG random number generator
+===================================================================
+.. module:: sandbox.rng_mrg
+   :platform: Unix, Windows
+   :synopsis: MRG random number generator
+.. moduleauthor:: LISA
+API
+===
+.. automodule:: theano.sandbox.rng_mrg
+    :members:
--- a/doc/library/sparse/index.txt
+++ b/doc/library/sparse/index.txt
@@ -11,20 +11,20 @@ In the tutorial section, you can find a :ref:`sparse tutorial
 The sparse submodule is not loaded when we import Theano. You must
 import ``theano.sparse`` to enable it.
-The sparse module provide the same functionalities as the tensor
+The sparse module provides the same functionality as the tensor
-module. The difference lies under the cover because sparse matrices
+module. The difference lies under the covers because sparse matrices
-does not store data in a contiguous array. Note that there are no GPU
+do not store data in a contiguous array. Note that there are no GPU
-implementations for sparse matrices implemented in Theano. The sparse
+implementations for sparse matrices in Theano. The sparse module has
-module has been used in:
+been used in:
 - NLP: Dense linear transformations of sparse vectors.
- Audio: Filterbank in Fourier domain.
+- Audio: Filterbank in the Fourier domain.
 Compressed Sparse Format
 ========================
-This section tries to explain how information is store for the two
+This section tries to explain how information is stored for the two
-sparse formats of SciPy supported by Theano. There is more formats
+sparse formats of SciPy supported by Theano. There are more formats
 that can be used with SciPy and some documentation about them may be
 found `here
 <http://deeplearning.net/software/theano/sandbox/sparse.html>`_.
@@ -50,14 +50,14 @@ attributes: ``data``, ``indices``, ``indptr`` and ``shape``.
 CSC Matrix
 ----------
-In the *Compressed Sparse Column* format, ``indices`` stands for index
+In the *Compressed Sparse Column* format, ``indices`` stands for
-inside the column vectors of the matrix and ``indptr`` tells where the
+indexes inside the column vectors of the matrix and ``indptr`` tells
-column starts in the ``data`` and in the ``indices``
+where the column starts in the ``data`` and in the ``indices``
-attributes. ``indptr`` can be tought as giving the slice which must be
+attributes. ``indptr`` can be thought of as giving the slice which
-applied to the other attribute in order to get each column of the
+must be applied to the other attribute in order to get each column of
-matrix. In other words, ``slice(indptr[i], indptr[i+1])`` correspond
+the matrix. In other words, ``slice(indptr[i], indptr[i+1])``
-to the slice needed to find the i-th column of the matrix in the
+corresponds to the slice needed to find the i-th column of the matrix
-``data`` and in the ``indices`` fields.
+in the ``data`` and ``indices`` fields.
 The following example builds a matrix and returns its columns. It
 prints the i-th column, i.e. a list of indices in the column and their
@@ -84,18 +84,18 @@ corresponding value in the second list.
 CSR Matrix
 ----------
-In the *Compressed Sparse Row* format, ``indices`` stands for index
+In the *Compressed Sparse Row* format, ``indices`` stands for indexes
 inside the row vectors of the matrix and ``indptr`` tells where the
 row starts in the ``data`` and in the ``indices``
-attributes. ``indptr`` can be tought as giving the slice which must be
+attributes. ``indptr`` can be thought of as giving the slice which
-applied to the other attribute in order to get each row of the
+must be applied to the other attribute in order to get each row of the
-matrix. In other words, ``slice(indptr[i], indptr[i+1])`` correspond
+matrix. In other words, ``slice(indptr[i], indptr[i+1])`` corresponds
 to the slice needed to find the i-th row of the matrix in the ``data``
-and in the ``indices`` fields.
+and ``indices`` fields.
 The following example builds a matrix and returns its rows. It prints
-the i-th row, i.e. a list of indices in the row and their corresponding value
+the i-th row, i.e. a list of indices in the row and their
-in the second list.
+corresponding value in the second list.
 >>> data = np.asarray([7, 8, 9])
 >>> indices = np.asarray([0, 1, 2])
@@ -120,7 +120,7 @@ List of Implemented Operations
 - Moving from and to sparse
    - :class:`DenseFromSparse <theano.sparse.basic.DenseFromSparse>` and ``dense_from_sparse``.
-      Both grad are implemented. Structured by default.
+      Both grads are implemented. Structured by default.
    - :class:`SparseFromDense <theano.sparse.basic.SparseFromDense>` and ``csr_from_dense``, ``csc_from_dense``.
      The grad implemented is structured.
    - Theano SparseVariable object have a method ``toarray()`` that is the same as ``dense_from_sparse``.
@@ -201,51 +201,55 @@ List of Implemented Operations
        - One of the inputs must be sparse, the other sparse or dense.
        - The grad implemented is regular.
        - No C code for perform and no C code for grad.
-        - Return a dense for perform and a dense for grad.
+        - Returns a dense for perform and a dense for grad.
    - :class:`StructuredDot <theano.sparse.basic.StructuredDot>`
      and :func:`structured_dot <theano.sparse.basic.structured_dot>`.
        - The first input is sparse, the second can be sparse or dense.
        - The grad implemented is structured.
        - C code for perform and grad.
-        - Return a dense for perforn and a sparse for grad.
+        - It returns a sparse output if both inputs are sparse and
+          dense one if one of the inputs is dense.
+        - Returns a sparse grad for sparse inputs and dense grad for
+          dense inputs.
    - :class:`TrueDot <theano.sparse.basic.TrueDot>` and
      :func:`true_dot <theano.sparse.basic.true_dot>`.
        - The first input is sparse, the second can be sparse or dense.
        - The grad implemented is regular.
        - No C code for perform and no C code for grad.
-        - Return a Sparse for perform and a Sparse for grad.
+        - Returns a Sparse.
-        - Flags trough constructor can change the output of
+        - The gradient returns a Sparse for sparse inputs and by
-          grad to be dense if the second input of the op is dense.
+          default a dense for dense inputs. The parameter
+          ``grad_preserves_dense`` can be set to False to return a
+          sparse grad for dense inputs.
    - :class:`SamplingDot <theano.sparse.basic.SamplingDot>` and
      ``sampling_dot``.
-        - Both input must be dense.
+        - Both inputs must be dense.
        - The grad implemented is structured for `p`.
        - Sample of the dot and sample of the gradient.
        - C code for perform but not for grad.
-        - Return sparse for perform and grad.
+        - Returns sparse for perform and grad.
    - :class:`Usmm <theano.sparse.basic.Usmm>` and ``usmm``.
        - You *shouldn't* insert this op yourself!
-           - There is optimization that transform a
+           - There is an optimization that transform a
             :class:`Dot <theano.sparse.basic.Dot>` to ``Usmm`` when possible.
        - This op is the equivalent of gemm for sparse dot.
-        - There is no grad implemented for this op and this is not needed as
+        - There is no grad implemented for this op.
-          you don't insert it yourself.
        - One of the inputs must be sparse, the other sparse or dense.
-        - Return a dense for perform
+        - Returns a dense from perform.
 - Slice Operations
-    - sparse_variable[N, N], return a tensor scalar.
+    - sparse_variable[N, N], returns a tensor scalar.
      There is no grad implemented for this operation.
-    - sparse_variable[M:N, O:P], return a sparse matrix
+    - sparse_variable[M:N, O:P], returns a sparse matrix
      There is no grad implemented for this operation.
-    - Sparse variable don't support [M, N:O] and [M:N, O] as we don't support sparse vector
+    - Sparse variables don't support [M, N:O] and [M:N, O] as we don't
-      and returning a sparse matrix would break the numpy interface.
+      support sparse vectors and returning a sparse matrix would break
-      Use [M:M+1, N:O] and [M:N, O:O+1] instead.
+      the numpy interface.  Use [M:M+1, N:O] and [M:N, O:O+1] instead.
    - :class:`Diag <theano.sparse.basic.Diag>` and ``diag``.
      The grad implemented is regular.

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1452,7 +1452,7 @@ Linear Algebra
        print(b.shape) #(5,6,4,3)
        print(c.shape) #(2,3,4,5,6,4,3)
-    See the documentation of numpy.tensordot for more examples.
+    :note: See the documentation of `numpy.tensordot <http://docs.scipy.org/doc/numpy/reference/generated/numpy.tensordot.html>`_ for more examples.
 .. function:: batched_dot(X, Y)
@@ -1478,6 +1478,40 @@ Linear Algebra
    :return: tensor of products
+.. function:: batched_tensordot(X, Y, axes=2)
+    :param x: A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
+    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
+    :param axes: an integer or array. If an integer, the number of axes
+                 to sum over. If an array, it must have two array
+                 elements containing the axes to sum over in each tensor.
+                 If an integer i, it is converted to an array containing
+                 the last i dimensions of the first tensor and the first
+                 i dimensions of the second tensor (excluding the first 
+                 (batch) dimension):
+                     axes = [range(a.ndim - i, b.ndim), range(1,i+1)]
+                 If an array, its two elements must contain compatible axes
+                 of the two tensors. For example, [[1, 2], [2, 4]] means sum
+                 over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
+                 (Remember axes are zero-indexed!) The 2nd axis of a and the
+                 3rd axis of b must have the same shape; the same is true for
+                 the 3rd axis of a and the 5th axis of b.
+    :type axes: int or array-like of length 2
+    :returns: a tensor with shape equal to the concatenation of a's shape
+              (less any dimensions that were summed over) and b's shape
+              (less first dimension and any dimensions that were summed over).
+    :rtype: tensor of tensordots
+    A hybrid of batch_dot and tensordot, this function computes the 
+    tensordot product between the two tensors, by iterating over the 
+    first dimension using scan to perform a sequence of tensordots.
+    :note: See :func:`tensordot` and :func:`batched_dot` for 
+        supplementary documentation.
 Gradient / Differentiation

--- a/doc/library/tensor/raw_random.txt
+++ b/doc/library/tensor/raw_random.txt
--- a/doc/tutorial/examples.txt
+++ b/doc/tutorial/examples.txt
@@ -5,13 +5,13 @@
 More Examples
 =============
-At this point it would be wise to begin familiarizing yourself 
+At this point it would be wise to begin familiarizing yourself more
-more systematically with Theano's fundamental objects and operations by browsing
+systematically with Theano's fundamental objects and operations by
-this section of the library: :ref:`libdoc_basic_tensor`.
+browsing this section of the library: :ref:`libdoc_basic_tensor`.
-As the tutorial unfolds, you should also gradually acquaint yourself with the other
+As the tutorial unfolds, you should also gradually acquaint yourself
-relevant areas of the library and with the relevant subjects of the documentation
+with the other relevant areas of the library and with the relevant
-entrance page.
+subjects of the documentation entrance page.
 Logistic Function
@@ -30,13 +30,13 @@ the logistic curve, which is given by:
    A plot of the logistic function, with x on the x-axis and s(x) on the
    y-axis.
-You want to compute the function :ref:`elementwise <libdoc_tensor_elementwise>` on matrices of
+You want to compute the function :ref:`elementwise
-doubles, which means that you want to apply this function to each
+<libdoc_tensor_elementwise>` on matrices of doubles, which means that
-individual element of the matrix.
+you want to apply this function to each individual element of the
+matrix.
 Well, what you do is this:
 .. If you modify this code, also change :
 .. theano/tests/test_tutorial.py:T_examples.test_examples_1
@@ -450,6 +450,10 @@ Other Random Distributions
 There are :ref:`other distributions implemented <libdoc_tensor_raw_random>`. 
+Other Implementations
+---------------------
+There is 2 other implementations based on :class:`CURAND <theano.sandbox.cuda.rng_curand>` and :ref:`MRG31k3p <libdoc_rng_mrg>`
 .. _logistic_regression:
@@ -457,7 +461,8 @@ There are :ref:`other distributions implemented <libdoc_tensor_raw_random>`.
 A Real Example: Logistic Regression
 ===================================
-The preceding elements are featured in this more realistic example.  It will be used repeatedly.  
+The preceding elements are featured in this more realistic example.
+It will be used repeatedly.
 .. code-block:: python

--- a/doc/tutorial/multi_cores.txt
+++ b/doc/tutorial/multi_cores.txt
@@ -2,30 +2,49 @@
 Multi cores support in Theano
 =============================
-Parallel element wise op with openmp
+BLAS operation
-====================================
+==============
-Beacuse element wise ops work on every tensor entry indipedently they can be
+BLAS is an interface for some mathematic operations between two
-easly parallelized using openmp.
+vectors, a vector and a matrix or two matrices (e.g. the dot product
+between vector/matrix and matrix/matrix). Many different
+implementations of that interface exist and some of them are
+parallelized.
-To use openmp you must set the openmp flag in Theano configuration.
+Theano tries to use that interface as frequently as possible for
+performance reasons. So if Theano links to a parallel implementation,
+those operations will run in parallel in Theano.
-Yuo can use the flag openmp_elemwise_minsize to set the minimum tensor size
+The most frequent way to control the number of threads used is via the
-for which the operation is parallelized because for short tensor using opemp
+``OMP_NUM_THREADS`` environment variable. Set it to the number of
-can slow down the operation.
+threads you want to use before starting the python process. Some BLAS
+implementations support other enviroment variables.
-If it is no specified the default value (200000) is used.
-For simple(fast) operation you can obtain a speed up for very long tensor
+Parallel element wise ops with OpenMP
-while for more complex operation you ca obtain a good speed up also for not
+=====================================
-too long tensor. 
-There is a script (elemwise_openmp_speedup.py in theano/misc/) which you can
-use to choose that value for your machine.
-The script run two elemwise operation (a fast and a slow one) for a vector of
-size openmp_elemwise_minsize with and without openmp and show the time
-difference between the two cases.
+Because element wise ops work on every tensor entry independently they
+can be easily parallelized using OpenMP.
+To use OpenMP you must set the ``openmp`` :ref:`flag <libdoc_config>`
+to ``True``.
+You can use the flag ``openmp_elemwise_minsize`` to set the minimum
+tensor size for which the operation is parallelized because for short
+tensors using OpenMP can slow down the operation. The default value is
+``200000``.
+For simple(fast) operation you can obtain a speed up with very large
+tensors while for more complex operation you can obtain a good speed
+up also for smaller tensor.
+There is a script ``elemwise_openmp_speedup.py`` in ``theano/misc/``
+which you can use to tune the value of ``openmp_elemwise_minsize`` for
+your machine.  The script runs two elemwise operations (a fast one and
+a slow one) for a vector of size ``openmp_elemwise_minsize`` with and
+without OpenMP and shows the time difference between the cases.
+The only way to control the number of threads used is via the
+``OMP_NUM_THREADS`` environment variable. Set it to the number of threads
+you want to use before starting the python process.
--- a/theano/gof/__init__.py
+++ b/theano/gof/__init__.py
@@ -62,7 +62,6 @@ from theano.gof.opt import (Optimizer, optimizer, SeqOptimizer,
    LocalOptimizer, local_optimizer, LocalOptGroup,
    OpSub, OpRemove, PatternSub,
    NavigatorOptimizer, TopoOptimizer, EquilibriumOptimizer,
-    InplaceOptimizer, PureThenInplaceOptimizer,
    OpKeyOptimizer)
 from theano.gof.optdb import \

--- a/theano/gof/graph.py
+++ b/theano/gof/graph.py
@@ -415,18 +415,21 @@ class Variable(Node):
        if inputs_to_values is None:
            inputs_to_values = {}
-        if not hasattr(self, '_fn'):
+        if not hasattr(self, '_fn_cache'):
-            self._fn_inputs = inputs_to_values.keys()
+            self._fn_cache = dict()
-            self._fn = theano.function(self._fn_inputs, self)
-        args = [inputs_to_values[param] for param in self._fn_inputs]
-        rval = self._fn(*args)
+        inputs = tuple(sorted(inputs_to_values.keys(), key=id))
+        if not inputs in self._fn_cache:
+            self._fn_cache[inputs] = theano.function(inputs, self)
+        args = [inputs_to_values[param] for param in inputs]
+        rval = self._fn_cache[inputs](*args)
        return rval
    def __getstate__(self):
        d = self.__dict__.copy()
-        d.pop("_fn", None)
+        d.pop("_fn_cache", None)
        return d
    env = property(env_getter, env_setter, env_deleter)

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -131,6 +131,9 @@ class FromFunctionOptimizer(Optimizer):
    def __call__(self, *args, **kwargs):
        return self.fn(*args, **kwargs)
+    def __str__(self):
+        return self.__name__
 def optimizer(f):
    """decorator for FromFunctionOptimizer"""
@@ -626,7 +629,10 @@ class MergeOptimizer(Optimizer):
        print >> stream, blanc, "  replace_time", replace_time
        print >> stream, blanc, "  validate_time", validate_time
        print >> stream, blanc, "  callback_time", callback_time
-        print >> stream, blanc, "  callback_times", callbacks_time
+        print >> stream, blanc, "  callbacks_time"
+        for i in sorted(callbacks_time.iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
        print >> stream, blanc, "  nb_merged", nb_merged
        print >> stream, blanc, "  nb_constant", nb_constant
@@ -1490,7 +1496,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
    def __init__(self,
                 optimizers,
                 failure_callback=None,
-                 max_depth=None,
                 max_use_ratio=None):
        """
        :param optimizers:  list or set of local or global optimizations to
@@ -1499,8 +1504,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
        :param max_use_ratio: each optimizer can be applied at most
            (size of graph * this number) times
-        :param max_depth: TODO what does this do? (EquilibriumDB sets it to 5)
        """
        super(EquilibriumOptimizer, self).__init__(
@@ -1520,7 +1523,6 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                        self.local_optimizers_map.setdefault(c, []).append(opt)
            else:
                self.global_optimizers.append(opt)
-        self.max_depth = max_depth
        self.max_use_ratio = max_use_ratio
        assert self.max_use_ratio is not None, (
                'max_use_ratio has to be a number')
@@ -1723,10 +1725,12 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            for (t, count, opt) in count_opt[::-1]:
                print >> stream, blanc, '  %.3fs - %d - %s' % (
                    t, count, opt)
-            print >> stream, blanc, '  %.3fs - in %d optimization that where not used' % (
+            print >> stream, blanc, '  %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
                not_used_time, len(not_used))
            not_used.sort()
            for (t, opt) in not_used[::-1]:
+                if t > 0:
+                    # Skip opt that have 0 times, they probably wasn't even tried.
                    print >> stream, blanc + "  ", '  %.3fs - %s' % (t, opt)
            print >> stream
@@ -1899,31 +1903,3 @@ def pre_greedy_local_optimizer(list_optimizations, out):
    final_outs, optimized_nodes = local_recursive_function(
        list_optimizations, out, {}, 0)
    return final_outs[out_index]
-############
-### Misc ###
-############
-class InplaceOptimizer(Optimizer):
-    def __init__(self, inplace):
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.inplace(fgraph)
-    def add_requirements(self, fgraph):
-        fgraph.attach_feature(dh.DestroyHandler())
-class PureThenInplaceOptimizer(Optimizer):
-    def __init__(self, pure, inplace):
-        self.pure = pure
-        self.inplace = inplace
-    def apply(self, fgraph):
-        self.pure(fgraph)
-        fgraph.attach_feature(dh.DestroyHandler())
-        self.inplace(fgraph)
--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -194,7 +194,6 @@ class EquilibriumDB(DB):
    def query(self, *tags, **kwtags):
        opts = super(EquilibriumDB, self).query(*tags, **kwtags)
        return opt.EquilibriumOptimizer(opts,
-                max_depth=5,
                max_use_ratio=config.optdb.max_use_ratio,
                failure_callback=opt.NavigatorOptimizer.warn_inplace)

--- a/theano/gof/tests/test_graph.py
+++ b/theano/gof/tests/test_graph.py
@@ -297,14 +297,17 @@ class TestIsSameGraph(unittest.TestCase):
 # eval         #
 ################
-def test_eval():
+class TestEval(unittest.TestCase):
-    x = tensor.scalar()
-    y = tensor.scalar()
+    def setUp(self):
-    z = x + y
+        self.x, self.y = tensor.scalars('x', 'y')
+        self.z = self.x + self.y
-    result = z.eval({x : 1., y : 2.})
+        self.w = 2 * self.z
-    assert result == 3.
+    def test_eval(self):
+        self.assertEquals(self.w.eval({self.x : 1., self.y : 2.}), 6.)
-    # We don't want to pickle the tmp function.
+        self.assertEquals(self.w.eval({self.z : 3}), 6.)
-    assert not hasattr(pickle.loads(pickle.dumps(z)), '_fn')
+        self.assertTrue(hasattr(self.w, "_fn_cache"),
+                "variable must have cache after eval")
+        self.assertFalse(hasattr(pickle.loads(pickle.dumps(self.w)), '_fn_cache'),
+                "temporary functions must not be serialized")
--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -205,6 +205,7 @@ if __name__ == "__main__":
        gpu
        K20m/ECC                 0.07s
        K20/NOECC                0.07s
+        M2090             0.19s
        C2075                           0.25s
        M2075                    0.25s
        M2070                    0.25s         0.27s         0.32s

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -671,7 +671,7 @@ class GpuConv(GpuOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 20)
+        return (0, 21)
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -1018,6 +1018,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        (version==3||version==4||version==5||version==-1) &&
        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+        (kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
        !work_complete) //conv_full_patch_stack_padded
    {
      //version 3 without split

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,7 +14,7 @@ import theano.ifelse
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler)
+                        Optimizer, toolbox)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import (
    device_properties, gpu_eye,
@@ -62,7 +62,7 @@ optdb.register('gpu_opt',
 # inside the elemwise. When there is no float64 op, this is working.
 optdb.register('gpu_after_fusion',
               ProxyDB(gpu_seqopt),
-               optdb.__position__.get('elemwise_fusion', 71) + .1,
+               optdb.__position__.get('elemwise_fusion', 49) + .1,
               'gpu')
@@ -88,7 +88,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:
@@ -1339,9 +1338,10 @@ gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
        max_inputs_to_GpuElemwise)
 if config.gpu.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
+    #Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5
    optdb.register('gpu_elemwise_fusion',
                   tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
-                   71.00, 'fast_run', 'fusion',
+                   49, 'fast_run', 'fusion',
                   'local_elemwise_fusion', 'gpu')
 else:
    _logger.debug(("not enabling optimization fusion of gpu elemwise in "

--- a/theano/sandbox/cuda/tests/CudaNdarray_py3.pkl
+++ b/theano/sandbox/cuda/tests/CudaNdarray_py3.pkl
--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -679,6 +679,7 @@ def test_full():
            #Test more than maxThreadsDim0
            , ((2,4,13,1050), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
            , ((2,4,1050,13), (3,4,10, 11), (1, 1), (1, 1), (1, 1))
+            , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
            ]
 #    shapes=shapes[:277]

--- a/theano/sandbox/cuda/tests/test_type.py
+++ b/theano/sandbox/cuda/tests/test_type.py
 import cPickle
+import os.path
+import sys
 from nose.tools import assert_raises
 import numpy
-import os.path
 from theano import config
 from theano.sandbox.cuda import cuda_available
@@ -12,10 +15,13 @@ if cuda_available:
 # >>> with open('CudaNdarray.pkl', 'wb') as fp:
 # >>> cPickle.dump(theano.sandbox.cuda.CudaNdarray(np.array([-42.0], dtype=np.float32)), fp)
 def test_unpickle_flag_is_false_by_default():
-    assert not config.experimental.unpickle_gpu_on_cpu, "Config flag experimental.unpickle_gpu_on_cpu is " \
+    assert not config.experimental.unpickle_gpu_on_cpu, (
-                                                      + "set to true. Make sure the default value stays false " \
+        "Config flag experimental.unpickle_gpu_on_cpu is "
-                                                      + "and that you have not set the flag manually."
+        "set to true. Make sure the default value stays false "
+        "and that you have not set the flag manually.")
 def test_unpickle_cudandarray_as_numpy_ndarray_flag0():
    oldflag = config.experimental.unpickle_gpu_on_cpu
@@ -23,7 +29,11 @@ def test_unpickle_cudandarray_as_numpy_ndarray_flag0():
    try:
        testfile_dir = os.path.dirname(os.path.realpath(__file__))
-        with open(os.path.join(testfile_dir, 'CudaNdarray.pkl')) as fp:
+        fname = 'CudaNdarray.pkl'
+        if sys.version_info[0] == 3:
+            fname = 'CudaNdarray_py3.pkl'
+        with open(os.path.join(testfile_dir, fname), 'rb') as fp:
            if cuda_available:
                mat = cPickle.load(fp)
            else:

--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
@@ -61,7 +61,7 @@ class GpuGemv(BlasOp, Gemv):
                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
                             %(A)s, %(x)s,
                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -72,7 +72,7 @@ class GpuGemv(BlasOp, Gemv):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -117,7 +117,7 @@ class GpuGemm(BlasOp, Gemm):
                             ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
                             %(A)s, %(B)s,
                             ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -128,7 +128,7 @@ class GpuGemm(BlasOp, Gemm):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
 gpugemm_no_inplace = GpuGemm(inplace=False)
@@ -176,7 +176,7 @@ class GpuDot22(BlasOp, Dot22):
                             one,
                             %(A)s, %(B)s,
                             zero,
-                             %(out)s) == NULL) {
+                             %(out)s, 0) == -1) {
            %(fail)s
        }
        """ % vars
@@ -187,7 +187,7 @@ class GpuDot22(BlasOp, Dot22):
        return code
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
    def c_headers(self):
        ret = super(GpuDot22, self).c_headers()

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -5,7 +5,7 @@ from theano import tensor, scalar
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
                        SequenceDB, ProxyDB,
-                        Optimizer, toolbox, DestroyHandler,
+                        Optimizer, toolbox,
                        InconsistencyError, EquilibriumOptimizer)
 from theano.gof.python25 import all, any
@@ -90,7 +90,6 @@ class InputToGpuOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        for input in fgraph.inputs:

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -734,9 +734,11 @@ class MRG_RandomStreams(object):
        :param low: Lower bound of the interval on which values are sampled.
        If the ``dtype`` arg is provided, ``low`` will be cast into dtype.
+        This bound is excluded.
        :param high: Higher bound of the interval on which values are sampled.
        If the ``dtype`` arg is provided, ``high`` will be cast into dtype.
+        This bound is excluded.
        :param size: Can be a list of integer or Theano variable
                (ex: the shape of other Theano Variable)

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -869,7 +869,8 @@ class ScalarOp(Op):
            return self.name
        else:
            param = [(k, v) for k, v in self.__dict__.items()
-                     if k not in ["name", "_op_use_c_code"]]
+                     if k not in ["name", "_op_use_c_code",
+                                  "output_types_preference"]]
            if param:
                return "%s{%s}" % (self.__class__.__name__,
                                   ", ".join("%s=%s" % (k, v)

--- a/theano/scan_module/scan_opt.py
+++ b/theano/scan_module/scan_opt.py
@@ -1509,7 +1509,6 @@ class PushOutDot1(gof.Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -2623,11 +2623,14 @@ class TrueDot(gof.op.Op):
        self.grad_preserves_dense = grad_preserves_dense
    def __eq__(self, other):
-        return (type(self) == type(other) and
+        # The grad_preserves_dense attribute doesn't change the
-                self.grad_preserves_dense == other.grad_preserves_dense)
+        # execution behavior.  To let the optimizer merge nodes with
+        # different values of this attribute we shouldn't compare it
+        # here.
+        return type(self) == type(other)
    def __hash__(self):
-        return hash(type(self)) ^ hash(self.grad_preserves_dense)
+        return hash(type(self))
    def __ne__(self, other):
        return not (self == other)
@@ -2712,15 +2715,17 @@ class TrueDot(gof.op.Op):
 def true_dot(x, y, grad_preserves_dense=True):
    """
    Operation for efficiently calculating the dot product when
-    one or all operands is sparse. Supported format are CSC and CSR.
+    one or all operands are sparse. Supported formats are CSC and CSR.
    The output of the operation is sparse.
-    :param x: Matrix variable.
+    :param x: Sparse matrix or 2d tensor variable.
-    :param y: Matrix variable.
+    :param y: Sparse matrix or 2d tensor variable.
-    :param grad_preserves_dense: if True and one on the input is dense,
+    :param grad_preserves_dense: if True (default), makes the grad of
-        make the output dense.
+        dense inputs dense.  Otherwise the grad is always sparse.
    :return: The dot product `x`.`y` in a sparse format.
+    :note: one of ``x`` or ``y`` must be sparse.
    """
    # TODO
    # Maybe the triple-transposition formulation

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -557,7 +557,7 @@ def get_scalar_constant_value(v):
            data = v.data
        return numpy_scalar(data)
-    if v.owner:
+    if getattr(v, 'owner', None):
        if isinstance(v.owner.op, (Alloc, DimShuffle, Rebroadcast,
                                   compile.ops.OutputGuard,
                                   compile.DeepCopyOp)):
@@ -590,14 +590,10 @@ def get_scalar_constant_value(v):
            v.owner.op.perform(v.owner, const, ret)
            return ret[0][0]
        if isinstance(v.owner.op, theano.tensor.subtensor.Subtensor) and v.ndim == 0:
-            # This condition depends on Subtensor always embedding constant
+            if isinstance(v.owner.inputs[0], TensorConstant):
-            # indices in the Op rather than making them inputs to the Apply
+                cdata = tuple(v.owner.op.get_constant_idx(v.owner.inputs))
-            # node.
-            if isinstance(v.owner.inputs[0], TensorConstant) and \
-                len(v.owner.inputs) == 1:
                try:
-                    return v.owner.inputs[0].data.__getitem__(
+                    return v.owner.inputs[0].data.__getitem__(cdata)
-                    tuple(v.owner.op.idx_list))
                except IndexError:
                    raise IndexError(
                            str(tuple(v.owner.op.idx_list)) +
@@ -620,10 +616,12 @@ def get_scalar_constant_value(v):
                           v.owner.inputs[0].owner.inputs) and
                len(v.owner.op.idx_list) == 1):
+                idx = v.owner.op.idx_list[0]
+                if isinstance(idx, gof.Type):
+                    idx = get_scalar_constant_value(v.owner.inputs[1])
                # Note the '+ 1' is because the first argument to Join is the
                # axis.
-                ret = v.owner.inputs[0].owner.inputs[
+                ret = v.owner.inputs[0].owner.inputs[idx + 1]
-                    v.owner.op.idx_list[0] + 1]
                ret = get_scalar_constant_value(ret)
                # join can cast implicitly its input in some case.
                return theano._asarray(ret, dtype=v.type.dtype)
@@ -635,14 +633,13 @@ def get_scalar_constant_value(v):
                # We put this check in case there is change in the future
                python_all(var.ndim == 0 for var in
                           v.owner.inputs[0].owner.inputs) and
-                len(v.owner.op.idx_list) == 1 and
+                len(v.owner.op.idx_list) == 1):
-                #idx_list can contain Scalar Type object.
+                idx = v.owner.op.idx_list[0]
-                isinstance(v.owner.op.idx_list[0], (int, long,
+                if isinstance(idx, gof.Type):
-                                                    numpy.integer))):
+                    idx = get_scalar_constant_value(v.owner.inputs[1])
                # Python 2.4 does not support indexing with numpy.integer
                # So we cast it.
-                idx = int(v.owner.op.idx_list[0])
+                idx = int(idx)
                ret = v.owner.inputs[0].owner.inputs[idx]
                ret = get_scalar_constant_value(ret)
                # MakeVector can cast implicitly its input in some case.
@@ -658,6 +655,8 @@ def get_scalar_constant_value(v):
                op = owner.op
                idx_list = op.idx_list
                idx = idx_list[0]
+                if isinstance(idx, gof.Type):
+                    idx = get_scalar_constant_value(owner.inputs[1])
                grandparent = leftmost_parent.owner.inputs[0]
                gp_broadcastable = grandparent.type.broadcastable
                ndim = grandparent.type.ndim
@@ -3107,6 +3106,48 @@ def batched_dot(x, y):
    return result
+def batched_tensordot(x, y, axes=2):
+    """
+    :param x: A Tensor with sizes e.g.: for 3D (dim1, dim3, dim2)
+    :param y: A Tensor with sizes e.g.: for 3D (dim1, dim2, dim4)
+    :param axes: an integer or array. If an integer, the number of axes
+                 to sum over. If an array, it must have two array
+                 elements containing the axes to sum over in each tensor.
+                 If an integer i, it is converted to an array containing
+                 the last i dimensions of the first tensor and the first
+                 i dimensions of the second tensor (excluding the first 
+                 (batch) dimension):
+                     axes = [range(a.ndim - i, b.ndim), range(1,i+1)]
+                 If an array, its two elements must contain compatible axes
+                 of the two tensors. For example, [[1, 2], [2, 4]] means sum
+                 over the 2nd and 3rd axes of a and the 3rd and 5th axes of b.
+                 (Remember axes are zero-indexed!) The 2nd axis of a and the
+                 3rd axis of b must have the same shape; the same is true for
+                 the 3rd axis of a and the 5th axis of b.
+    :type axes: int or array-like of length 2
+    A hybrid of batch_dot and tensordot, this function computes the 
+    tensordot product between the two tensors, by iterating over the 
+    first dimension using scan to perform a sequence of tensordots.    
+    """
+    if isinstance(axes, (list, numpy.ndarray)):
+        if isinstance(axes, list):
+            axes = numpy.asarray(axes)
+        else:
+            axes = axes.copy()
+        assert numpy.greater(axes,0).all(), "All axes should be greater than one, as the first axis is iterated over (batch-wise scan)"
+        axes -= 1
+    result, updates = theano.scan(fn=lambda x_mat, y_mat:
+            theano.tensor.tensordot(x_mat, y_mat, axes),
+            outputs_info=None,
+            sequences=[x, y],
+            non_sequences=None)
+    return result
 def split(x, splits_size, n_splits, axis=0):
    the_split = Split(n_splits)
    return the_split(x, axis, splits_size)

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -139,7 +139,7 @@ except ImportError:
    pass
 from theano.configparser import config, AddConfigVar, StrParam
-from theano.gof import (utils, Op, view_roots, DestroyHandler,
+from theano.gof import (utils, Op, view_roots,
                        local_optimizer, Optimizer,
                        InconsistencyError, toolbox, SequenceDB,
                        EquilibriumOptimizer, Apply,
@@ -1488,7 +1488,6 @@ class GemmOptimizer(Optimizer):
    def add_requirements(self, fgraph):
        fgraph.attach_feature(toolbox.ReplaceValidate())
-        fgraph.attach_feature(DestroyHandler())
    def apply(self, fgraph):
        did_something = True
@@ -1501,9 +1500,21 @@ class GemmOptimizer(Optimizer):
        time_factor_can = 0
        time_factor_list = 0
        time_toposort = 0
+        if fgraph.profile:
+            validate_before = fgraph.profile.validate_time
+            callbacks_before = fgraph.execute_callbacks_times.copy()
+            callback_before = fgraph.execute_callbacks_time
+        class Updater:
+            def on_import(self, fgraph, new_node, reason):
+                if new_node is not node:
+                    nodelist.append(new_node)
+        u = Updater()
+        fgraph.attach_feature(u)
        while did_something:
+            nb_iter += 1
            t0 = time.time()
-            nodelist = list(fgraph.toposort())
+            nodelist = theano.gof.graph.io_toposort(fgraph.inputs, fgraph.outputs)
            time_toposort += time.time() - t0
            did_something = False
            nodelist.reverse()
@@ -1546,16 +1557,30 @@ class GemmOptimizer(Optimizer):
                    except ReplacementDidntRemovedError, e:
                        nb_replacement_didn_t_remove += 1
                        self.warned = True
-            nb_iter += 1
+        fgraph.remove_feature(u)
+        if fgraph.profile:
+            validate_time = fgraph.profile.validate_time - validate_before
+            callback_time = fgraph.execute_callbacks_time - callback_before
+            callbacks_time = {}
+            for k, v in fgraph.execute_callbacks_times.iteritems():
+                if k in callbacks_before:
+                    callbacks_time[k] = v - callbacks_before[k]
+                else:
+                    callbacks_time[k] = v
+        else:
+            validate_time = None
+            callback_time = None
+            callbacks_time = {}
        return (self, nb_iter, nb_replacement, nb_replacement_didn_t_remove,
                nb_inconsistency_make, nb_inconsistency_replace,
                time_canonicalize, time_factor_can,
-                time_factor_list, time_toposort)
+                time_factor_list, time_toposort,
+                validate_time, callback_time, callbacks_time,)
    @staticmethod
    def print_profile(stream, prof, level=0):
        blanc = ('    ' * level)
-        #1946.912556s - ('gemm_optimizer', 'GemmOptimizer', 1)
        print >> stream, blanc, "GemmOptimizer"
        print >> stream, blanc, " nb_iter", prof[1]
        print >> stream, blanc, " nb_replacement", prof[2]
@@ -1566,6 +1591,12 @@ class GemmOptimizer(Optimizer):
        print >> stream, blanc, " time_factor_can", prof[7]
        print >> stream, blanc, " time_factor_list", prof[8]
        print >> stream, blanc, " time_toposort", prof[9]
+        print >> stream, blanc, " validate_time", prof[10]
+        print >> stream, blanc, " callback_time", prof[11]
+        print >> stream, blanc, " callbacks_time"
+        for i in sorted(prof[12].iteritems(), key=lambda a: a[1]):
+            if i[1] > 0:
+                print i
 class Dot22(GemmRelated):
@@ -1816,17 +1847,15 @@ blas_optdb.register('local_gemm_to_gemv',
        15, 'fast_run')
-# After destroyhandler is in but before we try to make elemwise things inplace
+# After destroyhandler(49.5) but before we try to make elemwise things
-# Try to make gemm inplace
+# inplace (75)
-# Also, need to make the gemm optimisation(step 70) happen before the
-# fusion of elemwise(step 71)
 blas_opt_inplace = in2out(local_inplace_gemm,
                          local_inplace_gemv,
                          local_inplace_ger,
                          name="blas_opt_inplace")
 optdb.register('InplaceBlasOpt',
               blas_opt_inplace,
-        70.0, 'fast_run', 'inplace')
+               70.0, 'fast_run', 'inplace', 'blas_opt_inplace')
 class Dot22Scalar(GemmRelated):

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -562,9 +562,13 @@ conv3D = Conv3D()
 :note: The order of dimensions does not correspond to the one in `conv2d`.
       This is for optimization.
-:note: The GPU implementation is very slow. You are better to use
+:note: The GPU implementation is very slow. You should use
-    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` that is faster
+    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` for a GPU
-    on GPU.
+    graph instead.
+:see: Someone made a script that shows how to swap the axes between
+      both 3d convolution implementations in Theano. See the last
+      `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
 """

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -178,6 +178,10 @@ def conv3d(signals, filters,
           Another way to define signals: (batch,  time, in channel, row, column)
           Another way to define filters: (out channel,time,in channel, row, column)
+    :see: Someone made a script that shows how to swap the axes between
+          both 3d convolution implementations in Theano. See the last
+          `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_.
    """
    if isinstance(border_mode, str):

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1396,8 +1396,9 @@ def _check_rows_is_arange_len_labels(rows, labels):
        # ShapeOptimizer, but we keep it if ShapeOptimizer is not present
        if isinstance(stop.owner.op, subtensor.Subtensor):
            shape_subtensor = stop.owner
-            if list(shape_subtensor.op.idx_list) == [0]:
+            if shape_subtensor.op.get_constant_idx(shape_subtensor.inputs,
-                shape_var, = shape_subtensor.inputs
+                                                   allow_partial=True) == [0]:
+                shape_var = shape_subtensor.inputs[0]
                if shape_var.owner and shape_var.owner.op == tensor.shape:
                    return shape_var.owner.inputs[0] is labels
        else:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
@@ -576,11 +576,11 @@ def random_integers(random_state, size=None, low=0, high=1, ndim=None,
 def choice_helper(random_state, a, replace, p, size):
-    """
+    """Helper function to draw random numbers using numpy's choice function.
-    Helper function to draw random numbers using numpy's choice function.
-    This is a generalization of numpy.random.choice to the case where `a`,
+    This is a generalization of numpy.random.choice that coerces
-    `replace` and `p` are tensors.
+    `replace` to a bool and replaces `p` with None when p is a vector
+    of 0 elements.
    """
    if a.ndim > 1:
        raise ValueError('a.ndim (%i) must be 0 or 1' % a.ndim)
@@ -622,16 +622,6 @@ def choice(random_state, size=None, a=2, replace=True, p=None, ndim=None,
                                                         broadcastable=bcast))
    return op(random_state, size, a, replace, p)
-def poisson_helper(random_state, lam, size):
-    """
-    Helper function to draw random numbers using numpy's poisson function.
-    This is a generalization of numpy.random.poisson to the case where 
-    `lam` is a tensor.
-    """
-    return random_state.poisson(lam, size)
 def poisson(random_state, size=None, lam=1.0, ndim=None, dtype='int64'):
    """
    Draw samples from a Poisson distribution.
@@ -652,7 +642,7 @@ def poisson(random_state, size=None, lam=1.0, ndim=None, dtype='int64'):
    ndim, size, bcast = _infer_ndim_bcast(ndim, size)
-    op = RandomFunction(poisson_helper, tensor.TensorType(dtype=dtype,
+    op = RandomFunction("poisson", tensor.TensorType(dtype=dtype,
                                                     broadcastable=bcast))
    return op(random_state, size, lam)
@@ -668,6 +658,9 @@ def permutation_helper(random_state, n, shape):
    If you wish to perform a permutation of the elements of an existing vector,
    see shuffle_row_elements.
+    This is a generalization of numpy.random.permutation to tensors.
+    Otherwise it behaves the same.
    """
    # n should be a 0-dimension array
    assert n.shape == ()
@@ -680,7 +673,7 @@ def permutation_helper(random_state, n, shape):
        shape = ()
    out_shape = list(shape)
    out_shape.append(n)
-    out = numpy.zeros(out_shape, int)
+    out = numpy.empty(out_shape, int)
    for i in numpy.ndindex(*shape):
        out[i] = random_state.permutation(n)
@@ -869,7 +862,7 @@ class RandomStreamsBase(object):
    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
                 prob=None):
        """
-        Sample n times with probability of success prob for each trial,
+        Sample n times with probability of success p for each trial and
        return the number of successes.
        If the size argument is ambiguous on the number of dimensions,

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -47,6 +47,23 @@ class AdvancedIndexingError(TypeError):
 # Helpful functions to deal with Subtensor and IncSubtensor
 ##########
+def make_constant(args):
+    """
+    Convert python litterals to theano constants in subtensor arguments.
+    """
+    def conv(a):
+            if a is None:
+                return a
+            elif isinstance(a, slice):
+                return slice(conv(a.start),
+                             conv(a.stop),
+                             conv(a.step))
+            elif isinstance(a, (int, long, numpy.integer)):
+                return scal.ScalarConstant(scal.int64, a)
+            else:
+                return a
+    return tuple(map(conv, args))
 def get_idx_list(inputs, idx_list):
    '''
    Given a list of inputs to the subtensor and its idx_list reorders
@@ -347,24 +364,56 @@ class Subtensor(Op):
                slice_c = None
            return slice(slice_a, slice_b, slice_c)
-        # There is a bug in numpy that results in isinstance(x, int) returning
+        elif isinstance(entry, (int, long, numpy.integer)):
-        # False for numpy integers.
+            # Disallow the use of python scalars in idx_list
-        # See <http://projects.scipy.org/numpy/ticket/2235>.
+            raise TypeError("Python scalar in idx_list."
-        elif isinstance(entry, numpy.integer):
+                            "Please report this error to theano-dev.")
-            return entry
-        # On Windows 64-bit, shapes are returned as Python long, as they can
-        # be bigger than what a Python int can hold.
-        # Shapes should always fit in a numpy.int64, and we support them better
-        # 2) In Python3, long replaced int. So we must assert it fit in int64.
-        elif isinstance(entry, (int, long)):
-            entry64 = numpy.int64(entry)
-            return entry64
        else:
            raise AdvancedIndexingError(Subtensor.e_indextype, entry)
+    def get_constant_idx(self, inputs, allow_partial=False):
+        """
+        Return the idx_list with constant inputs replaced by their
+        python scalar equivalent.  May raise
+        `theano.tensor.NotScalarConstantError` if the idx contains
+        non-constant entries.
+        If allow_partial is True, then entries that are not constant
+        will stay as their input variable rather than raising an
+        exception.
+        None entries are always left as-is.
+        Example usage (where v, a are appropriately typed theano variables):
+            >>> b = a[v, 1:3]
+            >>> b.owner.op.idx_list
+            (Scalar(int64), slice(Scalar(int64), Scalar(int64), None))
+            >>> b.owner.op.get_constant_idx(b.owner.inputs, allow_partial=True)
+            [v, slice(1, 3, None)]
+            >>> b.owner.op.get_constant_idx(b.owner.inputs)
+            NotScalarConstantError: v
+        """
+        real_idx = get_idx_list(inputs, self.idx_list)
+        def conv(val):
+            if val is None:
+                return None
+            elif isinstance(val, slice):
+                return slice(conv(val.start),
+                             conv(val.stop),
+                             conv(val.step))
+            else:
+                try:
+                    return get_scalar_constant_value(val)
+                except theano.tensor.NotScalarConstantError:
+                    if allow_partial:
+                        return val
+                    else:
+                        raise
+        return map(conv, real_idx)
    def __init__(self, idx_list):
        self.idx_list = tuple(map(self.convert, idx_list))
-        self.perform_cache_cdata = None
    @staticmethod
    def my_as_scalar(a):
@@ -404,31 +453,21 @@ class Subtensor(Op):
                    % (input.type, expected_type))
        # infer the broadcasting pattern
-        padded = (idx_list
+        padded = (self.get_constant_idx((None,)+inputs, allow_partial=True)
                  + [slice(None, None, None)] * (x.type.ndim - len(idx_list)))
        broadcastable = []
        for i, (p, bc) in enumerate(izip(padded, x.type.broadcastable)):
            if isinstance(p, slice):
                if bc and p.start in [None, 0]:
-                    # No need to check step when there is only
+                    start = p.start
-                    # one element.
+                    if start is None:
-                    # We could call get_canonical_form_slice() to
-                    # catch more broadcast case. I let this to
-                    # later.
-                    if p.stop is None:
-                        broadcastable.append(bc)
-                        continue
-                    try:
-                        if p.start is None:
                        start = 0
-                        else:
+                    if (p.stop is None or
-                            start = get_scalar_constant_value(p.start)
+                        (isinstance(p.stop, (int, numpy.integer)) and
-                        stop = get_scalar_constant_value(p.stop)
+                         p.stop > start)):
-                        if stop > start:
                        broadcastable.append(True)
                        continue
-                    except theano.tensor.NotScalarConstantError:
-                        pass
                broadcastable.append(False)
        return gof.Apply(self,
@@ -440,18 +479,9 @@ class Subtensor(Op):
        out, = out_
        x = inputs[0]
-        # The subtensor (or idx_list) does not depend on the inputs.
-        # (and cdata was cached on initial call)
-        if self.perform_cache_cdata is not None:
-            out[0] = numpy.asarray(x.__getitem__(self.perform_cache_cdata))
-            return
        cdata = get_idx_list(inputs, self.idx_list)
        if len(cdata) == 1:
            cdata = cdata[0]
-        # (first call caches cdata here)
-        if len(inputs) == 1:
-            self.perform_cache_cdata = cdata
        out[0] = numpy.asarray(x.__getitem__(cdata))

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2323,8 +2323,32 @@ def test_batched_dot():
    result_fn = theano.function([first_mat, second_mat], output)
    result = result_fn(first_mat_val, second_mat_val)
+    assert result.shape[0] == first_mat_val.shape[0]
+def test_batched_tensordot():
+    first = theano.tensor.tensor4("first")
+    second = theano.tensor.tensor4("second")
+    axes = [[1,2], [3,1]]
+    output = theano.tensor.basic.batched_tensordot(first, second, axes)
+    first_val = numpy.random.rand(8, 10, 20, 3).astype(config.floatX)
+    second_val = numpy.random.rand(8, 20, 5, 10).astype(config.floatX)
+    result_fn = theano.function([first, second], output)
+    result = result_fn(first_val, second_val)
    assert result.shape[0] == first_val.shape[0]
+    assert result.shape[1] == first_val.shape[3]
+    assert result.shape[2] == second_val.shape[2]
+    first_mat = theano.tensor.dmatrix("first")
+    second_mat = theano.tensor.dmatrix("second")
+    axes = 1
+    output = theano.tensor.basic.batched_tensordot(first_mat, second_mat, axes)
+    first_mat_val = numpy.random.rand(10, 4).astype(config.floatX)
+    second_mat_val = numpy.random.rand(10, 4).astype(config.floatX)
+    result_fn = theano.function([first_mat, second_mat], output)
+    result = result_fn(first_mat_val, second_mat_val)
+    print(result.shape)
+    assert result.shape[0] == first_mat_val.shape[0]
+    assert len(result.shape) == 1
 def test_tensor_values_eq_approx():
    #test, inf, -inf and nan equal themself

--- a/theano/tensor/var.py
+++ b/theano/tensor/var.py
@@ -348,6 +348,8 @@ class _tensor_py_operators:
    def __getitem__(self, args):
        if not isinstance(args, tuple):
            args = args,
+        # Convert python literals to theano constants
+        args = theano.tensor.subtensor.make_constant(args)
        # Determine if advanced indexing is needed or not
        # The logic is already in Subtensor.convert: if it succeeds,
        # standard indexing is used; if it fails with