Merged

00ecd70d · Olivier Delalleau · 253dde5c · 0488b3cf · 00ecd70d · 00ecd70d
--- a/bin/theano-cache
+++ b/bin/theano-cache
@@ -6,7 +6,7 @@ from theano.gof.cc import get_module_cache
 if len(sys.argv) == 1:
    print config.compiledir
 elif sys.argv[1] in ('clear'):
-    get_module_cache().clear()
+    get_module_cache().clear(unversioned_min_age=-1, clear_base_files=True)
 else:
    print 'command "%s" not recognized' % sys.argv[1]
    print 'Type "theano-cache" to print the cache location'

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -144,7 +144,7 @@ import theano and print the config variable, as in:

 .. attribute:: floatX

-    String value: either 'float64' or 'float32'.
+    String value: either 'float64' or 'float32'

    Default: 'float64'

@@ -152,6 +152,48 @@ import theano and print the config variable, as in:
    and similar functions.  It also sets the default theano bit width for
    arguments passed as Python floating-point numbers.

+.. attribute:: cast_policy
+
+    String value: either 'numpy+floatX', 'numpy' or 'custom'
+
+    Default: 'numpy+floatX'
+
+    This specifies how data types are implicitly figured out in Theano, e.g. for
+    constants or in the results of arithmetic operations. The 'custom' value
+    corresponds to a set of custom rules originally used in
+    Theano (which can be partially customized, see e.g. the in-code help of
+    ``tensor.NumpyAutocaster``), and is now deprecated.
+    The 'numpy' setting attempts to
+    mimic the numpy casting rules. The default value ('numpy+floatX') does
+    the same, except that
+    it prefers to use float32 numbers instead of float64 when ``config.floatX``
+    is set to 'float32'.
+    Note that both 'numpy' and 'numpy+floatX'
+    behave differently from numpy on purpose in the following situations:
+       * Depending on the value of ``config.int_division``, the resulting type
+         of a division of integer types with the ``/`` operator may not match
+         that of numpy.
+       * On mixed scalar / array operations, numpy tries to prevent the scalar
+         from upcasting the array's type unless it is of a fundamentally
+         different type. Theano does not attempt to do the same at this point,
+         so you should be careful that scalars may upcast arrays when they
+         would not when using numpy.
+
+.. attribute:: int_division
+
+    String value: either 'int', 'floatX' or 'raise'
+
+    Default: 'int'
+
+    Specifies what to do when one tries to compute ``x / y``, where both ``x`` and
+    ``y`` are of integer types (possibly unsigned). 'int' means an integer is
+    returned (as in Python 2.X), but this behavior is deprecated. 'floatX'
+    returns a number of type given by ``config.floatX``. 'raise' is the safest
+    choice (and will become default in a future release of Theano) and raises
+    an error when one tries to do such an operation, enforcing the use of the
+    integer division operator (``//``) (if a float result is intended, either
+    cast one of the arguments to a float, or use ``x.__truediv__(y)``).
+
 .. attribute:: mode

    String value: 'Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN', 'FAST_COMPILE'

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -15,11 +15,16 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'),
        )

-# TODO Work-in-progress
-#AddConfigVar('casting_policy',
-#        "Rules for implicit casts of constants in arithmetic operations",
-#        EnumStr('theano_0.3', 'numpy'),
-#        )
+AddConfigVar('cast_policy',
+        "Rules for implicit type casting",
+        EnumStr('numpy+floatX', 'numpy', 'custom'),
+        )
+
+AddConfigVar('int_division',
+        "What to do when one computes x / y, where both x and y are of "
+        "integer types",
+        EnumStr('int', 'raise', 'floatX'),
+        )

 #gpu mean let the driver select the gpu. Needed in case of gpu in exclusive mode.
 #gpuX mean use the gpu number X.
@@ -30,7 +35,8 @@ AddConfigVar('device',
            'gpu4', 'gpu5', 'gpu6', 'gpu7',
            'gpu8', 'gpu9', 'gpu10', 'gpu11',
            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False)
+                allow_override=False),
+        in_c_key=False,
        )

 AddConfigVar('init_gpu_device',
@@ -43,13 +49,13 @@ AddConfigVar('init_gpu_device',
            'gpu4', 'gpu5', 'gpu6', 'gpu7',
            'gpu8', 'gpu9', 'gpu10', 'gpu11',
            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False)
-        )
+                allow_override=False),
+        in_c_key=False)

 AddConfigVar('force_device',
        "Raise an error if we can't use the specified device",
-        BoolParam(False, allow_override=False)
-        )
+        BoolParam(False, allow_override=False),
+        in_c_key=False)

 #Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
 #The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
@@ -57,7 +63,8 @@ AddConfigVar('force_device',
 AddConfigVar('mode',
        "Default compilation mode",
        EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
-                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'))
+                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
+        in_c_key=False)

 # Test whether or not gcc is present: disable C code if it is not
 try:
@@ -65,12 +72,14 @@ try:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'))
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
+                 in_c_key=False)
 except OSError:
    # gcc is not present, linker should default to python only
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'))
+                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
+                 in_c_key=False)
    warning('GCC not detected ! Theano will be unable to execute optimized '+
            'C-implementations (for both CPU and GPU) and will default to '+
            'Python implementations. Performance will be severely degraded.')
@@ -78,32 +87,39 @@ except OSError:
 #Keep the default optimizer the same as the one for the mode FAST_RUN
 AddConfigVar('optimizer',
        "Default optimizer. If not None, will use this linker with the Mode object(not ProfileMode or DebugMode)",
-        EnumStr('fast_run', 'merge', 'fast_compile', 'None'))
+        EnumStr('fast_run', 'merge', 'fast_compile', 'None'),
+        in_c_key=False)

 AddConfigVar('on_opt_error',
        "What to do when an optimization crashes: warn and skip it, or raise the exception",
-        EnumStr('warn', 'raise'))
+        EnumStr('warn', 'raise'),
+        in_c_key=False)

 AddConfigVar('home',
        "User home directory",
-        StrParam(os.getenv("HOME", os.path.expanduser('~'))))
+        StrParam(os.getenv("HOME", os.path.expanduser('~'))),
+        in_c_key=False)
 #This expanduser works on windows (see discussion on theano-users, July 13 2010)

 AddConfigVar('nocleanup',
        "Suppress the deletion of code files that did not compile cleanly",
-        BoolParam(False))
+        BoolParam(False),
+        in_c_key=False)

 AddConfigVar('tensor.cmp_sloppy',
        "Relax tensor._allclose (0) not at all, (1) a bit, (2) more",
-        IntParam(0, lambda i: i in (0,1,2)))
+        IntParam(0, lambda i: i in (0,1,2)),
+        in_c_key=False)

 AddConfigVar('tensor.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
-        BoolParam(True))
+        BoolParam(True),
+        in_c_key=False)

 AddConfigVar('gpu.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
-        BoolParam(True))
+        BoolParam(True),
+        in_c_key=False)

 #http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
 AddConfigVar('lib.amdlibm',
@@ -140,41 +156,47 @@ AddConfigVar('numpy.seterr_all',
              "by the following flags: seterr_divide, seterr_over, "
              "seterr_under and seterr_invalid."),
             EnumStr('ignore', 'warn', 'raise', 'call', 'print', 'log', 'None',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_divide',
             ("Sets numpy's behavior for division by zero, see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_over',
             ("Sets numpy's behavior for floating-point overflow, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_under',
             ("Sets numpy's behavior for floating-point underflow, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_invalid',
             ("Sets numpy's behavior for invalid floating-point operation, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 ###
 ### To disable some warning about old bug that are fixed now.
 ###
 AddConfigVar('warn.ignore_bug_before',
             "If 'None', we warn about all Theano bugs found by default. If 'all', we don't warn about Theano bugs found by default. If a version, we print only the warnings relative to Theano bugs found after that version. Warning for specific bugs can be configured with specific [warn] flags.",
-             EnumStr('None', 'all', '0.3', allow_override=False))
+             EnumStr('None', 'all', '0.3', allow_override=False),
+             in_c_key=False)

 default_0_3 = True
 if config.warn.ignore_bug_before == 'None':
@@ -187,20 +209,25 @@ elif config.warn.ignore_bug_before >= '0.3':

 AddConfigVar('warn.argmax_pushdown_bug',
             "Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.gpusum_01_011_0111_bug',
             "Warn if we are in a case where old version of Theano had a silent bug with GpuSum pattern 01,011 and 0111 when the first dimensions was bigger then 4096. Was fixed 31 may 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.sum_sum_bug',
             "Warn if we are in a case where Theano version between version 9923a40c7b7a and the 2 august 2010(fixed date), generated an error in that case. This happen when their is 2 consecutive sum in the graph, bad code was generated. Was fixed 2 August 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.sum_div_dimshuffle_bug',
             "Warn if previous versions of Theano (between rev. 3bd9b789f5e8, 2010-06-16, and cfc6322e5ad4, 2010-08-03) would have given incorrect result. This bug was triggered by sum of division of dimshuffled tensors.",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('compute_test_value',
        "If 'True', Theano will run each op at graph build time, using Constants, SharedVariables and the tag 'test_value' as inputs to the function. This helps the user track down problems in the graph before it gets optimized.",
-        EnumStr('False', 'True', 'warn', 'err'))
+        EnumStr('False', 'True', 'warn', 'err'),
+        in_c_key=False)
--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -7,6 +7,8 @@ import ConfigParser
 import logging
 import warnings

+import theano
+
 _logger = logging.getLogger('theano.config')

 class TheanoConfigWarning(Warning):
@@ -103,6 +105,21 @@ def _config_print(thing, buf):
        print >> buf, "    Value: ", cv.val
        print >> buf, ""

+
+def get_config_md5():
+    """
+    Return a string md5 of the current config options. It should be such that
+    we can safely assume that two different config setups will lead to two
+    different strings.
+
+    We only take into account config options for which `in_c_key` is True.
+    """
+    all_opts = sorted([c for c in _config_var_list if c.in_c_key],
+                      key=lambda cv: cv.fullname)
+    return theano.gof.cc.hash_from_code('\n'.join(
+                    ['%s = %s' % (cv.fullname, cv.val) for cv in all_opts]))
+
+
 class TheanoConfigParser(object):
    #properties are installed by AddConfigVar
    _i_am_a_config_class = True
@@ -110,6 +127,7 @@ class TheanoConfigParser(object):
        sio = StringIO.StringIO()
        _config_print(self.__class__, sio)
        return sio.getvalue()
+
 # N.B. all instances of TheanoConfigParser give access to the same properties.
 config = TheanoConfigParser()

@@ -124,17 +142,27 @@ config = TheanoConfigParser()
 # - The subtrees provide the same interface as the root
 # - ConfigParser subclasses control get/set of config properties to guard against craziness.

-def AddConfigVar(name, doc, configparam, root=config):
+def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
    """Add a new variable to theano.config

    :type name: string for form "[section0.[section1.[etc]]].option"
    :param name: the full name for this configuration variable.
+
    :type doc: string
    :param doc: What does this variable specify?
+
    :type configparam: ConfigParam instance
-    :param configparam: an object for getting and setting this configuration  parameter
+    :param configparam: an object for getting and setting this configuration parameter
+
    :type root: object
-    :param root: used for recusive calls -- don't provide an argument for this parameter.
+    :param root: used for recusive calls -- do not provide an argument for this parameter.
+
+    :type in_c_key: boolean
+    :param in_c_key: If True, then whenever this config option changes, the
+    key associated to compiled C modules also changes, i.e. it may trigger a
+    compilation of these modules (this compilation will only be partial if it
+    turns out that the generated C code is unchanged). Set this option to False
+    only if you are confident this option should not affect C code compilation.

    :returns: None
    """
@@ -155,11 +183,13 @@ def AddConfigVar(name, doc, configparam, root=config):
        newroot = getattr(root, sections[0])
        if not getattr(newroot, '_i_am_a_config_class', False) or isinstance(newroot, type):
            raise TypeError('Internal config nodes must be config class instances', newroot)
-        return AddConfigVar('.'.join(sections[1:]), doc, configparam, root=newroot)
+        return AddConfigVar('.'.join(sections[1:]), doc, configparam,
+                            root=newroot, in_c_key=in_c_key)
    else:
        if hasattr(root, name):
            raise AttributeError('This name is already taken', configparam.fullname)
        configparam.doc = doc
+        configparam.in_c_key = in_c_key
        configparam.__get__() # trigger a read of the value from config files and env vars
        setattr(root.__class__, sections[0], configparam)
        _config_var_list.append(configparam)

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -7,6 +7,7 @@ from copy import copy
 import re #for set_compiledir
 import os, sys, StringIO

+
 if sys.version_info[:2] >= (2,5):
    import hashlib
    def hash_from_code(msg):
@@ -16,6 +17,13 @@ else:
    def hash_from_code(msg):
        return md5.new(msg).hexdigest()

+
+def hash_from_file(file_path):
+    """Return the MD5 hash of a file."""
+    return hash_from_code(open(file_path, 'rb').read())
+
+
+import theano
 from theano.gof.python25 import all
 from theano import config

@@ -43,6 +51,7 @@ import cmodule

 import logging
 _logger=logging.getLogger("theano.gof.cc")
+_logger.setLevel(logging.WARN)
 def info(*args):
    _logger.info(' '.join(str(a) for a in args))
 def debug(*args):
@@ -791,7 +800,7 @@ class CLinker(link.Linker):
        The key returned by this function is of the form (version, signature)
        The signature has the following form:
        {{{
-            'CLinker.cmodule_key', compilation args, libraries,
+            'CLinker.cmodule_key', compilation args, libraries, config md5,
            (op0, input_signature0, output_signature0),
            (op1, input_signature1, output_signature1),
            ...
@@ -858,10 +867,16 @@ class CLinker(link.Linker):
        constant_ids = dict()
        op_pos = {} # Apply -> topological position

-        # first we put the header, compile_args, library names into the signature
+        # First we put the header, compile_args, library names and config md5
+        # into the signature.
        sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
        if compile_args is not None: sig.append(tuple(compile_args))
        if libraries is not None: sig.append(tuple(libraries))
+        # IMPORTANT: The 'md5' prefix is used to isolate the compilation
+        # parameters from the rest of the key. If you want to add more key
+        # elements, they should be before this md5 hash if and only if they
+        # can lead to a different compiled file with the same source code.
+        sig.append('md5:' + theano.configparser.get_config_md5())

        # technically this should only be appended for gcc-compiled Ops
        # and the flags of other compilers should be inserted here... but it's not clear how to
@@ -943,11 +958,30 @@ class CLinker(link.Linker):

    def compile_cmodule(self, location=None):
        """
-        This method is a callback for `ModuleCache.module_from_key`
+        Compile the module and return it.
+        """
+        # Go through all steps of the compilation process.
+        for step_result in self.compile_cmodule_by_step(location=location):
+            pass
+        # And return the output of the last step, which should be the module
+        # itself.
+        return step_result
+
+    def compile_cmodule_by_step(self, location=None):
+        """
+        This method is a callback for `ModuleCache.module_from_key`.
+
+        It is a generator (thus the 'by step'), so that:
+            - it first yields the module's C code
+            - it last yields the module itself
+            - it may yield other intermediate outputs in-between if needed
+              in the future (but this is not currently the case)
        """
        if location is None:
            location = cmodule.dlimport_workdir(config.compiledir)
        mod = self.build_dynamic_module()
+        src_code = mod.code()
+        yield src_code
        get_lock()
        try:
            debug("LOCATION", location)
@@ -955,7 +989,7 @@ class CLinker(link.Linker):
            libs = self.libraries()
            preargs = self.compile_args()
            if c_compiler.__name__=='nvcc_module_compile_str' and config.lib.amdlibm:
-                #this lib don't work correctly with nvcc in device code.
+                # This lib does not work correctly with nvcc in device code.
                if '<amdlibm.h>' in mod.includes:
                    mod.includes.remove('<amdlibm.h>')
                if '-DREPLACE_WITH_AMDLIBM' in preargs:
@@ -965,7 +999,7 @@ class CLinker(link.Linker):
            try:
                module = c_compiler(
                    module_name=mod.name,
-                    src_code = mod.code(),
+                    src_code=src_code,
                    location=location,
                    include_dirs=self.header_dirs(),
                    lib_dirs=self.lib_dirs(),
@@ -977,8 +1011,7 @@ class CLinker(link.Linker):
        finally:
            release_lock()

-        return module
-
+        yield module

    def build_dynamic_module(self):
        """Return a cmodule.DynamicModule instance full of the code for our env.
@@ -1041,10 +1074,10 @@ class CLinker(link.Linker):
        except KeyError:
            key = None
        if key is None:
-            #if we can't get a key, then forget the cache mechanism
+            # If we can't get a key, then forget the cache mechanism.
            module = self.compile_cmodule()
        else:
-            module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule, keep_lock=keep_lock)
+            module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)

        vars = self.inputs + self.outputs + self.orphans
        # List of indices that should be ignored when passing the arguments

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -2,10 +2,12 @@
 """
 import os, tempfile, StringIO, sys, logging, subprocess, cPickle, atexit, time, shutil, stat
 import distutils.sysconfig
-from theano.configparser import config
+
 import numpy.distutils #TODO: TensorType should handle this
-import sys
+import theano

+from theano.configparser import config
+from theano.gof.cc import hash_from_code, hash_from_file
 import compilelock # we will abuse the lockfile mechanism when reading and writing the registry

 from theano.configparser import TheanoConfigParser, AddConfigVar, EnumStr, StrParam, IntParam, FloatParam, BoolParam
@@ -202,6 +204,97 @@ def module_name_from_dir(dirname):
    name, = [file for file in files if file.endswith('.so') or file.endswith('.pyd')]
    return os.path.join(dirname, name)

+
+def get_module_hash(src_code, key):
+
+    """
+    Return an MD5 hash that uniquely identifies a module.
+
+    This hash takes into account:
+        1. The C source code of the module (`src_code`).
+        2. The version part of the key.
+        3. The compiler options defined in `key` (command line parameters and
+           libraries to link against).
+    """
+    # `to_hash` will contain any element such that we know for sure that if
+    # it changes, then the module hash should be different.
+    # We start with the source code itself (stripping blanks might avoid
+    # recompiling after a basic indentation fix for instance).
+    to_hash = map(str.strip, src_code.split('\n'))
+    # Get the version part of the key.
+    to_hash += map(str, key[0])
+    c_link_key = key[1]
+    # Currently, in order to catch potential bugs early, we are very
+    # convervative about the structure of the key and raise an exception
+    # if it does not match exactly what we expect. In the future we may
+    # modify this behavior to be less strict and be able to accomodate
+    # changes to the key in an automatic way.
+    error_msg = ("This should not happen unless someone modified the code "
+                 "that defines the CLinker key, in which case you should "
+                 "ensure this piece of code is still valid (and this "
+                 "AssertionError may be removed or modified to accomodate "
+                 "this change)")
+    assert c_link_key[0] == 'CLinker.cmodule_key', error_msg
+    for key_element in c_link_key[1:]:
+        if isinstance(key_element, tuple):
+            # This should be the C++ compilation command line parameters or the
+            # libraries to link against.
+            to_hash += list(key_element)
+        elif isinstance(key_element, str):
+            if key_element.startswith('md5:'):
+                # This is the md5 hash of the config options. We can stop
+                # here.
+                break
+            else:
+                raise AssertionError(error_msg)
+        else:
+            raise AssertionError(error_msg)
+    return hash_from_code('\n'.join(to_hash))
+
+
+class KeyData(object):
+
+    """Used to store the key information in the cache."""
+
+    def __init__(self, keys, module_hash, key_pkl):
+        """
+        Constructor.
+
+        :param keys: Set of keys that are associated to the exact same module.
+
+        :param module_hash: Hash identifying the module (it should hash both
+        the code and the compilation options).
+
+        :param key_pkl: Path to the file in which this KeyData object should be
+        pickled.
+        """
+        self.keys = keys
+        self.module_hash = module_hash
+        self.key_pkl = key_pkl
+
+    def add_key(self, key):
+        """Add a key to the `keys` set, and update the pickled file."""
+        assert key not in self.keys
+        self.keys.add(key)
+        self.save_pkl()
+
+    def save_pkl(self):
+        """
+        Dump this object into its `key_pkl` file.
+
+        May raise a cPickle.PicklingError if such an exception is raised at
+        pickle time (in which case a warning is also displayed).
+        """
+        # Note that writing in binary mode is important under Windows.
+        try:
+            cPickle.dump(self, open(self.key_pkl, 'wb'),
+                         protocol=cPickle.HIGHEST_PROTOCOL)
+        except cPickle.PicklingError:
+            warning("Cache leak due to unpickle-able key data", self.keys)
+            os.remove(self.key_pkl)
+            raise
+
+
 class ModuleCache(object):
    """Interface to the cache of dynamically compiled modules on disk

@@ -209,24 +302,32 @@ class ModuleCache(object):
    It is built to handle the case where multiple programs are also using instances of this
    class to manage the same directory.

+    The cache works on the basis of keys. Each key is mapped to only one
+    dynamic module, but multiple keys may be mapped to the same module (see
+    below for details).

-    The cache works on the basis of keys.  Keys are used to uniquely identify a dynamic module.
    Keys should be tuples of length 2: (version, rest)
    The ``rest`` can be anything hashable and picklable, that uniquely identifies the
    computation in the module.

    The ``version`` should be a hierarchy of tuples of integers.
-    If the ``version`` is either 0 or (), then the corresponding module is unversioned, and
-    will be deleted in an atexit() handler.
-    If the ``version`` is neither 0 nor (), then the module will be kept in the cache between
-    processes.
-
+    If the ``version`` is either 0 or (), then the key is unversioned, and its
+    corresponding module will be deleted in an atexit() handler if it is not
+    associated to another versioned key.
+    If the ``version`` is neither 0 nor (), then the module will be kept in the
+    cache between processes.

    An unversioned module is not deleted by the process that creates it.  Deleting such modules
    does not work on NFS filesystems because the tmpdir in which the library resides is in use
    until the end of the process' lifetime.  Instead, unversioned modules are left in their
    tmpdirs without corresponding .pkl files.  These modules and their directories are erased
    by subsequent processes' refresh() functions.
+
+    Two different keys are mapped to the same module when:
+        - They have the same version.
+        - They share the same compilation options in their ``rest`` part (see
+          ``CLinker.cmodule_key_`` for how this part is built).
+        - They share the same C code.
    """

    dirname = ""
@@ -239,6 +340,9 @@ class ModuleCache(object):
    """Maps keys to the filename of a .so/.pyd.
    """

+    module_hash_to_key_data = {}
+    """Maps hash of a module's code to its corresponding KeyData object."""
+
    stats = []
    """A list with counters for the number of hits, loads, compiles issued by module_from_key()
    """
@@ -260,6 +364,7 @@ class ModuleCache(object):
        self.dirname = dirname
        self.module_from_name = dict(self.module_from_name)
        self.entry_from_key = dict(self.entry_from_key)
+        self.module_hash_to_key_data = dict(self.module_hash_to_key_data)
        self.stats = [0, 0, 0]
        if force_fresh is not None:
            self.force_fresh = force_fresh
@@ -283,8 +388,12 @@ class ModuleCache(object):
                        warning("hash 1:", hash(k1))

    age_thresh_use = 60*60*24*24
-    """The default age threshold for `clear_old` (in seconds)
    """
+    The default age threshold (in seconds) for cache files we want to use.
+
+    Older modules will be deleted in ``clear_old``.
+    """
+
    def refresh(self):
        """Update self.entry_from_key by walking the cache directory structure.

@@ -301,9 +410,10 @@ class ModuleCache(object):
            # add entries that are not in the entry_from_key dictionary
            time_now = time.time()
            for root, dirs, files in os.walk(self.dirname):
-                if os.path.join(root, 'key.pkl') in self.loaded_key_pkl:
+                key_pkl = os.path.join(root, 'key.pkl')
+                if key_pkl in self.loaded_key_pkl:
                    continue
-                elif 'delete.me' in files or len(files)==0:
+                elif 'delete.me' in files or not files:
                    # On NFS filesystems, it is impossible to delete a directory with open
                    # files in it.  So instead, some commands in this file will respond to a
                    # failed rmtree() by touching a 'delete.me' file.  This file is a message
@@ -311,47 +421,117 @@ class ModuleCache(object):
                    try:
                        shutil.rmtree(root)
                    except:
-                        # the directory is still in use??  We just leave it for future removal.
-                        pass
+                        # Maybe directory is still in use? We just leave it
+                        # for future removal (and make sure there is a
+                        # delete.me file in it).
+                        delete_me = os.path.join(root, 'delete.me')
+                        if not os.path.exists(delete_me):
+                            try:
+                                open(delete_me, 'w')
+                            except:
+                                # Giving up!
+                                warning("Cannot mark cache directory for "
+                                        "deletion: %s" % root)
                elif 'key.pkl' in files:
-                    key_pkl = os.path.join(root, 'key.pkl')
                    try:
                        entry = module_name_from_dir(root)
                    except ValueError: # there is a key but no dll!
                        if not root.startswith("/tmp"):
                            # Under /tmp, file are removed periodically by the os.
-                            # So it is normal that this happen from time to time.
+                            # So it is normal that this happens from time to time.
                            warning("ModuleCache.refresh() Found key without dll in cache, deleting it.", key_pkl)
                        info("Erasing broken cache directory", key_pkl)
                        shutil.rmtree(root)
                        continue
-                    if (time_now - last_access_time(entry))<self.age_thresh_use:
+                    if (time_now - last_access_time(entry)) < self.age_thresh_use:
                        debug('refresh adding', key_pkl)
+                        def unpickle_failure():
+                            info("ModuleCache.refresh() Failed to unpickle "
+                                 "cache file", key_pkl)
                        try:
-                            key = cPickle.load(open(key_pkl, 'rb'))
+                            key_data = cPickle.load(open(key_pkl, 'rb'))
+                        except EOFError:
+                            # Happened once... not sure why (would be worth
+                            # investigating).
+                            unpickle_failure()
+                            warning("Erasing broken cache directory [EOF]", root)
+                            shutil.rmtree(root)
+                            continue
                        except:
-                            info("ModuleCache.refresh() Failed to unpickle cache key", key_pkl)
-                            if 0:
-                                info("Erasing broken cache directory", key_pkl)
+                            # For now, raise exceptions, in order to be able to
+                            # figure out which exceptions should be caught.
+                            # TODO Make it more user-friendly by not raising
+                            # the exception.
+                            raise
+                            unpickle_failure()
+                            if False:
+                                info("Erasing broken cache directory", root)
                                shutil.rmtree(root)
                            else:
-                                ## This exception is often triggered by keys that contain
+                                # This exception is often triggered by keys that contain
                                # references to classes that have not yet been imported.  They are
-                                # not necessarily broken
+                                # not necessarily broken.
                                pass
                            continue
-
-                        if not key[0]: #if the version is False
-                            warning("ModuleCache.refresh() Found unversioned key in cache, deleting it.", key_pkl)
-                            info("Erasing broken cache directory", key_pkl)
-                            shutil.rmtree(root)
+                        
+                        if not isinstance(key_data, KeyData):
+                            # This is some old cache data, that does not fit
+                            # the new cache format. It would be possible to
+                            # update it, but it is not entirely safe since we
+                            # do not know the config options that were used.
+                            # As a result, we delete it instead (which is also
+                            # simpler to implement).
+                            _rmtree(root, ignore_nocleanup=True,
+                                    msg='deprecated cache entry')
                            continue

-                        if key not in self.entry_from_key:
-                            self.entry_from_key[key] = entry
-                            # assert that we haven't already got this entry somehow
-                            assert entry not in self.module_from_name
-                            self.loaded_key_pkl.add(key_pkl)
+                        # Find unversioned keys.
+                        to_del = [key for key in key_data.keys if not key[0]]
+                        if to_del:
+                            warning("ModuleCache.refresh() Found unversioned "
+                                    "key in cache, removing it.", key_pkl)
+                            if len(to_del) == len(key_data.keys):
+                                # All keys were unversioned.
+                                info("Erasing broken cache directory", key_pkl)
+                                shutil.rmtree(root)
+                                continue
+                            else:
+                                # Fix the pickled file to only keep the
+                                # versioned keys.
+                                info("Fixing broken cache directory", key_pkl)
+                                key_data.keys = set(
+                                        [key for key in key_data.keys
+                                         if key[0]])
+                                key_data.save_pkl()
+
+                        for key in key_data.keys:
+                            if key not in self.entry_from_key:
+                                self.entry_from_key[key] = entry
+                                # Assert that we have not already got this
+                                # entry somehow.
+                                assert entry not in self.module_from_name
+                            else:
+                                warning("The same cache key is associated to "
+                                        "different modules. This should not "
+                                        "be possible! We will re-use the first"
+                                        " module (%s) instead of the new one "
+                                        "(%s)." % (self.entry_from_key[key],
+                                                   entry))
+                        self.loaded_key_pkl.add(key_pkl)
+
+                        # Remember the map from a module's hash to the KeyData
+                        # object associated with it.
+                        mod_hash = key_data.module_hash
+                        if mod_hash in self.module_hash_to_key_data:
+                            # This should not happen: a given module should
+                            # never be duplicated in the cache.
+                            warning(
+                                "Found duplicated modules in the cache! If "
+                                "you are unable to debug this issue, it is "
+                                "advised to at least clear your cache with "
+                                "'theano-cache clear'.")
+                        else:
+                            self.module_hash_to_key_data[mod_hash] = key_data
                    else:
                        too_old_to_use.append(entry)

@@ -395,9 +575,12 @@ class ModuleCache(object):

    def module_from_key(self, key, fn=None, keep_lock=False):
        """
-        :param fn: a callable object that will return a module for the key (it is called only if the key isn't in
-        the cache).  This function will be called with a single keyword argument "location"
-        that is a path on the filesystem wherein the function should write the module.
+        :param fn: A callable object that will return an iterable object when
+        called, such that the first element in this iterable object is the
+        source code of the module, and the last element is the module itself.
+        `fn` is called only if the key is not already in the cache, with
+        a single keyword argument `location` that is the path to the directory
+        where the module should be compiled.
        """
        rval = None
        try:
@@ -419,61 +602,129 @@ class ModuleCache(object):
            rval = self.module_from_name[name]
        else:
            hash_key = hash(key)
-            # we have never seen this key before
+            key_data = None
+            # We have never seen this key before.
            # Acquire lock before creating things in the compile cache,
-            # to avoid that other processes remove the compile dire while it
-            # is still empty
+            # to avoid that other processes remove the compile dir while it
+            # is still empty.
            compilelock.get_lock()
-            location = dlimport_workdir(self.dirname)
-            #debug("LOCATION*", location)
+            # This try/finally block ensures that the lock is released once we
+            # are done writing in the cache file or after raising an exception.
            try:
-                module = fn(location=location)  # WILL FAIL FOR BAD C CODE
-            except Exception, e:
-                _rmtree(location)
+                location = dlimport_workdir(self.dirname)
+                #debug("LOCATION*", location)
+
+                compile_steps = fn(location=location).__iter__()
+
+                # Check if we already know a module with the same hash. If we
+                # do, then there is no need to even compile it.
+                duplicated_module = False
+                # The first compilation step is to yield the source code.
+                src_code = compile_steps.next()
+                module_hash = get_module_hash(src_code, key)
+                if module_hash in self.module_hash_to_key_data:
+                    debug("Duplicated module! Will re-use the previous one")
+                    duplicated_module = True
+                    # Load the already existing module.
+                    key_data = self.module_hash_to_key_data[module_hash]
+                    # Note that we do not pass the `fn` argument, since it
+                    # should not be used considering that the module should
+                    # already be compiled.
+                    module = self.module_from_key(
+                            key=key_data.keys.__iter__().next())
+                    name = module.__file__
+                    # Add current key to the set of keys associated to the same
+                    # module.
+                    key_data.add_key(key)
+                    # We can delete the work directory.
+                    _rmtree(location, ignore_nocleanup=True)
+                else:
+                    try:
+                        # Will fail if there is an error compiling the C code.
+                        while True:
+                            try:
+                                # The module should be returned by the last
+                                # step of the compilation.
+                                module = compile_steps.next()
+                            except StopIteration:
+                                break
+                    except Exception, e:
+                        _rmtree(location)
+                        raise
+
+                    # Obtain path to the '.so' module file.
+                    name = module.__file__
+                    
+                    debug("Adding module to cache", key, name)
+                    assert name.startswith(location)
+                    assert name not in self.module_from_name
+                    # Changing the hash of the key is not allowed during
+                    # compilation. That is the only cause found that makes the
+                    # following assert fail.
+                    assert hash(key) == hash_key
+                    assert key not in self.entry_from_key
+
+                    if _version: # save the key
+                        key_pkl = os.path.join(location, 'key.pkl')
+                        assert not os.path.exists(key_pkl)
+                        key_data = KeyData(
+                                keys=set([key]),
+                                module_hash=module_hash,
+                                key_pkl=key_pkl)
+                        try:
+                            key_data.save_pkl()
+                            key_broken = False
+                        except cPickle.PicklingError:
+                            key_broken = True
+
+                        if not key_broken:
+                            try:
+                                kd2 = cPickle.load(open(key_pkl, 'rb'))
+                                assert len(kd2.keys) == 1
+                                key_from_file = kd2.keys.__iter__().next()
+                                if key != key_from_file:
+                                    raise Exception(
+                                        "Key not equal to unpickled version "
+                                        "(Hint: verify the __eq__ and "
+                                        "__hash__ functions for your Ops",
+                                        (key, key_from_file))
+                                # Adding the key file to this set means it is a
+                                # versioned key.
+                                self.loaded_key_pkl.add(key_pkl)
+                                self.module_hash_to_key_data[module_hash] = \
+                                                                    key_data
+                            except cPickle.UnpicklingError:
+                                warning('Cache failure due to un-loadable key',
+                                        key)
+
+            finally:
+                # Release lock if needed.
                if not keep_lock:
                    compilelock.release_lock()
-                #try:
-                #except Exception, ee:
-                    #error('failed to cleanup location', location, ee)
-                raise
-
-            if not keep_lock:
-                compilelock.release_lock()
-            name = module.__file__
-
-            debug("Adding module to cache", key, name)
-            assert name.startswith(location)
-            assert name not in self.module_from_name
-#Changing the hash of the key is not allowed during compilation
-#That is the only cause found that make the last assert fail.
-            assert hash(key)==hash_key
-            assert key not in self.entry_from_key
-
-            assert key not in self.entry_from_key
-            if _version: # save they key
-                key_pkl = os.path.join(location, 'key.pkl')
-                # Note that using a binary file is important under Windows.
-                key_file = open(key_pkl, 'wb')
-                try:
-                    cPickle.dump(key, key_file, cPickle.HIGHEST_PROTOCOL)
-                    key_file.close()
-                    key_broken = False
-                except cPickle.PicklingError:
-                    key_file.close()
-                    os.remove(key_pkl)
-                    warning("Cache leak due to unpickle-able key", key)
-                    key_broken = True
-
-                if not key_broken:
-                    try:
-                        key_from_file = cPickle.load(open(key_pkl, 'rb'))
-                        if key != key_from_file:
-                            raise Exception("key not equal to unpickled version (Hint: verify the __eq__ and __hash__ functions for your Ops", (key, key_from_file))
-                        self.loaded_key_pkl.add(key_pkl) # adding the key file to this set means it is a versioned key
-                    except cPickle.UnpicklingError:
-                        warning('Cache failure due to un-loadable key', key)
-            self.entry_from_key[key] = name
-            self.module_from_name[name] = module
+
+            # Update map from key to module name for all keys associated to
+            # this same module.
+            if key_data is None:
+                # Should only happen if unversioned.
+                assert not _version
+                all_keys = [key]
+            else:
+                assert key in key_data.keys
+                all_keys = key_data.keys
+            for k in all_keys:
+                if k in self.entry_from_key:
+                    # If we had already seen this key, then it should be
+                    # associated to the same module.
+                    assert self.entry_from_key[k] == name
+                else:
+                    self.entry_from_key[k] = name
+
+            if name in self.module_from_name:
+                # May happen if we are re-using an existing module.
+                assert duplicated_module
+                assert self.module_from_name[name] is module
+            else:
+                self.module_from_name[name] = module

            self.stats[2] += 1
            rval = module
@@ -481,16 +732,17 @@ class ModuleCache(object):
        return rval

    age_thresh_del = 60*60*24*31#31 days
-    age_thresh_del_unversionned = 60*60*24*7#7 days
+    age_thresh_del_unversioned = 60*60*24*7#7 days

    """The default age threshold for `clear_old` (in seconds)
    """
-    def clear_old(self, age_thresh_del=None): #default to a 31-day age_thresh_delold
+    def clear_old(self, age_thresh_del=None):
        """
        Delete entries from the filesystem for cache entries that are too old.

-        :param age_thresh_del: dynamic modules whose last access time is more than ``age_thresh_del``
-        seconds ago will be erased.
+        :param age_thresh_del: Dynamic modules whose last access time is more
+        than ``age_thresh_del`` seconds ago will be erased. Defaults to 31-day
+        age if not provided.
        """
        if age_thresh_del is None:
            age_thresh_del = self.age_thresh_del
@@ -500,84 +752,146 @@ class ModuleCache(object):
            # update the age of modules that have been accessed by other processes
            # and get all module that are too old to use.(not loaded in self.entry_from_key)
            too_old_to_use = self.refresh()
-            too_old_to_use = [(None,entry) for entry in too_old_to_use]
+            too_old_to_use = [(None, entry) for entry in too_old_to_use]
            time_now = time.time()

-            # the .items() is important here:
+            # the .iteritems() is important here:
            # we need to get a copy of the whole list of keys and entries
            items_copy = list(self.entry_from_key.iteritems())
-            for key, entry in items_copy+too_old_to_use:
+            all_items = items_copy + too_old_to_use
+            # Since multiple keys may share the same entry, we turn this list
+            # of pairs into a dictionary that maps an entry to the list of keys
+            # that use it.
+            entry_to_keys = dict((entry, [])
+                                 for key, entry in all_items)
+            for key, entry in all_items:
+                entry_to_keys[entry].append(key)
+            for entry, keys in entry_to_keys.iteritems():
                age = time_now - last_access_time(entry)
                if age > age_thresh_del:
                    # TODO: we are assuming that modules that haven't been accessed in over
                    # age_thresh_del are not currently in use by other processes, but that could be
                    # false for long-running jobs...
                    assert entry not in self.module_from_name
-                    if key is not None:
-                        del self.entry_from_key[key]
+                    for key in keys:
+                        if key is not None:
+                            del self.entry_from_key[key]
                    parent = os.path.dirname(entry)
                    assert parent.startswith(os.path.join(self.dirname, 'tmp'))
-                    info("clear_old removing cache dir", parent)
-                    _rmtree(parent)
+                    _rmtree(parent, msg='old cache directory', level='info')

        finally:
            compilelock.release_lock()

-    def clear(self):
+    def clear(self, unversioned_min_age=None, clear_base_files=False):
        """
-        Clear all the elements of the cache
+        Clear all elements in the cache.
+
+        :param unversioned_min_age: Forwarded to `clear_unversioned`. In
+        particular, you can set it to -1 in order to delete all unversioned
+        cached modules regardless of their age.
+
+        :clear_base_files: If True, then delete base directories 'cuda_ndarray'
+        and 'cutils_ext' if they are present. If False, those directories are
+        left intact.
        """
-        self.clear_old(-1.0)
-        self.clear_unversioned()
+        compilelock.get_lock()
+        try:
+            self.clear_old(-1.0)
+            self.clear_unversioned(min_age=unversioned_min_age)
+            if clear_base_files:
+                self.clear_base_files()
+        finally:
+            compilelock.release_lock()

-    def clear_unversioned(self):
-        """Delete unversioned dynamic modules from the internal dictionaries and from the
+    def clear_base_files(self):
+        """
+        Delete base directories 'cuda_ndarray' and 'cutils_ext' if present.
+        """
+        compilelock.get_lock()
+        try:
+            for base_dir in ('cuda_ndarray', 'cutils_ext'):
+                to_delete = os.path.join(self.dirname, base_dir)
+                if os.path.isdir(to_delete):
+                    try:
+                        shutil.rmtree(to_delete)
+                        debug('Deleted: %s' % to_delete)
+                    except:
+                        warning('Could not delete %s' % to_delete)
+        finally:
+            compilelock.release_lock()
+
+    def clear_unversioned(self, min_age=None):
+        """
+        Delete unversioned dynamic modules.
+        
+        They are deleted both from the internal dictionaries and from the
        filesystem.
+
+        :param min_age: Minimum age to be deleted, in seconds. Defaults to
+        7-day age if not provided.
        """
+        if min_age is None:
+            min_age = self.age_thresh_del_unversioned
        items_copy = list(self.entry_from_key.iteritems())
-        for key, entry in items_copy:
-            version, rest = key
-            if not version:
-                del self.entry_from_key[key]

-                # entry is guaranteed to be in this dictionary,
-                # because an unversioned entry should never have been loaded via refresh
-                assert entry in self.module_from_name
+        compilelock.get_lock()

-                del self.module_from_name[entry]
+        try:
+            for key, entry in items_copy:
+                version, rest = key
+                if not version:
+                    del self.entry_from_key[key]

-                parent = os.path.dirname(entry)
-                assert parent.startswith(os.path.join(self.dirname, 'tmp'))
-                info("clear_unversioned removing cache dir", parent)
-                _rmtree(parent)
+                    # entry is guaranteed to be in this dictionary,
+                    # because an unversioned entry should never have been loaded via refresh
+                    assert entry in self.module_from_name

-        time_now = time.time()
-        for filename in os.listdir(self.dirname):
-            if filename.startswith('tmp'):
-                try:
-                    open(os.path.join(self.dirname, filename, 'key.pkl')).close()
-                    has_key = True
-                except IOError:
-                    has_key = False
-                if not has_key:
-                    age = time_now - last_access_time(os.path.join(self.dirname, filename))
-                    #In normal case, the processus that created this directory will delete it
-                    #In case this processus crash, it won't be cleaned up.
-                    #As we don't know how to know if this directory is still used
-                    #we wait 1 weak and suppose that the processus crashed
-                    #and we do the clean up for it.
-                    if age > self.age_thresh_del_unversionned:
-                        info("clear_unversioned removing cache dir", filename)
-                        _rmtree(os.path.join(self.dirname, filename))
+                    del self.module_from_name[entry]
+
+                    parent = os.path.dirname(entry)
+                    assert parent.startswith(os.path.join(self.dirname, 'tmp'))
+                    _rmtree(parent, msg='unversioned', level='info')
+
+            time_now = time.time()
+            for filename in os.listdir(self.dirname):
+                if filename.startswith('tmp'):
+                    try:
+                        open(os.path.join(self.dirname, filename, 'key.pkl')).close()
+                        has_key = True
+                    except IOError:
+                        has_key = False
+                    if not has_key:
+                        age = time_now - last_access_time(os.path.join(self.dirname, filename))
+                        # In normal case, the processus that created this directory
+                        # will delete it. However, if this processus crashes, it
+                        # will not be cleaned up.
+                        # As we don't know if this directory is still used, we wait
+                        # one week and suppose that the processus crashed, and we
+                        # take care of the clean-up.
+                        if age > min_age:
+                            info("clear_unversioned removing cache dir", filename)
+                            _rmtree(os.path.join(self.dirname, filename),
+                                    msg='unversioned', level='info')
+        finally:
+            compilelock.release_lock()

    def _on_atexit(self):
-        #self.refresh()#refresh is called by clear_old(), this can be long for big directory
-        self.clear_old()
-        self.clear_unversioned()
+        # Note: no need to call refresh() since it is called by clear_old().
+        compilelock.get_lock()
+        try:
+            self.clear_old()
+            self.clear_unversioned()
+        finally:
+            compilelock.release_lock()

-def _rmtree(parent):
+def _rmtree(parent, ignore_nocleanup=False, msg='', level='debug'):
    try:
-        if not config.nocleanup:
+        if ignore_nocleanup or not config.nocleanup:
+            log_msg = 'Deleting'
+            if msg:
+                log_msg += ' (%s)' % msg
+            eval(level)('%s: %s' % (log_msg, parent))
            shutil.rmtree(parent)
    except Exception, e:
        # If parent still exists, mark it for deletion by a future refresh()

--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -37,7 +37,7 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
 class NaiveAlgo(object):
    verbose = 0 # 1, 2 or 3 for more verbose output.
    cache_version = ()
-    cache_version = ('debug', 14, verbose)
+    cache_version = (14, verbose)

    def __init__(self, scalar_op, sync=True, inplace_pattern={}):
        """
@@ -56,7 +56,7 @@ class NaiveAlgo(object):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
-        print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename, id(self), nd)
+        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (self.scalar_op.__class__.__name__,nodename, nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
        #declare inputs
@@ -159,10 +159,9 @@ class NaiveAlgo(object):
                print >> sio, "//    Input  ", ipos, str(i.type)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "//    Output ", ipos, str(i.type)
-            print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
+            print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(
                    self.scalar_op.__class__.__name__,
                    nodename,
-                    id(self),
                    'tiling%i'%nd)
            if (nd):
                print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
@@ -262,10 +261,9 @@ class NaiveAlgo(object):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
-        print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
+        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(
                self.scalar_op.__class__.__name__,
                nodename,
-                id(self),
                'tiling%i_less_registers'%nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
@@ -472,7 +470,6 @@ class NaiveAlgo(object):
        nd = node.outputs[0].type.ndim
        nb_inputs = len(node.inputs)
        nb_outputs = len(node.outputs)
-        id_self = id(self)
        d = dict()
        #input_params and output_params go into the function declaration/definition
        input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
@@ -512,7 +509,7 @@ class NaiveAlgo(object):
        """ %locals()
        if self.verbose:
            print >> sio, """
-                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s_%(id_self)s     w numEls" << numEls << " dims"<< d << "\\n";
+                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
            """ %locals()
            print >> sio, 'std::cerr << ' + " << ' ' <<  ".join(['"  "']+list("dims[%i]"%di
                for di in xrange(nd)) + ["'\\n';"])
@@ -693,7 +690,7 @@ nd_collapse_[i]=0;
                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'


-        def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
+        def launch_Ccontiguous(nodename, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
                kernel_call_args.append("i%i_data"%ipos)
@@ -736,7 +733,7 @@ nd_collapse_[i]=0;
            else:
                print >> sio, " return 0; " %locals()

-        def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
+        def launch_General(nodename, scalar_op, force_nd, sync=True):
            # kernel_call_args are used to invoke the cuda kernel
            local="local_"
            kernel_call_args = ["numEls"]
@@ -769,7 +766,7 @@ nd_collapse_[i]=0;
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

-                kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
+                kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
                """ %locals()
            if sync:
                print >> sio, """
@@ -791,11 +788,11 @@ nd_collapse_[i]=0;
        print >> sio, "if(numEls==0) return 0;"
        print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
        print >> sio, "case 0: {"
-        launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
+        launch_Ccontiguous(nodename, scalar_op, self.sync)
        print >> sio, "        } break;"
        for i in range(1, nd+1):
            print >> sio, "case "+str(i)+": {"
-            launch_General(nodename, id_self, scalar_op, i, self.sync)
+            launch_General(nodename, scalar_op, i, self.sync)
            print >> sio, "        } break;"

        print >> sio, "}"#end case

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -318,11 +318,11 @@ def test_elemwise3():
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
    b = tensor.fvector()
    print b.type
-    print tensor.constant(1).type
-    print (1 + b).type
-    print (1 + b**a).type
-    print tensor.exp((1 + b**a)).type
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
+    fone = tensor.constant(1, dtype='float32')
+    print (fone + b).type
+    print (fone + b**a).type
+    print tensor.exp((fone + b**a)).type
+    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(fone +
        b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -144,7 +144,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""
    b = tensor.fmatrix()
-    f = theano.function([b],theano.printing.Print()(b)*2, mode=mode_with_gpu)
+    ftwo = tensor.constant(2, dtype='float32')
+    f = theano.function([b],theano.printing.Print()(b) * ftwo, mode=mode_with_gpu)
    #theano.printing.debugprint(f)
    #print f.maker.env.toposort()
 #[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -246,13 +246,13 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    neib_shape = T.as_tensor_variable(neib_shape)
    original_shape = T.as_tensor_variable(original_shape)

-    new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] )
+    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1], neib_shape[1])
    output_2d = images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape, mode=mode)
    
    if mode == 'ignore_borders':
        valid_shape = list(original_shape)
-        valid_shape[2]  = valid_shape[2] / neib_shape[0] * neib_shape[0]
-        valid_shape[3]  = valid_shape[3] / neib_shape[1] * neib_shape[1]
+        valid_shape[2]  = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
+        valid_shape[3]  = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
        output_4d = output_2d.reshape(valid_shape)
        #padding the borders with zeros
        for d in [2,3]:

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -49,6 +49,17 @@ def multMatVect(v, A, m1, B, m2):
    r[3:] = matVecModM(B, v[3:], m2)
    return r

+def cast_if_untyped(x, dtype):
+    """Return `x` cast as a numpy scalar of type `dtype` if `x` is untyped."""
+    if hasattr(x, 'dtype'):
+        # `x` is already typed.
+        return x
+    else:
+        # We intend to do this on regular Python int / float objects.
+        assert isinstance(x, int) or isinstance(x, float)
+        return numpy.array(x, dtype=dtype)
+
+
 #MRG31k3p
 #generator constants :
 M1 = numpy.int32(2147483647)    #2^31 - 1
@@ -263,7 +274,7 @@ class mrg_uniform(mrg_uniform_base):
        if (%(size)s->dimensions[0] != %(ndim)s)
        {
            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, %(size)s->dimensions[0]);
+                %(ndim)s, int(%(size)s->dimensions[0]));
            %(fail)s
        }
        if (%(size)s->descr->type_num != PyArray_INT32)
@@ -589,6 +600,35 @@ class GPU_mrg_uniform(mrg_uniform_base):
    def c_code_cache_version(self):
        return (5,)

+
+def guess_n_streams(size, warn=True):
+    """
+    Return a guess at a good number of streams.
+    
+    :param warn: If True, warn when a guess cannot be made (in which case
+    we return 30 * 256).
+    """
+    # TODO: a smart way of choosing the number of streams, see #612.
+    # Note that this code was moved out of `MRG_RandomStreams` so that it can
+    # be easily accessed from tests, where we want to disable the warning.
+    if (isinstance(size, (tuple, list)) and
+        all([isinstance(i, int) for i in size])):
+        # We can make a guess.
+        r = 1
+        for s in size:
+            r *= s
+        if r > 6:
+            r = r/6 # chosen as fastest for rbm_benchmark
+        return r
+    else:
+        if warn:
+            assert False
+            print >> sys.stderr, (
+                    "MRG_RandomStreams Can't determine #streams from "
+                    "size (%s), guessing 30*256") % str(size)
+        return 30 * 256
+
+
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""

@@ -654,18 +694,7 @@ class MRG_RandomStreams(object):
        return rval

    def n_streams(self, size):
-        # TODO: a smart way of choosing the number of streams, see #612.
-        if isinstance(size, (tuple, list)) and all([isinstance(i,int) for i in size]):
-            r = 1
-            for s in size:
-                r *= s
-            if r > 6:
-                r = r/6 # chosen as fastest for rbm_benchmark
-            return r
-
-        print >> sys.stderr, ("MRG_RandomStreams Can't determine #streams from "
-                "size (%s), guessing 30*256")%str(size)
-        return 30*256
+        return guess_n_streams(size, warn=True)

    def pretty_return(self, node_rstate, new_rstate, sample):
        sample.rstate = node_rstate
@@ -674,7 +703,8 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample

-    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX, nstreams=None):
+    def uniform(self, size, low=0, high=1, ndim=None, dtype='floatX',
+                nstreams=None):
        """
        Sample a tensor of given size whose element from a uniform
        distribution between low and high.
@@ -683,10 +713,25 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.

-        :param: size: Can be a list of integer or Theano variable
+        :param low: Lower bound of the interval on which values are sampled.
+        If not already typed, it is cast into dtype.
+
+        :param high: Higher bound of the interval on which values are sampled.
+        If not already typed, it is cast into dtype.
+
+        :param size: Can be a list of integer or Theano variable
                (ex: the shape of other Theano Variable)
-                TODO: can size be None?
+
+        :param dtype: The output data type.
        """
+        if dtype == 'floatX':
+            dtype = config.floatX
+
+        # We cast `low` and `high` into `dtype` to make sure we do not upcast
+        # e.g. float32 into float64.
+        low = cast_if_untyped(low, dtype)
+        high = cast_if_untyped(high, dtype)
+
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i,int) or isinstance(i,Variable)
@@ -726,18 +771,23 @@ class MRG_RandomStreams(object):

        if u.type.broadcastable != r.type.broadcastable:
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
+
+        assert r.dtype == dtype
        return  r

-    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64'):
+    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
+                 nstreams=None):
        if n == 1:
-            if dtype=='float32' and self.use_cuda:
-                return cast(self.uniform(size=size, dtype=dtype) < p, dtype)
+            if dtype == 'float32' and self.use_cuda:
+                x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
            else:
-                return cast(self.uniform(size=size) < p, dtype)
+                x = self.uniform(size=size, nstreams=nstreams)
+            return cast(x < p, dtype)
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")

-    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
+                    nstreams=None):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial
        distribution defined by probabilities pvals.
@@ -758,22 +808,36 @@ class MRG_RandomStreams(object):
                    ndim, size, pvals[:,0])
            assert ndim==1
            bcast = bcast+(pvals.type.broadcastable[-1],)
-            unis = self.uniform(size=size, ndim=1)
+            unis = self.uniform(size=size, ndim=1, nstreams=nstreams)
            op = multinomial.MultinomialFromUniform(dtype)
            return op(pvals, unis)
        else:
            raise NotImplementedError(("MRG_RandomStreams.multinomial only"
                " implemented with n == 1 and pvals.ndim = 2"))

-    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
+    def normal(self, size=None, avg=0, std=1, ndim=None,
+               dtype='floatX', nstreams=None):
        """
-        :param: size: Can be a list of integer or Theano variable(ex: the shape of other Theano Variable)
+        :param size: Can be a list of integers or Theano variables (ex: the
+        shape of another Theano Variable)
+
+        :param dtype: The output data type.
+
+        :param nstreams: Number of streams.
        """
        # We need an even number of ]0,1[ samples. Then we split them
        # in two halves. First half becomes our U1's for Box-Muller,
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

+        if dtype == 'floatX':
+            dtype = config.floatX
+
+        # We cast `avg` and `std` into `dtype` to make sure we do not upcast
+        # e.g. float32 into float64.
+        avg = cast_if_untyped(avg, dtype)
+        std = cast_if_untyped(std, dtype)
+
        evened = False
        constant = False
        if isinstance(size, tuple) and all([isinstance(i,int) for i in size]):
@@ -786,25 +850,26 @@ class MRG_RandomStreams(object):
        else:
            #if even, don't change, if odd, +1
            n_samples = prod(size)+(prod(size)%2)
-        flattened = self.uniform(size=(n_samples,), dtype=dtype)
+        flattened = self.uniform(size=(n_samples,), dtype=dtype,
+                                 nstreams=nstreams)

        if constant:
-            U1 = flattened[:n_samples/2]
-            U2 = flattened[n_samples/2:]
+            U1 = flattened[:n_samples // 2]
+            U2 = flattened[n_samples // 2:]
        else:
-            U1 = flattened[:prod(flattened.shape)/2]
-            U2 = flattened[prod(flattened.shape)/2:]
+            U1 = flattened[:prod(flattened.shape) // 2]
+            U2 = flattened[prod(flattened.shape) // 2:]

        #normal_samples = zeros_like(flattened)
-        sqrt_ln_U1 = sqrt(-2.0*log(U1))
+        sqrt_ln_U1 = sqrt(numpy.array(-2.0, dtype=dtype) * log(U1))
        # TypeError: 'TensorVariable' object does not support item assignment
        # so this doesn't work...
        #normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
        #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)

        # so trying this instead
-        first_half = sqrt_ln_U1 * cos(2.0*cast(numpy.pi,dtype)*U2)
-        second_half = sqrt_ln_U1 * sin(2.0*cast(numpy.pi,dtype)*U2)
+        first_half = sqrt_ln_U1 * cos(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2)
+        second_half = sqrt_ln_U1 * sin(numpy.array(2.0 * numpy.pi, dtype=dtype)*U2)
        normal_samples = join(0, first_half, second_half)

        final_samples = None
@@ -820,6 +885,7 @@ class MRG_RandomStreams(object):

        final_samples = avg + std * final_samples

+        assert final_samples.dtype == dtype
        return final_samples

 @local_optimizer([None])

--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
@@ -3,7 +3,7 @@ import copy
 import numpy

 import theano
-from theano import tensor, function
+from theano import config, function, tensor
 import multinomial
 from theano.compile.mode import get_default_mode, predefined_linkers
 import theano.sandbox.cuda as cuda
@@ -77,7 +77,14 @@ def test_multinomial_large():
        mval = f(pval,uval)

        assert mval.shape == pval.shape
-        assert mval.dtype == pval.dtype
+        if config.cast_policy == 'custom':
+            assert mval.dtype == pval.dtype
+        elif config.cast_policy == 'numpy+floatX':
+            assert mval.dtype == config.floatX
+        elif config.cast_policy == 'numpy':
+            assert mval.dtype == 'float64'
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.allclose(mval.sum(axis=1), 2)
        asdf = numpy.asarray([0, 0, 2, 0])+0*pval
        assert numpy.allclose(mval, asdf) #broadcast over all rows

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -350,7 +350,9 @@ def test_uniform():
        print 'ON CPU with size=(%s):'%str(size)
        x = tensor.matrix()
        R = MRG_RandomStreams(234, use_cuda=False)
-        u = R.uniform(size=size)
+        # Note: we specify `nstreams` to avoid a warning.
+        u = R.uniform(size=size,
+                      nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, u, mode=mode)
        assert any([isinstance(node.op,theano.sandbox.rng_mrg.mrg_uniform)
                    for node in f.maker.env.toposort()])
@@ -366,7 +368,8 @@ def test_uniform():
            print ''
            print 'ON GPU with size=(%s):'%str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
-            u = R.uniform(size=size, dtype='float32')
+            u = R.uniform(size=size, dtype='float32',
+                          nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                    theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -421,7 +424,9 @@ def test_binomial():
            print ''
            print 'ON CPU with size=(%s) and mean(%d):'%(str(size),mean)
            R = MRG_RandomStreams(234, use_cuda=False)
-            u = R.binomial(size=size, p=mean)
+            # Note: we specify `nstreams` to avoid a warning.
+            u = R.binomial(size=size, p=mean,
+                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
            f = theano.function(var_input, u, mode=mode)
            theano.printing.debugprint(f)
            out = f(*input)
@@ -433,7 +438,9 @@ def test_binomial():
                print ''
                print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
                R = MRG_RandomStreams(234, use_cuda=True)
-                u = R.binomial(size=size, p=mean, dtype='float32')
+                u = R.binomial(size=size, p=mean, dtype='float32',
+                               nstreams=rng_mrg.guess_n_streams(size,
+                                                                warn=False))
                assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
                f = theano.function(var_input, theano.Out(
                        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -478,7 +485,9 @@ def test_normal0():
        print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
-        n = R.normal(size=size, avg=avg, std=std)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.normal(size=size, avg=avg, std=std,
+                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n, mode=mode)
        theano.printing.debugprint(f)
        out  = f(*input)
@@ -491,7 +500,8 @@ def test_normal0():
            print ''
            print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
-            n = R.normal(size=size, avg=avg, std=std, dtype='float32')
+            n = R.normal(size=size, avg=avg, std=std, dtype='float32',
+                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
@@ -557,7 +567,8 @@ def test_multinomial():
    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
-    m = R.multinomial(pvals=pvals, dtype=config.floatX)
+    # Note: we specify `nstreams` to avoid a warning.
+    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    out = f()

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -12,8 +12,9 @@ If you want to use a scalar variable in a Theano graph,
 you probably want to use theano.tensor.[c,z,f,d,b,w,i,l,]scalar!
 """

-import math
+import math, warnings
 from copy import copy
+from itertools import imap

 import numpy, theano

@@ -26,11 +27,37 @@ builtin_complex = complex
 builtin_int = int
 builtin_float = float

+
+class ComplexError(Exception):
+    """Raised if complex numbers are used in an unsupported operation."""
+    pass
+
+class IntegerDivisionError(Exception):
+    """Raised if someone tries to divide integers with '/' instead of '//'."""
+    pass
+
+
 def upcast(dtype, *dtypes):
-    z = numpy.zeros((), dtype = dtype)
-    for dtype in dtypes:
-        z = z + numpy.zeros((), dtype = dtype)
-    return str(z.dtype)
+    # Should we try to keep float32 instead of float64? This is used so that
+    # for instance mixing int64 with float32 yields float32 instead of float64.
+    # Note that we store this boolean as a one-element list so that it can be
+    # modified within `make_array`.
+    keep_float32 = [(config.cast_policy == 'numpy+floatX' and
+                     config.floatX == 'float32')]
+    def make_array(dt):
+        if dt == 'float64':
+            # There is an explicit float64 dtype: we cannot keep float32.
+            keep_float32[0] = False
+        return numpy.zeros((), dtype=dt)
+    z = make_array(dtype)
+    for dt in dtypes:
+        z = z + make_array(dt=dt)
+    rval = str(z.dtype)
+    if rval == 'float64' and keep_float32[0]:
+        return 'float32'
+    else:
+        return rval
+

 def as_scalar(x, name = None):
    if isinstance(x, gof.Apply):
@@ -47,6 +74,7 @@ def as_scalar(x, name = None):
    except TypeError:
        raise TypeError("Cannot convert %s to Scalar" % x, type(x))

+
 def constant(x):
    # pass through numpy scalars, since they are already typed on purpose typically.
    if hasattr(x,'dtype'):
@@ -383,8 +411,9 @@ uint_types = uint8, uint16, uint32, uint64
 float_types = float32, float64
 complex_types = complex64, complex128

+discrete_types = int_types + uint_types
 continuous_types = float_types + complex_types
-
+ 
 class _scalar_py_operators:

    #UNARY
@@ -416,7 +445,8 @@ class _scalar_py_operators:
    def __sub__(self,other): return sub(self,other)
    def __mul__(self,other): return mul(self,other)
    def __div__(self,other): return div_proxy(self,other)
-    def __mod__(self,other): return mod(self,other)
+    def __floordiv__(self, other): return int_div(self, other)
+    def __mod__(self, other): return mod_check(self, other)
    def __pow__(self,other): return pow(self,other)

    #ARITHMETIC - RIGHT-OPERAND
@@ -994,32 +1024,74 @@ class Sub(BinaryScalarOp):
        return first_part, second_part
 sub = Sub(upcast_out, name = 'sub')

-def div_proxy(x, y):
-    """Proxy for either true_div or int_div, depending on types of x, y.
+
+def int_or_true_div(x_discrete, y_discrete):
+    """
+    Return 'int' or 'true' depending on the type of division used for x / y.
+
+    :param x_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :param y_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :returns: 'int' if `x / y` should be an integer division, or `true` if it
+    should be a true division.
+
+    Raises an IntegerDivisionError if both `x_discrete` and `y_discrete` are
+    True and `config.int_division` is set to 'raise'.
+
+    This function is used by both scalar/basic.py and tensor.basic/py.
    """
-    if as_scalar(x).type.dtype.startswith('int') and as_scalar(y).type.dtype.startswith('int'):
-        return int_div(x, y)
+    if (x_discrete and y_discrete):
+        if config.int_division == 'raise':
+            raise IntegerDivisionError(
+                "With `config.int_division` set to 'raise', dividing two "
+                "integer types with '/' is forbidden to avoid confusion "
+                "between integer and floating point divisions. Please "
+                "use // for integer division, or if you want a float result "
+                "either cast one of the arguments to a float or directly call "
+                "`x.__truediv__(y)`.")
+        elif config.int_division == 'int':
+            warnings.warn(
+                    "Division of two integer types with x / y is deprecated, "
+                    "please use x // y for an integer division "
+                    "(set `config.int_division = raise` to track the origin "
+                    "of this warning)",
+                    DeprecationWarning)
+            return 'int'
+        elif config.int_division == 'floatX':
+            return 'true'
+        else:
+            raise NotImplementedError(config.int_division)
    else:
-        return true_div(x, y)
+        return 'true'
+
+
+def div_proxy(x, y):
+    """Proxy for either true_div or int_div, depending on types of x, y."""
+    f = eval('%s_div' % int_or_true_div(as_scalar(x).type in discrete_types,
+                                        as_scalar(y).type in discrete_types))
+    return f(x, y)
+

 class TrueDiv(BinaryScalarOp):
    def output_types(self, types):
-        if all(t not in continuous_types for t in types):
-            return [float64]
+        if all(t in discrete_types for t in types):
+            return [Scalar(config.floatX)]
        else:
            return super(TrueDiv, self).output_types(types)
    def impl(self, x, y):
        x = numpy.asarray(x)
        y = numpy.asarray(y)
-        if str(x.dtype).startswith('int') and str(y.dtype).startswith('int'):
-            return float(x) / y
+        if all(a.dtype in discrete_types for a in (x, y)):
+            return numpy.array(float(x) / y, dtype=config.floatX)
        else:
            return x / y
    def c_code(self, node, name, (x, y), (z, ), sub):
        #we generate good c code only when both are complex!
        if sum([node.inputs[0].type in complex_types, node.inputs[1].type in complex_types])==1:
            raise NotImplementedError('type not supported', type)
-        if node.inputs[0].type in int_types and node.inputs[1].type in int_types:
+        if (node.inputs[0].type in discrete_types and
+            node.inputs[1].type in discrete_types):
            return "%(z)s = ((double)%(x)s) / %(y)s;" % locals()
        return "%(z)s = %(x)s / %(y)s;" % locals()
    def grad(self, (x, y), (gz, )):
@@ -1028,11 +1100,15 @@ class TrueDiv(BinaryScalarOp):
        if x.type in float_types:
            first_part = cast(gz / y, x.type.dtype)
        else:
+            assert x.type in discrete_types
            first_part = None

+        if y.type in complex_types:
+            raise NotImplementedError()
        if y.type in float_types:
            second_part = cast(-(gz * x) / (y * y), y.type.dtype)
        else:
+            assert y.type in discrete_types
            second_part = None
        return first_part, second_part
 true_div = TrueDiv(upcast_out, name = 'true_div')
@@ -1048,9 +1124,29 @@ int_div = IntDiv(upcast_out, name = 'int_div')

 floor_div = int_div

+
+def raise_complex_error():
+    raise ComplexError(
+                "Theano does not support the mod operator (%) on "
+                "complex numbers, since numpy deprecated it.")
+
+
+def mod_check(x, y):
+    if (as_scalar(x).type in complex_types or
+        as_scalar(y).type in complex_types):
+        # Currently forbidden.
+        raise_complex_error()
+    else:
+        return mod(x, y)
+
+
 class Mod(BinaryScalarOp):
+
    def impl(self, x, y):
+        if isinstance(x, numpy.complex) or isinstance(y, numpy.complex):
+            raise_complex_error()
        return x % y
+
    def c_code_cache_version(self):
        return (5,)

@@ -1060,20 +1156,34 @@ class Mod(BinaryScalarOp):

    def c_code(self, node, name, (x, y), (z, ), sub):
        """
-        We want the result to have the same sign as python, not the other implementaiton of mod.
+        We want the result to have the same sign as python, not the other implementation of mod.
        """
        #raise NotImplementedError("Unlike Python, C's modulo returns negative modulo on negative dividend (to implement)")
        t = node.inputs[0].type.upcast(*[ i.type for i in node.inputs[1:]])
-        if t in int_types or t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64']:
+        if (str(t) in imap(str, discrete_types) or
+            t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64'] or
+            t in discrete_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, discrete_types)
            x_mod_y = "THEANO_MACRO_MOD(%(x)s, %(y)s)"%locals()
            x_mod_ymm = "THEANO_MACRO_MOD(-%(x)s, -%(y)s)"%locals()
            x_mod_ypm = "THEANO_MACRO_MOD(%(x)s, -%(y)s)"%locals()
            x_mod_ymp = "THEANO_MACRO_MOD(-%(x)s, %(y)s)"%locals()
-        elif t in float_types or t in ['float32','float64']:
+        elif (str(t) in imap(str, float_types) or
+              t in ['float32','float64'] or
+              t in float_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, float_types)
            x_mod_y = "fmod(%(x)s,%(y)s)"%locals()
            x_mod_ymm = "fmod(-%(x)s,-%(y)s)"%locals()
            x_mod_ypm = "fmod(%(x)s,-%(y)s)"%locals()
            x_mod_ymp = "fmod(-%(x)s,%(y)s)"%locals()
+        elif str(t) in imap(str, complex_types):
+            raise_complex_error()
        else:
            raise NotImplementedError('type not supported', type)


--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -37,6 +37,7 @@ class test_ScalarOps(unittest.TestCase):
    #As we use theano.scalar normally, but we use theano.tensor.scalar
    #that is not important. Also this make the theano fct fail at call time
    #so this is not a silent bug.
+    # --> This is why it is purposedly named 'tes_mod' instead of 'test_mod'.
    def tes_mod(self):
        """
        We add this test as not all language and C implementation give the same
@@ -174,6 +175,19 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))


+class test_complex_mod(unittest.TestCase):
+    """Make sure % fails on complex numbers."""
+
+    def test_fail(self):
+        x = complex64()
+        y = int32()
+        try:
+            x % y
+            assert False
+        except ComplexError:
+            pass
+
+
 class test_div(unittest.TestCase):
    def test_0(self):
        a = int8()
@@ -182,9 +196,9 @@ class test_div(unittest.TestCase):
        d = float64()
        f = float32()

-        print (a/b).owner.op
-        assert isinstance((a/b).owner.op, IntDiv)
-        assert isinstance((b/a).owner.op, IntDiv)
+        print (a//b).owner.op
+        assert isinstance((a//b).owner.op, IntDiv)
+        assert isinstance((b//a).owner.op, IntDiv)
        assert isinstance((b/d).owner.op, TrueDiv)
        assert isinstance((b/f).owner.op, TrueDiv)
        assert isinstance((f/a).owner.op, TrueDiv)

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -10,7 +10,7 @@ except ImportError:
    pass#the variable enable_sparse will be used to disable the test file.

 import theano
-from theano import compile
+from theano import compile, config
 from theano.sparse import enable_sparse
 if enable_sparse == False:
    raise SkipTest('Optional package sparse disabled')
@@ -239,8 +239,18 @@ class T_AddMul(unittest.TestCase):
            self.assertRaises(NotImplementedError, add, a_sv, c_dv)
            self.assertRaises(NotImplementedError, add, c_sv, a_dv)

-            # mul upcasts the dense input if needed
-            self.assertRaises(NotImplementedError, mul, a_sv, b_dv)
+            # mul may upcast the dense input if needed
+            if (config.cast_policy in ('custom', 'numpy') or
+                (config.cast_policy == 'numpy+floatX' and
+                 config.floatX == 'float64')):
+                # The result should be a float64 (not implemented).
+                self.assertRaises(NotImplementedError, mul, a_sv, b_dv)
+            elif (config.cast_policy == 'numpy+floatX' and
+                  config.floatX == 'float32'):
+                # The result should be a float32.
+                assert mul(a_sv, b_dv).dtype == 'float32'
+            else:
+                raise NotImplementedError()
            self.assertRaises(NotImplementedError, mul, b_sv, a_dv)
            assert mul(b_sv, c_dv).dtype == 'int32'
            self.assertRaises(NotImplementedError, mul, c_sv, b_dv)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -7,6 +7,7 @@ import sys # for sys.maxint
 from theano.configparser import config, AddConfigVar, BoolParam
 import traceback #for overriding Op.__call__
 import warnings
+from itertools import izip

 import numpy, theano
 #from copy import copy as python_copy
@@ -22,6 +23,9 @@ from theano.gof.python25 import partial, any, all
 from theano import compile, printing
 from theano.printing import pprint

+# We use these exceptions as well.
+from theano.scalar import ComplexError, IntegerDivisionError
+
 ### set up the external interface
 from elemwise import Elemwise, DimShuffle, CAReduce, Sum

@@ -35,6 +39,17 @@ def _warn(*msg):
 #This is needed as we will hide it later
 python_complex=complex

+# Define common subsets of dtypes (as strings).
+int_dtypes = map(str, scal.int_types)
+discrete_dtypes = map(str, scal.discrete_types)
+complex_dtypes = map(str, scal.complex_types)
+
+
+class ShapeError(Exception):
+    """Raised when the shape cannot be computed."""
+    pass
+
+
 def check_equal_numpy(x, y):
    """
    Returns True iff x and y are equal (checks the dtype and
@@ -161,36 +176,64 @@ class NumpyAutocaster(object):
    """
    This class is used to cast python ints and floats to numpy arrays.

-    The behaviour for numpy scalars is a bit tricky... but tends to work in
-    practice.
-    If the dtype of a numpy scalar is in the self.dtypes list, then this 'cast'
-    is a no-op.
-
-    When config.floatX is float32 (at the time of calling), then this function
-    downcasts float and numpy.float arguments to numpy.float32, if float32 is
-    in the self.dtypes list.
-
-    Python ints are always 64bit and floats are always double precision.
-    This class uses the algorithm in __call__ to use a narrower dtype when no
-    precision would be lost, and to even lose precision when this is demanded
-    by the list of dtypes (e.g. to automatically cast all floats to
-    single-precision if self.dtypes does not include full precision floats).
-
+    The behavior when called on scalar `x` depends on `config.cast_policy`:
+        - 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
+        - 'numpy+floatX' will do the same, except it will use float32 instead
+          of float64 if `x` is a Python float and `config.floatX` is set to
+          'float32' (note that if `x` is a numpy scalar whose data type is
+          float64, it is not modified since we assume the user is purposedly
+          using float64).
+        - 'custom' lets one define a tuple of data types such that:
+            - if `x` is already a numpy scalar and its data type is in this
+              tuple, then it is returned unchanged;
+            - otherwise, the first data type in this tuple that can represent
+              `x` without loss of precision will be used, unless `x` is a float
+              and 'float32' is in the tuple (in which case `x` is cast as a
+              float32);
+            - if no data type can represent `x` without loss of precision, then
+              the last data type in the tuple will be used.
    """
    def __init__(self, dtypes):
+        """
+        Constructor.
+
+        :type dtypes: Tuple of strings.
+        :param dtypes: The ordered list of preferred data types (only used when
+        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster` help
+        for details).
+        """
        self.dtypes = tuple(dtypes)

    def __call__(self, x):
-        # Change the default casting behaviour for python floats to always cast
-        # to float32
-        dtype = None
+        # Make sure we only deal with scalars.
+        assert (isinstance(x, int) or
+                isinstance(x, float) or
+                (isinstance(x, numpy.ndarray) and x.ndim == 0))
+
+        if config.cast_policy == 'numpy':
+            return numpy.asarray(x)
+        elif config.cast_policy == 'numpy+floatX':
+            rval = numpy.asarray(x)
+            if (rval.dtype == 'float64' and         # numpy wants float64
+                config.floatX == 'float32' and      # but we prefer float32
+                not hasattr(x, 'dtype')):           # and `x` was not typed
+                rval = theano._asarray(rval, dtype='float32')
+            return rval
+
+        # The following is the original code, corresponding to the 'custom'
+        # option for `config.cast_policy`.
+        assert config.cast_policy == 'custom'

        try:
            # Pass through numpy scalars, since they are already typed on
            # purpose typically.
            if str(x.dtype) in self.dtypes:
-                return theano._asarray(x, dtype=x.dtype) #leave dtype alone
+                # No need to cast `x` into a new dtype. Note that we still
+                # need to convert it into an array, because it may not be
+                # one already (e.g. if x == numpy.float64(1.1)).
+                return numpy.asarray(x)
        except AttributeError:
+            # Means `x` has no 'dtype' attribute.
            pass

        # unsafe downcast of float64 variables when config.floatX == 'float32'
@@ -222,7 +265,10 @@ autocast_float = NumpyAutocaster(('float32', 'float64'))
 # have the same type as the xmatrix().
 #
 class autocast_float_as(object):
-    """This class makes it possible to temporarily and locally adjust autocasting behaviour.
+    """
+    This class makes it possible to temporarily and locally adjust autocasting
+    behavior when `config.cast_policy` is set to 'custom'.
+    If `config.cast_policy` is not 'custom', an exception is raised.

    For example:
    >>> with autocast_float_as('float32') as _dummy:
@@ -234,10 +280,13 @@ class autocast_float_as(object):
    """
    def __init__(self, *dtypes):
        self.dtypes = dtypes
+        assert config.cast_policy == 'custom'
    def __enter__(self):
+        assert config.cast_policy == 'custom'
        self.old_dtypes = autocast_float.dtypes
        autocast_float.dtypes = self.dtypes
    def __exit__(self, *args):
+        assert config.cast_policy == 'custom'
        autocast_float.dtypes = self.old_dtypes

 def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
@@ -259,6 +308,11 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
            x_ = autocast_int(x)
        elif rtype is TensorConstant and isinstance(x, float):
            x_ = autocast_float(x)
+        elif rtype is TensorConstant and isinstance(x, long):
+            # It is not clear what would happen if one was to use a `long`
+            # number as a constant in a Theano graph. As a result, we throw
+            # an exception in this situation.
+            raise NotImplementedError('Constants of type `long` not supported')
        elif isinstance(x, numpy.ndarray):
            x_ = x
            # Currently we do not have a bool dtype in Theano.
@@ -350,7 +404,7 @@ def _allclose(a, b):
        rtol = float64_rtol

    # Work around bug in Numpy, see http://projects.scipy.org/numpy/ticket/1684
-    if str(b.dtype).startswith('int') and (numpy.absolute(b) < 0).any():
+    if str(b.dtype) in int_dtypes and (numpy.absolute(b) < 0).any():
        b = theano._asarray(b, dtype='float64')

    return numpy.allclose(a,b, atol=atol, rtol=rtol)
@@ -1092,6 +1146,10 @@ class _tensor_py_operators:
    def __div__(self,other):
        try:
            return div_proxy(self,other)
+        except IntegerDivisionError:
+            # This is to raise the exception that occurs when trying to divide
+            # two integer arrays (currently forbidden).
+            raise
        except Exception, e:
            return NotImplemented
    def __pow__(self,other):
@@ -1101,7 +1159,11 @@ class _tensor_py_operators:
            return NotImplemented
    def __mod__(self,other):
        try:
-            return mod(self,other)
+            return mod_check(self, other)
+        except ComplexError:
+            # This is to raise the exception that occurs when trying to compute
+            # x % y with either x or y a complex number.
+            raise
        except Exception, e:
            return NotImplemented

@@ -1850,7 +1912,7 @@ def min(x, axis='DEFAULT'):
        "flatten the tensor before calling min()."),
        stacklevel=2)
    str_x_type = str(x.dtype)
-    if str_x_type.startswith('float') or str_x_type.startswith('int'):
+    if str_x_type.startswith('float') or str_x_type in int_dtypes:
        return -max(-x, axis=axis)
    else:
        #Be careful about unsigned integers, complex
@@ -1880,7 +1942,7 @@ def argmin(x, axis='DEFAULT'):
        "axis before calling argmin."),
        stacklevel=2)
    str_x_type = str(x.dtype)
-    if str_x_type.startswith('float') or str_x_type.startswith('int'):
+    if str_x_type.startswith('float') or str_x_type in int_dtypes:
        return argmax(-x, axis=axis)
    else:
        #Be careful about unsigned integers, complex
@@ -2383,7 +2445,7 @@ def mean(input, axis = None, op = False):
    if op:
        return Mean(axis)(input)

-    if str(input.dtype).startswith('int'):
+    if str(input.dtype) in discrete_dtypes:
            # we need to cast eventually anyway, and this helps
            # to prevents overflow
        input = cast(input, 'float64')
@@ -2527,12 +2589,11 @@ def minimum(x,y):
    # see decorator for function body

 def div_proxy(x, y):
-    """Proxy for either true_div or int_div, depending on types of x, y.
-    """
-    if as_tensor_variable(x).type.dtype.startswith('int') and as_tensor_variable(y).type.dtype.startswith('int'):
-        return int_div(x, y)
-    else:
-        return true_div(x, y)
+    """Proxy for either true_div or int_div, depending on types of x, y."""
+    f = eval('%s_div' % scal.int_or_true_div(
+        as_tensor_variable(x).dtype in discrete_dtypes,
+        as_tensor_variable(y).dtype in discrete_dtypes))
+    return f(x, y)

 @_scal_elemwise_with_nfunc('add', 2, 1)
 def add(a, *other_terms):
@@ -2564,6 +2625,15 @@ def int_div(a, b):
    """elementwise integer-division"""
    # see decorator for function body

+def mod_check(x, y):
+    """Make sure we do not try to use complex numbers."""
+    if (as_tensor_variable(x).dtype in complex_dtypes or
+        as_tensor_variable(y).dtype in complex_dtypes):
+        # Currently forbidden.
+        scal.raise_complex_error()
+    else:
+        return mod(x, y)
+
 @_scal_elemwise_with_nfunc('mod', 2, 1)
 def mod(a, b):
    """elementwise modulo"""
@@ -2866,7 +2936,7 @@ class Subtensor(Op):
        padded = ( actual_idx_list +
                  [slice(None, None, None)]*(len(xshp)-len(self.idx_list)))
        i = 0
-        for idx, xl in zip(padded, xshp):
+        for idx, xl in izip(padded, xshp):
            if isinstance(idx, slice):
                # If it is the default (None, None, None) slice, or a variant,
                # the shape will be xl
@@ -2876,7 +2946,7 @@ class Subtensor(Op):
                    outshp.append(xl)
                else:
                    cnf = get_canonical_form_slice(idx, xl)
-                    length = (cnf[0].stop - cnf[0].start -1)/cnf[0].step + 1
+                    length = (cnf[0].stop - cnf[0].start -1) // cnf[0].step + 1
                    length = switch(lt(length,0), 0, length)
                    outshp.append(length)
                i += 1
@@ -3993,6 +4063,31 @@ def arange(start, stop=None, step=1, dtype=None):
    # If dtype is not provided, infer it from the other arguments
    if dtype is None:
        dtype = scal.upcast(start.type.dtype, stop.type.dtype, step.type.dtype)
+        if config.cast_policy in ('numpy', 'numpy+floatX'):
+            # We enforce numpy semantics, except in the special case where
+            # `config.cast_policy` is 'numpy+floatX' and we want to use float32
+            # rather than float64.
+            # As an example, if `start`, `stop` and `step` are all int32,
+            # `numpy.arange` returns an int64 array (on 64-bit platforms),
+            # while the upcast above returns int32.
+            numpy_dtype = numpy.arange(
+                    start=numpy.array(0, dtype=start.dtype),
+                    stop=numpy.array(1, dtype=stop.dtype),
+                    step=numpy.array(1, dtype=step.dtype)).dtype
+            if numpy_dtype != dtype:
+                if (config.cast_policy == 'numpy+floatX' and
+                    config.floatX == 'float32' and
+                    numpy_dtype == 'float64' and
+                    # No explicit float64 in the three arguments?
+                    all(dt != 'float64'
+                        for dt in [s.dtype for s in (start, stop, step)])):
+                    # We use float32 instead.
+                    assert dtype != 'float64'
+                    dtype = 'float32'
+                else:
+                    # We use the same dtype as numpy instead of the result of
+                    # the upcast.
+                    dtype = str(numpy_dtype)

    if dtype not in _arange:
        _arange[dtype] = ARange(dtype)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -453,7 +453,7 @@ class Elemwise(Op):
        """

        inputs = map(as_tensor_variable, inputs)
-        shadow = self.scalar_op.make_node(*[Scalar(dtype = t.type.dtype)() for t in inputs])
+        shadow = self.scalar_op.make_node(*[Scalar(dtype=i.type.dtype)() for i in inputs])

        target_length = max([input.type.ndim for input in inputs])

@@ -1200,7 +1200,8 @@ class Prod(CAReduce):
        self.no_zeros_in_input = no_zeros_in_input

    def __setstate__(self, dct):
-        self.__dict__.update(dct)
+        super(Prod, self).__setstate__(dct)
+        # Add default value to be able to reload old pickled objects.
        if 'no_zeros_in_input' not in dct:
            self.no_zeros_in_input = False


--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -135,9 +135,9 @@ class Conv3D(theano.Op):
        vidDur = V_shape[3]
        filterDur = W_shape[3]

-        output_height = T.floor( (vidHeight - filterHeight) / dr )+1
-        output_width = T.floor( (vidWidth - filterWidth) / dc )+1
-        output_dur = T.floor( (vidDur - filterDur) / dt ) +1
+        output_height = T.floor((vidHeight - filterHeight) // dr) + 1
+        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
+        output_dur = T.floor((vidDur - filterDur) // dt) + 1

        rval = (batch_size,  output_height, output_width, output_dur, output_channels )


--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -575,14 +575,15 @@ class ConvOp(Op):
            try:
                fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode)
            except TypeError:
-                raise NotImplementedError()
+                raise theano.tensor.ShapeError()
            outshp = (batch_size,fmo) + tuple(fmshp)
            return [outshp]
        else:
            # Haven't implemented this case. imshp and kshp may be symbollic
            # and ConvOp.getOutputShape doesn't handle this. In this case
            # we simply let the default function do its work.
-            raise NotImplementedError()
+            raise theano.tensor.ShapeError()
+            

    def perform(self,node, inp, out):
        """

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -879,6 +879,7 @@ def test_argmax_pushdown():
            [x],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)

@@ -922,6 +923,7 @@ def test_argmax_pushdown_bias():
            [x,b],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -27,11 +27,12 @@ from theano import compile  #to register the optimizer built by this file
 from theano.gof.python25 import any, all
 from theano.gof.opt import Optimizer, pre_constant_merge, pre_greedy_local_optimizer
 from theano.gof import toolbox, DestroyHandler
-from basic import get_constant_value
+from basic import get_constant_value, ShapeError


 # Utilities

+
 def out2in(*local_opts):
    """WRITEME """
    return opt.TopoOptimizer(opt.LocalOptGroup(*local_opts),
@@ -528,7 +529,7 @@ class ShapeFeature(object):
    the cost of many Ops accurately, and generate c-code that is specific [e.g. unrolled] to
    particular sizes.

-    If you can determine the shape only in some case, return NotImplementedError when you can't
+    In cases where you cannot figure out the shape, raise a ShapeError.

    .. note::

@@ -719,8 +720,15 @@ class ShapeFeature(object):

        try:
            o_shapes = shape_infer(node, [self.shape_of[r] for r in node.inputs])
-        except NotImplementedError:
+        except ShapeError:
            o_shapes = self.default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
+        except NotImplementedError, e:
+            raise NotImplementedError(
+                    'Code called by infer_shape failed raising a '
+                    'NotImplementedError. Raising NotImplementedError to '
+                    'indicate that a shape cannot be computed is no longer '
+                    'supported, and one should now use tensor.ShapeError '
+                    'instead. The original exception message is: %s' % e)
        except Exception, e:
            _logger.error('Failed to infer_shape from Op %s.\nInput shapes:%s\nException encountered during infer_shape: %s\nException message: %s\nTraceback: %s'% (node.op,
                [self.shape_of[r] for r in node.inputs],
@@ -3427,11 +3435,12 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
    """
    def local_fuse(node):
        """
-        As part of specialisation, we fuse two consecutive elemwise op of the same shape.
-
-        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
-        The number of dimension is validated at call time by theano itself.
+        As part of specialization, we fuse two consecutive elemwise Ops of the
+        same shape.

+        For mixed dtype, we let the Composite op do the cast. It lets the C
+        compiler do the cast.
+        The number of dimensions is validated at call time by theano itself.
        """
        # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
        # TODO: use broadcast flag?
@@ -3547,7 +3556,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
        if new_nb_input != len(inputs) or len(s_inputs) != len(inputs):
            raise Exception("""Something has gone wrong with the elemwise
 fusion optimization. We skip this optimization. You can ignore this message,
-your code will run correctly, but maybe slower.""")
+your code will run correctly, but may be slower.""")

        otype = node.outputs[0].type
        s_new_out=node.op.scalar_op(*s_g)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -47,6 +47,75 @@ def eval_outputs(outputs):
        return variables[0]
    return variables

+def get_numeric_subclasses(cls=numpy.number, ignore=None):
+    """
+    Return subclasses of `cls` in the numpy scalar hierarchy.
+
+    We only return subclasses that correspond to unique data types.
+    The hierarchy can be seen here:
+        http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
+    """
+    if ignore is None:
+        ignore = []
+    rval = []
+    dtype = numpy.dtype(cls)
+    dtype_num = dtype.num
+    if dtype_num not in ignore:
+        # Safety check: we should be able to represent 0 with this data type.
+        numpy.array(0, dtype=dtype)
+        rval.append(cls)
+        ignore.append(dtype_num)
+    for sub in cls.__subclasses__():
+        rval += [c for c in get_numeric_subclasses(sub, ignore=ignore)]
+    return rval
+
+
+def get_numeric_types(with_int=True, with_float=True, with_complex=False,
+                      with_128_bit=False):
+    """
+    Return numpy numeric data types.
+
+    :param with_int: Whether to include integer types.
+
+    :param with_float: Whether to include floating point types.
+
+    :param with_complex: Whether to include complex types.
+
+    :param with_128_bit: Whether to include 128/256-bit types.
+
+    :returns: A list of unique data type objects. Note that multiple data types
+    may share the same string representation, but can be differentiated through
+    their `num` attribute.
+
+    Note that we could probably rely on the lists of types defined in the
+    `scalar` module. However with this function we can test more unique dtype
+    objects, and possibly detect defects in dtypes that may be introduced in
+    numpy in the future.
+    """
+    rval = []
+    def is_within(cls1, cls2):
+        # Return True if scalars defined from `cls1` are within the hierarchy
+        # starting from `cls2`.
+        # The third test below is to catch for instance the fact that
+        # one can use ``dtype=numpy.number`` and obtain a float64 scalar, even
+        # though `numpy.number` is not under `numpy.floating` in the class
+        # hierarchy.
+        return (cls1 is cls2 or
+                issubclass(cls1, cls2) or
+                isinstance(numpy.array([0], dtype=cls1)[0], cls2))
+    for cls in get_numeric_subclasses():
+        dtype = numpy.dtype(cls)
+        if ((not with_complex and is_within(cls, numpy.complexfloating)) or
+            (not with_int and is_within(cls, numpy.integer)) or
+            (not with_float and is_within(cls, numpy.floating)) or
+            (not with_128_bit and ('128' in str(dtype) or
+                                   '256' in str(dtype)))):
+            # Ignore this class.
+            continue
+        rval.append([str(dtype), dtype, dtype.num])
+    # We sort it to be deterministic, then remove the string and num elements.
+    return [x[1] for x in sorted(rval, key=str)]
+
 def _numpy_checker(x, y):
    """
    Checks if x.data and y.data have the same contents.
@@ -374,6 +443,18 @@ _good_broadcast_div_mod_normal_float_inplace = dict(same_shapes = (rand(2, 3), r
 _good_broadcast_div_mod_normal_float = dict(empty2 = (numpy.asarray([0]), numpy.asarray([])),
                                            **_good_broadcast_div_mod_normal_float_inplace
                                            )
+def no_complex(d):
+    """Remove pairs from dictionary d when the value contains complex data."""
+    return dict((k, v) for k, v in d.iteritems()
+                if all(str(x.dtype) not in tensor.complex_dtypes for x in v))
+
+
+# 'No-complex' versions.
+_good_broadcast_div_mod_normal_float_no_complex = no_complex(
+                                        _good_broadcast_div_mod_normal_float)
+_good_broadcast_div_mod_normal_float_inplace_no_complex = no_complex(
+                                _good_broadcast_div_mod_normal_float_inplace)
+
 _grad_broadcast_div_mod_normal = dict(same_shapes = (rand(2, 3), rand(2, 3)),
                                      scalar = (rand(2, 3), rand(1, 1)),
                                      row = (rand(2, 3), rand(1, 3)),
@@ -389,8 +470,9 @@ _grad_broadcast_div_mod_normal = dict(same_shapes = (rand(2, 3), rand(2, 3)),

 div_grad_rtol=None
 if config.floatX=='float32':
-    #We raise the relative tolerence for the grad as their is error in float32
-    #This is probably caused by our way of computing the gradient error.
+    # We raise the relative tolerance for the grad as there can be errors in
+    # float32.
+    # This is probably caused by our way of computing the gradient error.
    div_grad_rtol=0.025
 DivTester = makeBroadcastTester(op = true_div,
                                  expected = lambda x, y: x / y,
@@ -410,14 +492,14 @@ DivInplaceTester = makeBroadcastTester(op = inplace.true_div_inplace,

 ModTester = makeBroadcastTester(op = mod,
                                  expected = lambda x, y: numpy.asarray(x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
-                                  good = _good_broadcast_div_mod_normal_float,
+                                  good = _good_broadcast_div_mod_normal_float_no_complex,
 #                                               integers = (randint(2, 3), randint_nonzero(2, 3)),
 #                                               dtype_mixup_1 = (rand(2, 3), randint_nonzero(2, 3)),
 #                                               dtype_mixup_2 = (randint_nonzero(2, 3), rand(2, 3))),
                                  )
 ModInplaceTester = makeBroadcastTester(op = inplace.mod_inplace,
                                         expected = lambda x, y: numpy.asarray(x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
-                                         good = _good_broadcast_div_mod_normal_float_inplace,
+                                         good = _good_broadcast_div_mod_normal_float_inplace_no_complex,
                                         inplace = True)

 _good_broadcast_pow_normal_float = dict(same_shapes = (rand_ranged(1, 5, (2, 3)), rand_ranged(-3, 3, (2, 3))),
@@ -2180,7 +2262,7 @@ class T_Join_and_Split(unittest.TestCase):

    def test_stack_scalar_make_vector(self):
        '''Test that calling stack() on scalars instantiates MakeVector,
-        not Join. Test that the floatX dtype stay floatX, not down casted to int64'''
+        not Join. Test that the floatX dtype stay floatX, not downcasted to int64'''
        a = tensor.scalar('a')
        b = tensor.scalar('b')
        s = stack(a, b, a, b)
@@ -2665,9 +2747,9 @@ class T_divimpl(unittest.TestCase):
                (5.0/11.0))
        assert numpy.allclose(function([i, ii, d, f, c], f/i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (11.0/5.0))
-        assert numpy.allclose(function([i, ii, d, f, c], i/ii)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
+        assert numpy.allclose(function([i, ii, d, f, c], i//ii)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (5/3))
-        assert numpy.allclose(function([i, ii, d, f, c], ii/i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
+        assert numpy.allclose(function([i, ii, d, f, c], ii//i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (3/5))
        assert numpy.allclose(function([i, ii, d, f, c], true_div(i,ii))(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (5./3.))
@@ -3056,7 +3138,13 @@ class T_scalarfromtensor(unittest.TestCase):
        v = eval_outputs([ss])

        self.assertTrue(v == 56, v)
-        self.assertTrue(isinstance(v, numpy.int8))
+        if config.cast_policy == 'custom':
+            self.assertTrue(isinstance(v, numpy.int8))
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            self.assertTrue(isinstance(
+                v, getattr(numpy, str(numpy.asarray(56).dtype))))
+        else:
+            raise NotImplementedError(config.cast_policy)
        self.assertTrue(v.shape == (), v.shape)
        tt = lscalar()
        ss = scalar_from_tensor(tt)
@@ -3496,7 +3584,13 @@ class TestARange(unittest.TestCase):
        out = arange(start, stop, step)
        f = function([start, stop, step], out)

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            numpy_dtype = numpy.arange(numpy.array(1, dtype='int32')).dtype
+            assert out.dtype == numpy_dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(0,5,1) == numpy.arange(0,5,1))
        assert numpy.all(f(2,11,4) == numpy.arange(2,11,4))
        assert numpy.all(f(-5,1,1) == numpy.arange(-5,1,1))
@@ -3510,13 +3604,31 @@ class TestARange(unittest.TestCase):
        out = arange(start, stop, step)
        f = function([start, stop, step], out)

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy == 'numpy':
+            numpy_dtype = numpy.arange(numpy.array(0, dtype=start.dtype),
+                                       numpy.array(1, dtype=stop.dtype),
+                                       numpy.array(1, dtype=step.dtype)).dtype
+            assert out.dtype == numpy_dtype
+        elif config.cast_policy == 'numpy+floatX':
+            assert out.dtype == config.floatX
+        else:
+            raise NotImplementedError(config.cast_policy)
        arg_vals = [ (0,5,1), (2,11,4), (-5,1.1,1.2), (1.3,2,-2.1), (10,2,2) ]
        for arg_v in arg_vals:
            start_v, stop_v, step_v = arg_v
            start_v_, stop_v_, step_v_ = numpy.asarray(arg_v, dtype=start.type.dtype)
-            assert numpy.all(f(start_v_, stop_v_, step_v_) == \
-                    numpy.arange(start_v, stop_v, step_v, dtype=start.type.dtype))
+            f_val = f(start_v_, stop_v_, step_v_)
+            if config.cast_policy == 'custom':
+                expected_val = numpy.arange(start_v, stop_v, step_v,
+                                            dtype=start.type.dtype)
+            elif config.cast_policy in ('numpy', 'numpy+floatX'):
+                expected_val = numpy.arange(start_v_, stop_v_, step_v_,
+                                            dtype=out.dtype)
+            else:
+                raise NotImplementedError(config.cast_policy)
+            assert numpy.all(f_val == expected_val)

    def test_float64(self):
        """Test arange constructor, on float64 outputs"""
@@ -3529,8 +3641,15 @@ class TestARange(unittest.TestCase):
        for arg_v in arg_vals:
            start_v, stop_v, step_v = arg_v
            start_v_, stop_v_, step_v_ = numpy.asarray(arg_v, dtype=start.type.dtype)
-            assert numpy.all(f(start_v_, stop_v_, step_v_) == \
-                    numpy.arange(start_v, stop_v, step_v, dtype=start.type.dtype))
+            f_val = f(start_v_, stop_v_, step_v_)
+            if config.cast_policy == 'custom':
+                expected_val = numpy.arange(start_v, stop_v, step_v,
+                                            dtype=start.type.dtype)
+            elif config.cast_policy in ('numpy', 'numpy+floatX'):
+                expected_val = numpy.arange(start_v_, stop_v_, step_v_)
+            else:
+                raise NotImplementedError(config.cast_policy)
+            assert numpy.all(f_val == expected_val)

    def test_default_step(self):
        """Test that arange constructor uses the correct default step"""
@@ -3538,7 +3657,13 @@ class TestARange(unittest.TestCase):
        out = arange(start, stop)
        f = function([start, stop], out)

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(numpy.int32(0),
+                                             numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(0,5) == numpy.arange(0,5))
        assert numpy.all(f(-5,1) == numpy.arange(-5,1))
        assert numpy.all(f(0,0) == numpy.arange(0,0))
@@ -3560,7 +3685,12 @@ class TestARange(unittest.TestCase):
        out = arange(stop)
        f = function([stop], out)

-        assert out.dtype == stop.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == stop.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(8) == numpy.arange(8))
        assert numpy.all(f(-2) == numpy.arange(-2))

@@ -3568,24 +3698,93 @@ class TestARange(unittest.TestCase):
        fout = arange(fstop)
        ff = function([fstop], fout)

-        assert fout.dtype == fstop.type.dtype
+        if config.cast_policy == 'custom':
+            assert fout.dtype == fstop.type.dtype
+        elif config.cast_policy == 'numpy':
+            assert fout.dtype == numpy.arange(numpy.float32(1)).dtype
+        elif config.cast_policy == 'numpy+floatX':
+            if config.floatX == 'float32':
+                assert fout.dtype == 'float32'
+            else:
+                assert fout.dtype == numpy.arange(numpy.float32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
+
        fstop_values = [0.2, -0.7, 8.5]
        for fstop_v in fstop_values:
            fstop_v32 = numpy.float32(fstop_v)
            assert numpy.all(ff(fstop_v32) == numpy.arange(fstop_v))

    def test_upcast(self):
-        """Test that arange compute output type adequately"""
-        assert arange(iscalar()).dtype == iscalar().dtype
-        assert arange(fscalar()).dtype == fscalar().dtype
-        assert arange(dscalar()).dtype == dscalar().dtype
-
-        # int32 + float32 -> float64
-        assert arange(iscalar(), fscalar()).dtype == dscalar().dtype
-        assert arange(iscalar(), dscalar()).dtype == dscalar().dtype
-        assert arange(fscalar(), dscalar()).dtype == dscalar().dtype
-
-        assert arange(iscalar(), fscalar(), dscalar()).dtype == dscalar().dtype
+        """Test that arange computes output type adequately"""
+        if config.cast_policy == 'custom':
+            assert arange(iscalar()).dtype == iscalar().dtype
+            assert arange(fscalar()).dtype == fscalar().dtype
+            assert arange(dscalar()).dtype == dscalar().dtype
+
+            # int32 + float32 -> float64
+            assert arange(iscalar(), fscalar()).dtype == dscalar().dtype
+            assert arange(iscalar(), dscalar()).dtype == dscalar().dtype
+            assert arange(fscalar(), dscalar()).dtype == dscalar().dtype
+
+            assert arange(iscalar(), fscalar(), dscalar()).dtype == dscalar().dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            for dtype in get_numeric_types():
+                # Test with a single argument.
+                arange_dtype = arange(scalar(dtype=str(dtype))).dtype
+                numpy_dtype = numpy.arange(numpy.array(1, dtype=dtype)).dtype
+                if (dtype != 'float64' and
+                    numpy_dtype == 'float64' and
+                    config.cast_policy == 'numpy+floatX' and
+                    config.floatX == 'float32'):
+                    # We want a float32 arange.
+                    assert arange_dtype == 'float32'
+                else:
+                    # Follow numpy.
+                    assert arange_dtype == numpy_dtype
+                
+                # Test with two arguments.
+                for stop_dtype in get_numeric_types():
+                    arange_dtype = arange(
+                            start=scalar(dtype=str(dtype)),
+                            stop=scalar(dtype=str(stop_dtype))).dtype
+                    numpy_dtype = numpy.arange(
+                            start=numpy.array(0, dtype=dtype),
+                            stop=numpy.array(1, dtype=stop_dtype)).dtype
+                    if (dtype != 'float64' and
+                        stop_dtype != 'float64' and
+                        numpy_dtype == 'float64' and
+                        config.cast_policy == 'numpy+floatX' and
+                        config.floatX == 'float32'):
+                        # We want a float32 arange.
+                        assert arange_dtype == 'float32'
+                    else:
+                        # Follow numpy.
+                        assert arange_dtype == numpy_dtype
+
+                    # Test with three arguments.
+                    for step_dtype in get_numeric_types():
+                        arange_dtype = arange(
+                                start=scalar(dtype=str(dtype)),
+                                stop=scalar(dtype=str(stop_dtype)),
+                                step=scalar(dtype=str(step_dtype))).dtype
+                        numpy_dtype = numpy.arange(
+                                start=numpy.array(0, dtype=dtype),
+                                stop=numpy.array(1, dtype=stop_dtype),
+                                step=numpy.array(1, dtype=step_dtype)).dtype
+                        if (dtype != 'float64' and
+                            stop_dtype != 'float64' and
+                            step_dtype != 'float64' and
+                            numpy_dtype == 'float64' and
+                            config.cast_policy == 'numpy+floatX' and
+                            config.floatX == 'float32'):
+                            # We want a float32 arange.
+                            assert arange_dtype == 'float32'
+                        else:
+                            # Follow numpy.
+                            assert arange_dtype == numpy_dtype
+        else:
+            raise NotImplementedError(config.cast_policy)

    def test_dtype_cache(self):
        """Checks that the same Op is returned on repeated calls to arange
@@ -3593,8 +3792,8 @@ class TestARange(unittest.TestCase):

        start, stop, step = iscalars('start', 'stop', 'step')
        out1 = arange(start, stop, step)
-        out2 = arange(start, stop, step, dtype=start.type.dtype)
-        out3 = arange(start, stop, 2., dtype=start.type.dtype)
+        out2 = arange(start, stop, step, dtype=out1.dtype)
+        out3 = arange(start, stop, 2., dtype=out1.dtype)
        out4 = arange(start, stop, 2.)

        assert out1.owner.op is out2.owner.op
@@ -3612,7 +3811,16 @@ class TestARange(unittest.TestCase):
        assert len(f.maker.env.toposort())==7
 #7 [Elemwise{sub,no_inplace}(stop, start), Elemwise{Cast{float64}}(Elemwise{sub,no_inplace}.0), Elemwise{TrueDiv{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{Cast{float64}}.0, step), Elemwise{Ceil{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{TrueDiv{output_types_preference=transfer_type{0}}}[(0, 0)].0), Elemwise{Cast{int64}}(Elemwise{Ceil{output_types_preference=transfer_type{0}}}[(0, 0)].0), Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{Cast{int64}}.0, 0), MakeVector(Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)].0)]

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            numpy_dtype = numpy.arange(numpy.array(0, dtype=start.dtype),
+                                       numpy.array(1, dtype=stop.dtype),
+                                       numpy.array(1, dtype=step.dtype)).dtype
+            assert out.dtype == numpy_dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
+
        assert numpy.all(f(0,5,1) == len(numpy.arange(0,5,1)))
        assert numpy.all(f(2,11,4) == len(numpy.arange(2,11,4)))
        assert numpy.all(f(-5,1,1) == len(numpy.arange(-5,1,1)))
@@ -3624,7 +3832,13 @@ class TestARange(unittest.TestCase):
        f = function([start, stop], out.shape, mode=mode)
        assert len(f.maker.env.toposort())==4
 #4 [Elemwise{sub,no_inplace}(stop, start), Elemwise{Cast{int64}}(Elemwise{sub,no_inplace}.0), Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{Cast{int64}}.0, 0), MakeVector(Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)].0)]
-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(
+                    numpy.int32(0), numpy.int32(1), numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(0,5) == len(numpy.arange(0,5)))
        assert numpy.all(f(2,11) == len(numpy.arange(2,11)))
        assert numpy.all(f(-5,1) == len(numpy.arange(-5,1)))
@@ -3637,7 +3851,16 @@ class TestARange(unittest.TestCase):
        assert len(f.maker.env.toposort())==2
        #[Elemwise{Cast{int64}}(stop), MakeVector(Elemwise{Cast{int64}}.0)]

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            numpy_dtype = numpy.arange(0,
+                                       numpy.array(1, dtype=stop.dtype),
+                                       1).dtype
+            assert out.dtype == numpy_dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
+
        assert numpy.all(f(5) == len(numpy.arange(0,5)))
        assert numpy.all(f(11) == len(numpy.arange(0,11)))
        assert numpy.all(f(1) == len(numpy.arange(0,1)))
@@ -4074,6 +4297,22 @@ def test_default_state():
    assert numpy.allclose(f(numpy.asarray(2.2, dtype=config.floatX)), 7)

 def test_autocast():
+    backup_config = config.cast_policy
+    # Call test functions for all possible values of `config.cast_policy`.
+    for autocast_cfg in (
+            'custom',
+            'numpy',
+            'numpy+floatX',
+            ):
+        config.cast_policy = autocast_cfg
+        try:
+            eval('_test_autocast_' + autocast_cfg.replace('+', '_'))()
+        finally:
+            config.cast_policy = backup_config
+
+def _test_autocast_custom():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'custom'
    orig_autocast = autocast_float.dtypes

    # Test that autocast_float_as sets the autocast dtype correctly
@@ -4165,6 +4404,180 @@ def test_autocast():
    finally:
        ac.__exit__()

+
+def _test_autocast_numpy():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'numpy'
+    # Go through some typical scalar values.
+    def ok(z):
+        assert tensor.constant(z).dtype == numpy.asarray(z).dtype
+    for x in ([2**i for i in xrange(63)] +
+              [0] +
+              [0., 1., 1.1, 1.5]):
+        n_x = numpy.asarray(x)
+        # Make sure the data type is the same as the one found by numpy.
+        ok(x)
+        ok(-x)
+        ok(x - 1)
+        ok(-x + 1)
+        ok(n_x)
+
+
+def _test_autocast_numpy_floatX():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'numpy+floatX'
+    backup_floatX = config.floatX
+    def ok(z, floatX):
+        if (isinstance(z, float) and
+            floatX == 'float32' and
+            not hasattr(z, 'dtype')):
+            # Special case where we use 'float32' instead of 'float64'.
+            assert tensor.constant(z).dtype == 'float32'
+        else:
+            assert tensor.constant(z).dtype == numpy.asarray(z).dtype
+    try:
+        # Test with various values of `config.floatX`.
+        for floatX in ('float32', 'float64'):
+            config.floatX = floatX
+            # Go through some typical scalar values.
+            for x in ([2**i for i in xrange(63)] +
+                      [0] +
+                      [0., 1., 1.1, 1.5]):
+                ok(x, floatX)
+                ok(-x, floatX)
+                ok(x - 1, floatX)
+                ok(-x + 1, floatX)
+                ok(numpy.asarray(x), floatX)
+                ok(numpy.float64(x), floatX)
+    finally:
+        config.floatX = backup_floatX
+
+
+class test_arithmetic_cast(unittest.TestCase):
+
+    """
+    Test output types of basic arithmeric operations (* / + - //).
+
+    We only test the behavior for `config.cast_policy` set to either 'numpy' or
+    'numpy+floatX': the 'custom' behavior is (at least partially) tested in
+    `_test_autocast_custom`.
+    """
+
+    def test_arithmetic_cast(self):
+        backup_config = config.cast_policy
+        dtypes = get_numeric_types(with_complex=True)
+        # Here:
+        # scalar == scalar stored as a 0d array
+        # array == 1d array
+        # i_scalar == scalar type used internally by Theano
+        theano_scalar = lambda dtype: tensor.scalar(dtype=str(dtype))
+        numpy_scalar = lambda dtype: numpy.array(1, dtype=dtype)
+        theano_array = lambda dtype: tensor.vector(dtype=str(dtype))
+        numpy_array = lambda dtype: numpy.array([1], dtype=dtype)
+        theano_i_scalar = lambda dtype: theano.scalar.Scalar(str(dtype))()
+        numpy_i_scalar = numpy_scalar
+        try:
+            for cfg in ('numpy', 'numpy+floatX'):
+                config.cast_policy = cfg
+                for op in (operator.add, operator.sub, operator.mul,
+                           operator.div, operator.floordiv):
+                    for a_type in dtypes:
+                        for b_type in dtypes:
+                            # Note that we do not test division between
+                            # integers if it is forbidden.
+                            # Theano deals with integer division in its own
+                            # special way (depending on `config.int_division`).
+                            is_int_division = (
+                                    op is operator.div and
+                                    a_type in tensor.discrete_dtypes and
+                                    b_type in tensor.discrete_dtypes)
+                            # We will test all meaningful combinations of
+                            # scalar and array operations.
+                            for combo in (
+                                          ('scalar', 'scalar'),
+                                          ('array', 'array'),
+                                          ('scalar', 'array'),
+                                          ('array', 'scalar'),
+                                          ('i_scalar', 'i_scalar'),
+                                          ):
+
+                                theano_args = map(eval,
+                                        ['theano_%s' % c for c in combo])
+                                numpy_args = map(eval,
+                                        ['numpy_%s' % c for c in combo])
+                                try:
+                                    theano_dtype = op(
+                                        theano_args[0](a_type),
+                                        theano_args[1](b_type)).type.dtype
+                                    # Should have crashed if it is an integer
+                                    # division and `config.int_division` does
+                                    # not allow it.
+                                    assert not (is_int_division and
+                                                config.int_division == 'raise')
+                                except theano.scalar.IntegerDivisionError:
+                                    assert (is_int_division and
+                                            config.int_division == 'raise')
+                                    # This is the expected behavior.
+                                    continue
+                                # For numpy we have a problem:
+                                #   http://projects.scipy.org/numpy/ticket/1827
+                                # As a result we only consider the highest data
+                                # type that numpy may return.
+                                numpy_dtypes = [
+                                        op(numpy_args[0](a_type),
+                                           numpy_args[1](b_type)).dtype,
+                                        op(numpy_args[1](b_type),
+                                           numpy_args[0](a_type)).dtype]
+                                numpy_dtype = theano.scalar.upcast(
+                                        *map(str, numpy_dtypes))
+                                if numpy_dtype == theano_dtype:
+                                    # Same data type found, all is good!
+                                    continue
+                                if (cfg == 'numpy+floatX' and
+                                    config.floatX == 'float32' and
+                                    a_type != 'float64' and
+                                    b_type != 'float64' and
+                                    numpy_dtype == 'float64'):
+                                    # We should keep float32.
+                                    assert theano_dtype == 'float32'
+                                    continue
+                                if 'array' in combo and 'scalar' in combo:
+                                    # For mixed scalar / array operations,
+                                    # Theano may differ from numpy as it does
+                                    # not try to prevent the scalar from
+                                    # upcasting the array.
+                                    array_type, scalar_type = (
+                                            (a_type, b_type)[
+                                                        list(combo).index(arg)]
+                                            for arg in ('array', 'scalar'))
+                                    up_type = theano.scalar.upcast(array_type,
+                                                                   scalar_type)
+                                    if (
+                                        # The two data types are different.
+                                        scalar_type != array_type and
+                                        # The array type is not enough to hold
+                                        # the scalar type as well.
+                                        array_type != up_type and
+                                        # Theano upcasted the result array.
+                                        theano_dtype == up_type and
+                                        # But Numpy kept its original type.
+                                        # (not an equality because of numpy bug
+                                        # mentioned above).
+                                        array_type in numpy_dtypes):
+                                        # Then we accept this difference in
+                                        # behavior.
+                                        continue
+                                if (is_int_division and
+                                    config.int_division == 'floatX'):
+                                    assert theano_dtype == config.floatX
+                                    continue
+                                # In any other situation: something wrong is
+                                # going on!
+                                assert False
+        finally:
+            config.cast_policy = backup_config
+
+
 class test_broadcast(unittest.TestCase):
    def test_broadcast_bigdim(self):
        def f():
@@ -4373,6 +4786,18 @@ class T_as_tensor_variable(unittest.TestCase):
        assert ten.type.dtype == 'uint8'


+class test_complex_mod(unittest.TestCase):
+    """Make sure % fails on complex numbers."""
+
+    def test_fail(self):
+        x = vector(dtype='complex64')
+        try:
+            x % 5
+            assert False
+        except ComplexError:
+            pass
+
+
 if __name__ == '__main__':
    if 1:
        unittest.main()

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
-
-import time
-import unittest
+import cPickle, time, unittest

 from theano.gof import Variable, Op
 from theano import gof
@@ -399,6 +397,14 @@ class test_Prod(unittest.TestCase):

        fn_debug(a)

+    def test_pickle_bug(self):
+        # Regression test for bug fixed in 24d4fd291054.
+        o = Prod()
+        s = cPickle.dumps(o)
+        o = cPickle.loads(s)
+        cPickle.dumps(o)
+
+
 if __name__ == '__main__':
    #unittest.main()
    suite = unittest.TestSuite([test_Prod('test_mul_without_zeros_zeros')])

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -183,15 +183,15 @@ class test_canonize(unittest.TestCase):
 #            (fx*fy*(fx+fy+dz),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type add
 #            (dz*fy*(fx+fy),(fx,fy,dz),(dxv,dyv,dzv),2,'float64'),#check mixed type mul
            #check with dimshuffle of constant
-            (fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
-            (fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
+            (fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1, {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
+            (fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1, {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
 #            (2+fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
 #            (2*fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
-            (2+fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
-            (2*fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1,'float32'),
+            (2+fx+fy+fz+2,(fx,fy,fz),(fxv,fyv,fzv),1, {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
+            (2*fx*fy*fz*2,(fx,fy,fz),(fxv,fyv,fzv),1, {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
 #            (fx*fy*2*(fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
 #            (fx*fy*(2+fx+fy+fz),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
-            (fx*fy*2*(fx+fy+fz+2),(fx,fy,fz),(fxv,fyv,fzv),2,'float32'),
+            (fx*fy*2*(fx+fy+fz+2),(fx,fy,fz),(fxv,fyv,fzv),2, {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),

            #check with broadcast of row
 #            (fx+fy+fz+fv,(fx,fy,fz,fv),(fxv,fyv,fzv,fvv),1,'float32'),
@@ -220,6 +220,8 @@ class test_canonize(unittest.TestCase):
            mode._optimizer=gof.Query(["canonicalize"])
            mode._optimizer=mode._optimizer.excluding('local_elemwise_fusion')
            for id, [g, sym_inputs, val_inputs, nb_elemwise, out_dtype] in enumerate(cases):
+                if isinstance(out_dtype, dict):
+                    out_dtype = out_dtype[config.cast_policy]
                f = compile.function(list(sym_inputs), g,
                                     #we need the optimisation enabled, debug do this.
                                     mode=mode)
@@ -445,12 +447,15 @@ class test_canonize(unittest.TestCase):
            #test (2.0 * x) / (4.0 * y) -> (0.5 * x) / y
            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
                                                           (((2.0*dx)/(4.0*dy)),[dx,dy],[dxv,dyv],'float64'),
-                                                           (((2.0*fx)/(4.0*fy)),[fx,fy],[fxv,fyv],'float32'),
+                                                           (((2.0*fx)/(4.0*fy)),[fx,fy],[fxv,fyv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                                                           (((2.0*dv)/(4.0*dy)),[dv,dy],[dvv,dyv],'float64'),
-                                                           (((2.0*fv)/(4.0*fy)),[fv,fy],[fvv,fyv],'float32'),
+                                                           (((2.0*fv)/(4.0*fy)),[fv,fy],[fvv,fyv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                                                           (((2.0*dx)/(4.0*dv)),[dx,dv],[dxv,dvv],'float64'),
-                                                           (((2.0*fx)/(4.0*fv)),[fx,fv],[fxv,fvv],'float32'),
+                                                           (((2.0*fx)/(4.0*fv)),[fx,fv],[fxv,fvv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                ]):
+
+                if isinstance(out_dtype, dict):
+                    out_dtype = out_dtype[config.cast_policy]
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
@@ -468,10 +473,12 @@ class test_canonize(unittest.TestCase):
            #test 2 * x / 2 -> x
            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
                                                           ((2*dx)/2,[dx],[dxv],'float64'),
-                                                           ((2*fx)/2,[fx],[fxv],'float32'),
+                                                           ((2*fx)/2,[fx],[fxv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                                                           ((2*dv)/2,[dv],[dvv],'float64'),
-                                                           ((2*fv)/2,[fv],[fvv],'float32'),
+                                                           ((2*fv)/2,[fv],[fvv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                ]):
+                if isinstance(out_dtype, dict):
+                    out_dtype = out_dtype[config.cast_policy]
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                out = f(*val_inputs)
@@ -484,11 +491,11 @@ class test_canonize(unittest.TestCase):
            #test x / abs(x) -> sign(x)
            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
                                                           (dx/abs(dx),[dx],[0.5-dxv],'float64'),
-                                                           (fx/abs(fx),[fx],[0.5-fxv],'float32'),
+                                                           (fx/abs(fx),[fx],[0.5-fxv], 'float32'),
                                                           (dx/abs(dx),[dx],[0.1*dxv],'float64'),
-                                                           (fx/abs(fx),[fx],[0.1*fxv],'float32'),
+                                                           (fx/abs(fx),[fx],[0.1*fxv], 'float32'),
                                                           (dv/abs(dv),[dv],[0.5-dvv],'float64'),
-                                                           (fv/abs(fv),[fv],[0.5-fvv],'float32'),
+                                                           (fv/abs(fv),[fv],[0.5-fvv], 'float32'),
                ]):
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
@@ -501,12 +508,15 @@ class test_canonize(unittest.TestCase):
            #test (2*x) / (3*abs(x)) -> sign(x)
            for id,(g, sym_inputs, val_inputs, out_dtype) in enumerate([
                    ((2*dx)/(3*abs(dx)),[dx],[0.5-dxv],'float64'),
-                    ((2*fx)/(3*abs(fx)),[fx],[0.5-fxv],'float32'),
+                    ((2*fx)/(3*abs(fx)),[fx],[0.5-fxv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                    ((2*dx)/(3*abs(dx)),[dx],[0.1*dxv],'float64'),
-                    ((2*fx)/(3*abs(fx)),[fx],[0.1*fxv],'float32'),
+                    ((2*fx)/(3*abs(fx)),[fx],[0.1*fxv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                    ((2*dv)/(3*abs(dv)),[dv],[0.5-dvv],'float64'),
-                    ((2*fv)/(3*abs(fv)),[fv],[0.5-fvv],'float32'),
+                    ((2*fv)/(3*abs(fv)),[fv],[0.5-fvv], {'custom': 'float32', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
                ]):
+
+                if isinstance(out_dtype, dict):
+                    out_dtype = out_dtype[config.cast_policy]
                f = compile.function(list(sym_inputs), g,
                                     mode=mode)
                topo = f.maker.env.toposort()
@@ -647,10 +657,14 @@ def test_local_merge_abs():


 def test_mixeddiv():
-    """Test that int division is preserved"""
+    """Test that int division raises an exception."""
    i = iscalar()
    d = dscalar()
-    assert 0 == function([i,d], d*(i/(i+1)))(3, 1.0)
+    try:
+        0 == function([i,d], d*(i/(i+1)))(3, 1.0)
+        assert False
+    except theano.scalar.IntegerDivisionError:
+        pass

 def test_const_type_in_mul_canonizer():
    input = dmatrix()
@@ -715,6 +729,7 @@ class test_fusion(unittest.TestCase):
        izv = theano._asarray(my_init(shp,num=70),dtype='int32')
        fwx=fw+fx
        ftanx = theano.tensor.tan(fx)
+        ftwo = tensor.constant(2, dtype='float32')
        cases = [
            (fx+fy+fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+fzv,'float32'),#0
            (fx*fy*fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv*fzv,'float32'),#1
@@ -733,12 +748,12 @@ class test_fusion(unittest.TestCase):
            (fx*fy+fz+fy,(fx,fy,fz),(fxv,fyv,fzv),1,fxv*fyv+fzv+fyv,'float32'),
            (fx*fy*fz*fw+fx+fy+fz+fw,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fxv*fyv*fzv*fwv+fxv+fyv+fzv+fwv,'float32'),#15
            #test with constant
-            ((fw+fx)+(fy+fz)+2,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            (((fw+fx)+2+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            ((fw+(fx+2+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            ((fw+(fx+fy)+2+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
-            (fw+(fx+(fy+fz)+2),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
-            (2+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            ((fw+fx)+(fy+fz)+ ftwo,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (((fw+fx)+ftwo+fy)+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            ((fw+(fx+ftwo+fy))+fz,(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            ((fw+(fx+fy)+ftwo+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
+            (fw+(fx+(fy+fz)+ftwo),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),#20
+            (ftwo+(fw+fx)+(fy+fz),(fw,fx,fy,fz),(fwv,fxv,fyv,fzv),1,fwv+fxv+fyv+fzv+2,'float32'),
            #mix float32 and float64
            (2+(dw+fx)+(fy+fz),(dw,fx,fy,fz),(dwv,fxv,fyv,fzv),1,dwv+fxv+fyv+fzv+2,'float64'),
            (2+(fw+dw)+(fy+fz),(fw,dw,fy,fz),(fwv,dwv,fyv,fzv),1,fwv+dwv+fyv+fzv+2,'float64'),
@@ -768,10 +783,10 @@ class test_fusion(unittest.TestCase):
            (fx+fy+theano.tensor.exp(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv+fyv+numpy.exp(fzv),'float32'),#35
            (fx-fy-fz,(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv-fzv,'float32'),
            (fx-(fy/fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
-            (fx-theano.tensor.true_div(fy,2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
+            (fx-theano.tensor.true_div(fy,ftwo),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
            (fx-theano.tensor.true_div(fy,fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv/fzv),'float32'),
-            (fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)),'float64'),#int32 - float32 = float64 #No c_code for int_div#40
-            (fx-(fy/2),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
+            (fx-theano.tensor.int_div(ix*100,iy*1000),(fx,ix,iy),(fxv,ixv,iyv),4,fxv-((ixv*100)//(iyv*1000)), {'custom': 'float64', 'numpy+floatX': config.floatX, 'numpy': 'float64'}), #No c_code for int_div#40
+            (fx-(fy/ftwo),(fx,fy),(fxv,fyv),1,fxv-(fyv/2),'float32'),
            (fx-(fy%fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv%fzv),'float32'),
            (fx-(fy>fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>fzv),'float32'),
            (fx-(fy>=fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-(fyv>=fzv),'float32'),
@@ -790,10 +805,10 @@ class test_fusion(unittest.TestCase):
            (fx-fy+theano.tensor.round(fz),(fx,fy,fz),(fxv,fyv,fzv),1,fxv-fyv+numpy.round(fzv),'float32'),
            (ix-iy+theano.tensor.iround(fz),(ix,iy,fz),(ixv,iyv,fzv),1,ixv-iyv+numpy.round(fzv),'int64'),
            # Bit op
-            (fx-theano.tensor.or_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv|izv),'float64'),
-            (fx-theano.tensor.xor(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv^izv),'float64'),#60
-            (fx-theano.tensor.and_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv&izv),'float64'),
-            (fx-theano.tensor.invert(iy),(fx,iy),(fxv,iyv),1,fxv-(~iyv),'float64'),
+            (fx-theano.tensor.or_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv|izv), {'custom': 'float64', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
+            (fx-theano.tensor.xor(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv^izv), {'custom': 'float64', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),#60
+            (fx-theano.tensor.and_(iy,iz),(fx,iy,iz),(fxv,iyv,izv),1,fxv-(iyv&izv), {'custom': 'float64', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),
+            (fx-theano.tensor.invert(iy),(fx,iy),(fxv,iyv),1,fxv-(~iyv), {'custom': 'float64', 'numpy+floatX': config.floatX, 'numpy': 'float64'}),

            (fx-theano.tensor.cast(fy,dtype='float64'),(fx,fy),(fxv,fyv),1,
                              fxv-numpy.asarray(fyv,'float64'),'float64'),
@@ -819,8 +834,10 @@ class test_fusion(unittest.TestCase):
        fail3=[]
        fail4=[]
        for id, [g, sym_inputs, val_inputs, nb_elemwise, answer, out_dtype] in enumerate(cases):
+            if isinstance(out_dtype, dict):
+                out_dtype = out_dtype[config.cast_policy]
            if gpu and (out_dtype!='float32' or any(i.dtype != 'float32' for i in g.owner.inputs)):
-                print "Skip test %d as the gpu code currently support only float32" % id
+                print "Skip test %d as the gpu code currently supports only float32" % id
                continue
            print "new cases", id

@@ -831,7 +848,8 @@ class test_fusion(unittest.TestCase):
                    out=f(*val_inputs)
                t1=time.time()
            else:
-                out=shared_fn(numpy.zeros(shp, dtype=out_dtype),'out')
+                out = shared_fn(numpy.zeros(shp, dtype=out_dtype), 'out')
+                assert out.dtype == g.dtype
                f = function(sym_inputs,[],updates=[(out, g)],mode=mode)
                t0=time.time()
                for x in range(nb_repeat):
@@ -2497,6 +2515,7 @@ class T_local_sum(unittest.TestCase):
        assert numpy.allclose(f(input),input.sum())


+        config.warn.sum_sum_bug = False
        f = theano.function([a],a.sum(0).sum(0).sum(0),mode=self.mode)
        assert len(f.maker.env.nodes)==1
        assert numpy.allclose(f(input),input.sum())
@@ -2506,6 +2525,7 @@ class T_local_sum(unittest.TestCase):
        input=numpy.arange(3*3*3, dtype=config.floatX).reshape(3,3,3)
        dims=[(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]

+        config.warn.sum_sum_bug = False
        for d,dd in dims:
            f = theano.function([a],a.sum(d).sum(dd),mode=self.mode)
            assert numpy.allclose(f(input),input.sum(d).sum(dd))
@@ -2551,6 +2571,7 @@ class T_local_sum(unittest.TestCase):
                assert len(f.maker.env.nodes)==nb_nodes[2]
                assert f.maker.env.toposort()[-1].op==T.alloc

+            config.warn.sum_sum_bug = False
            for d, dd in [(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]:
                f = theano.function([a],t_like(a).sum(d).sum(dd),mode=mode)
                print f.maker.env.toposort()
@@ -2610,6 +2631,8 @@ class T_local_sum_dimshuffle(unittest.TestCase):
        c_val = rng.randn(2,2,2).astype(config.floatX)
        d_val = numpy.asarray(rng.randn(), config.floatX)

+        config.warn.sum_sum_bug = False
+        config.warn.sum_div_dimshuffle_bug = False
        for i,s in enumerate(sums):
            print i
            f = theano.function([a,b,c,d], s, mode=self.mode)
@@ -2753,8 +2776,17 @@ def test_local_mul_to_neg():
    f1 = theano.function([a], -1*a)
    f2 = theano.function([a], -1.0*a)
    aval = numpy.random.randint(0,10,(2,2)).astype('int32')
-    assert f1(aval).dtype == a.dtype
-    assert f2(aval).dtype == 'float64'
+    if config.cast_policy == 'custom':
+        assert f1(aval).dtype == a.dtype
+        assert f2(aval).dtype == 'float64'
+    elif config.cast_policy == 'numpy':
+        assert f1(aval).dtype == str(numpy.array(0).dtype)
+        assert f2(aval).dtype == 'float64'
+    elif config.cast_policy == 'numpy+floatX':
+        assert f1(aval).dtype == str(numpy.array(0).dtype)
+        assert f2(aval).dtype == config.floatX
+    else:
+        raise NotImplementedError(config.cast_policy)

 def test_local_add_specialize():


--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
 """ test code snippet in the Theano tutorials.
 """

-import unittest
+import os, unittest
 import theano
 import theano.tensor as T
 from theano import function
@@ -722,6 +722,15 @@ class T_loading_and_saving(unittest.TestCase):

        mode_instance = theano.compile.mode.get_mode(None)
        if not isinstance(mode_instance, theano.compile.debugmode.DebugMode):
+            if os.path.exists('obj.save') or os.path.exists('objects.save'):
+                # We do not want to delete these files silently, in case for
+                # some reason they would be something else than test-generated
+                # files.
+                # Ideally we would save those files in a temporary directory...
+                raise AssertionError(
+                        'Please get rid of files obj.save and '
+                        'objects.save in directory %s' % os.getcwd())
+
            f = file('obj.save', 'wb')
            cPickle.dump(my_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
            f.close()
@@ -746,6 +755,9 @@ class T_loading_and_saving(unittest.TestCase):
                loaded_objects.append(cPickle.load(f))
            f.close()

+            # Cleanup created files.
+            os.remove('obj.save')
+            os.remove('objects.save')

 class T_modes(unittest.TestCase):
    ## All tests here belog to