Merged

77223d49 · Olivier Delalleau · f218f76e · 45f47901 · 77223d49 · 77223d49
--- a/bin/theano-cache
+++ b/bin/theano-cache
@@ -6,7 +6,7 @@ from theano.gof.cc import get_module_cache
 if len(sys.argv) == 1:
    print config.compiledir
 elif sys.argv[1] in ('clear'):
-    get_module_cache().clear()
+    get_module_cache().clear(unversioned_min_age=-1)
 else:
    print 'command "%s" not recognized' % sys.argv[1]
    print 'Type "theano-cache" to print the cache location'

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -144,7 +144,7 @@ import theano and print the config variable, as in:

 .. attribute:: floatX

-    String value: either 'float64' or 'float32'.
+    String value: either 'float64' or 'float32'

    Default: 'float64'

@@ -152,6 +152,47 @@ import theano and print the config variable, as in:
    and similar functions.  It also sets the default theano bit width for
    arguments passed as Python floating-point numbers.

+.. attribute:: cast_policy
+
+    String value: either 'numpy+floatX', 'numpy' or 'custom'
+
+    Default: 'custom'
+
+    This specifies how data types are implicitly figured out in Theano, e.g. for
+    constants or in the results of arithmetic operations. The current default
+    value ('custom') corresponds to a set of custom rules originally used in
+    Theano (which can be partially customized, see e.g. the in-code help of
+    ``tensor.NumpyAutocaster``). However the 'custom' option will be
+    deprecated in a future release of Theano. The 'numpy' setting attempts to
+    mimic the numpy casting rules. 'numpy+floatX' does the same, except that
+    it prefers to use float32 numbers instead of float64 when ``config.floatX``
+    is set to 'float32' (this will become the default value in a future
+    release of Theano). Note that both 'numpy' and 'numpy+floatX'
+    behave differently from numpy on purpose in the following situations:
+       * Depending on the value of ``config.int_division``, the resulting type
+         of a division of integer types with the ``/`` operator may not match
+         that of numpy.
+       * On mixed scalar / array operations, numpy tries to prevent the scalar
+         from upcasting the array's type unless it is of a fundamentally
+         different type. However it is not practical to implement in Theano
+         a behavior similar to the one currently found in numpy, so Theano
+         does not attempt to do the same.
+
+.. attribute:: int_division
+
+    String value: either 'int', 'floatX' or 'raise'
+
+    Default: 'int'
+
+    Specifies what to do when one tries to compute ``x / y``, where both ``x`` and
+    ``y`` are of integer types (possibly unsigned). 'int' means an integer is
+    returned (as in Python 2.X), but this behavior is deprecated. 'floatX'
+    returns a number of type given by ``config.floatX``. 'raise' is the safest
+    choice (and will become default in a future release of Theano) and raises
+    an error when one tries to do such an operation, enforcing the use of the
+    integer division operator (``//``) (if a float result is intended, either
+    cast one of the arguments to a float, or use ``x.__truediv__(y)``).
+
 .. attribute:: mode

    String value: 'Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN', 'FAST_COMPILE'

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -15,11 +15,16 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'),
        )

-# TODO Work-in-progress
-#AddConfigVar('casting_policy',
-#        "Rules for implicit casts of constants in arithmetic operations",
-#        EnumStr('theano_0.3', 'numpy'),
-#        )
+AddConfigVar('cast_policy',
+        "Rules for implicit type casting (until further notice, do not modify within a script, and clear your Theano cache whenever it is modified)",
+        EnumStr('custom', 'numpy+floatX', 'numpy'),
+        )
+
+AddConfigVar('int_division',
+        "What to do when one computes x / y, where both x and y are of "
+        "integer types",
+        EnumStr('int', 'raise', 'floatX'),
+        )

 #gpu mean let the driver select the gpu. Needed in case of gpu in exclusive mode.
 #gpuX mean use the gpu number X.

--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -7,6 +7,8 @@ import ConfigParser
 import logging
 import warnings

+import theano
+
 _logger = logging.getLogger('theano.config')

 class TheanoConfigWarning(Warning):
@@ -103,6 +105,17 @@ def _config_print(thing, buf):
        print >> buf, "    Value: ", cv.val
        print >> buf, ""

+
+def get_config_md5():
+    """
+    Return a string md5 of the current config options. It should be such that
+    we can safely assume that two different config setups will lead to two
+    different strings.
+    """
+    all_opts = sorted(_config_var_list, key=lambda cv: cv.fullname)
+    return theano.gof.cc.hash_from_code('\n'.join(['%s = %s' % (cv.fullname, cv.val) for cv in all_opts]))
+
+
 class TheanoConfigParser(object):
    #properties are installed by AddConfigVar
    _i_am_a_config_class = True
@@ -110,6 +123,7 @@ class TheanoConfigParser(object):
        sio = StringIO.StringIO()
        _config_print(self.__class__, sio)
        return sio.getvalue()
+
 # N.B. all instances of TheanoConfigParser give access to the same properties.
 config = TheanoConfigParser()


--- a/theano/gof/apply_shape.py
+++ b/theano/gof/apply_shape.py
@@ -4,6 +4,7 @@ This is not used currently very used. It appear in some case, but I'm not sure i
 It could help the current system to make it detect problem earlier when contructing the graph instead of during optimization.
 """
 import sys
+import theano
 from theano import gof

 def ishape(v):
@@ -35,7 +36,7 @@ class Apply(gof.Apply):

        try:
            oshapes = infer_shape(self, ishapes)
-        except NotImplementedError:
+        except theano.tensor.ShapeError:
            return

        for o, oshp in zip(outputs, oshapes):

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -7,6 +7,7 @@ from copy import copy
 import re #for set_compiledir
 import os, sys, StringIO

+
 if sys.version_info[:2] >= (2,5):
    import hashlib
    def hash_from_code(msg):
@@ -16,6 +17,13 @@ else:
    def hash_from_code(msg):
        return md5.new(msg).hexdigest()

+
+def hash_from_file(file_path):
+    """Return the MD5 hash of a file."""
+    return hash_from_code(open(file_path, 'rb').read())
+
+
+import theano
 from theano.gof.python25 import all
 from theano import config

@@ -43,6 +51,7 @@ import cmodule

 import logging
 _logger=logging.getLogger("theano.gof.cc")
+_logger.setLevel(logging.WARN)
 def info(*args):
    _logger.info(' '.join(str(a) for a in args))
 def debug(*args):
@@ -791,7 +800,7 @@ class CLinker(link.Linker):
        The key returned by this function is of the form (version, signature)
        The signature has the following form:
        {{{
-            'CLinker.cmodule_key', compilation args, libraries,
+            'CLinker.cmodule_key', compilation args, libraries, config md5,
            (op0, input_signature0, output_signature0),
            (op1, input_signature1, output_signature1),
            ...
@@ -858,10 +867,16 @@ class CLinker(link.Linker):
        constant_ids = dict()
        op_pos = {} # Apply -> topological position

-        # first we put the header, compile_args, library names into the signature
+        # First we put the header, compile_args, library names and config md5
+        # into the signature.
        sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
        if compile_args is not None: sig.append(tuple(compile_args))
        if libraries is not None: sig.append(tuple(libraries))
+        # IMPORTANT: The 'md5' prefix is used to isolate the compilation
+        # parameters from the rest of the key. If you want to add more key
+        # elements, they should be before this md5 hash if and only if they
+        # can lead to a different compiled file with the same source code.
+        sig.append('md5:' + theano.configparser.get_config_md5())

        # technically this should only be appended for gcc-compiled Ops
        # and the flags of other compilers should be inserted here... but it's not clear how to

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -2,10 +2,12 @@
 """
 import os, tempfile, StringIO, sys, logging, subprocess, cPickle, atexit, time, shutil, stat
 import distutils.sysconfig
-from theano.configparser import config
+
 import numpy.distutils #TODO: TensorType should handle this
-import sys
+import theano

+from theano.configparser import config
+from theano.gof.cc import hash_from_code, hash_from_file
 import compilelock # we will abuse the lockfile mechanism when reading and writing the registry

 from theano.configparser import TheanoConfigParser, AddConfigVar, EnumStr, StrParam, IntParam, FloatParam, BoolParam
@@ -202,6 +204,81 @@ def module_name_from_dir(dirname):
    name, = [file for file in files if file.endswith('.so') or file.endswith('.pyd')]
    return os.path.join(dirname, name)

+
+def get_module_hash(module_file, key):
+
+    """
+    Return an MD5 hash that identifies a module.
+
+    This hash takes into account:
+        1. The 'mod.cpp' file associated used to compile `module_file`.
+        2. The compiler options defined in `key`.
+    """
+    source_code = os.path.join(os.path.dirname(module_file), 'mod.cpp')
+    if not os.path.exists(source_code):
+        source_code = os.path.join(os.path.dirname(module_file), 'mod.cu')
+        assert os.path.exists(source_code)
+    source_hash = hash_from_file(source_code)
+    c_link_key = key[1]
+    # Currently, in order to catch potential bugs early, we are very
+    # convervative about the structure of the key and raise an exception
+    # if it does not match exactly what we expect. In the future we may
+    # modify this behavior to be less strict and be able to accomodate
+    # changes to the key in an automatic way.
+    error_msg = ("This should not happen unless someone modified the code "
+                 "that defines the CLinker key, in which case you should "
+                 "ensure this piece of code is still valid (and this "
+                 "AssertionError may be removed or modified to accomodate "
+                 "this change)")
+    assert (c_link_key[0] == 'CLinker.cmodule_key', error_msg)
+    to_hash = [source_hash]
+    for key_element in c_link_key[1:]:
+        if isinstance(key_element, tuple):
+            to_hash += list(key_element)
+        elif isinstance(key_element, str):
+            if key_element.startswith('md5:'):
+                # This is the md5 hash of the config options. We can stop
+                # here.
+                break
+            else:
+                raise AssertionError(error_msg)
+        else:
+            raise AssertionError(error_msg)
+    return hash_from_code('\n'.join(to_hash))
+
+
+class KeyData(object):
+
+    """Used to store the key information in the cache."""
+
+    def __init__(self, keys, module_hash, key_pkl):
+        """
+        Constructor.
+
+        :param keys: Set of keys that are associated to the exact same module.
+
+        :param module_hash: Hash identifying the module (it should hash both
+        the code and the compilation options).
+
+        :param key_pkl: Path to the file in which this KeyData object should be
+        pickled.
+        """
+        self.keys = keys
+        self.module_hash = module_hash
+        self.key_pkl = key_pkl
+
+    def add_key(self, key):
+        """Add a key to the `keys` set, and update the pickled file."""
+        assert key not in self.keys
+        self.keys.add(key)
+        self.save_pkl()
+
+    def save_pkl(self):
+        """Dump this object into its `key_pkl` file."""
+        cPickle.dump(self, open(self.key_pkl, 'wb'),
+                     protocol=cPickle.HIGHEST_PROTOCOL)
+
+
 class ModuleCache(object):
    """Interface to the cache of dynamically compiled modules on disk

@@ -239,6 +316,9 @@ class ModuleCache(object):
    """Maps keys to the filename of a .so/.pyd.
    """

+    loaded_modules_hash = {}
+    """Maps hash of a module's code to its corresponding KeyData object."""
+
    stats = []
    """A list with counters for the number of hits, loads, compiles issued by module_from_key()
    """
@@ -260,6 +340,7 @@ class ModuleCache(object):
        self.dirname = dirname
        self.module_from_name = dict(self.module_from_name)
        self.entry_from_key = dict(self.entry_from_key)
+        self.loaded_modules_hash = dict(self.loaded_modules_hash)
        self.stats = [0, 0, 0]
        if force_fresh is not None:
            self.force_fresh = force_fresh
@@ -301,7 +382,8 @@ class ModuleCache(object):
            # add entries that are not in the entry_from_key dictionary
            time_now = time.time()
            for root, dirs, files in os.walk(self.dirname):
-                if os.path.join(root, 'key.pkl') in self.loaded_key_pkl:
+                key_pkl = os.path.join(root, 'key.pkl')
+                if key_pkl in self.loaded_key_pkl:
                    continue
                elif 'delete.me' in files or len(files)==0:
                    # On NFS filesystems, it is impossible to delete a directory with open
@@ -314,44 +396,86 @@ class ModuleCache(object):
                        # the directory is still in use??  We just leave it for future removal.
                        pass
                elif 'key.pkl' in files:
-                    key_pkl = os.path.join(root, 'key.pkl')
                    try:
                        entry = module_name_from_dir(root)
                    except ValueError: # there is a key but no dll!
                        if not root.startswith("/tmp"):
                            # Under /tmp, file are removed periodically by the os.
-                            # So it is normal that this happen from time to time.
+                            # So it is normal that this happens from time to time.
                            warning("ModuleCache.refresh() Found key without dll in cache, deleting it.", key_pkl)
                        info("Erasing broken cache directory", key_pkl)
                        shutil.rmtree(root)
                        continue
-                    if (time_now - last_access_time(entry))<self.age_thresh_use:
+                    if (time_now - last_access_time(entry)) < self.age_thresh_use:
                        debug('refresh adding', key_pkl)
                        try:
-                            key = cPickle.load(open(key_pkl, 'rb'))
+                            key_data = cPickle.load(open(key_pkl, 'rb'))
                        except:
-                            info("ModuleCache.refresh() Failed to unpickle cache key", key_pkl)
-                            if 0:
+                            info("ModuleCache.refresh() Failed to unpickle "
+                                 "cache file", key_pkl)
+                            if False:
                                info("Erasing broken cache directory", key_pkl)
                                shutil.rmtree(root)
                            else:
-                                ## This exception is often triggered by keys that contain
+                                # This exception is often triggered by keys that contain
                                # references to classes that have not yet been imported.  They are
                                # not necessarily broken
                                pass
                            continue
-
-                        if not key[0]: #if the version is False
-                            warning("ModuleCache.refresh() Found unversioned key in cache, deleting it.", key_pkl)
-                            info("Erasing broken cache directory", key_pkl)
-                            shutil.rmtree(root)
-                            continue
-
-                        if key not in self.entry_from_key:
-                            self.entry_from_key[key] = entry
-                            # assert that we haven't already got this entry somehow
-                            assert entry not in self.module_from_name
-                            self.loaded_key_pkl.add(key_pkl)
+                        
+                        if not isinstance(key_data, KeyData):
+                            # Backward-compatibility with older cache mechanism
+                            # that used single keys with no hash of the
+                            # compiled file.
+                            key_data = KeyData(
+                                    keys=set([key_data]),
+                                    module_hash=get_module_hash(entry, key_data),
+                                    key_pkl=key_pkl)
+                            debug("Updating cache key to new format", key_pkl)
+                            key_data.save_pkl()
+
+                        # Find unversioned keys.
+                        to_del = [key for key in key_data.keys if not key[0]]
+                        if to_del:
+                            warning("ModuleCache.refresh() Found unversioned "
+                                    "key in cache, removing it.", key_pkl)
+                            if len(to_del) == len(key_data.keys):
+                                # All keys were unversioned.
+                                info("Erasing broken cache directory", key_pkl)
+                                shutil.rmtree(root)
+                                continue
+                            else:
+                                # Fix the pickled file to only keep the
+                                # versioned keys.
+                                info("Fixing broken cache directory", key_pkl)
+                                key_data.keys = set(
+                                        [key for key in key_data.keys
+                                         if key[0]])
+                                key_data.save_pkl()
+
+                        for key in key_data.keys:
+                            if key not in self.entry_from_key:
+                                self.entry_from_key[key] = entry
+                                # Assert that we have not already got this
+                                # entry somehow.
+                                assert entry not in self.module_from_name
+                                self.loaded_key_pkl.add(key_pkl)
+
+                        # Remember the map from a module's hash to the KeyData
+                        # object associated with it.
+                        mod_hash = key_data.module_hash
+                        if mod_hash in self.loaded_modules_hash:
+                            # This should not happen anymore, but may happen
+                            # with the previous cache mechanism, that did not
+                            # ensure uniqueness of the compiled modules.
+                            # TODO Convert into an error in the future.
+                            warning(
+                                "Found duplicated modules in the cache, you "
+                                "are probably using an old cache. Clear it "
+                                "with 'theano-cache clear' to benefit from "
+                                "recent cache optimizations.")
+                        else:
+                            self.loaded_modules_hash[mod_hash] = key_data
                    else:
                        too_old_to_use.append(entry)

@@ -419,59 +543,97 @@ class ModuleCache(object):
            rval = self.module_from_name[name]
        else:
            hash_key = hash(key)
-            # we have never seen this key before
+            # We have never seen this key before.
            # Acquire lock before creating things in the compile cache,
-            # to avoid that other processes remove the compile dire while it
-            # is still empty
+            # to avoid that other processes remove the compile dir while it
+            # is still empty.
            compilelock.get_lock()
-            location = dlimport_workdir(self.dirname)
-            #debug("LOCATION*", location)
+            # This try/finally block ensures that the lock is released once we
+            # are done writing in the cache file or after raising an exception.
            try:
-                module = fn(location=location)  # WILL FAIL FOR BAD C CODE
-            except Exception, e:
-                _rmtree(location)
-                if not keep_lock:
-                    compilelock.release_lock()
-                #try:
-                #except Exception, ee:
-                    #error('failed to cleanup location', location, ee)
-                raise
-
-            if not keep_lock:
-                compilelock.release_lock()
-            name = module.__file__
-
-            debug("Adding module to cache", key, name)
-            assert name.startswith(location)
-            assert name not in self.module_from_name
-#Changing the hash of the key is not allowed during compilation
-#That is the only cause found that make the last assert fail.
-            assert hash(key)==hash_key
-            assert key not in self.entry_from_key
-
-            assert key not in self.entry_from_key
-            if _version: # save they key
-                key_pkl = os.path.join(location, 'key.pkl')
-                # Note that using a binary file is important under Windows.
-                key_file = open(key_pkl, 'wb')
+                location = dlimport_workdir(self.dirname)
+                #debug("LOCATION*", location)
                try:
-                    cPickle.dump(key, key_file, cPickle.HIGHEST_PROTOCOL)
-                    key_file.close()
-                    key_broken = False
-                except cPickle.PicklingError:
-                    key_file.close()
-                    os.remove(key_pkl)
-                    warning("Cache leak due to unpickle-able key", key)
-                    key_broken = True
-
-                if not key_broken:
+                    module = fn(location=location)  # WILL FAIL FOR BAD C CODE
+                except Exception, e:
+                    _rmtree(location)
+                    #try:
+                    #except Exception, ee:
+                        #error('failed to cleanup location', location, ee)
+                    raise
+
+                name = module.__file__
+
+                debug("Adding module to cache", key, name)
+                assert name.startswith(location)
+                assert name not in self.module_from_name
+                # Changing the hash of the key is not allowed during
+                # compilation. That is the only cause found that makes the
+                # following assert fail.
+                assert hash(key) == hash_key
+                assert key not in self.entry_from_key
+
+                # Check if we already know a module with the same hash.
+                duplicated_module = False
+                module_hash = get_module_hash(name, key)
+                if module_hash in self.loaded_modules_hash:
+                    debug("Duplicated module! Will re-use the previous one")
+                    duplicated_module = True
+                    # Load the already existing module.
+                    key_data = self.loaded_modules_hash[module_hash]
+                    module = self.module_from_key(
+                            key=key_data.keys.__iter__().next(),
+                            keep_lock=True)
+                    # Add current key to the set of keys associated to the same
+                    # module.
+                    key_data.add_key(key)
+                    # We can delete this module.
+                    debug("Deleting: ", os.path.dirname(name))
+                    shutil.rmtree(os.path.dirname(name))
+                    name = module.__file__
+
+                if not duplicated_module and _version: # save the key
+                    key_pkl = os.path.join(location, 'key.pkl')
+                    key_data = KeyData(
+                            keys=set([key]),
+                            module_hash=get_module_hash(name, key),
+                            key_pkl=key_pkl)
+                    # Note that using a binary file is important under Windows.
+                    key_file = open(key_pkl, 'wb')
                    try:
-                        key_from_file = cPickle.load(open(key_pkl, 'rb'))
-                        if key != key_from_file:
-                            raise Exception("key not equal to unpickled version (Hint: verify the __eq__ and __hash__ functions for your Ops", (key, key_from_file))
-                        self.loaded_key_pkl.add(key_pkl) # adding the key file to this set means it is a versioned key
-                    except cPickle.UnpicklingError:
-                        warning('Cache failure due to un-loadable key', key)
+                        cPickle.dump(key_data, key_file,
+                                     cPickle.HIGHEST_PROTOCOL)
+                        key_file.close()
+                        key_broken = False
+                    except cPickle.PicklingError:
+                        key_file.close()
+                        os.remove(key_pkl)
+                        warning("Cache leak due to unpickle-able key", key)
+                        key_broken = True
+
+                    if not key_broken:
+                        try:
+                            kd_from_file = cPickle.load(open(key_pkl, 'rb'))
+                            assert len(kd_from_file.keys) == 1
+                            key_from_file = kd_from_file.keys.__iter__().next()
+                            if key != key_from_file:
+                                raise Exception(
+                                    "key not equal to unpickled version (Hint:"
+                                    " verify the __eq__ and __hash__ functions"
+                                    " for your Ops", (key, key_from_file))
+                            # Adding the key file to this set means it is a
+                            # versioned key.
+                            self.loaded_key_pkl.add(key_pkl)
+                            self.loaded_modules_hash[module_hash] = key_data
+                        except cPickle.UnpicklingError:
+                            warning('Cache failure due to un-loadable key',
+                                    key)
+
+            finally:
+                # Release lock if needed.
+                if not keep_lock:
+                    compilelock.release_lock()
+
            self.entry_from_key[key] = name
            self.module_from_name[name] = module

@@ -481,7 +643,7 @@ class ModuleCache(object):
        return rval

    age_thresh_del = 60*60*24*31#31 days
-    age_thresh_del_unversionned = 60*60*24*7#7 days
+    age_thresh_del_unversioned = 60*60*24*7#7 days

    """The default age threshold for `clear_old` (in seconds)
    """
@@ -500,13 +662,13 @@ class ModuleCache(object):
            # update the age of modules that have been accessed by other processes
            # and get all module that are too old to use.(not loaded in self.entry_from_key)
            too_old_to_use = self.refresh()
-            too_old_to_use = [(None,entry) for entry in too_old_to_use]
+            too_old_to_use = [(None, entry) for entry in too_old_to_use]
            time_now = time.time()

            # the .items() is important here:
            # we need to get a copy of the whole list of keys and entries
            items_copy = list(self.entry_from_key.iteritems())
-            for key, entry in items_copy+too_old_to_use:
+            for key, entry in items_copy + too_old_to_use:
                age = time_now - last_access_time(entry)
                if age > age_thresh_del:
                    # TODO: we are assuming that modules that haven't been accessed in over
@@ -523,17 +685,19 @@ class ModuleCache(object):
        finally:
            compilelock.release_lock()

-    def clear(self):
+    def clear(self, unversioned_min_age=None):
        """
        Clear all the elements of the cache
        """
        self.clear_old(-1.0)
-        self.clear_unversioned()
+        self.clear_unversioned(min_age=unversioned_min_age)

-    def clear_unversioned(self):
+    def clear_unversioned(self, min_age=None):
        """Delete unversioned dynamic modules from the internal dictionaries and from the
        filesystem.
        """
+        if min_age is None:
+            min_age = self.age_thresh_del_unversioned
        items_copy = list(self.entry_from_key.iteritems())
        for key, entry in items_copy:
            version, rest = key
@@ -561,12 +725,13 @@ class ModuleCache(object):
                    has_key = False
                if not has_key:
                    age = time_now - last_access_time(os.path.join(self.dirname, filename))
-                    #In normal case, the processus that created this directory will delete it
-                    #In case this processus crash, it won't be cleaned up.
-                    #As we don't know how to know if this directory is still used
-                    #we wait 1 weak and suppose that the processus crashed
-                    #and we do the clean up for it.
-                    if age > self.age_thresh_del_unversionned:
+                    # In normal case, the processus that created this directory
+                    # will delete it. However, if this processus crashes, it
+                    # will not be cleaned up.
+                    # As we don't know if this directory is still used, we wait
+                    # one week and suppose that the processus crashed, and we
+                    # take care of the clean-up.
+                    if age > min_age:
                        info("clear_unversioned removing cache dir", filename)
                        _rmtree(os.path.join(self.dirname, filename))


--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -246,13 +246,13 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    neib_shape = T.as_tensor_variable(neib_shape)
    original_shape = T.as_tensor_variable(original_shape)

-    new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] )
+    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1], neib_shape[1])
    output_2d = images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape, mode=mode)
    
    if mode == 'ignore_borders':
        valid_shape = list(original_shape)
-        valid_shape[2]  = valid_shape[2] / neib_shape[0] * neib_shape[0]
-        valid_shape[3]  = valid_shape[3] / neib_shape[1] * neib_shape[1]
+        valid_shape[2]  = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
+        valid_shape[3]  = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
        output_4d = output_2d.reshape(valid_shape)
        #padding the borders with zeros
        for d in [2,3]:

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -263,7 +263,7 @@ class mrg_uniform(mrg_uniform_base):
        if (%(size)s->dimensions[0] != %(ndim)s)
        {
            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, %(size)s->dimensions[0]);
+                %(ndim)s, int(%(size)s->dimensions[0]));
            %(fail)s
        }
        if (%(size)s->descr->type_num != PyArray_INT32)
@@ -589,6 +589,35 @@ class GPU_mrg_uniform(mrg_uniform_base):
    def c_code_cache_version(self):
        return (4,)

+
+def guess_n_streams(size, warn=True):
+    """
+    Return a guess at a good number of streams.
+    
+    :param warn: If True, warn when a guess cannot be made (in which case
+    we return 30 * 256).
+    """
+    # TODO: a smart way of choosing the number of streams, see #612.
+    # Note that this code was moved out of `MRG_RandomStreams` so that it can
+    # be easily accessed from tests, where we want to disable the warning.
+    if (isinstance(size, (tuple, list)) and
+        all([isinstance(i, int) for i in size])):
+        # We can make a guess.
+        r = 1
+        for s in size:
+            r *= s
+        if r > 6:
+            r = r/6 # chosen as fastest for rbm_benchmark
+        return r
+    else:
+        if warn:
+            assert False
+            print >> sys.stderr, (
+                    "MRG_RandomStreams Can't determine #streams from "
+                    "size (%s), guessing 30*256") % str(size)
+        return 30 * 256
+
+
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""

@@ -654,18 +683,7 @@ class MRG_RandomStreams(object):
        return rval

    def n_streams(self, size):
-        # TODO: a smart way of choosing the number of streams, see #612.
-        if isinstance(size, (tuple, list)) and all([isinstance(i,int) for i in size]):
-            r = 1
-            for s in size:
-                r *= s
-            if r > 6:
-                r = r/6 # chosen as fastest for rbm_benchmark
-            return r
-
-        print >> sys.stderr, ("MRG_RandomStreams Can't determine #streams from "
-                "size (%s), guessing 30*256")%str(size)
-        return 30*256
+        return guess_n_streams(size, warn=True)

    def pretty_return(self, node_rstate, new_rstate, sample):
        sample.rstate = node_rstate
@@ -674,7 +692,8 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample

-    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX, nstreams=None):
+    def uniform(self, size, low=0.0, high=1.0, ndim=None, dtype='floatX',
+                nstreams=None):
        """
        Sample a tensor of given size whose element from a uniform
        distribution between low and high.
@@ -683,10 +702,14 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.

-        :param: size: Can be a list of integer or Theano variable
+        :param size: Can be a list of integer or Theano variable
                (ex: the shape of other Theano Variable)
-                TODO: can size be None?
+
+        :param dtype: The output data type.
        """
+        if dtype == 'floatX':
+            dtype = config.floatX
+
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i,int) or isinstance(i,Variable)
@@ -728,16 +751,19 @@ class MRG_RandomStreams(object):
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
        return  r

-    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64'):
+    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
+                 nstreams=None):
        if n == 1:
-            if dtype=='float32' and self.use_cuda:
-                return cast(self.uniform(size=size, dtype=dtype) < p, dtype)
+            if dtype == 'float32' and self.use_cuda:
+                x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
            else:
-                return cast(self.uniform(size=size) < p, dtype)
+                x = self.uniform(size=size, nstreams=nstreams)
+            return cast(x < p, dtype)
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")

-    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
+                    nstreams=None):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial
        distribution defined by probabilities pvals.
@@ -758,22 +784,31 @@ class MRG_RandomStreams(object):
                    ndim, size, pvals[:,0])
            assert ndim==1
            bcast = bcast+(pvals.type.broadcastable[-1],)
-            unis = self.uniform(size=size, ndim=1)
+            unis = self.uniform(size=size, ndim=1, nstreams=nstreams)
            op = multinomial.MultinomialFromUniform(dtype)
            return op(pvals, unis)
        else:
            raise NotImplementedError(("MRG_RandomStreams.multinomial only"
                " implemented with n == 1 and pvals.ndim = 2"))

-    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
+    def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
+               dtype='floatX', nstreams=None):
        """
-        :param: size: Can be a list of integer or Theano variable(ex: the shape of other Theano Variable)
+        :param size: Can be a list of integers or Theano variables (ex: the
+        shape of another Theano Variable)
+
+        :param dtype: The output data type.
+
+        :param nstreams: Number of streams.
        """
        # We need an even number of ]0,1[ samples. Then we split them
        # in two halves. First half becomes our U1's for Box-Muller,
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

+        if dtype == 'floatX':
+            dtype = config.floatX
+
        evened = False
        constant = False
        if isinstance(size, tuple) and all([isinstance(i,int) for i in size]):
@@ -786,14 +821,15 @@ class MRG_RandomStreams(object):
        else:
            #if even, don't change, if odd, +1
            n_samples = prod(size)+(prod(size)%2)
-        flattened = self.uniform(size=(n_samples,), dtype=dtype)
+        flattened = self.uniform(size=(n_samples,), dtype=dtype,
+                                 nstreams=nstreams)

        if constant:
-            U1 = flattened[:n_samples/2]
-            U2 = flattened[n_samples/2:]
+            U1 = flattened[:n_samples // 2]
+            U2 = flattened[n_samples // 2:]
        else:
-            U1 = flattened[:prod(flattened.shape)/2]
-            U2 = flattened[prod(flattened.shape)/2:]
+            U1 = flattened[:prod(flattened.shape) // 2]
+            U2 = flattened[prod(flattened.shape) // 2:]

        #normal_samples = zeros_like(flattened)
        sqrt_ln_U1 = sqrt(-2.0*log(U1))

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -350,7 +350,9 @@ def test_uniform():
        print 'ON CPU with size=(%s):'%str(size)
        x = tensor.matrix()
        R = MRG_RandomStreams(234, use_cuda=False)
-        u = R.uniform(size=size)
+        # Note: we specify `nstreams` to avoid a warning.
+        u = R.uniform(size=size,
+                      nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, u, mode=mode)
        assert any([isinstance(node.op,theano.sandbox.rng_mrg.mrg_uniform)
                    for node in f.maker.env.toposort()])
@@ -366,7 +368,8 @@ def test_uniform():
            print ''
            print 'ON GPU with size=(%s):'%str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
-            u = R.uniform(size=size, dtype='float32')
+            u = R.uniform(size=size, dtype='float32',
+                          nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                    theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -421,7 +424,9 @@ def test_binomial():
            print ''
            print 'ON CPU with size=(%s) and mean(%d):'%(str(size),mean)
            R = MRG_RandomStreams(234, use_cuda=False)
-            u = R.binomial(size=size, p=mean)
+            # Note: we specify `nstreams` to avoid a warning.
+            u = R.binomial(size=size, p=mean,
+                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
            f = theano.function(var_input, u, mode=mode)
            theano.printing.debugprint(f)
            out = f(*input)
@@ -433,7 +438,9 @@ def test_binomial():
                print ''
                print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
                R = MRG_RandomStreams(234, use_cuda=True)
-                u = R.binomial(size=size, p=mean, dtype='float32')
+                u = R.binomial(size=size, p=mean, dtype='float32',
+                               nstreams=rng_mrg.guess_n_streams(size,
+                                                                warn=False))
                assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
                f = theano.function(var_input, theano.Out(
                        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -478,7 +485,9 @@ def test_normal0():
        print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
-        n = R.normal(size=size, avg=avg, std=std)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.normal(size=size, avg=avg, std=std,
+                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n, mode=mode)
        theano.printing.debugprint(f)
        out  = f(*input)
@@ -491,7 +500,8 @@ def test_normal0():
            print ''
            print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
-            n = R.normal(size=size, avg=avg, std=std, dtype='float32')
+            n = R.normal(size=size, avg=avg, std=std, dtype='float32',
+                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
@@ -557,7 +567,8 @@ def test_multinomial():
    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
-    m = R.multinomial(pvals=pvals, dtype=config.floatX)
+    # Note: we specify `nstreams` to avoid a warning.
+    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    out = f()

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -12,8 +12,9 @@ If you want to use a scalar variable in a Theano graph,
 you probably want to use theano.tensor.[c,z,f,d,b,w,i,l,]scalar!
 """

-import math
+import math, warnings
 from copy import copy
+from itertools import imap

 import numpy, theano

@@ -26,11 +27,37 @@ builtin_complex = complex
 builtin_int = int
 builtin_float = float

+
+class ComplexError(Exception):
+    """Raised if complex numbers are used in an unsupported operation."""
+    pass
+
+class IntegerDivisionError(Exception):
+    """Raised if someone tries to divide integers with '/' instead of '//'."""
+    pass
+
+
 def upcast(dtype, *dtypes):
-    z = numpy.zeros((), dtype = dtype)
-    for dtype in dtypes:
-        z = z + numpy.zeros((), dtype = dtype)
-    return str(z.dtype)
+    # Should we try to keep float32 instead of float64? This is used so that
+    # for instance mixing int64 with float32 yields float32 instead of float64.
+    # Note that we store this boolean as a one-element list so that it can be
+    # modified within `make_array`.
+    keep_float32 = [(config.cast_policy == 'numpy+floatX' and
+                     config.floatX == 'float32')]
+    def make_array(dt):
+        if dt == 'float64':
+            # There is an explicit float64 dtype: we cannot keep float32.
+            keep_float32[0] = False
+        return numpy.zeros((), dtype=dt)
+    z = make_array(dtype)
+    for dt in dtypes:
+        z = z + make_array(dt=dt)
+    rval = str(z.dtype)
+    if rval == 'float64' and keep_float32[0]:
+        return 'float32'
+    else:
+        return rval
+

 def as_scalar(x, name = None):
    if isinstance(x, gof.Apply):
@@ -47,6 +74,7 @@ def as_scalar(x, name = None):
    except TypeError:
        raise TypeError("Cannot convert %s to Scalar" % x, type(x))

+
 def constant(x):
    # pass through numpy scalars, since they are already typed on purpose typically.
    if hasattr(x,'dtype'):
@@ -383,8 +411,9 @@ uint_types = uint8, uint16, uint32, uint64
 float_types = float32, float64
 complex_types = complex64, complex128

+discrete_types = int_types + uint_types
 continuous_types = float_types + complex_types
-
+ 
 class _scalar_py_operators:

    #UNARY
@@ -416,7 +445,8 @@ class _scalar_py_operators:
    def __sub__(self,other): return sub(self,other)
    def __mul__(self,other): return mul(self,other)
    def __div__(self,other): return div_proxy(self,other)
-    def __mod__(self,other): return mod(self,other)
+    def __floordiv__(self, other): return int_div(self, other)
+    def __mod__(self, other): return mod_check(self, other)
    def __pow__(self,other): return pow(self,other)

    #ARITHMETIC - RIGHT-OPERAND
@@ -994,32 +1024,74 @@ class Sub(BinaryScalarOp):
        return first_part, second_part
 sub = Sub(upcast_out, name = 'sub')

-def div_proxy(x, y):
-    """Proxy for either true_div or int_div, depending on types of x, y.
+
+def int_or_true_div(x_discrete, y_discrete):
+    """
+    Return 'int' or 'true' depending on the type of division used for x / y.
+
+    :param x_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :param y_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :returns: 'int' if `x / y` should be an integer division, or `true` if it
+    should be a true division.
+
+    Raises an IntegerDivisionError if both `x_discrete` and `y_discrete` are
+    True and `config.int_division` is set to 'raise'.
+
+    This function is used by both scalar/basic.py and tensor.basic/py.
    """
-    if as_scalar(x).type.dtype.startswith('int') and as_scalar(y).type.dtype.startswith('int'):
-        return int_div(x, y)
+    if (x_discrete and y_discrete):
+        if config.int_division == 'raise':
+            raise IntegerDivisionError(
+                "With `config.int_division` set to 'raise', dividing two "
+                "integer types with '/' is forbidden to avoid confusion "
+                "between integer and floating point divisions. Please "
+                "use // for integer division, or if you want a float result "
+                "either cast one of the arguments to a float or directly call "
+                "`x.__truediv__(y)`.")
+        elif config.int_division == 'int':
+            warnings.warn(
+                    "Division of two integer types with x / y is deprecated, "
+                    "please use x // y for an integer division "
+                    "(set `config.int_division = raise` to track the origin "
+                    "of this warning)",
+                    DeprecationWarning)
+            return 'int'
+        elif config.int_division == 'floatX':
+            return 'true'
+        else:
+            raise NotImplementedError(config.int_division)
    else:
-        return true_div(x, y)
+        return 'true'
+
+
+def div_proxy(x, y):
+    """Proxy for either true_div or int_div, depending on types of x, y."""
+    f = eval('%s_div' % int_or_true_div(as_scalar(x).type in discrete_types,
+                                        as_scalar(y).type in discrete_types))
+    return f(x, y)
+

 class TrueDiv(BinaryScalarOp):
    def output_types(self, types):
-        if all(t not in continuous_types for t in types):
-            return [float64]
+        if all(t in discrete_types for t in types):
+            return [Scalar(config.floatX)]
        else:
            return super(TrueDiv, self).output_types(types)
    def impl(self, x, y):
        x = numpy.asarray(x)
        y = numpy.asarray(y)
-        if str(x.dtype).startswith('int') and str(y.dtype).startswith('int'):
-            return float(x) / y
+        if all(a.dtype in discrete_types for a in (x, y)):
+            return numpy.array(float(x) / y, dtype=config.floatX)
        else:
            return x / y
    def c_code(self, node, name, (x, y), (z, ), sub):
        #we generate good c code only when both are complex!
        if sum([node.inputs[0].type in complex_types, node.inputs[1].type in complex_types])==1:
            raise NotImplementedError('type not supported', type)
-        if node.inputs[0].type in int_types and node.inputs[1].type in int_types:
+        if (node.inputs[0].type in discrete_types and
+            node.inputs[1].type in discrete_types):
            return "%(z)s = ((double)%(x)s) / %(y)s;" % locals()
        return "%(z)s = %(x)s / %(y)s;" % locals()
    def grad(self, (x, y), (gz, )):
@@ -1028,11 +1100,15 @@ class TrueDiv(BinaryScalarOp):
        if x.type in float_types:
            first_part = cast(gz / y, x.type.dtype)
        else:
+            assert x.type in discrete_types
            first_part = None

+        if y.type in complex_types:
+            raise NotImplementedError()
        if y.type in float_types:
            second_part = cast(-(gz * x) / (y * y), y.type.dtype)
        else:
+            assert y.type in discrete_types
            second_part = None
        return first_part, second_part
 true_div = TrueDiv(upcast_out, name = 'true_div')
@@ -1048,9 +1124,29 @@ int_div = IntDiv(upcast_out, name = 'int_div')

 floor_div = int_div

+
+def raise_complex_error():
+    raise ComplexError(
+                "Theano does not support the mod operator (%) on "
+                "complex numbers, since numpy deprecated it.")
+
+
+def mod_check(x, y):
+    if (as_scalar(x).type in complex_types or
+        as_scalar(y).type in complex_types):
+        # Currently forbidden.
+        raise_complex_error()
+    else:
+        return mod(x, y)
+
+
 class Mod(BinaryScalarOp):
+
    def impl(self, x, y):
+        if isinstance(x, numpy.complex) or isinstance(y, numpy.complex):
+            raise_complex_error()
        return x % y
+
    def c_code_cache_version(self):
        return (5,)

@@ -1060,20 +1156,34 @@ class Mod(BinaryScalarOp):

    def c_code(self, node, name, (x, y), (z, ), sub):
        """
-        We want the result to have the same sign as python, not the other implementaiton of mod.
+        We want the result to have the same sign as python, not the other implementation of mod.
        """
        #raise NotImplementedError("Unlike Python, C's modulo returns negative modulo on negative dividend (to implement)")
        t = node.inputs[0].type.upcast(*[ i.type for i in node.inputs[1:]])
-        if t in int_types or t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64']:
+        if (str(t) in imap(str, discrete_types) or
+            t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64'] or
+            t in discrete_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, discrete_types)
            x_mod_y = "THEANO_MACRO_MOD(%(x)s, %(y)s)"%locals()
            x_mod_ymm = "THEANO_MACRO_MOD(-%(x)s, -%(y)s)"%locals()
            x_mod_ypm = "THEANO_MACRO_MOD(%(x)s, -%(y)s)"%locals()
            x_mod_ymp = "THEANO_MACRO_MOD(-%(x)s, %(y)s)"%locals()
-        elif t in float_types or t in ['float32','float64']:
+        elif (str(t) in imap(str, float_types) or
+              t in ['float32','float64'] or
+              t in float_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, float_types)
            x_mod_y = "fmod(%(x)s,%(y)s)"%locals()
            x_mod_ymm = "fmod(-%(x)s,-%(y)s)"%locals()
            x_mod_ypm = "fmod(%(x)s,-%(y)s)"%locals()
            x_mod_ymp = "fmod(-%(x)s,%(y)s)"%locals()
+        elif str(t) in imap(str, complex_types):
+            raise_complex_error()
        else:
            raise NotImplementedError('type not supported', type)


--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -37,6 +37,7 @@ class test_ScalarOps(unittest.TestCase):
    #As we use theano.scalar normally, but we use theano.tensor.scalar
    #that is not important. Also this make the theano fct fail at call time
    #so this is not a silent bug.
+    # --> This is why it is purposedly named 'tes_mod' instead of 'test_mod'.
    def tes_mod(self):
        """
        We add this test as not all language and C implementation give the same
@@ -174,6 +175,19 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))


+class test_complex_mod(unittest.TestCase):
+    """Make sure % fails on complex numbers."""
+
+    def test_fail(self):
+        x = complex64()
+        y = int32()
+        try:
+            x % y
+            assert False
+        except ComplexError:
+            pass
+
+
 class test_div(unittest.TestCase):
    def test_0(self):
        a = int8()
@@ -182,9 +196,9 @@ class test_div(unittest.TestCase):
        d = float64()
        f = float32()

-        print (a/b).owner.op
-        assert isinstance((a/b).owner.op, IntDiv)
-        assert isinstance((b/a).owner.op, IntDiv)
+        print (a//b).owner.op
+        assert isinstance((a//b).owner.op, IntDiv)
+        assert isinstance((b//a).owner.op, IntDiv)
        assert isinstance((b/d).owner.op, TrueDiv)
        assert isinstance((b/f).owner.op, TrueDiv)
        assert isinstance((f/a).owner.op, TrueDiv)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -7,6 +7,7 @@ import sys # for sys.maxint
 from theano.configparser import config, AddConfigVar, BoolParam
 import traceback #for overriding Op.__call__
 import warnings
+from itertools import izip

 import numpy, theano
 #from copy import copy as python_copy
@@ -23,6 +24,9 @@ from theano.gof.python25 import partial, any, all
 from theano import compile, printing
 from theano.printing import pprint

+# We use these exceptions as well.
+from theano.scalar import ComplexError, IntegerDivisionError
+
 ### set up the external interface
 from elemwise import Elemwise, DimShuffle, CAReduce, Sum

@@ -36,6 +40,17 @@ def _warn(*msg):
 #This is needed as we will hide it later
 python_complex=complex

+# Define common subsets of dtypes (as strings).
+int_dtypes = map(str, scal.int_types)
+discrete_dtypes = map(str, scal.discrete_types)
+complex_dtypes = map(str, scal.complex_types)
+
+
+class ShapeError(Exception):
+    """Raised when the shape cannot be computed."""
+    pass
+
+
 def check_equal_numpy(x, y):
    """
    Returns True iff x and y are equal (checks the dtype and
@@ -162,36 +177,64 @@ class NumpyAutocaster(object):
    """
    This class is used to cast python ints and floats to numpy arrays.

-    The behaviour for numpy scalars is a bit tricky... but tends to work in
-    practice.
-    If the dtype of a numpy scalar is in the self.dtypes list, then this 'cast'
-    is a no-op.
-
-    When config.floatX is float32 (at the time of calling), then this function
-    downcasts float and numpy.float arguments to numpy.float32, if float32 is
-    in the self.dtypes list.
-
-    Python ints are always 64bit and floats are always double precision.
-    This class uses the algorithm in __call__ to use a narrower dtype when no
-    precision would be lost, and to even lose precision when this is demanded
-    by the list of dtypes (e.g. to automatically cast all floats to
-    single-precision if self.dtypes does not include full precision floats).
-
+    The behavior when called on scalar `x` depends on `config.cast_policy`:
+        - 'numpy' will simply use the same type as found by `numpy.asarray(x)`.
+        - 'numpy+floatX' will do the same, except it will use float32 instead
+          of float64 if `x` is a Python float and `config.floatX` is set to
+          'float32' (note that if `x` is a numpy scalar whose data type is
+          float64, it is not modified since we assume the user is purposedly
+          using float64).
+        - 'custom' lets one define a tuple of data types such that:
+            - if `x` is already a numpy scalar and its data type is in this
+              tuple, then it is returned unchanged;
+            - otherwise, the first data type in this tuple that can represent
+              `x` without loss of precision will be used, unless `x` is a float
+              and 'float32' is in the tuple (in which case `x` is cast as a
+              float32);
+            - if no data type can represent `x` without loss of precision, then
+              the last data type in the tuple will be used.
    """
    def __init__(self, dtypes):
+        """
+        Constructor.
+
+        :type dtypes: Tuple of strings.
+        :param dtypes: The ordered list of preferred data types (only used when
+        `config.cast_policy` is set to 'custom', see the `NumpyAutocaster` help
+        for details).
+        """
        self.dtypes = tuple(dtypes)

    def __call__(self, x):
-        # Change the default casting behaviour for python floats to always cast
-        # to float32
-        dtype = None
+        # Make sure we only deal with scalars.
+        assert (isinstance(x, int) or
+                isinstance(x, float) or
+                (isinstance(x, numpy.ndarray) and x.ndim == 0))
+
+        if config.cast_policy == 'numpy':
+            return numpy.asarray(x)
+        elif config.cast_policy == 'numpy+floatX':
+            rval = numpy.asarray(x)
+            if (rval.dtype == 'float64' and         # numpy wants float64
+                config.floatX == 'float32' and      # but we prefer float32
+                not hasattr(x, 'dtype')):           # and `x` was not typed
+                rval = theano._asarray(rval, dtype='float32')
+            return rval
+
+        # The following is the original code, corresponding to the 'custom'
+        # option for `config.cast_policy`.
+        assert config.cast_policy == 'custom'

        try:
            # Pass through numpy scalars, since they are already typed on
            # purpose typically.
            if str(x.dtype) in self.dtypes:
-                return theano._asarray(x, dtype=x.dtype) #leave dtype alone
+                # No need to cast `x` into a new dtype. Note that we still
+                # need to convert it into an array, because it may not be
+                # one already (e.g. if x == numpy.float64(1.1)).
+                return numpy.asarray(x)
        except AttributeError:
+            # Means `x` has no 'dtype' attribute.
            pass

        # unsafe downcast of float64 variables when config.floatX == 'float32'
@@ -223,7 +266,10 @@ autocast_float = NumpyAutocaster(('float32', 'float64'))
 # have the same type as the xmatrix().
 #
 class autocast_float_as(object):
-    """This class makes it possible to temporarily and locally adjust autocasting behaviour.
+    """
+    This class makes it possible to temporarily and locally adjust autocasting
+    behavior when `config.cast_policy` is set to 'custom'.
+    If `config.cast_policy` is not 'custom', an exception is raised.

    For example:
    >>> with autocast_float_as('float32') as _dummy:
@@ -235,10 +281,13 @@ class autocast_float_as(object):
    """
    def __init__(self, *dtypes):
        self.dtypes = dtypes
+        assert config.cast_policy == 'custom'
    def __enter__(self):
+        assert config.cast_policy == 'custom'
        self.old_dtypes = autocast_float.dtypes
        autocast_float.dtypes = self.dtypes
    def __exit__(self, *args):
+        assert config.cast_policy == 'custom'
        autocast_float.dtypes = self.old_dtypes

 def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
@@ -260,6 +309,11 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
            x_ = autocast_int(x)
        elif rtype is TensorConstant and isinstance(x, float):
            x_ = autocast_float(x)
+        elif rtype is TensorConstant and isinstance(x, long):
+            # It is not clear what would happen if one was to use a `long`
+            # number as a constant in a Theano graph. As a result, we throw
+            # an exception in this situation.
+            raise NotImplementedError('Constants of type `long` not supported')
        elif isinstance(x, numpy.ndarray):
            x_ = x
            # Currently we do not have a bool dtype in Theano.
@@ -352,7 +406,7 @@ def _allclose(a, b):
        rtol = float64_rtol

    # Work around bug in Numpy, see http://projects.scipy.org/numpy/ticket/1684
-    if str(b.dtype).startswith('int') and (numpy.absolute(b) < 0).any():
+    if str(b.dtype) in int_dtypes and (numpy.absolute(b) < 0).any():
        b = theano._asarray(b, dtype='float64')

    return numpy.allclose(a,b, atol=atol, rtol=rtol)
@@ -1094,6 +1148,10 @@ class _tensor_py_operators:
    def __div__(self,other):
        try:
            return div_proxy(self,other)
+        except IntegerDivisionError:
+            # This is to raise the exception that occurs when trying to divide
+            # two integer arrays (currently forbidden).
+            raise
        except Exception, e:
            return NotImplemented
    def __pow__(self,other):
@@ -1103,7 +1161,11 @@ class _tensor_py_operators:
            return NotImplemented
    def __mod__(self,other):
        try:
-            return mod(self,other)
+            return mod_check(self, other)
+        except ComplexError:
+            # This is to raise the exception that occurs when trying to compute
+            # x % y with either x or y a complex number.
+            raise
        except Exception, e:
            return NotImplemented

@@ -1852,7 +1914,7 @@ def min(x, axis='DEFAULT'):
        "flatten the tensor before calling min()."),
        stacklevel=2)
    str_x_type = str(x.dtype)
-    if str_x_type.startswith('float') or str_x_type.startswith('int'):
+    if str_x_type.startswith('float') or str_x_type in int_dtypes:
        return -max(-x, axis=axis)
    else:
        #Be careful about unsigned integers, complex
@@ -1882,7 +1944,7 @@ def argmin(x, axis='DEFAULT'):
        "axis before calling argmin."),
        stacklevel=2)
    str_x_type = str(x.dtype)
-    if str_x_type.startswith('float') or str_x_type.startswith('int'):
+    if str_x_type.startswith('float') or str_x_type in int_dtypes:
        return argmax(-x, axis=axis)
    else:
        #Be careful about unsigned integers, complex
@@ -2385,7 +2447,7 @@ def mean(input, axis = None, op = False):
    if op:
        return Mean(axis)(input)

-    if str(input.dtype).startswith('int'):
+    if str(input.dtype) in discrete_dtypes:
            # we need to cast eventually anyway, and this helps
            # to prevents overflow
        input = cast(input, 'float64')
@@ -2529,12 +2591,11 @@ def minimum(x,y):
    # see decorator for function body

 def div_proxy(x, y):
-    """Proxy for either true_div or int_div, depending on types of x, y.
-    """
-    if as_tensor_variable(x).type.dtype.startswith('int') and as_tensor_variable(y).type.dtype.startswith('int'):
-        return int_div(x, y)
-    else:
-        return true_div(x, y)
+    """Proxy for either true_div or int_div, depending on types of x, y."""
+    f = eval('%s_div' % scal.int_or_true_div(
+        as_tensor_variable(x).dtype in discrete_dtypes,
+        as_tensor_variable(y).dtype in discrete_dtypes))
+    return f(x, y)

 @_scal_elemwise_with_nfunc('add', 2, 1)
 def add(a, *other_terms):
@@ -2566,6 +2627,15 @@ def int_div(a, b):
    """elementwise integer-division"""
    # see decorator for function body

+def mod_check(x, y):
+    """Make sure we do not try to use complex numbers."""
+    if (as_tensor_variable(x).dtype in complex_dtypes or
+        as_tensor_variable(y).dtype in complex_dtypes):
+        # Currently forbidden.
+        scal.raise_complex_error()
+    else:
+        return mod(x, y)
+
 @_scal_elemwise_with_nfunc('mod', 2, 1)
 def mod(a, b):
    """elementwise modulo"""
@@ -2868,7 +2938,7 @@ class Subtensor(Op):
        padded = ( actual_idx_list +
                  [slice(None, None, None)]*(len(xshp)-len(self.idx_list)))
        i = 0
-        for idx, xl in zip(padded, xshp):
+        for idx, xl in izip(padded, xshp):
            if isinstance(idx, slice):
                # If it is the default (None, None, None) slice, or a variant,
                # the shape will be xl
@@ -2878,7 +2948,7 @@ class Subtensor(Op):
                    outshp.append(xl)
                else:
                    cnf = get_canonical_form_slice(idx, xl)
-                    length = (cnf[0].stop - cnf[0].start -1)/cnf[0].step + 1
+                    length = (cnf[0].stop - cnf[0].start -1) // cnf[0].step + 1
                    length = switch(lt(length,0), 0, length)
                    outshp.append(length)
                i += 1
@@ -2978,15 +3048,28 @@ class SubtensorPrinter:

 pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor), SubtensorPrinter())

-def setsubtensor(x, y, idx_list, inplace=False):
-    print >> sys.stderr, "tensor.setsubtensor is deprecated - please use set_subtensor"
+
+def setsubtensor(x, y, idx_list, inplace=False, show_warning=True):
+    # Note that `show_warning` should only be set to False by tests, in order
+    # to make sure this old code is still working.
+    if show_warning:
+        print >> sys.stderr, (
+                "tensor.setsubtensor is deprecated - please use set_subtensor")
    the_op = IncSubtensor(idx_list, inplace, set_instead_of_inc=True)
-    return the_op(x, y, *Subtensor.collapse(idx_list, lambda entry: isinstance(entry, Variable)))
-def incsubtensor(x, y, idx_list, inplace=False):
-    print >> sys.stderr, "tensor.incsubtensor is deprecated - please use inc_subtensor"
+    return the_op(x, y, *Subtensor.collapse(
+                                    idx_list,
+                                    lambda entry: isinstance(entry, Variable)))
+
+
+def incsubtensor(x, y, idx_list, inplace=False, show_warning=True):
+    # Note that `show_warning` should only be set to False by tests, in order
+    # to make sure this old code is still working.
+    if show_warning:
+        print >> sys.stderr, "tensor.incsubtensor is deprecated - please use inc_subtensor"
    the_op = IncSubtensor(idx_list, inplace, set_instead_of_inc=False)
    return the_op(x, y, *Subtensor.collapse(idx_list, lambda entry: isinstance(entry, Variable)))

+
 def set_subtensor(x, y, inplace=False):
    """Return x with the given subtensor overwritten by y.

@@ -3519,14 +3602,14 @@ class Join(Op):
        # that whenever I get a None. Should we just remove gof/apply_shape
        # if it is depricated ??
        if ishapes[1] is None:
-            raise NotImplementedError
+            raise ShapeError()
        n_dim = len(ishapes[1])
        for shape in ishapes[1:]:
            if shape is None:
-                raise NotImplementedError
+                raise ShapeError()
            for shape_i in shape:
                if shape_i is None:
-                    raise NotImplementedError
+                    raise ShapeError()
            # at this point the inputs have been broadcasted so they should
            # all have the same shape
            assert len(shape) == n_dim
@@ -4025,6 +4108,31 @@ def arange(start, stop=None, step=1, dtype=None):
    # If dtype is not provided, infer it from the other arguments
    if dtype is None:
        dtype = scal.upcast(start.type.dtype, stop.type.dtype, step.type.dtype)
+        if config.cast_policy in ('numpy', 'numpy+floatX'):
+            # We enforce numpy semantics, except in the special case where
+            # `config.cast_policy` is 'numpy+floatX' and we want to use float32
+            # rather than float64.
+            # As an example, if `start`, `stop` and `step` are all int32,
+            # `numpy.arange` returns an int64 array (on 64-bit platforms),
+            # while the upcast above returns int32.
+            numpy_dtype = numpy.arange(
+                    start=numpy.array(0, dtype=start.dtype),
+                    stop=numpy.array(1, dtype=stop.dtype),
+                    step=numpy.array(1, dtype=step.dtype)).dtype
+            if numpy_dtype != dtype:
+                if (config.cast_policy == 'numpy+floatX' and
+                    config.floatX == 'float32' and
+                    numpy_dtype == 'float64' and
+                    # No explicit float64 in the three arguments?
+                    all(dt != 'float64'
+                        for dt in [s.dtype for s in (start, stop, step)])):
+                    # We use float32 instead.
+                    assert dtype != 'float64'
+                    dtype = 'float32'
+                else:
+                    # We use the same dtype as numpy instead of the result of
+                    # the upcast.
+                    dtype = str(numpy_dtype)

    if dtype not in _arange:
        _arange[dtype] = ARange(dtype)

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -454,7 +454,7 @@ class Elemwise(Op):
        """

        inputs = map(as_tensor_variable, inputs)
-        shadow = self.scalar_op.make_node(*[Scalar(dtype = t.type.dtype)() for t in inputs])
+        shadow = self.scalar_op.make_node(*[Scalar(dtype=i.type.dtype)() for i in inputs])

        target_length = max([input.type.ndim for input in inputs])


--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -135,9 +135,9 @@ class Conv3D(theano.Op):
        vidDur = V_shape[3]
        filterDur = W_shape[3]

-        output_height = T.floor( (vidHeight - filterHeight) / dr )+1
-        output_width = T.floor( (vidWidth - filterWidth) / dc )+1
-        output_dur = T.floor( (vidDur - filterDur) / dt ) +1
+        output_height = T.floor((vidHeight - filterHeight) // dr) + 1
+        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
+        output_dur = T.floor((vidDur - filterDur) // dt) + 1

        rval = (batch_size,  output_height, output_width, output_dur, output_channels )


--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -575,14 +575,15 @@ class ConvOp(Op):
            try:
                fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode)
            except TypeError:
-                raise NotImplementedError()
+                raise theano.tensor.ShapeError()
            outshp = (batch_size,fmo) + tuple(fmshp)
            return [outshp]
        else:
            # Haven't implemented this case. imshp and kshp may be symbollic
            # and ConvOp.getOutputShape doesn't handle this. In this case
            # we simply let the default function do its work.
-            raise NotImplementedError()
+            raise theano.tensor.ShapeError()
+            

    def perform(self,node, inp, out):
        """

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -879,6 +879,7 @@ def test_argmax_pushdown():
            [x],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)

@@ -922,6 +923,7 @@ def test_argmax_pushdown_bias():
            [x,b],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -28,11 +28,12 @@ from theano import compile  #to register the optimizer built by this file
 from theano.gof.python25 import any, all
 from theano.gof.opt import Optimizer, pre_constant_merge, pre_greedy_local_optimizer
 from theano.gof import toolbox, DestroyHandler
-from basic import get_constant_value
+from basic import get_constant_value, ShapeError


 # Utilities

+
 def out2in(*local_opts):
    """WRITEME """
    return opt.TopoOptimizer(opt.LocalOptGroup(*local_opts),
@@ -529,7 +530,7 @@ class ShapeFeature(object):
    the cost of many Ops accurately, and generate c-code that is specific [e.g. unrolled] to
    particular sizes.

-    If you can determine the shape only in some case, return NotImplementedError when you can't
+    In cases where you cannot figure out the shape, raise a ShapeError.

    .. note::

@@ -728,8 +729,15 @@ class ShapeFeature(object):

        try:
            o_shapes = shape_infer(node, [self.shape_of[r] for r in node.inputs])
-        except NotImplementedError:
+        except ShapeError:
            o_shapes = self.default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
+        except NotImplementedError, e:
+            raise NotImplementedError(
+                    'Code called by infer_shape failed raising a '
+                    'NotImplementedError. Raising NotImplementedError to '
+                    'indicate that a shape cannot be computed is no longer '
+                    'supported, and one should now use tensor.ShapeError '
+                    'instead. The original exception message is: %s' % e)
        except Exception, e:
            _logger.error('Failed to infer_shape from Op %s.\nInput shapes:%s\nException encountered during infer_shape: %s\nException message: %s\nTraceback: %s'% (node.op,
                [self.shape_of[r] for r in node.inputs],
@@ -3431,11 +3439,12 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
    """
    def local_fuse(node):
        """
-        As part of specialisation, we fuse two consecutive elemwise op of the same shape.
-
-        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
-        The number of dimension is validated at call time by theano itself.
+        As part of specialization, we fuse two consecutive elemwise Ops of the
+        same shape.

+        For mixed dtype, we let the Composite op do the cast. It lets the C
+        compiler do the cast.
+        The number of dimensions is validated at call time by theano itself.
        """
        # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
        # TODO: use broadcast flag?
@@ -3551,7 +3560,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
        if new_nb_input != len(inputs) or len(s_inputs) != len(inputs):
            raise Exception("""Something has gone wrong with the elemwise
 fusion optimization. We skip this optimization. You can ignore this message,
-your code will run correctly, but maybe slower.""")
+your code will run correctly, but may be slower.""")

        otype = node.outputs[0].type
        s_new_out=node.op.scalar_op(*s_g)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -47,6 +47,75 @@ def eval_outputs(outputs):
        return variables[0]
    return variables

+def get_numeric_subclasses(cls=numpy.number, ignore=None):
+    """
+    Return subclasses of `cls` in the numpy scalar hierarchy.
+
+    We only return subclasses that correspond to unique data types.
+    The hierarchy can be seen here:
+        http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
+    """
+    if ignore is None:
+        ignore = []
+    rval = []
+    dtype = numpy.dtype(cls)
+    dtype_num = dtype.num
+    if dtype_num not in ignore:
+        # Safety check: we should be able to represent 0 with this data type.
+        numpy.array(0, dtype=dtype)
+        rval.append(cls)
+        ignore.append(dtype_num)
+    for sub in cls.__subclasses__():
+        rval += [c for c in get_numeric_subclasses(sub, ignore=ignore)]
+    return rval
+
+
+def get_numeric_types(with_int=True, with_float=True, with_complex=False,
+                      with_128_bit=False):
+    """
+    Return numpy numeric data types.
+
+    :param with_int: Whether to include integer types.
+
+    :param with_float: Whether to include floating point types.
+
+    :param with_complex: Whether to include complex types.
+
+    :param with_128_bit: Whether to include 128/256-bit types.
+
+    :returns: A list of unique data type objects. Note that multiple data types
+    may share the same string representation, but can be differentiated through
+    their `num` attribute.
+
+    Note that we could probably rely on the lists of types defined in the
+    `scalar` module. However with this function we can test more unique dtype
+    objects, and possibly detect defects in dtypes that may be introduced in
+    numpy in the future.
+    """
+    rval = []
+    def is_within(cls1, cls2):
+        # Return True if scalars defined from `cls1` are within the hierarchy
+        # starting from `cls2`.
+        # The third test below is to catch for instance the fact that
+        # one can use ``dtype=numpy.number`` and obtain a float64 scalar, even
+        # though `numpy.number` is not under `numpy.floating` in the class
+        # hierarchy.
+        return (cls1 is cls2 or
+                issubclass(cls1, cls2) or
+                isinstance(numpy.array([0], dtype=cls1)[0], cls2))
+    for cls in get_numeric_subclasses():
+        dtype = numpy.dtype(cls)
+        if ((not with_complex and is_within(cls, numpy.complexfloating)) or
+            (not with_int and is_within(cls, numpy.integer)) or
+            (not with_float and is_within(cls, numpy.floating)) or
+            (not with_128_bit and ('128' in str(dtype) or
+                                   '256' in str(dtype)))):
+            # Ignore this class.
+            continue
+        rval.append([str(dtype), dtype, dtype.num])
+    # We sort it to be deterministic, then remove the string and num elements.
+    return [x[1] for x in sorted(rval, key=str)]
+
 def _numpy_checker(x, y):
    """
    Checks if x.data and y.data have the same contents.
@@ -374,6 +443,18 @@ _good_broadcast_div_mod_normal_float_inplace = dict(same_shapes = (rand(2, 3), r
 _good_broadcast_div_mod_normal_float = dict(empty2 = (numpy.asarray([0]), numpy.asarray([])),
                                            **_good_broadcast_div_mod_normal_float_inplace
                                            )
+def no_complex(d):
+    """Remove pairs from dictionary d when the value contains complex data."""
+    return dict((k, v) for k, v in d.iteritems()
+                if all(str(x.dtype) not in tensor.complex_dtypes for x in v))
+
+
+# 'No-complex' versions.
+_good_broadcast_div_mod_normal_float_no_complex = no_complex(
+                                        _good_broadcast_div_mod_normal_float)
+_good_broadcast_div_mod_normal_float_inplace_no_complex = no_complex(
+                                _good_broadcast_div_mod_normal_float_inplace)
+
 _grad_broadcast_div_mod_normal = dict(same_shapes = (rand(2, 3), rand(2, 3)),
                                      scalar = (rand(2, 3), rand(1, 1)),
                                      row = (rand(2, 3), rand(1, 3)),
@@ -389,8 +470,9 @@ _grad_broadcast_div_mod_normal = dict(same_shapes = (rand(2, 3), rand(2, 3)),

 div_grad_rtol=None
 if config.floatX=='float32':
-    #We raise the relative tolerence for the grad as their is error in float32
-    #This is probably caused by our way of computing the gradient error.
+    # We raise the relative tolerance for the grad as there can be errors in
+    # float32.
+    # This is probably caused by our way of computing the gradient error.
    div_grad_rtol=0.025
 DivTester = makeBroadcastTester(op = true_div,
                                  expected = lambda x, y: x / y,
@@ -410,14 +492,14 @@ DivInplaceTester = makeBroadcastTester(op = inplace.true_div_inplace,

 ModTester = makeBroadcastTester(op = mod,
                                  expected = lambda x, y: numpy.asarray(x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
-                                  good = _good_broadcast_div_mod_normal_float,
+                                  good = _good_broadcast_div_mod_normal_float_no_complex,
 #                                               integers = (randint(2, 3), randint_nonzero(2, 3)),
 #                                               dtype_mixup_1 = (rand(2, 3), randint_nonzero(2, 3)),
 #                                               dtype_mixup_2 = (randint_nonzero(2, 3), rand(2, 3))),
                                  )
 ModInplaceTester = makeBroadcastTester(op = inplace.mod_inplace,
                                         expected = lambda x, y: numpy.asarray(x % y, dtype=theano.scalar.basic.upcast(x.dtype, y.dtype)),
-                                         good = _good_broadcast_div_mod_normal_float_inplace,
+                                         good = _good_broadcast_div_mod_normal_float_inplace_no_complex,
                                         inplace = True)

 _good_broadcast_pow_normal_float = dict(same_shapes = (rand_ranged(1, 5, (2, 3)), rand_ranged(-3, 3, (2, 3))),
@@ -2180,7 +2262,7 @@ class T_Join_and_Split(unittest.TestCase):

    def test_stack_scalar_make_vector(self):
        '''Test that calling stack() on scalars instantiates MakeVector,
-        not Join. Test that the floatX dtype stay floatX, not down casted to int64'''
+        not Join. Test that the floatX dtype stay floatX, not downcasted to int64'''
        a = tensor.scalar('a')
        b = tensor.scalar('b')
        s = stack(a, b, a, b)
@@ -2665,9 +2747,9 @@ class T_divimpl(unittest.TestCase):
                (5.0/11.0))
        assert numpy.allclose(function([i, ii, d, f, c], f/i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (11.0/5.0))
-        assert numpy.allclose(function([i, ii, d, f, c], i/ii)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
+        assert numpy.allclose(function([i, ii, d, f, c], i//ii)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (5/3))
-        assert numpy.allclose(function([i, ii, d, f, c], ii/i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
+        assert numpy.allclose(function([i, ii, d, f, c], ii//i)(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (3/5))
        assert numpy.allclose(function([i, ii, d, f, c], true_div(i,ii))(5, 3, 7.0, 11.0, numpy.complex(5,3)),
                (5./3.))
@@ -3056,7 +3138,13 @@ class T_scalarfromtensor(unittest.TestCase):
        v = eval_outputs([ss])

        self.assertTrue(v == 56, v)
-        self.assertTrue(isinstance(v, numpy.int8))
+        if config.cast_policy == 'custom':
+            self.assertTrue(isinstance(v, numpy.int8))
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            self.assertTrue(isinstance(
+                v, getattr(numpy, str(numpy.asarray(56).dtype))))
+        else:
+            raise NotImplementedError(config.cast_policy)
        self.assertTrue(v.shape == (), v.shape)
        tt = lscalar()
        ss = scalar_from_tensor(tt)
@@ -3538,7 +3626,13 @@ class TestARange(unittest.TestCase):
        out = arange(start, stop)
        f = function([start, stop], out)

-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(numpy.int32(0),
+                                             numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(0,5) == numpy.arange(0,5))
        assert numpy.all(f(-5,1) == numpy.arange(-5,1))
        assert numpy.all(f(0,0) == numpy.arange(0,0))
@@ -3560,7 +3654,12 @@ class TestARange(unittest.TestCase):
        out = arange(stop)
        f = function([stop], out)

-        assert out.dtype == stop.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == stop.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(8) == numpy.arange(8))
        assert numpy.all(f(-2) == numpy.arange(-2))

@@ -3568,24 +3667,93 @@ class TestARange(unittest.TestCase):
        fout = arange(fstop)
        ff = function([fstop], fout)

-        assert fout.dtype == fstop.type.dtype
+        if config.cast_policy == 'custom':
+            assert fout.dtype == fstop.type.dtype
+        elif config.cast_policy == 'numpy':
+            assert fout.dtype == numpy.arange(numpy.float32(1)).dtype
+        elif config.cast_policy == 'numpy+floatX':
+            if config.floatX == 'float32':
+                assert fout.dtype == 'float32'
+            else:
+                assert fout.dtype == numpy.arange(numpy.float32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
+
        fstop_values = [0.2, -0.7, 8.5]
        for fstop_v in fstop_values:
            fstop_v32 = numpy.float32(fstop_v)
            assert numpy.all(ff(fstop_v32) == numpy.arange(fstop_v))

    def test_upcast(self):
-        """Test that arange compute output type adequately"""
-        assert arange(iscalar()).dtype == iscalar().dtype
-        assert arange(fscalar()).dtype == fscalar().dtype
-        assert arange(dscalar()).dtype == dscalar().dtype
-
-        # int32 + float32 -> float64
-        assert arange(iscalar(), fscalar()).dtype == dscalar().dtype
-        assert arange(iscalar(), dscalar()).dtype == dscalar().dtype
-        assert arange(fscalar(), dscalar()).dtype == dscalar().dtype
-
-        assert arange(iscalar(), fscalar(), dscalar()).dtype == dscalar().dtype
+        """Test that arange computes output type adequately"""
+        if config.cast_policy == 'custom':
+            assert arange(iscalar()).dtype == iscalar().dtype
+            assert arange(fscalar()).dtype == fscalar().dtype
+            assert arange(dscalar()).dtype == dscalar().dtype
+
+            # int32 + float32 -> float64
+            assert arange(iscalar(), fscalar()).dtype == dscalar().dtype
+            assert arange(iscalar(), dscalar()).dtype == dscalar().dtype
+            assert arange(fscalar(), dscalar()).dtype == dscalar().dtype
+
+            assert arange(iscalar(), fscalar(), dscalar()).dtype == dscalar().dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            for dtype in get_numeric_types():
+                # Test with a single argument.
+                arange_dtype = arange(scalar(dtype=str(dtype))).dtype
+                numpy_dtype = numpy.arange(numpy.array(1, dtype=dtype)).dtype
+                if (dtype != 'float64' and
+                    numpy_dtype == 'float64' and
+                    config.cast_policy == 'numpy+floatX' and
+                    config.floatX == 'float32'):
+                    # We want a float32 arange.
+                    assert arange_dtype == 'float32'
+                else:
+                    # Follow numpy.
+                    assert arange_dtype == numpy_dtype
+                
+                # Test with two arguments.
+                for stop_dtype in get_numeric_types():
+                    arange_dtype = arange(
+                            start=scalar(dtype=str(dtype)),
+                            stop=scalar(dtype=str(stop_dtype))).dtype
+                    numpy_dtype = numpy.arange(
+                            start=numpy.array(0, dtype=dtype),
+                            stop=numpy.array(1, dtype=stop_dtype)).dtype
+                    if (dtype != 'float64' and
+                        stop_dtype != 'float64' and
+                        numpy_dtype == 'float64' and
+                        config.cast_policy == 'numpy+floatX' and
+                        config.floatX == 'float32'):
+                        # We want a float32 arange.
+                        assert arange_dtype == 'float32'
+                    else:
+                        # Follow numpy.
+                        assert arange_dtype == numpy_dtype
+
+                    # Test with three arguments.
+                    for step_dtype in get_numeric_types():
+                        arange_dtype = arange(
+                                start=scalar(dtype=str(dtype)),
+                                stop=scalar(dtype=str(stop_dtype)),
+                                step=scalar(dtype=str(step_dtype))).dtype
+                        numpy_dtype = numpy.arange(
+                                start=numpy.array(0, dtype=dtype),
+                                stop=numpy.array(1, dtype=stop_dtype),
+                                step=numpy.array(1, dtype=step_dtype)).dtype
+                        if (dtype != 'float64' and
+                            stop_dtype != 'float64' and
+                            step_dtype != 'float64' and
+                            numpy_dtype == 'float64' and
+                            config.cast_policy == 'numpy+floatX' and
+                            config.floatX == 'float32'):
+                            # We want a float32 arange.
+                            assert arange_dtype == 'float32'
+                        else:
+                            # Follow numpy.
+                            assert arange_dtype == numpy_dtype
+        else:
+            raise NotImplementedError(config.cast_policy)

    def test_dtype_cache(self):
        """Checks that the same Op is returned on repeated calls to arange
@@ -3624,7 +3792,13 @@ class TestARange(unittest.TestCase):
        f = function([start, stop], out.shape, mode=mode)
        assert len(f.maker.env.toposort())==4
 #4 [Elemwise{sub,no_inplace}(stop, start), Elemwise{Cast{int64}}(Elemwise{sub,no_inplace}.0), Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)](Elemwise{Cast{int64}}.0, 0), MakeVector(Elemwise{Maximum{output_types_preference=transfer_type{0}}}[(0, 0)].0)]
-        assert out.dtype == start.type.dtype
+        if config.cast_policy == 'custom':
+            assert out.dtype == start.type.dtype
+        elif config.cast_policy in ('numpy', 'numpy+floatX'):
+            assert out.dtype == numpy.arange(
+                    numpy.int32(0), numpy.int32(1), numpy.int32(1)).dtype
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.all(f(0,5) == len(numpy.arange(0,5)))
        assert numpy.all(f(2,11) == len(numpy.arange(2,11)))
        assert numpy.all(f(-5,1) == len(numpy.arange(-5,1)))
@@ -4074,6 +4248,22 @@ def test_default_state():
    assert numpy.allclose(f(numpy.asarray(2.2, dtype=config.floatX)), 7)

 def test_autocast():
+    backup_config = config.cast_policy
+    # Call test functions for all possible values of `config.cast_policy`.
+    for autocast_cfg in (
+            'custom',
+            'numpy',
+            'numpy+floatX',
+            ):
+        config.cast_policy = autocast_cfg
+        try:
+            eval('_test_autocast_' + autocast_cfg.replace('+', '_'))()
+        finally:
+            config.cast_policy = backup_config
+
+def _test_autocast_custom():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'custom'
    orig_autocast = autocast_float.dtypes

    # Test that autocast_float_as sets the autocast dtype correctly
@@ -4165,6 +4355,180 @@ def test_autocast():
    finally:
        ac.__exit__()

+
+def _test_autocast_numpy():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'numpy'
+    # Go through some typical scalar values.
+    def ok(z):
+        assert tensor.constant(z).dtype == numpy.asarray(z).dtype
+    for x in ([2**i for i in xrange(63)] +
+              [0] +
+              [0., 1., 1.1, 1.5]):
+        n_x = numpy.asarray(x)
+        # Make sure the data type is the same as the one found by numpy.
+        ok(x)
+        ok(-x)
+        ok(x - 1)
+        ok(-x + 1)
+        ok(n_x)
+
+
+def _test_autocast_numpy_floatX():
+    """Called from `test_autocast`."""
+    assert config.cast_policy == 'numpy+floatX'
+    backup_floatX = config.floatX
+    def ok(z, floatX):
+        if (isinstance(z, float) and
+            floatX == 'float32' and
+            not hasattr(z, 'dtype')):
+            # Special case where we use 'float32' instead of 'float64'.
+            assert tensor.constant(z).dtype == 'float32'
+        else:
+            assert tensor.constant(z).dtype == numpy.asarray(z).dtype
+    try:
+        # Test with various values of `config.floatX`.
+        for floatX in ('float32', 'float64'):
+            config.floatX = floatX
+            # Go through some typical scalar values.
+            for x in ([2**i for i in xrange(63)] +
+                      [0] +
+                      [0., 1., 1.1, 1.5]):
+                ok(x, floatX)
+                ok(-x, floatX)
+                ok(x - 1, floatX)
+                ok(-x + 1, floatX)
+                ok(numpy.asarray(x), floatX)
+                ok(numpy.float64(x), floatX)
+    finally:
+        config.floatX = backup_floatX
+
+
+class test_arithmetic_cast(unittest.TestCase):
+
+    """
+    Test output types of basic arithmeric operations (* / + - //).
+
+    We only test the behavior for `config.cast_policy` set to either 'numpy' or
+    'numpy+floatX': the 'custom' behavior is (at least partially) tested in
+    `_test_autocast_custom`.
+    """
+
+    def test_arithmetic_cast(self):
+        backup_config = config.cast_policy
+        dtypes = get_numeric_types(with_complex=True)
+        # Here:
+        # scalar == scalar stored as a 0d array
+        # array == 1d array
+        # i_scalar == scalar type used internally by Theano
+        theano_scalar = lambda dtype: tensor.scalar(dtype=str(dtype))
+        numpy_scalar = lambda dtype: numpy.array(1, dtype=dtype)
+        theano_array = lambda dtype: tensor.vector(dtype=str(dtype))
+        numpy_array = lambda dtype: numpy.array([1], dtype=dtype)
+        theano_i_scalar = lambda dtype: theano.scalar.Scalar(str(dtype))()
+        numpy_i_scalar = numpy_scalar
+        try:
+            for cfg in ('numpy', 'numpy+floatX'):
+                config.cast_policy = cfg
+                for op in (operator.add, operator.sub, operator.mul,
+                           operator.div, operator.floordiv):
+                    for a_type in dtypes:
+                        for b_type in dtypes:
+                            # Note that we do not test division between
+                            # integers if it is forbidden.
+                            # Theano deals with integer division in its own
+                            # special way (depending on `config.int_division`).
+                            is_int_division = (
+                                    op is operator.div and
+                                    a_type in tensor.discrete_dtypes and
+                                    b_type in tensor.discrete_dtypes)
+                            # We will test all meaningful combinations of
+                            # scalar and array operations.
+                            for combo in (
+                                          ('scalar', 'scalar'),
+                                          ('array', 'array'),
+                                          ('scalar', 'array'),
+                                          ('array', 'scalar'),
+                                          ('i_scalar', 'i_scalar'),
+                                          ):
+
+                                theano_args = map(eval,
+                                        ['theano_%s' % c for c in combo])
+                                numpy_args = map(eval,
+                                        ['numpy_%s' % c for c in combo])
+                                try:
+                                    theano_dtype = op(
+                                        theano_args[0](a_type),
+                                        theano_args[1](b_type)).type.dtype
+                                    # Should have crashed if it is an integer
+                                    # division and `config.int_division` does
+                                    # not allow it.
+                                    assert not (is_int_division and
+                                                config.int_division == 'raise')
+                                except theano.scalar.IntegerDivisionError:
+                                    assert (is_int_division and
+                                            config.int_division == 'raise')
+                                    # This is the expected behavior.
+                                    continue
+                                # For numpy we have a problem:
+                                #   http://projects.scipy.org/numpy/ticket/1827
+                                # As a result we only consider the highest data
+                                # type that numpy may return.
+                                numpy_dtypes = [
+                                        op(numpy_args[0](a_type),
+                                           numpy_args[1](b_type)).dtype,
+                                        op(numpy_args[1](b_type),
+                                           numpy_args[0](a_type)).dtype]
+                                numpy_dtype = theano.scalar.upcast(
+                                        *map(str, numpy_dtypes))
+                                if numpy_dtype == theano_dtype:
+                                    # Same data type found, all is good!
+                                    continue
+                                if (cfg == 'numpy+floatX' and
+                                    config.floatX == 'float32' and
+                                    a_type != 'float64' and
+                                    b_type != 'float64' and
+                                    numpy_dtype == 'float64'):
+                                    # We should keep float32.
+                                    assert theano_dtype == 'float32'
+                                    continue
+                                if 'array' in combo and 'scalar' in combo:
+                                    # For mixed scalar / array operations,
+                                    # Theano may differ from numpy as it does
+                                    # not try to prevent the scalar from
+                                    # upcasting the array.
+                                    array_type, scalar_type = (
+                                            (a_type, b_type)[
+                                                        list(combo).index(arg)]
+                                            for arg in ('array', 'scalar'))
+                                    up_type = theano.scalar.upcast(array_type,
+                                                                   scalar_type)
+                                    if (
+                                        # The two data types are different.
+                                        scalar_type != array_type and
+                                        # The array type is not enough to hold
+                                        # the scalar type as well.
+                                        array_type != up_type and
+                                        # Theano upcasted the result array.
+                                        theano_dtype == up_type and
+                                        # But Numpy kept its original type.
+                                        # (not an equality because of numpy bug
+                                        # mentioned above).
+                                        array_type in numpy_dtypes):
+                                        # Then we accept this difference in
+                                        # behavior.
+                                        continue
+                                if (is_int_division and
+                                    config.int_division == 'floatX'):
+                                    assert theano_dtype == config.floatX
+                                    continue
+                                # In any other situation: something wrong is
+                                # going on!
+                                assert False
+        finally:
+            config.cast_policy = backup_config
+
+
 class test_broadcast(unittest.TestCase):
    def test_broadcast_bigdim(self):
        def f():
@@ -4373,6 +4737,18 @@ class T_as_tensor_variable(unittest.TestCase):
        assert ten.type.dtype == 'uint8'


+class test_complex_mod(unittest.TestCase):
+    """Make sure % fails on complex numbers."""
+
+    def test_fail(self):
+        x = vector(dtype='complex64')
+        try:
+            x % 5
+            assert False
+        except ComplexError:
+            pass
+
+
 if __name__ == '__main__':
    if 1:
        unittest.main()

--- a/theano/tensor/tests/test_incsubtensor.py
+++ b/theano/tensor/tests/test_incsubtensor.py
@@ -30,9 +30,11 @@ class Test_incsubtensor(unittest.TestCase):
        for do_set in [False,True]:

            if do_set:
-                resut = T.setsubtensor(a, increment, [sl1, sl2])
+                resut = T.setsubtensor(a, increment, [sl1, sl2],
+                                       show_warning=False)
            else:
-                resut = T.incsubtensor(a, increment, [sl1, sl2])
+                resut = T.incsubtensor(a, increment, [sl1, sl2],
+                                       show_warning=False)

            f = theano.function([a, increment, sl2_end], resut)

@@ -59,7 +61,7 @@ class Test_incsubtensor(unittest.TestCase):

        def inc_slice(*s):
            def just_numeric_args(a,b):
-                return T.incsubtensor(a, b, s)
+                return T.incsubtensor(a, b, s, show_warning=False)
            return just_numeric_args

        # vector

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -647,10 +647,14 @@ def test_local_merge_abs():


 def test_mixeddiv():
-    """Test that int division is preserved"""
+    """Test that int division raises an exception."""
    i = iscalar()
    d = dscalar()
-    assert 0 == function([i,d], d*(i/(i+1)))(3, 1.0)
+    try:
+        0 == function([i,d], d*(i/(i+1)))(3, 1.0)
+        assert False
+    except theano.scalar.IntegerDivisionError:
+        pass

 def test_const_type_in_mul_canonizer():
    input = dmatrix()
@@ -2487,6 +2491,7 @@ class T_local_sum(unittest.TestCase):
        assert numpy.allclose(f(input),input.sum())


+        config.warn.sum_sum_bug = False
        f = theano.function([a],a.sum(0).sum(0).sum(0),mode=self.mode)
        assert len(f.maker.env.nodes)==1
        assert numpy.allclose(f(input),input.sum())
@@ -2496,6 +2501,7 @@ class T_local_sum(unittest.TestCase):
        input=numpy.arange(3*3*3, dtype=config.floatX).reshape(3,3,3)
        dims=[(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]

+        config.warn.sum_sum_bug = False
        for d,dd in dims:
            f = theano.function([a],a.sum(d).sum(dd),mode=self.mode)
            assert numpy.allclose(f(input),input.sum(d).sum(dd))
@@ -2541,6 +2547,7 @@ class T_local_sum(unittest.TestCase):
                assert len(f.maker.env.nodes)==nb_nodes[2]
                assert f.maker.env.toposort()[-1].op==T.alloc

+            config.warn.sum_sum_bug = False
            for d, dd in [(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)]:
                f = theano.function([a],t_like(a).sum(d).sum(dd),mode=mode)
                print f.maker.env.toposort()
@@ -2600,6 +2607,8 @@ class T_local_sum_dimshuffle(unittest.TestCase):
        c_val = rng.randn(2,2,2).astype(config.floatX)
        d_val = numpy.asarray(rng.randn(), config.floatX)

+        config.warn.sum_sum_bug = False
+        config.warn.sum_div_dimshuffle_bug = False
        for i,s in enumerate(sums):
            print i
            f = theano.function([a,b,c,d], s, mode=self.mode)

--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
 """ test code snippet in the Theano tutorials.
 """

-import unittest
+import os, unittest
 import theano
 import theano.tensor as T
 from theano import function
@@ -722,6 +722,15 @@ class T_loading_and_saving(unittest.TestCase):

        mode_instance = theano.compile.mode.get_mode(None)
        if not isinstance(mode_instance, theano.compile.debugmode.DebugMode):
+            if os.path.exists('obj.save') or os.path.exists('objects.save'):
+                # We do not want to delete these files silently, in case for
+                # some reason they would be something else than test-generated
+                # files.
+                # Ideally we would save those files in a temporary directory...
+                raise AssertionError(
+                        'Please get rid of files obj.save and '
+                        'objects.save in directory %s' % os.getcwd())
+
            f = file('obj.save', 'wb')
            cPickle.dump(my_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
            f.close()
@@ -746,6 +755,9 @@ class T_loading_and_saving(unittest.TestCase):
                loaded_objects.append(cPickle.load(f))
            f.close()

+            # Cleanup created files.
+            os.remove('obj.save')
+            os.remove('objects.save')

 class T_modes(unittest.TestCase):
    ## All tests here belog to