Merged

00ecd70d · Olivier Delalleau · 253dde5c · 0488b3cf · 00ecd70d · 00ecd70d
--- a/bin/theano-cache
+++ b/bin/theano-cache
@@ -6,7 +6,7 @@ from theano.gof.cc import get_module_cache
 if len(sys.argv) == 1:
    print config.compiledir
 elif sys.argv[1] in ('clear'):
-    get_module_cache().clear()
+    get_module_cache().clear(unversioned_min_age=-1, clear_base_files=True)
 else:
    print 'command "%s" not recognized' % sys.argv[1]
    print 'Type "theano-cache" to print the cache location'

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -144,7 +144,7 @@ import theano and print the config variable, as in:

 .. attribute:: floatX

-    String value: either 'float64' or 'float32'.
+    String value: either 'float64' or 'float32'

    Default: 'float64'

@@ -152,6 +152,48 @@ import theano and print the config variable, as in:
    and similar functions.  It also sets the default theano bit width for
    arguments passed as Python floating-point numbers.

+.. attribute:: cast_policy
+
+    String value: either 'numpy+floatX', 'numpy' or 'custom'
+
+    Default: 'numpy+floatX'
+
+    This specifies how data types are implicitly figured out in Theano, e.g. for
+    constants or in the results of arithmetic operations. The 'custom' value
+    corresponds to a set of custom rules originally used in
+    Theano (which can be partially customized, see e.g. the in-code help of
+    ``tensor.NumpyAutocaster``), and is now deprecated.
+    The 'numpy' setting attempts to
+    mimic the numpy casting rules. The default value ('numpy+floatX') does
+    the same, except that
+    it prefers to use float32 numbers instead of float64 when ``config.floatX``
+    is set to 'float32'.
+    Note that both 'numpy' and 'numpy+floatX'
+    behave differently from numpy on purpose in the following situations:
+       * Depending on the value of ``config.int_division``, the resulting type
+         of a division of integer types with the ``/`` operator may not match
+         that of numpy.
+       * On mixed scalar / array operations, numpy tries to prevent the scalar
+         from upcasting the array's type unless it is of a fundamentally
+         different type. Theano does not attempt to do the same at this point,
+         so you should be careful that scalars may upcast arrays when they
+         would not when using numpy.
+
+.. attribute:: int_division
+
+    String value: either 'int', 'floatX' or 'raise'
+
+    Default: 'int'
+
+    Specifies what to do when one tries to compute ``x / y``, where both ``x`` and
+    ``y`` are of integer types (possibly unsigned). 'int' means an integer is
+    returned (as in Python 2.X), but this behavior is deprecated. 'floatX'
+    returns a number of type given by ``config.floatX``. 'raise' is the safest
+    choice (and will become default in a future release of Theano) and raises
+    an error when one tries to do such an operation, enforcing the use of the
+    integer division operator (``//``) (if a float result is intended, either
+    cast one of the arguments to a float, or use ``x.__truediv__(y)``).
+
 .. attribute:: mode

    String value: 'Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN', 'FAST_COMPILE'

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -15,11 +15,16 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'),
        )

-# TODO Work-in-progress
-#AddConfigVar('casting_policy',
-#        "Rules for implicit casts of constants in arithmetic operations",
-#        EnumStr('theano_0.3', 'numpy'),
-#        )
+AddConfigVar('cast_policy',
+        "Rules for implicit type casting",
+        EnumStr('numpy+floatX', 'numpy', 'custom'),
+        )
+
+AddConfigVar('int_division',
+        "What to do when one computes x / y, where both x and y are of "
+        "integer types",
+        EnumStr('int', 'raise', 'floatX'),
+        )

 #gpu mean let the driver select the gpu. Needed in case of gpu in exclusive mode.
 #gpuX mean use the gpu number X.
@@ -30,7 +35,8 @@ AddConfigVar('device',
            'gpu4', 'gpu5', 'gpu6', 'gpu7',
            'gpu8', 'gpu9', 'gpu10', 'gpu11',
            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False)
+                allow_override=False),
+        in_c_key=False,
        )

 AddConfigVar('init_gpu_device',
@@ -43,13 +49,13 @@ AddConfigVar('init_gpu_device',
            'gpu4', 'gpu5', 'gpu6', 'gpu7',
            'gpu8', 'gpu9', 'gpu10', 'gpu11',
            'gpu12', 'gpu13', 'gpu14', 'gpu15',
-                allow_override=False)
-        )
+                allow_override=False),
+        in_c_key=False)

 AddConfigVar('force_device',
        "Raise an error if we can't use the specified device",
-        BoolParam(False, allow_override=False)
-        )
+        BoolParam(False, allow_override=False),
+        in_c_key=False)

 #Don't add FAST_RUN_NOGC to this list(as well as other ALL CAPS short cut)
 #The way to get FAST_RUN_NOGC is with the flag 'linker=c|py_nogc'
@@ -57,7 +63,8 @@ AddConfigVar('force_device',
 AddConfigVar('mode',
        "Default compilation mode",
        EnumStr('Mode', 'ProfileMode', 'DebugMode', 'FAST_RUN',
-                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'))
+                'FAST_COMPILE', 'PROFILE_MODE', 'DEBUG_MODE'),
+        in_c_key=False)

 # Test whether or not gcc is present: disable C code if it is not
 try:
@@ -65,12 +72,14 @@ try:
    # Keep the default linker the same as the one for the mode FAST_RUN
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'))
+                 EnumStr('c|py', 'py', 'c', 'c|py_nogc', 'c&py'),
+                 in_c_key=False)
 except OSError:
    # gcc is not present, linker should default to python only
    AddConfigVar('linker',
                 "Default linker used if the theano flags mode is Mode or ProfileMode",
-                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'))
+                 EnumStr('py', 'c|py', 'c', 'c|py_nogc', 'c&py'),
+                 in_c_key=False)
    warning('GCC not detected ! Theano will be unable to execute optimized '+
            'C-implementations (for both CPU and GPU) and will default to '+
            'Python implementations. Performance will be severely degraded.')
@@ -78,32 +87,39 @@ except OSError:
 #Keep the default optimizer the same as the one for the mode FAST_RUN
 AddConfigVar('optimizer',
        "Default optimizer. If not None, will use this linker with the Mode object(not ProfileMode or DebugMode)",
-        EnumStr('fast_run', 'merge', 'fast_compile', 'None'))
+        EnumStr('fast_run', 'merge', 'fast_compile', 'None'),
+        in_c_key=False)

 AddConfigVar('on_opt_error',
        "What to do when an optimization crashes: warn and skip it, or raise the exception",
-        EnumStr('warn', 'raise'))
+        EnumStr('warn', 'raise'),
+        in_c_key=False)

 AddConfigVar('home',
        "User home directory",
-        StrParam(os.getenv("HOME", os.path.expanduser('~'))))
+        StrParam(os.getenv("HOME", os.path.expanduser('~'))),
+        in_c_key=False)
 #This expanduser works on windows (see discussion on theano-users, July 13 2010)

 AddConfigVar('nocleanup',
        "Suppress the deletion of code files that did not compile cleanly",
-        BoolParam(False))
+        BoolParam(False),
+        in_c_key=False)

 AddConfigVar('tensor.cmp_sloppy',
        "Relax tensor._allclose (0) not at all, (1) a bit, (2) more",
-        IntParam(0, lambda i: i in (0,1,2)))
+        IntParam(0, lambda i: i in (0,1,2)),
+        in_c_key=False)

 AddConfigVar('tensor.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the elemwise fusion optimization",
-        BoolParam(True))
+        BoolParam(True),
+        in_c_key=False)

 AddConfigVar('gpu.local_elemwise_fusion',
        "Enable or not in fast_run mode(fast_run optimization) the gpu elemwise fusion optimization",
-        BoolParam(True))
+        BoolParam(True),
+        in_c_key=False)

 #http://developer.amd.com/CPU/LIBRARIES/LIBM/Pages/default.aspx
 AddConfigVar('lib.amdlibm',
@@ -140,41 +156,47 @@ AddConfigVar('numpy.seterr_all',
              "by the following flags: seterr_divide, seterr_over, "
              "seterr_under and seterr_invalid."),
             EnumStr('ignore', 'warn', 'raise', 'call', 'print', 'log', 'None',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_divide',
             ("Sets numpy's behavior for division by zero, see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_over',
             ("Sets numpy's behavior for floating-point overflow, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_under',
             ("Sets numpy's behavior for floating-point underflow, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 AddConfigVar('numpy.seterr_invalid',
             ("Sets numpy's behavior for invalid floating-point operation, "
              "see numpy.seterr. "
              "'None' means using the default, defined by numpy.seterr_all."),
             EnumStr('None', 'ignore', 'warn', 'raise', 'call', 'print', 'log',
-                 allow_override=False))
+                 allow_override=False),
+             in_c_key=False)

 ###
 ### To disable some warning about old bug that are fixed now.
 ###
 AddConfigVar('warn.ignore_bug_before',
             "If 'None', we warn about all Theano bugs found by default. If 'all', we don't warn about Theano bugs found by default. If a version, we print only the warnings relative to Theano bugs found after that version. Warning for specific bugs can be configured with specific [warn] flags.",
-             EnumStr('None', 'all', '0.3', allow_override=False))
+             EnumStr('None', 'all', '0.3', allow_override=False),
+             in_c_key=False)

 default_0_3 = True
 if config.warn.ignore_bug_before == 'None':
@@ -187,20 +209,25 @@ elif config.warn.ignore_bug_before >= '0.3':

 AddConfigVar('warn.argmax_pushdown_bug',
             "Warn if in past version of Theano we generated a bug with the optimisation theano.tensor.nnet.nnet.local_argmax_pushdown optimization. Was fixed 27 may 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.gpusum_01_011_0111_bug',
             "Warn if we are in a case where old version of Theano had a silent bug with GpuSum pattern 01,011 and 0111 when the first dimensions was bigger then 4096. Was fixed 31 may 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.sum_sum_bug',
             "Warn if we are in a case where Theano version between version 9923a40c7b7a and the 2 august 2010(fixed date), generated an error in that case. This happen when their is 2 consecutive sum in the graph, bad code was generated. Was fixed 2 August 2010",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('warn.sum_div_dimshuffle_bug',
             "Warn if previous versions of Theano (between rev. 3bd9b789f5e8, 2010-06-16, and cfc6322e5ad4, 2010-08-03) would have given incorrect result. This bug was triggered by sum of division of dimshuffled tensors.",
-             BoolParam(default_0_3))
+             BoolParam(default_0_3),
+             in_c_key=False)

 AddConfigVar('compute_test_value',
        "If 'True', Theano will run each op at graph build time, using Constants, SharedVariables and the tag 'test_value' as inputs to the function. This helps the user track down problems in the graph before it gets optimized.",
-        EnumStr('False', 'True', 'warn', 'err'))
+        EnumStr('False', 'True', 'warn', 'err'),
+        in_c_key=False)
--- a/theano/configparser.py
+++ b/theano/configparser.py
@@ -7,6 +7,8 @@ import ConfigParser
 import logging
 import warnings

+import theano
+
 _logger = logging.getLogger('theano.config')

 class TheanoConfigWarning(Warning):
@@ -103,6 +105,21 @@ def _config_print(thing, buf):
        print >> buf, "    Value: ", cv.val
        print >> buf, ""

+
+def get_config_md5():
+    """
+    Return a string md5 of the current config options. It should be such that
+    we can safely assume that two different config setups will lead to two
+    different strings.
+
+    We only take into account config options for which `in_c_key` is True.
+    """
+    all_opts = sorted([c for c in _config_var_list if c.in_c_key],
+                      key=lambda cv: cv.fullname)
+    return theano.gof.cc.hash_from_code('\n'.join(
+                    ['%s = %s' % (cv.fullname, cv.val) for cv in all_opts]))
+
+
 class TheanoConfigParser(object):
    #properties are installed by AddConfigVar
    _i_am_a_config_class = True
@@ -110,6 +127,7 @@ class TheanoConfigParser(object):
        sio = StringIO.StringIO()
        _config_print(self.__class__, sio)
        return sio.getvalue()
+
 # N.B. all instances of TheanoConfigParser give access to the same properties.
 config = TheanoConfigParser()

@@ -124,17 +142,27 @@ config = TheanoConfigParser()
 # - The subtrees provide the same interface as the root
 # - ConfigParser subclasses control get/set of config properties to guard against craziness.

-def AddConfigVar(name, doc, configparam, root=config):
+def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
    """Add a new variable to theano.config

    :type name: string for form "[section0.[section1.[etc]]].option"
    :param name: the full name for this configuration variable.
+
    :type doc: string
    :param doc: What does this variable specify?
+
    :type configparam: ConfigParam instance
    :param configparam: an object for getting and setting this configuration parameter
+
    :type root: object
-    :param root: used for recusive calls -- don't provide an argument for this parameter.
+    :param root: used for recusive calls -- do not provide an argument for this parameter.
+
+    :type in_c_key: boolean
+    :param in_c_key: If True, then whenever this config option changes, the
+    key associated to compiled C modules also changes, i.e. it may trigger a
+    compilation of these modules (this compilation will only be partial if it
+    turns out that the generated C code is unchanged). Set this option to False
+    only if you are confident this option should not affect C code compilation.

    :returns: None
    """
@@ -155,11 +183,13 @@ def AddConfigVar(name, doc, configparam, root=config):
        newroot = getattr(root, sections[0])
        if not getattr(newroot, '_i_am_a_config_class', False) or isinstance(newroot, type):
            raise TypeError('Internal config nodes must be config class instances', newroot)
-        return AddConfigVar('.'.join(sections[1:]), doc, configparam, root=newroot)
+        return AddConfigVar('.'.join(sections[1:]), doc, configparam,
+                            root=newroot, in_c_key=in_c_key)
    else:
        if hasattr(root, name):
            raise AttributeError('This name is already taken', configparam.fullname)
        configparam.doc = doc
+        configparam.in_c_key = in_c_key
        configparam.__get__() # trigger a read of the value from config files and env vars
        setattr(root.__class__, sections[0], configparam)
        _config_var_list.append(configparam)

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -7,6 +7,7 @@ from copy import copy
 import re #for set_compiledir
 import os, sys, StringIO

+
 if sys.version_info[:2] >= (2,5):
    import hashlib
    def hash_from_code(msg):
@@ -16,6 +17,13 @@ else:
    def hash_from_code(msg):
        return md5.new(msg).hexdigest()

+
+def hash_from_file(file_path):
+    """Return the MD5 hash of a file."""
+    return hash_from_code(open(file_path, 'rb').read())
+
+
+import theano
 from theano.gof.python25 import all
 from theano import config

@@ -43,6 +51,7 @@ import cmodule

 import logging
 _logger=logging.getLogger("theano.gof.cc")
+_logger.setLevel(logging.WARN)
 def info(*args):
    _logger.info(' '.join(str(a) for a in args))
 def debug(*args):
@@ -791,7 +800,7 @@ class CLinker(link.Linker):
        The key returned by this function is of the form (version, signature)
        The signature has the following form:
        {{{
-            'CLinker.cmodule_key', compilation args, libraries,
+            'CLinker.cmodule_key', compilation args, libraries, config md5,
            (op0, input_signature0, output_signature0),
            (op1, input_signature1, output_signature1),
            ...
@@ -858,10 +867,16 @@ class CLinker(link.Linker):
        constant_ids = dict()
        op_pos = {} # Apply -> topological position

-        # first we put the header, compile_args, library names into the signature
+        # First we put the header, compile_args, library names and config md5
+        # into the signature.
        sig = ['CLinker.cmodule_key'] # will be cast to tuple on return
        if compile_args is not None: sig.append(tuple(compile_args))
        if libraries is not None: sig.append(tuple(libraries))
+        # IMPORTANT: The 'md5' prefix is used to isolate the compilation
+        # parameters from the rest of the key. If you want to add more key
+        # elements, they should be before this md5 hash if and only if they
+        # can lead to a different compiled file with the same source code.
+        sig.append('md5:' + theano.configparser.get_config_md5())

        # technically this should only be appended for gcc-compiled Ops
        # and the flags of other compilers should be inserted here... but it's not clear how to
@@ -943,11 +958,30 @@ class CLinker(link.Linker):

    def compile_cmodule(self, location=None):
        """
-        This method is a callback for `ModuleCache.module_from_key`
+        Compile the module and return it.
+        """
+        # Go through all steps of the compilation process.
+        for step_result in self.compile_cmodule_by_step(location=location):
+            pass
+        # And return the output of the last step, which should be the module
+        # itself.
+        return step_result
+
+    def compile_cmodule_by_step(self, location=None):
+        """
+        This method is a callback for `ModuleCache.module_from_key`.
+
+        It is a generator (thus the 'by step'), so that:
+            - it first yields the module's C code
+            - it last yields the module itself
+            - it may yield other intermediate outputs in-between if needed
+              in the future (but this is not currently the case)
        """
        if location is None:
            location = cmodule.dlimport_workdir(config.compiledir)
        mod = self.build_dynamic_module()
+        src_code = mod.code()
+        yield src_code
        get_lock()
        try:
            debug("LOCATION", location)
@@ -955,7 +989,7 @@ class CLinker(link.Linker):
            libs = self.libraries()
            preargs = self.compile_args()
            if c_compiler.__name__=='nvcc_module_compile_str' and config.lib.amdlibm:
-                #this lib don't work correctly with nvcc in device code.
+                # This lib does not work correctly with nvcc in device code.
                if '<amdlibm.h>' in mod.includes:
                    mod.includes.remove('<amdlibm.h>')
                if '-DREPLACE_WITH_AMDLIBM' in preargs:
@@ -965,7 +999,7 @@ class CLinker(link.Linker):
            try:
                module = c_compiler(
                    module_name=mod.name,
-                    src_code = mod.code(),
+                    src_code=src_code,
                    location=location,
                    include_dirs=self.header_dirs(),
                    lib_dirs=self.lib_dirs(),
@@ -977,8 +1011,7 @@ class CLinker(link.Linker):
        finally:
            release_lock()

-        return module
-
+        yield module

    def build_dynamic_module(self):
        """Return a cmodule.DynamicModule instance full of the code for our env.
@@ -1041,10 +1074,10 @@ class CLinker(link.Linker):
        except KeyError:
            key = None
        if key is None:
-            #if we can't get a key, then forget the cache mechanism
+            # If we can't get a key, then forget the cache mechanism.
            module = self.compile_cmodule()
        else:
-            module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule, keep_lock=keep_lock)
+            module = get_module_cache().module_from_key(key=key, fn=self.compile_cmodule_by_step, keep_lock=keep_lock)

        vars = self.inputs + self.outputs + self.orphans
        # List of indices that should be ignored when passing the arguments

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -37,7 +37,7 @@ def get_str_list_logical_scalar(node, value_str='ii_i%i_value', data_str='ii_i%i
 class NaiveAlgo(object):
    verbose = 0 # 1, 2 or 3 for more verbose output.
    cache_version = ()
-    cache_version = ('debug', 14, verbose)
+    cache_version = (14, verbose)

    def __init__(self, scalar_op, sync=True, inplace_pattern={}):
        """
@@ -56,7 +56,7 @@ class NaiveAlgo(object):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
-        print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(self.scalar_op.__class__.__name__,nodename, id(self), nd)
+        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (self.scalar_op.__class__.__name__,nodename, nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
        #declare inputs
@@ -159,10 +159,9 @@ class NaiveAlgo(object):
                print >> sio, "//    Input  ", ipos, str(i.type)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "//    Output ", ipos, str(i.type)
-            print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
+            print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(
                    self.scalar_op.__class__.__name__,
                    nodename,
-                    id(self),
                    'tiling%i'%nd)
            if (nd):
                print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
@@ -262,10 +261,9 @@ class NaiveAlgo(object):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
-        print >> sio, "static __global__ void kernel_%s_%s_%s_%s(unsigned int numEls" %(
+        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %(
                self.scalar_op.__class__.__name__,
                nodename,
-                id(self),
                'tiling%i_less_registers'%nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
@@ -472,7 +470,6 @@ class NaiveAlgo(object):
        nd = node.outputs[0].type.ndim
        nb_inputs = len(node.inputs)
        nb_outputs = len(node.outputs)
-        id_self = id(self)
        d = dict()
        #input_params and output_params go into the function declaration/definition
        input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
@@ -512,7 +509,7 @@ class NaiveAlgo(object):
        """ %locals()
        if self.verbose:
            print >> sio, """
-                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s_%(id_self)s     w numEls" << numEls << " dims"<< d << "\\n";
+                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
            """ %locals()
            print >> sio, 'std::cerr << ' + " << ' ' <<  ".join(['"  "']+list("dims[%i]"%di
                for di in xrange(nd)) + ["'\\n';"])
@@ -693,7 +690,7 @@ nd_collapse_[i]=0;
                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'


-        def launch_Ccontiguous(nodename, id_self, scalar_op, sync=True):
+        def launch_Ccontiguous(nodename, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
                kernel_call_args.append("i%i_data"%ipos)
@@ -736,7 +733,7 @@ nd_collapse_[i]=0;
            else:
                print >> sio, " return 0; " %locals()

-        def launch_General(nodename, id_self, scalar_op, force_nd, sync=True):
+        def launch_General(nodename, scalar_op, force_nd, sync=True):
            # kernel_call_args are used to invoke the cuda kernel
            local="local_"
            kernel_call_args = ["numEls"]
@@ -769,7 +766,7 @@ nd_collapse_[i]=0;
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

-                kernel_%(scalar_op)s_%(nodename)s_%(id_self)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
+                kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
                """ %locals()
            if sync:
                print >> sio, """
@@ -791,11 +788,11 @@ nd_collapse_[i]=0;
        print >> sio, "if(numEls==0) return 0;"
        print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
        print >> sio, "case 0: {"
-        launch_Ccontiguous(nodename, id_self, scalar_op, self.sync)
+        launch_Ccontiguous(nodename, scalar_op, self.sync)
        print >> sio, "        } break;"
        for i in range(1, nd+1):
            print >> sio, "case "+str(i)+": {"
-            launch_General(nodename, id_self, scalar_op, i, self.sync)
+            launch_General(nodename, scalar_op, i, self.sync)
            print >> sio, "        } break;"

        print >> sio, "}"#end case

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -318,11 +318,11 @@ def test_elemwise3():
    a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a')
    b = tensor.fvector()
    print b.type
-    print tensor.constant(1).type
-    print (1 + b).type
-    print (1 + b**a).type
-    print tensor.exp((1 + b**a)).type
-    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
+    fone = tensor.constant(1, dtype='float32')
+    print (fone + b).type
+    print (fone + b**a).type
+    print tensor.exp((fone + b**a)).type
+    f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(fone +
        b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu)
    has_elemwise = False
    for i, node in enumerate(f.maker.env.toposort()):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -144,7 +144,8 @@ def test_opt_gpujoin_joinvectors_elemwise_then_minusone():
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""
    b = tensor.fmatrix()
-    f = theano.function([b],theano.printing.Print()(b)*2, mode=mode_with_gpu)
+    ftwo = tensor.constant(2, dtype='float32')
+    f = theano.function([b],theano.printing.Print()(b) * ftwo, mode=mode_with_gpu)
    #theano.printing.debugprint(f)
    #print f.maker.env.toposort()
 #[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -246,13 +246,13 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
    neib_shape = T.as_tensor_variable(neib_shape)
    original_shape = T.as_tensor_variable(original_shape)

-    new_neib_shape = T.stack( original_shape[-1]/neib_shape[1], neib_shape[1] )
+    new_neib_shape = T.stack(original_shape[-1] // neib_shape[1], neib_shape[1])
    output_2d = images2neibs(neibs.dimshuffle('x','x',0,1), new_neib_shape, mode=mode)
    
    if mode == 'ignore_borders':
        valid_shape = list(original_shape)
-        valid_shape[2]  = valid_shape[2] / neib_shape[0] * neib_shape[0]
-        valid_shape[3]  = valid_shape[3] / neib_shape[1] * neib_shape[1]
+        valid_shape[2]  = (valid_shape[2] // neib_shape[0]) * neib_shape[0]
+        valid_shape[3]  = (valid_shape[3] // neib_shape[1]) * neib_shape[1]
        output_4d = output_2d.reshape(valid_shape)
        #padding the borders with zeros
        for d in [2,3]:

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -49,6 +49,17 @@ def multMatVect(v, A, m1, B, m2):
    r[3:] = matVecModM(B, v[3:], m2)
    return r

+def cast_if_untyped(x, dtype):
+    """Return `x` cast as a numpy scalar of type `dtype` if `x` is untyped."""
+    if hasattr(x, 'dtype'):
+        # `x` is already typed.
+        return x
+    else:
+        # We intend to do this on regular Python int / float objects.
+        assert isinstance(x, int) or isinstance(x, float)
+        return numpy.array(x, dtype=dtype)
+
+
 #MRG31k3p
 #generator constants :
 M1 = numpy.int32(2147483647)    #2^31 - 1
@@ -263,7 +274,7 @@ class mrg_uniform(mrg_uniform_base):
        if (%(size)s->dimensions[0] != %(ndim)s)
        {
            PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
-                %(ndim)s, %(size)s->dimensions[0]);
+                %(ndim)s, int(%(size)s->dimensions[0]));
            %(fail)s
        }
        if (%(size)s->descr->type_num != PyArray_INT32)
@@ -589,6 +600,35 @@ class GPU_mrg_uniform(mrg_uniform_base):
    def c_code_cache_version(self):
        return (5,)

+
+def guess_n_streams(size, warn=True):
+    """
+    Return a guess at a good number of streams.
+    
+    :param warn: If True, warn when a guess cannot be made (in which case
+    we return 30 * 256).
+    """
+    # TODO: a smart way of choosing the number of streams, see #612.
+    # Note that this code was moved out of `MRG_RandomStreams` so that it can
+    # be easily accessed from tests, where we want to disable the warning.
+    if (isinstance(size, (tuple, list)) and
+        all([isinstance(i, int) for i in size])):
+        # We can make a guess.
+        r = 1
+        for s in size:
+            r *= s
+        if r > 6:
+            r = r/6 # chosen as fastest for rbm_benchmark
+        return r
+    else:
+        if warn:
+            assert False
+            print >> sys.stderr, (
+                    "MRG_RandomStreams Can't determine #streams from "
+                    "size (%s), guessing 30*256") % str(size)
+        return 30 * 256
+
+
 class MRG_RandomStreams(object):
    """Module component with similar interface to numpy.random (numpy.random.RandomState)"""

@@ -654,18 +694,7 @@ class MRG_RandomStreams(object):
        return rval

    def n_streams(self, size):
-        # TODO: a smart way of choosing the number of streams, see #612.
-        if isinstance(size, (tuple, list)) and all([isinstance(i,int) for i in size]):
-            r = 1
-            for s in size:
-                r *= s
-            if r > 6:
-                r = r/6 # chosen as fastest for rbm_benchmark
-            return r
-
-        print >> sys.stderr, ("MRG_RandomStreams Can't determine #streams from "
-                "size (%s), guessing 30*256")%str(size)
-        return 30*256
+        return guess_n_streams(size, warn=True)

    def pretty_return(self, node_rstate, new_rstate, sample):
        sample.rstate = node_rstate
@@ -674,7 +703,8 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample

-    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX, nstreams=None):
+    def uniform(self, size, low=0, high=1, ndim=None, dtype='floatX',
+                nstreams=None):
        """
        Sample a tensor of given size whose element from a uniform
        distribution between low and high.
@@ -683,10 +713,25 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.

-        :param: size: Can be a list of integer or Theano variable
+        :param low: Lower bound of the interval on which values are sampled.
+        If not already typed, it is cast into dtype.
+
+        :param high: Higher bound of the interval on which values are sampled.
+        If not already typed, it is cast into dtype.
+
+        :param size: Can be a list of integer or Theano variable
                (ex: the shape of other Theano Variable)
-                TODO: can size be None?
+
+        :param dtype: The output data type.
        """
+        if dtype == 'floatX':
+            dtype = config.floatX
+
+        # We cast `low` and `high` into `dtype` to make sure we do not upcast
+        # e.g. float32 into float64.
+        low = cast_if_untyped(low, dtype)
+        high = cast_if_untyped(high, dtype)
+
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i,int) or isinstance(i,Variable)
@@ -726,18 +771,23 @@ class MRG_RandomStreams(object):

        if u.type.broadcastable != r.type.broadcastable:
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
+
+        assert r.dtype == dtype
        return  r

-    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64'):
+    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
+                 nstreams=None):
        if n == 1:
-            if dtype=='float32' and self.use_cuda:
-                return cast(self.uniform(size=size, dtype=dtype) < p, dtype)
+            if dtype == 'float32' and self.use_cuda:
+                x = self.uniform(size=size, dtype=dtype, nstreams=nstreams)
            else:
-                return cast(self.uniform(size=size) < p, dtype)
+                x = self.uniform(size=size, nstreams=nstreams)
+            return cast(x < p, dtype)
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")

-    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64',
+                    nstreams=None):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial
        distribution defined by probabilities pvals.
@@ -758,22 +808,36 @@ class MRG_RandomStreams(object):
                    ndim, size, pvals[:,0])
            assert ndim==1
            bcast = bcast+(pvals.type.broadcastable[-1],)
-            unis = self.uniform(size=size, ndim=1)
+            unis = self.uniform(size=size, ndim=1, nstreams=nstreams)
            op = multinomial.MultinomialFromUniform(dtype)
            return op(pvals, unis)
        else:
            raise NotImplementedError(("MRG_RandomStreams.multinomial only"
                " implemented with n == 1 and pvals.ndim = 2"))

-    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
+    def normal(self, size=None, avg=0, std=1, ndim=None,
+               dtype='floatX', nstreams=None):
        """
-        :param: size: Can be a list of integer or Theano variable(ex: the shape of other Theano Variable)
+        :param size: Can be a list of integers or Theano variables (ex: the
+        shape of another Theano Variable)
+
+        :param dtype: The output data type.
+
+        :param nstreams: Number of streams.
        """
        # We need an even number of ]0,1[ samples. Then we split them
        # in two halves. First half becomes our U1's for Box-Muller,
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

+        if dtype == 'floatX':
+            dtype = config.floatX
+
+        # We cast `avg` and `std` into `dtype` to make sure we do not upcast
+        # e.g. float32 into float64.
+        avg = cast_if_untyped(avg, dtype)
+        std = cast_if_untyped(std, dtype)
+
        evened = False
        constant = False
        if isinstance(size, tuple) and all([isinstance(i,int) for i in size]):
@@ -786,25 +850,26 @@ class MRG_RandomStreams(object):
        else:
            #if even, don't change, if odd, +1
            n_samples = prod(size)+(prod(size)%2)
-        flattened = self.uniform(size=(n_samples,), dtype=dtype)
+        flattened = self.uniform(size=(n_samples,), dtype=dtype,
+                                 nstreams=nstreams)

        if constant:
-            U1 = flattened[:n_samples/2]
-            U2 = flattened[n_samples/2:]
+            U1 = flattened[:n_samples // 2]
+            U2 = flattened[n_samples // 2:]
        else:
-            U1 = flattened[:prod(flattened.shape)/2]
-            U2 = flattened[prod(flattened.shape)/2:]
+            U1 = flattened[:prod(flattened.shape) // 2]
+            U2 = flattened[prod(flattened.shape) // 2:]

        #normal_samples = zeros_like(flattened)
-        sqrt_ln_U1 = sqrt(-2.0*log(U1))
+        sqrt_ln_U1 = sqrt(numpy.array(-2.0, dtype=dtype) * log(U1))
        # TypeError: 'TensorVariable' object does not support item assignment
        # so this doesn't work...
        #normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
        #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)

        # so trying this instead
-        first_half = sqrt_ln_U1 * cos(2.0*cast(numpy.pi,dtype)*U2)
-        second_half = sqrt_ln_U1 * sin(2.0*cast(numpy.pi,dtype)*U2)
+        first_half = sqrt_ln_U1 * cos(numpy.array(2.0 * numpy.pi, dtype=dtype) * U2)
+        second_half = sqrt_ln_U1 * sin(numpy.array(2.0 * numpy.pi, dtype=dtype)*U2)
        normal_samples = join(0, first_half, second_half)

        final_samples = None
@@ -820,6 +885,7 @@ class MRG_RandomStreams(object):

        final_samples = avg + std * final_samples

+        assert final_samples.dtype == dtype
        return final_samples

 @local_optimizer([None])

--- a/theano/sandbox/test_multinomial.py
+++ b/theano/sandbox/test_multinomial.py
@@ -3,7 +3,7 @@ import copy
 import numpy

 import theano
-from theano import tensor, function
+from theano import config, function, tensor
 import multinomial
 from theano.compile.mode import get_default_mode, predefined_linkers
 import theano.sandbox.cuda as cuda
@@ -77,7 +77,14 @@ def test_multinomial_large():
        mval = f(pval,uval)

        assert mval.shape == pval.shape
+        if config.cast_policy == 'custom':
            assert mval.dtype == pval.dtype
+        elif config.cast_policy == 'numpy+floatX':
+            assert mval.dtype == config.floatX
+        elif config.cast_policy == 'numpy':
+            assert mval.dtype == 'float64'
+        else:
+            raise NotImplementedError(config.cast_policy)
        assert numpy.allclose(mval.sum(axis=1), 2)
        asdf = numpy.asarray([0, 0, 2, 0])+0*pval
        assert numpy.allclose(mval, asdf) #broadcast over all rows

--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -350,7 +350,9 @@ def test_uniform():
        print 'ON CPU with size=(%s):'%str(size)
        x = tensor.matrix()
        R = MRG_RandomStreams(234, use_cuda=False)
-        u = R.uniform(size=size)
+        # Note: we specify `nstreams` to avoid a warning.
+        u = R.uniform(size=size,
+                      nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, u, mode=mode)
        assert any([isinstance(node.op,theano.sandbox.rng_mrg.mrg_uniform)
                    for node in f.maker.env.toposort()])
@@ -366,7 +368,8 @@ def test_uniform():
            print ''
            print 'ON GPU with size=(%s):'%str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
-            u = R.uniform(size=size, dtype='float32')
+            u = R.uniform(size=size, dtype='float32',
+                          nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                    theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -421,7 +424,9 @@ def test_binomial():
            print ''
            print 'ON CPU with size=(%s) and mean(%d):'%(str(size),mean)
            R = MRG_RandomStreams(234, use_cuda=False)
-            u = R.binomial(size=size, p=mean)
+            # Note: we specify `nstreams` to avoid a warning.
+            u = R.binomial(size=size, p=mean,
+                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
            f = theano.function(var_input, u, mode=mode)
            theano.printing.debugprint(f)
            out = f(*input)
@@ -433,7 +438,9 @@ def test_binomial():
                print ''
                print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
                R = MRG_RandomStreams(234, use_cuda=True)
-                u = R.binomial(size=size, p=mean, dtype='float32')
+                u = R.binomial(size=size, p=mean, dtype='float32',
+                               nstreams=rng_mrg.guess_n_streams(size,
+                                                                warn=False))
                assert u.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
                f = theano.function(var_input, theano.Out(
                        theano.sandbox.cuda.basic_ops.gpu_from_host(u),
@@ -478,7 +485,9 @@ def test_normal0():
        print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
-        n = R.normal(size=size, avg=avg, std=std)
+        # Note: we specify `nstreams` to avoid a warning.
+        n = R.normal(size=size, avg=avg, std=std,
+                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n, mode=mode)
        theano.printing.debugprint(f)
        out  = f(*input)
@@ -491,7 +500,8 @@ def test_normal0():
            print ''
            print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
-            n = R.normal(size=size, avg=avg, std=std, dtype='float32')
+            n = R.normal(size=size, avg=avg, std=std, dtype='float32',
+                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
            assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
            f = theano.function(var_input, theano.Out(
                theano.sandbox.cuda.basic_ops.gpu_from_host(n),
@@ -557,7 +567,8 @@ def test_multinomial():
    pvals = numpy.asarray(numpy.random.uniform(size=sample_size))
    pvals = numpy.apply_along_axis(lambda row : row/numpy.sum(row), 1, pvals)
    R = MRG_RandomStreams(234, use_cuda=False)
-    m = R.multinomial(pvals=pvals, dtype=config.floatX)
+    # Note: we specify `nstreams` to avoid a warning.
+    m = R.multinomial(pvals=pvals, dtype=config.floatX, nstreams=30 * 256)
    f = theano.function([], m, mode=mode_)
    theano.printing.debugprint(f)
    out = f()

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -12,8 +12,9 @@ If you want to use a scalar variable in a Theano graph,
 you probably want to use theano.tensor.[c,z,f,d,b,w,i,l,]scalar!
 """

-import math
+import math, warnings
 from copy import copy
+from itertools import imap

 import numpy, theano

@@ -26,11 +27,37 @@ builtin_complex = complex
 builtin_int = int
 builtin_float = float

+
+class ComplexError(Exception):
+    """Raised if complex numbers are used in an unsupported operation."""
+    pass
+
+class IntegerDivisionError(Exception):
+    """Raised if someone tries to divide integers with '/' instead of '//'."""
+    pass
+
+
 def upcast(dtype, *dtypes):
-    z = numpy.zeros((), dtype = dtype)
-    for dtype in dtypes:
-        z = z + numpy.zeros((), dtype = dtype)
-    return str(z.dtype)
+    # Should we try to keep float32 instead of float64? This is used so that
+    # for instance mixing int64 with float32 yields float32 instead of float64.
+    # Note that we store this boolean as a one-element list so that it can be
+    # modified within `make_array`.
+    keep_float32 = [(config.cast_policy == 'numpy+floatX' and
+                     config.floatX == 'float32')]
+    def make_array(dt):
+        if dt == 'float64':
+            # There is an explicit float64 dtype: we cannot keep float32.
+            keep_float32[0] = False
+        return numpy.zeros((), dtype=dt)
+    z = make_array(dtype)
+    for dt in dtypes:
+        z = z + make_array(dt=dt)
+    rval = str(z.dtype)
+    if rval == 'float64' and keep_float32[0]:
+        return 'float32'
+    else:
+        return rval
+

 def as_scalar(x, name = None):
    if isinstance(x, gof.Apply):
@@ -47,6 +74,7 @@ def as_scalar(x, name = None):
    except TypeError:
        raise TypeError("Cannot convert %s to Scalar" % x, type(x))

+
 def constant(x):
    # pass through numpy scalars, since they are already typed on purpose typically.
    if hasattr(x,'dtype'):
@@ -383,6 +411,7 @@ uint_types = uint8, uint16, uint32, uint64
 float_types = float32, float64
 complex_types = complex64, complex128

+discrete_types = int_types + uint_types
 continuous_types = float_types + complex_types
 
 class _scalar_py_operators:
@@ -416,7 +445,8 @@ class _scalar_py_operators:
    def __sub__(self,other): return sub(self,other)
    def __mul__(self,other): return mul(self,other)
    def __div__(self,other): return div_proxy(self,other)
-    def __mod__(self,other): return mod(self,other)
+    def __floordiv__(self, other): return int_div(self, other)
+    def __mod__(self, other): return mod_check(self, other)
    def __pow__(self,other): return pow(self,other)

    #ARITHMETIC - RIGHT-OPERAND
@@ -994,32 +1024,74 @@ class Sub(BinaryScalarOp):
        return first_part, second_part
 sub = Sub(upcast_out, name = 'sub')

-def div_proxy(x, y):
-    """Proxy for either true_div or int_div, depending on types of x, y.
+
+def int_or_true_div(x_discrete, y_discrete):
    """
-    if as_scalar(x).type.dtype.startswith('int') and as_scalar(y).type.dtype.startswith('int'):
-        return int_div(x, y)
-    else:
-        return true_div(x, y)
+    Return 'int' or 'true' depending on the type of division used for x / y.
+
+    :param x_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :param y_discrete: True if `x` is discrete ([unsigned] integer).
+
+    :returns: 'int' if `x / y` should be an integer division, or `true` if it
+    should be a true division.
+
+    Raises an IntegerDivisionError if both `x_discrete` and `y_discrete` are
+    True and `config.int_division` is set to 'raise'.
+
+    This function is used by both scalar/basic.py and tensor.basic/py.
+    """
+    if (x_discrete and y_discrete):
+        if config.int_division == 'raise':
+            raise IntegerDivisionError(
+                "With `config.int_division` set to 'raise', dividing two "
+                "integer types with '/' is forbidden to avoid confusion "
+                "between integer and floating point divisions. Please "
+                "use // for integer division, or if you want a float result "
+                "either cast one of the arguments to a float or directly call "
+                "`x.__truediv__(y)`.")
+        elif config.int_division == 'int':
+            warnings.warn(
+                    "Division of two integer types with x / y is deprecated, "
+                    "please use x // y for an integer division "
+                    "(set `config.int_division = raise` to track the origin "
+                    "of this warning)",
+                    DeprecationWarning)
+            return 'int'
+        elif config.int_division == 'floatX':
+            return 'true'
+        else:
+            raise NotImplementedError(config.int_division)
+    else:
+        return 'true'
+
+
+def div_proxy(x, y):
+    """Proxy for either true_div or int_div, depending on types of x, y."""
+    f = eval('%s_div' % int_or_true_div(as_scalar(x).type in discrete_types,
+                                        as_scalar(y).type in discrete_types))
+    return f(x, y)
+

 class TrueDiv(BinaryScalarOp):
    def output_types(self, types):
-        if all(t not in continuous_types for t in types):
-            return [float64]
+        if all(t in discrete_types for t in types):
+            return [Scalar(config.floatX)]
        else:
            return super(TrueDiv, self).output_types(types)
    def impl(self, x, y):
        x = numpy.asarray(x)
        y = numpy.asarray(y)
-        if str(x.dtype).startswith('int') and str(y.dtype).startswith('int'):
-            return float(x) / y
+        if all(a.dtype in discrete_types for a in (x, y)):
+            return numpy.array(float(x) / y, dtype=config.floatX)
        else:
            return x / y
    def c_code(self, node, name, (x, y), (z, ), sub):
        #we generate good c code only when both are complex!
        if sum([node.inputs[0].type in complex_types, node.inputs[1].type in complex_types])==1:
            raise NotImplementedError('type not supported', type)
-        if node.inputs[0].type in int_types and node.inputs[1].type in int_types:
+        if (node.inputs[0].type in discrete_types and
+            node.inputs[1].type in discrete_types):
            return "%(z)s = ((double)%(x)s) / %(y)s;" % locals()
        return "%(z)s = %(x)s / %(y)s;" % locals()
    def grad(self, (x, y), (gz, )):
@@ -1028,11 +1100,15 @@ class TrueDiv(BinaryScalarOp):
        if x.type in float_types:
            first_part = cast(gz / y, x.type.dtype)
        else:
+            assert x.type in discrete_types
            first_part = None

+        if y.type in complex_types:
+            raise NotImplementedError()
        if y.type in float_types:
            second_part = cast(-(gz * x) / (y * y), y.type.dtype)
        else:
+            assert y.type in discrete_types
            second_part = None
        return first_part, second_part
 true_div = TrueDiv(upcast_out, name = 'true_div')
@@ -1048,9 +1124,29 @@ int_div = IntDiv(upcast_out, name = 'int_div')

 floor_div = int_div

+
+def raise_complex_error():
+    raise ComplexError(
+                "Theano does not support the mod operator (%) on "
+                "complex numbers, since numpy deprecated it.")
+
+
+def mod_check(x, y):
+    if (as_scalar(x).type in complex_types or
+        as_scalar(y).type in complex_types):
+        # Currently forbidden.
+        raise_complex_error()
+    else:
+        return mod(x, y)
+
+
 class Mod(BinaryScalarOp):
+
    def impl(self, x, y):
+        if isinstance(x, numpy.complex) or isinstance(y, numpy.complex):
+            raise_complex_error()
        return x % y
+
    def c_code_cache_version(self):
        return (5,)

@@ -1060,20 +1156,34 @@ class Mod(BinaryScalarOp):

    def c_code(self, node, name, (x, y), (z, ), sub):
        """
-        We want the result to have the same sign as python, not the other implementaiton of mod.
+        We want the result to have the same sign as python, not the other implementation of mod.
        """
        #raise NotImplementedError("Unlike Python, C's modulo returns negative modulo on negative dividend (to implement)")
        t = node.inputs[0].type.upcast(*[ i.type for i in node.inputs[1:]])
-        if t in int_types or t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64']:
+        if (str(t) in imap(str, discrete_types) or
+            t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64'] or
+            t in discrete_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, discrete_types)
            x_mod_y = "THEANO_MACRO_MOD(%(x)s, %(y)s)"%locals()
            x_mod_ymm = "THEANO_MACRO_MOD(-%(x)s, -%(y)s)"%locals()
            x_mod_ypm = "THEANO_MACRO_MOD(%(x)s, -%(y)s)"%locals()
            x_mod_ymp = "THEANO_MACRO_MOD(-%(x)s, %(y)s)"%locals()
-        elif t in float_types or t in ['float32','float64']:
+        elif (str(t) in imap(str, float_types) or
+              t in ['float32','float64'] or
+              t in float_types):
+            # The above or's should not be needed anymore. However, for now we
+            # keep them out of safety, and verify they are useless with an
+            # assert.
+            assert str(t) in imap(str, float_types)
            x_mod_y = "fmod(%(x)s,%(y)s)"%locals()
            x_mod_ymm = "fmod(-%(x)s,-%(y)s)"%locals()
            x_mod_ypm = "fmod(%(x)s,-%(y)s)"%locals()
            x_mod_ymp = "fmod(-%(x)s,%(y)s)"%locals()
+        elif str(t) in imap(str, complex_types):
+            raise_complex_error()
        else:
            raise NotImplementedError('type not supported', type)


--- a/theano/scalar/tests/test_basic.py
+++ b/theano/scalar/tests/test_basic.py
@@ -37,6 +37,7 @@ class test_ScalarOps(unittest.TestCase):
    #As we use theano.scalar normally, but we use theano.tensor.scalar
    #that is not important. Also this make the theano fct fail at call time
    #so this is not a silent bug.
+    # --> This is why it is purposedly named 'tes_mod' instead of 'test_mod'.
    def tes_mod(self):
        """
        We add this test as not all language and C implementation give the same
@@ -174,6 +175,19 @@ class test_logical(unittest.TestCase):
            self.assertTrue(fn(a,b) == ~a, (a,))


+class test_complex_mod(unittest.TestCase):
+    """Make sure % fails on complex numbers."""
+
+    def test_fail(self):
+        x = complex64()
+        y = int32()
+        try:
+            x % y
+            assert False
+        except ComplexError:
+            pass
+
+
 class test_div(unittest.TestCase):
    def test_0(self):
        a = int8()
@@ -182,9 +196,9 @@ class test_div(unittest.TestCase):
        d = float64()
        f = float32()

-        print (a/b).owner.op
-        assert isinstance((a/b).owner.op, IntDiv)
-        assert isinstance((b/a).owner.op, IntDiv)
+        print (a//b).owner.op
+        assert isinstance((a//b).owner.op, IntDiv)
+        assert isinstance((b//a).owner.op, IntDiv)
        assert isinstance((b/d).owner.op, TrueDiv)
        assert isinstance((b/f).owner.op, TrueDiv)
        assert isinstance((f/a).owner.op, TrueDiv)

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -10,7 +10,7 @@ except ImportError:
    pass#the variable enable_sparse will be used to disable the test file.

 import theano
-from theano import compile
+from theano import compile, config
 from theano.sparse import enable_sparse
 if enable_sparse == False:
    raise SkipTest('Optional package sparse disabled')
@@ -239,8 +239,18 @@ class T_AddMul(unittest.TestCase):
            self.assertRaises(NotImplementedError, add, a_sv, c_dv)
            self.assertRaises(NotImplementedError, add, c_sv, a_dv)

-            # mul upcasts the dense input if needed
+            # mul may upcast the dense input if needed
+            if (config.cast_policy in ('custom', 'numpy') or
+                (config.cast_policy == 'numpy+floatX' and
+                 config.floatX == 'float64')):
+                # The result should be a float64 (not implemented).
                self.assertRaises(NotImplementedError, mul, a_sv, b_dv)
+            elif (config.cast_policy == 'numpy+floatX' and
+                  config.floatX == 'float32'):
+                # The result should be a float32.
+                assert mul(a_sv, b_dv).dtype == 'float32'
+            else:
+                raise NotImplementedError()
            self.assertRaises(NotImplementedError, mul, b_sv, a_dv)
            assert mul(b_sv, c_dv).dtype == 'int32'
            self.assertRaises(NotImplementedError, mul, c_sv, b_dv)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -453,7 +453,7 @@ class Elemwise(Op):
        """

        inputs = map(as_tensor_variable, inputs)
-        shadow = self.scalar_op.make_node(*[Scalar(dtype = t.type.dtype)() for t in inputs])
+        shadow = self.scalar_op.make_node(*[Scalar(dtype=i.type.dtype)() for i in inputs])

        target_length = max([input.type.ndim for input in inputs])

@@ -1200,7 +1200,8 @@ class Prod(CAReduce):
        self.no_zeros_in_input = no_zeros_in_input

    def __setstate__(self, dct):
-        self.__dict__.update(dct)
+        super(Prod, self).__setstate__(dct)
+        # Add default value to be able to reload old pickled objects.
        if 'no_zeros_in_input' not in dct:
            self.no_zeros_in_input = False


--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -135,9 +135,9 @@ class Conv3D(theano.Op):
        vidDur = V_shape[3]
        filterDur = W_shape[3]

-        output_height = T.floor( (vidHeight - filterHeight) / dr )+1
-        output_width = T.floor( (vidWidth - filterWidth) / dc )+1
-        output_dur = T.floor( (vidDur - filterDur) / dt ) +1
+        output_height = T.floor((vidHeight - filterHeight) // dr) + 1
+        output_width = T.floor((vidWidth - filterWidth) // dc) + 1
+        output_dur = T.floor((vidDur - filterDur) // dt) + 1

        rval = (batch_size,  output_height, output_width, output_dur, output_channels )


--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -575,14 +575,15 @@ class ConvOp(Op):
            try:
                fmshp = ConvOp.getOutputShape(imshp[1:], kshp, (self.dx,self.dy), self.out_mode)
            except TypeError:
-                raise NotImplementedError()
+                raise theano.tensor.ShapeError()
            outshp = (batch_size,fmo) + tuple(fmshp)
            return [outshp]
        else:
            # Haven't implemented this case. imshp and kshp may be symbollic
            # and ConvOp.getOutputShape doesn't handle this. In this case
            # we simply let the default function do its work.
-            raise NotImplementedError()
+            raise theano.tensor.ShapeError()
+            

    def perform(self,node, inp, out):
        """

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -879,6 +879,7 @@ def test_argmax_pushdown():
            [x],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)

@@ -922,6 +923,7 @@ def test_argmax_pushdown_bias():
            [x,b],
            [out])

+    config.warn.argmax_pushdown_bug = False
    theano.compile.mode.optdb.query(
            theano.compile.mode.OPT_FAST_RUN).optimize(env)


--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -27,11 +27,12 @@ from theano import compile  #to register the optimizer built by this file
 from theano.gof.python25 import any, all
 from theano.gof.opt import Optimizer, pre_constant_merge, pre_greedy_local_optimizer
 from theano.gof import toolbox, DestroyHandler
-from basic import get_constant_value
+from basic import get_constant_value, ShapeError


 # Utilities

+
 def out2in(*local_opts):
    """WRITEME """
    return opt.TopoOptimizer(opt.LocalOptGroup(*local_opts),
@@ -528,7 +529,7 @@ class ShapeFeature(object):
    the cost of many Ops accurately, and generate c-code that is specific [e.g. unrolled] to
    particular sizes.

-    If you can determine the shape only in some case, return NotImplementedError when you can't
+    In cases where you cannot figure out the shape, raise a ShapeError.

    .. note::

@@ -719,8 +720,15 @@ class ShapeFeature(object):

        try:
            o_shapes = shape_infer(node, [self.shape_of[r] for r in node.inputs])
-        except NotImplementedError:
+        except ShapeError:
            o_shapes = self.default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
+        except NotImplementedError, e:
+            raise NotImplementedError(
+                    'Code called by infer_shape failed raising a '
+                    'NotImplementedError. Raising NotImplementedError to '
+                    'indicate that a shape cannot be computed is no longer '
+                    'supported, and one should now use tensor.ShapeError '
+                    'instead. The original exception message is: %s' % e)
        except Exception, e:
            _logger.error('Failed to infer_shape from Op %s.\nInput shapes:%s\nException encountered during infer_shape: %s\nException message: %s\nTraceback: %s'% (node.op,
                [self.shape_of[r] for r in node.inputs],
@@ -3427,11 +3435,12 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
    """
    def local_fuse(node):
        """
-        As part of specialisation, we fuse two consecutive elemwise op of the same shape.
-
-        For mixed dtype, we let the Compise op do the cast. It let the C compile do the cast.
-        The number of dimension is validated at call time by theano itself.
+        As part of specialization, we fuse two consecutive elemwise Ops of the
+        same shape.

+        For mixed dtype, we let the Composite op do the cast. It lets the C
+        compiler do the cast.
+        The number of dimensions is validated at call time by theano itself.
        """
        # META TODO:  PUT THESE THINGS IN TRAC, NOT TODO NOTES!!
        # TODO: use broadcast flag?
@@ -3547,7 +3556,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
        if new_nb_input != len(inputs) or len(s_inputs) != len(inputs):
            raise Exception("""Something has gone wrong with the elemwise
 fusion optimization. We skip this optimization. You can ignore this message,
-your code will run correctly, but maybe slower.""")
+your code will run correctly, but may be slower.""")

        otype = node.outputs[0].type
        s_new_out=node.op.scalar_op(*s_g)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
-
-import time
-import unittest
+import cPickle, time, unittest

 from theano.gof import Variable, Op
 from theano import gof
@@ -399,6 +397,14 @@ class test_Prod(unittest.TestCase):

        fn_debug(a)

+    def test_pickle_bug(self):
+        # Regression test for bug fixed in 24d4fd291054.
+        o = Prod()
+        s = cPickle.dumps(o)
+        o = cPickle.loads(s)
+        cPickle.dumps(o)
+
+
 if __name__ == '__main__':
    #unittest.main()
    suite = unittest.TestSuite([test_Prod('test_mul_without_zeros_zeros')])

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
--- a/theano/tests/test_tutorial.py
+++ b/theano/tests/test_tutorial.py
 """ test code snippet in the Theano tutorials.
 """

-import unittest
+import os, unittest
 import theano
 import theano.tensor as T
 from theano import function
@@ -722,6 +722,15 @@ class T_loading_and_saving(unittest.TestCase):

        mode_instance = theano.compile.mode.get_mode(None)
        if not isinstance(mode_instance, theano.compile.debugmode.DebugMode):
+            if os.path.exists('obj.save') or os.path.exists('objects.save'):
+                # We do not want to delete these files silently, in case for
+                # some reason they would be something else than test-generated
+                # files.
+                # Ideally we would save those files in a temporary directory...
+                raise AssertionError(
+                        'Please get rid of files obj.save and '
+                        'objects.save in directory %s' % os.getcwd())
+
            f = file('obj.save', 'wb')
            cPickle.dump(my_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
            f.close()
@@ -746,6 +755,9 @@ class T_loading_and_saving(unittest.TestCase):
                loaded_objects.append(cPickle.load(f))
            f.close()

+            # Cleanup created files.
+            os.remove('obj.save')
+            os.remove('objects.save')

 class T_modes(unittest.TestCase):
    ## All tests here belog to