Merged

e9f194f5 · Olivier Delalleau · d4c945c9 · 95d71c9e · e9f194f5 · e9f194f5
--- a/doc/advanced_tutorial/cop.txt
+++ b/doc/advanced_tutorial/cop.txt
@@ -38,11 +38,13 @@ There are less methods to define for an Op than for a Type:
  *Default:* The default behavior is to do nothing.
 .. function:: c_compile_args()
+              c_no_compile_args()
              c_headers()
              c_libraries()
              c_support_code()
-  Allows you to specify headers, libraries, special g++ arguments or
+  Allows you to specify headers, libraries,
+  special g++ arguments to add/exclude or
  helper functions/structs that the type needs. See :ref:`op`.

--- a/doc/advanced_tutorial/ctype.txt
+++ b/doc/advanced_tutorial/ctype.txt
@@ -75,11 +75,13 @@ the most important ones:
  decrease the appropriate reference counts.
 .. function:: c_compile_args()
+              c_no_compile_args()
              c_headers()
              c_libraries()
              c_support_code()
-  Allows you to specify headers, libraries, special g++ arguments or
+  Allows you to specify headers, libraries, 
+  special g++ arguments to add/exclude or
  helper functions/structs that the type needs. See :ref:`type`.

--- a/doc/glossary.txt
+++ b/doc/glossary.txt
@@ -260,7 +260,7 @@ Glossary of terminology
            * making :term:`Apply` instances, which mean "apply this TOI to some particular inputs" (via the ``make_node``),
            * performing the calculation of outputs from given inputs (via the ``perform``),
-            * producing c code to perform calculation of outputs from inputs (via ``c_code, c_code_cleanup, c_support_code, c_headers, c_libraries, c_compile_args``)
+            * producing c code to perform calculation of outputs from inputs (via ``c_code, c_code_cleanup, c_support_code, c_headers, c_libraries, c_compile_args, c_no_compile_args``)
            * [optionally] building gradient-calculating graphs (via ``grad``).
        See :ref:`intro_to_ops`.

--- a/doc/sandbox/module.txt
+++ b/doc/sandbox/module.txt
@@ -135,12 +135,10 @@ Now, using ``Module``:
    m = M.Module()
    n = T.scalar('n')
    m.c = T.scalar() # state variables
-    m.inc = M.Method(n, [], c = m.c + n) # m.c <= m.c + n
+    m.inc = M.Method(n, [], updates = {m.c: m.c + n}) # m.c <= m.c + n
-    m.dec = M.Method(n, [], c = m.c - n) # k.c <= k.c - n
+    m.dec = M.Method(n, [], updates = {m.c: m.c - n}) # k.c <= k.c - n
-    m.dec = M.Method(n, [], updates = {m.c: m.c - n})#alternative syntax
    #m.dec = M.Method(n, [], updates = {c: m.c - n})#global c don't exist
-    #m.dec = M.Method(n, [], m.c = m.c - n) #python don't suppor this syntax
+    #m.plus10 does not update the state
-    #m.plus10 don't update the state
    m.plus10 = M.Method([], m.c + 10) # m.c is always accessible since it is a member of this mlass
    inst = m.make(c = 0) # here, we make an "instance" of the module with c initialized to 0
@@ -192,8 +190,8 @@ Using Module:
        m = M.Module()
        n = T.scalar('n')
        m.c = T.scalar() # state variables
-        m.inc = M.Method(n, [], c = m.c + n) # m.c <= m.c + n
+        m.inc = M.Method(n, [], updates = {m.c: m.c + n}) # m.c <= m.c + n
-        m.dec = M.Method(n, [], c = m.c - n) # m.c <= m.c - n
+        m.dec = M.Method(n, [], updates = {m.c: m.c - n}) # m.c <= m.c - n
        return m
    m = M.Module()

--- a/doc/topics/function.txt
+++ b/doc/topics/function.txt
@@ -36,7 +36,7 @@ The ``inputs`` argument to ``theano.function`` is a list, containing the ``Varia
 .. class:: In
-   .. method:: __init__(variable, name=None, value=None, update=None, mutable=False)
+   .. method:: __init__(variable, name=None, value=None, update=None, mutable=False, strict=False, autoname=True, implicit=None)
      ``variable``: a Variable instance. This will be assigned a value
      before running the function, not computed from its owner.
@@ -46,8 +46,12 @@ The ``inputs`` argument to ``theano.function`` is a list, containing the ``Varia
      can be set by ``kwarg``, and its value can be accessed by
      ``self.<name>``. The default value is ``None``.
-      ``value``: literal or Container. This is the default value of
+      ``value``: literal or ``Container``. The initial/default value for this
-      the Input. The default value of this parameter is ``None``
+        input. If update is`` None``, this input acts just like
+        an argument with a default value in Python. If update is not ``None``,
+        changes to this
+        value will "stick around", whether due to an update or a user's
+        explicit action.
      ``update``: Variable instance. This expression Variable will
      replace ``value`` after each function call. The default value is
@@ -57,11 +61,28 @@ The ``inputs`` argument to ``theano.function`` is a list, containing the ``Varia
      compiled function to modify the Python object being used as the
      default value. The default value is ``False``.
+      ``strict``: Bool (default: ``False`` ). ``True`` means that the value 
+      you pass for this input must have exactly the right type. Otherwise, it
+      may be cast automatically to the proper type.
      ``autoname``: Bool. If set to ``True``, if ``name`` is ``None`` and
      the Variable has a name, it will be taken as the input's
      name. If autoname is set to ``False``, the name is the exact
      value passed as the name parameter (possibly ``None``).
+      ``implicit``: Bool or ``None`` (default: ``None``)
+            ``True``: This input is implicit in the sense that the user is not allowed
+            to provide a value for it. Requires ``value`` to be set.
+            ``False``: The user can provide a value for this input. Be careful
+            when ``value`` is a container, because providing an input value will
+            overwrite the content of this container.
+            ``None``: Automatically choose between ``True`` or ``False`` depending on the
+            situation. It will be set to ``False`` in all cases except if
+            ``value`` is a container (so that there is less risk of accidentally
+            overwriting its content without being aware of it).
 Value: initial and default values
 ---------------------------------
@@ -136,6 +157,31 @@ Theano's Module system uses this mechanism to share storage between Methods.
 The container being shared doesn't have to correspond to the same Variable in both functions, 
 but that's usually how this mechanism is used.
+Note that when an input's ``value`` parameter is a shared container, this
+input is considered as implicit by default. This means it cannot be set by the
+user.
+If ``implicit`` is manually set to ``False``, then it can be set by the user,
+but then it will overwrite the container's content, so one should be careful
+when allowing this.
+This is illustrated in the following example.
+>>> dec(1, 0)   # Try to manually set an implicit input
+<type 'exceptions.TypeError'>: Tried to provide value for implicit input: s
+>>> dec = function([x, In(s, update=(s-x), value=inc.container[s], implicit=False)], [])
+>>> inc[s] = 2
+>>> print dec[s]    # Containers are shared
+2.0
+>>> dec(1)
+[]
+>>> print inc[s]    # Calling dec decreased the value in inc's container
+1.0
+>>> dec(1, 0)       # Update inc[s] with 0 - 1 = -1
+[]
+>>> print inc[s]
+-1.0
+>>> print dec[s]    # Still shared
+-1.0
 Input Argument Restrictions
 ---------------------------
@@ -168,8 +214,8 @@ instance explicitly with the ``autoname`` flag set to False.
 Access to function values and containers
 ----------------------------------------
-For each input, ``theano.function`` will create a ``Container`` if the
+For each input, ``theano.function`` will create a ``Container`` if
-value was not already a ``Container``. At the time of a function call,
+``value`` was not already a ``Container`` (or if ``implicit`` was ``False``). At the time of a function call,
 each of these containers must be filled with a value. Each input (but
 especially ones with a default value or an update expression) may have a
 value between calls. The function interface defines a way to get at

--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -215,7 +215,7 @@ class Function(object):
        self.return_none = return_none
        self.maker = maker
-        # we'll be popping stuff off this `containers` object.  It's a copy
+        # We will be popping stuff off this `containers` object.  It is a copy.
        containers = list(self.input_storage) 
        finder = {}
        inv_finder = {}
@@ -229,15 +229,26 @@ class Function(object):
        #setters = []
        # Initialize the storage
+        # this loop works by modifying the elements (as variable c) of self.input_storage inplace.
        for i, ((input, indices, sinputs), (required, refeed, value)) in enumerate(zip(self.indices, defaults)):
            if indices is None: # this is true iff input is not a SymbolicInputKit
                c = containers[0]  #containers is being used as a stack. Here we pop off the next one.
                if input.strict:
                    c.strict = True
                if value is not None:
-                    # always initialize the storage
+                    # Always initialize the storage.
-                    c.data = value
+                    if isinstance(value, gof.Container):
+                        # There is no point in obtaining the current value
+                        # stored in the container, since the container is
+                        # shared.
+                        # For safety, we make sure 'refeed' is False, since
+                        # there is no need to refeed the defaullt value.
+                        assert not refeed
+                    else:
+                        c.value = value
                c.required = required
+                c.implicit = input.implicit
                c.provided = 0 # this is a count of how many times the input has been provided (reinitialized to 0 on __call__)
                finder[i] = c
                finder[input.variable] = c
@@ -247,6 +258,9 @@ class Function(object):
                #setters.append(partial(assign, c))
                containers[:1] = []
            else:
+                # TODO The following code may need to do something to handle
+                # implicit inputs.
                # The input is a SymbolicInputKit, so we take as many containers as the Kit provides inputs
                cs = containers[:len(indices)]
                # distribute does the initialization of the containers
@@ -347,20 +361,27 @@ class Function(object):
        # Set keyword arguments
        for k, arg in kwargs.iteritems():
            self[k] = arg
-        # Check if inputs are missing or if inputs were set more than once
+        # Check if inputs are missing, or if inputs were set more than once, or
+        # if we tried to provide inputs that are supposed to be implicit.
        for c in self.input_storage:
            if c.required and not c.provided:
                raise TypeError("Missing required input: %s" % getattr(self.inv_finder[c], 'variable', self.inv_finder[c]))
            if c.provided > 1:
                raise TypeError("Multiple values for input: %s" % getattr(self.inv_finder[c], 'variable', self.inv_finder[c]))
+            if c.implicit and c.provided > 0:
+                raise TypeError('Tried to provide value for implicit input: %s'
+                        % getattr(self.inv_finder[c], 'variable',
+                            self.inv_finder[c]))
        # Do the actual work
        self.fn()
        # Retrieve the values that were computed
        outputs = [x.data for x in self.output_storage]
-        #remove internal references to required inputs
+        # Remove internal references to required inputs.
-        #these can't be re-used anyway
+        # These cannot be re-used anyway.
        for x in self.input_storage:
            if c.required:
                c.storage[0] = None
@@ -377,12 +398,16 @@ class Function(object):
        # Update the inputs that have an update function
        for input, storage in reversed(zip(self.maker.expanded_inputs, self.input_storage)):
-            if input.update:
+            if input.update is not None:
                storage.data = outputs.pop()
        # Put default values back in the storage
        for i, (required, refeed, value) in enumerate(self.defaults):
            if refeed:
+                if isinstance(value, gof.Container):
+                    value = value.storage[0]
                self[i] = value
        if self.return_none:
            return None
        elif self.unpack_single and len(outputs) == 1:
@@ -404,26 +429,26 @@ class Function(object):
 def _pickle_Function(f):
    #copy of the input storage list
    ins = list(f.input_storage)
-    defaults = []
+    input_storage = []
    for (input, indices, inputs), (required, refeed, default) in zip(f.indices, f.defaults):
        if isinstance(input, SymbolicInputKit):
            li = len(indices)
            if not default:
-                defaults.append(ins[:li])
+                input_storage.append(ins[:li])
            else:
-                defaults.append(default)
+                input_storage.append(default)
            ins[:li] = []
        else:
-            defaults.append(ins[0])
+            input_storage.append(ins[0])
            del ins[0]
    inputs_data = [x.data for x in f.input_storage]
    # HACK to detect aliased storage.
-    # aliased relationships will not be preserved across the pickle operation
+    # This is here because aliased relationships are not [currently] preserved across the pickle operation
    if not (f.pickle_aliased_memory_strategy == 'ignore'):
-        all_data = defaults + inputs_data
+        all_data = input_storage + inputs_data # addition here means list append
        for i, d_i in enumerate(all_data):
            for j, d_j in enumerate(all_data):
                if (i < j) and isinstance(d_i, numpy.ndarray) and isinstance(d_j, numpy.ndarray):
@@ -436,14 +461,14 @@ def _pickle_Function(f):
                        else:
                            raise AliasedMemoryError(d_i, d_j)
-    rval = (_constructor_Function, (f.maker, defaults, inputs_data))
+    rval = (_constructor_Function, (f.maker, input_storage, inputs_data))
    return rval
-def _constructor_Function(maker, defaults, data):
+def _constructor_Function(maker, input_storage, inputs_data):
-    f = maker.create(defaults, trustme = True)
+    f = maker.create(input_storage, trustme = True)
-    assert len(f.input_storage) == len(data)
+    assert len(f.input_storage) == len(inputs_data)
-    for container, x in zip(f.input_storage, data):
+    for container, x in zip(f.input_storage, inputs_data):
-        container.data = x
+        assert (container.data is x) or (container.data == x)
    return f
 copy_reg.pickle(Function, _pickle_Function)
@@ -626,97 +651,53 @@ class FunctionMaker(object):
        self.accept_inplace = accept_inplace
        self.function_builder = function_builder
-    def create(self, defaults = None, trustme = False):
+        self.required = [(i.value == None) for i in self.inputs]
+        self.refeed = [
+                (i.value != None and not isinstance(i.value, gof.Container) and i.update == None)
+                    for i in self.inputs] 
+    def create(self, input_storage=None, trustme=False):
        """
        Create a function.
-        defaults -> a list matching the inputs list and providing default values
+        input_storage -> a list matching the inputs list and providing default values
                    if the default for an input is None, then that input is a
                    required input. For an input with an update, the default
                    acts as initialization.
        trustme -> disables some exceptions, used internally
        """
-        if defaults is None:
+        if input_storage is None:
-            defaults = [None]*len(self.inputs)
+            input_storage = [None]*len(self.inputs)
-        input_storage = [] # list of independent one-element lists, will be passed to the linker
+        input_storage_lists = [] # list of independent one-element lists, will be passed to the linker
-        _defaults = []
+        defaults = []
-        # The following loop is to fill in the input_storage and _defaults lists.
+        # The following loop is to fill in the input_storage_lists and defaults lists.
-        for (input, indices, subinputs), default in zip(self.indices, defaults):
+        assert len(self.indices) == len(input_storage)
-            __default = default
+        for i, ((input, indices, subinputs), input_storage_i) in enumerate(zip(self.indices, input_storage)):
+            # Replace any default value given as a variable by its container.
-            if isinstance(default, gof.Container):
+            # Note that this makes sense only in the context of shared variables,
-                # If the default is a gof.Container, this means we want to share
+            # but for now we avoid dealing directly with them to avoid dependency
-                # the same storage. This is done by appending default.storage
+            # on the shared variables work-in-progress repository.
-                # to input_storage
+            if isinstance(input_storage_i, gof.Variable):
+                input_storage_i = input_storage_i.container
+            if isinstance(input_storage_i, gof.Container):
+                # If the default is a gof.Container, this means we want to
+                # share the same storage. This is done by appending
+                # input_storage_i.storage to input_storage_lists.
                if indices is not None:
                    raise TypeError("Cannot take a Container instance as default for a SymbolicInputKit.")
-                input_storage.append(default.storage)
+                input_storage_lists.append(input_storage_i.storage)
-                default = None
+                defaults.append((self.required[i],
-                required = False
+                    self.refeed[i],
-            elif isinstance(input, SymbolicInputKit):
+                    input_storage_i.storage[0]))
-                # If the input is a SymbolicInputKit, it represents more than
-                # one storage unit. The indices and subinputs lists represent which
-                # of the kit's inputs are active in this graph, so we make as many
-                # storage units as needed
-                if isinstance(default, (list, tuple)) \
-                        and all(isinstance(x, gof.Container) for x in default):
-                    if len(default) == len(indices):
-                        input_storage += [x.storage for x in default]
-                    elif len(default) > len(indices):
-                        input_storage += [default[i].storage for i in indices]
-                    else:
-                        raise ValueError('Not enough storage for SymbolicInputKit', input, indices, default)
-                    default = NODEFAULT
-                else:
-                    input_storage += [[None] for i in indices]
            else:
                # Normal case: one new, independent storage unit
-                input_storage.append([None])
+                input_storage_lists.append([input_storage_i])
+                defaults.append((self.required[i], self.refeed[i], input_storage_i))
-            # Filling _defaults. Each entry is a tuple of three elements:
-            # (required, refeed, value)
-            # - required means that the user must provide a value when calling the function
-            # - refeed means that we want to put the default back in the storage after each function call
-            # - value is the value that will be put in the storage initially
-            # Even though a SymbolicInputKit represents more than one input,
-            # we still only have one entry for the defaults list.
-            if isinstance(input, SymbolicInputKit):
-                if default is NODEFAULT:
-                    _defaults.append((False, False, None))
-                elif default is None:
-                    _defaults.append((True, True, None))
-                else:
-                    _defaults.append((False, False, default))
-            elif input.update is not None:
-                # If the input has an update, then (logically) it is not required since
-                # it is just a parameter and of course we don't want to refeed the default
-                # back into the storage as it would defeat the point of updating it. We
-                # always do this policy.
-                if default is None:
-                    if trustme or isinstance(__default, gof.Container):
-                        _defaults.append((False, False, None))
-                    else:
-                        # This might catch some bugs early
-                        raise ValueError("A default (initial) value is required for an input which can update itself.", input)
-                else:
-                    _defaults.append((False, False, default))
-            else:
-                if default is None:
-                    if trustme or isinstance(__default, gof.Container):
-                        _defaults.append((False, False, None))
-                    else:
-                        # No default, so this is a required input. Nothing to feed back, initial value is None.
-                        _defaults.append((True, False, None))
-                else:
-                    # Default value. It is not required, but we want to put it back into the storage
-                    # everytime so it behaves like most programming languages' default values
-                    _defaults.append((False, True, default))
-        defaults = _defaults
        # Get a function instance
-        _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage)
+        _fn, _i, _o = self.linker.make_thunk(input_storage = input_storage_lists)
        fn = self.function_builder(_fn, _i, _o, self.indices, self.outputs, defaults, self.unpack_single, self.return_none, self)
        return fn
@@ -805,6 +786,7 @@ def function(inputs, outputs, mode=None, accept_inplace = False):
    """
    mode = mode if mode is not None else mode_module.default_mode
    inputs = map(convert_function_input, inputs)
    if outputs is not None:
        outputs = map(FunctionMaker.wrap_out, outputs) if isinstance(outputs, (list, tuple)) else FunctionMaker.wrap_out(outputs)
@@ -820,6 +802,7 @@ def function(inputs, outputs, mode=None, accept_inplace = False):
        else:              
            #return a different kind of function
            def dup_defaults():
+                # TODO This may need to be changed to use containers as defaults.
                return [copy.copy(default.value) if isinstance(default, gof.Container) else
                        copy.copy(default)
                        for default in defaults]

--- a/theano/compile/io.py
+++ b/theano/compile/io.py
 """Define `SymbolicInput`, `SymbolicOutput`, `In`, `Out` """
 __docformat__ = 'restructuredtext en'
+from theano import gof
 class SymbolicInput(object):
    """
    Represents a symbolic input for use with function or FunctionMaker.
@@ -27,9 +29,15 @@ class SymbolicInput(object):
    autoname: Bool (default: True)
        See the name option.
+    implicit: Bool (default: False)
+        See help(In). Note that 'None' is not allowed here, since we are in the
+        symbolic case.
    """
-    def __init__(self, variable, name=None, update=None, mutable=None, strict=False, autoname=True):
+    def __init__(self, variable, name=None, update=None, mutable=None, strict=False, autoname=True,
+            implicit=False):
+        assert implicit is not None # Safety check.
        self.variable = variable
        self.name = variable.name if (autoname and name is None) else name
        if self.name is not None and not isinstance(self.name, str):
@@ -37,6 +45,7 @@ class SymbolicInput(object):
        self.update = update
        self.mutable = mutable if (mutable is not None) else (update is not None)
        self.strict = strict
+        self.implicit = implicit
    def __str__(self):
        if self.update:
@@ -132,14 +141,39 @@ class In(SymbolicInput):
    strict: Bool (default: False)
        True: means that the value you pass for this input must have exactly the right type
-        False: the value you pass for this input may be casted automatically to the proper type
+        False: the value you pass for this input may be cast automatically to the proper type
    autoname: Bool (default: True)
        See the name option.
+    implicit: Bool or None (default: None)
+        True: This input is implicit in the sense that the user is not allowed
+            to provide a value for it. Requires 'value' to be set.
+        False: The user can provide a value for this input. Be careful when
+            'value' is a container, because providing an input value will
+            overwrite the content of this container.
+        None: Automatically choose between True or False depending on the
+            situation. It will be set to False in all cases except if 'value'
+            is a container (so that there is less risk of accidentally
+            overwriting its content without being aware of it).
    """
-    def __init__(self, variable, name=None, value=None, update=None, mutable=None, strict=False, autoname=True):
+    # Note: the documentation above is duplicated in doc/topics/function.txt,
-        super(In, self).__init__(variable, name, update, mutable, strict, autoname)
+    # try to keep it synchronized.
+    def __init__(self, variable, name=None, value=None, update=None,
+            mutable=None, strict=False, autoname=True,
+            implicit=None):
+        if implicit is None:
+            # TODO Having a default value being a Variable only makes sense
+            # if this is a SharedVariable. This should be changed once shared
+            # variables are part of Theano instead of living in a separate
+            # repository.
+            implicit = (isinstance(value, gof.Container) or
+                    isinstance(value, gof.Variable))
+        super(In, self).__init__(variable, name, update, mutable, strict,
+                autoname, implicit = implicit)
        self.value = value
+        if self.implicit and value is None:
+            raise TypeError('An implicit input must be given a default value')
 class SymbolicOutput(object):

--- a/theano/compile/mode.py
+++ b/theano/compile/mode.py
@@ -37,6 +37,7 @@ predefined_linkers = {
    'c&py' : gof.DualLinker(checker = check_equal)
    }
+#Keep default_linker the same as the one for default_mode
 default_linker = 'c|py'
 def register_linker(name, linker):
@@ -63,7 +64,8 @@ predefined_optimizers = {
    'fast_run_stable' : OPT_FAST_RUN_STABLE,
    'fast_compile' : OPT_FAST_COMPILE
    }
-default_optimizer = 'merge'
+#Keep default_optimizer the same as the one for default_mode
+default_optimizer = 'fast_run'
 def register_optimizer(name, opt):
    """Add a `Optimizer` which can be referred to by `name` in `Mode`."""
@@ -157,6 +159,7 @@ predefined_modes = {'FAST_COMPILE': FAST_COMPILE,
 # The default mode used by functions and modules is read from the environment
 # variable THEANO_DEFAULT_MODE. Unit tests will run using this value. If the env. var.
 # is not set, it will default to 'FAST_RUN'
+# keep default_mode.optimizer==default_optimizer and default_mode.linker==default_linker!
 ##
 default_mode = os.getenv('THEANO_DEFAULT_MODE','FAST_RUN')

--- a/theano/compile/module.py
+++ b/theano/compile/module.py
@@ -354,7 +354,7 @@ class Method(Component):
            return memo[self]
        self.resolve_all() # resolve all so we don't have to mess with strings
-        def get_storage(r, require = False):
+        def get_storage(r, require=False):
            # If require is True, we can only get storage from the memo.
            try:
                return memo[r]
@@ -405,7 +405,8 @@ class Method(Component):
                        variable=k,
                        update=v,
                        value=get_storage(k, not allocate_all).value,
-                        mutable=True)
+                        mutable=True,
+                        implicit = True)
                inputs.append(input_k)
            else:
                raise ValueError(('Variable listed in both inputs and updates.'
@@ -437,6 +438,13 @@ class Method(Component):
                    assert storage.mutable == False 
                else:
                    storage = get_storage(input, not allocate_all)
+                # Declare as an implicit input.
+                # TODO Note from OD: is this dangerous? (in case this storage
+                # is shared, and would sometimes need to be implicit, sometimes
+                # not).
+                storage.implicit = True
                assert type(storage) is io.In
                inputs.append(storage)

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -2,10 +2,11 @@ import time
 from ..gof.link import WrapLinkerMany
 from ..gof.cutils import run_cthunk
-from ..compile.mode import Mode
+from ..compile.mode import Mode, predefined_linkers
+from ..gof.cc import OpWiseCLinker
 class ProfileMode(Mode):
-    def __init__(self, linker, optimizer=None):
+    def __init__(self, linker=OpWiseCLinker(), optimizer=None):
        local_time = [0.0]
        apply_time = {}
        op_time = {}
@@ -31,6 +32,9 @@ class ProfileMode(Mode):
        self.op_time = op_time
        self.op_cimpl = op_cimpl
+        if isinstance(linker, str):
+            linker = predefined_linkers[linker]
        wrap_linker = WrapLinkerMany([linker], [blah])
        if optimizer:
            super(ProfileMode, self).__init__(wrap_linker, optimizer)

--- a/theano/compile/tests/test_function.py
+++ b/theano/compile/tests/test_function.py
@@ -7,7 +7,7 @@ from theano.compile.function_module import *
 from theano import tensor
 from theano import tensor as T
-import random
+import random, theano
 import numpy as N
@@ -250,9 +250,30 @@ class T_function(unittest.TestCase):
        self.failUnless(f[s] == 2)
        self.failUnless(g[s] == 2)
        f(1, 2)
-        g(1, 2)
        self.failUnless(f[s] == 4)
        self.failUnless(g[s] == 4)
+        g(1, 2) # has no effect on state
+        self.failUnless(f[s] == 4)
+        self.failUnless(g[s] == 4)
+    def test_shared_state_not_implicit(self):
+        # This test is taken from the documentation in
+        # doc/topics/function.txt. If it does not pass anymore and yet the
+        # behavior is still intended the doc and the test should both be
+        # updated accordingly.
+        x, s = T.scalars('xs')
+        inc = function([x, In(s, update=(s+x), value=10.0)], [])
+        dec = function([x, In(s, update=(s-x), value=inc.container[s],
+            implicit = False)], [])
+        self.failUnless(dec[s] is inc[s])
+        inc[s] = 2
+        self.failUnless(dec[s] == 2)
+        dec(1)
+        self.failUnless(inc[s] == 1)
+        dec(1, 0)
+        self.failUnless(inc[s] == -1)
+        self.failUnless(dec[s] == -1)
 class T_picklefunction(unittest.TestCase):
@@ -278,6 +299,13 @@ class T_picklefunction(unittest.TestCase):
        self.failIf(g.container[2].storage is f.container[2].storage)
        self.failIf(x in g.container)
        self.failIf(x in g.value)
+        self.failUnless(len(f.defaults) == len(g.defaults))
+        print 'f.defaults = %s' % (f.defaults, )
+        print 'g.defaults = %s' % (g.defaults, )
+        self.failUnless(all([f_req == g_req and f_feed == g_feed and
+            f_val == g_val
+            for ((f_req, f_feed, f_val), (g_req, g_feed, g_val)) in zip(
+                f.defaults, g.defaults)]))
        self.failIf(g.value[1] is f.value[1]) # should not have been copied
        self.failIf(g.value[2] is f.value[2]) # should have been copied because it is mutable.
@@ -287,6 +315,32 @@ class T_picklefunction(unittest.TestCase):
        self.failUnless(f(2, 1) == g(2)) #they should be in sync, default value should be copied.
        f(1,2) # put them out of sync
        self.failIf(f(1, 2) == g(1, 2)) #they should not be equal anymore.
+        g(1, 2) # put them back in sync
+        self.failUnless(f(3) == g(3)) # They should be in sync again.
+    def test_deepcopy_shared_container(self):
+        # Ensure that shared containers remain shared after a deep copy.
+        a, x = T.scalars('ax')
+        h = function([In(a, value = 0.0)], a)
+        f = function([x, In(a, value=h.container[a], implicit = True)], x + a)
+        try:
+            memo = {}
+            ac = copy.deepcopy(a)
+            memo.update({id(a): ac})
+            hc = copy.deepcopy(h, memo = memo)
+            memo.update({id(h): hc})
+            fc = copy.deepcopy(f, memo = memo)
+        except NotImplementedError, e:
+            if e[0].startswith('DebugMode is not picklable'):
+                return
+            else:
+                raise
+        h[a] = 1
+        hc[ac] = 2
+        self.failUnless(f[a] == 1)
+        self.failUnless(fc[ac] == 2)
    def test_pickle(self):
        a = T.scalar() # the a is for 'anonymous' (un-named).
@@ -472,7 +526,7 @@ if __name__ == '__main__':
    if 1:
        unittest.main()
-    else:
+    elif 0:
        testcases = []
        testcases.append(T_function)
@@ -483,3 +537,11 @@ if __name__ == '__main__':
            suite.addTest(testloader.loadTestsFromTestCase(testcase))
        unittest.TextTestRunner(verbosity=2).run(suite)
        #</boilerplate>
+    elif 0:
+        theano.compile.mode.default_mode = 'FAST_COMPILE'
+        t = T_picklefunction()
+        def fu(b):
+            assert b
+        t.failUnless = fu
+        t.test_deepcopy_shared_container()
--- a/theano/compile/tests/test_module.py
+++ b/theano/compile/tests/test_module.py
@@ -678,6 +678,23 @@ def test_method_mode():
    assert m.h.maker.mode == m.g.maker.mode
    assert numpy.all(m.f([1,2]) == m.g([1,2]))
+def test_method_implicit_ticket_384():
+    """
+    Ensure it is not possible to accidentally overwrite module variables
+    added as implicit inputs.
+    """
+    M = Module()
+    M.x = T.scalar()
+    M.f = Method([M.x], M.x * 3)
+    m = M.make()
+    m.f(0)
+    try:
+        m.f(0, 0)
+        assert False
+    except TypeError, e:
+        if not str(e).startswith('Tried to provide value for implicit input'):
+            raise
 def test_pickle():
    """Test that a module can be pickled"""
    M = Module()

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -526,10 +526,23 @@ class CLinker(link.Linker):
        This might contain duplicates.
        """
-        ret = []
+        ret = ["-O3", "-w"]#-w means supress all warnings
+# this is the param the -ffast-math activate. I put the explicitly as FillMissing must disable "-ffinite-math-only". Putting -ffast-math would make it disable all other parameter at the same time.
+        ret += ["-fno-math-errno", "-funsafe-math-optimizations",
+                "-fno-signaling-nans", "-fcx-limited-range",
+                "-fno-rounding-math", "-ffinite-math-only"]
        for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
            try: ret += x.c_compile_args()
            except utils.MethodNotDefined: pass
+        ret=list(set(ret))#to remove duplicate
+        for x in [y.type for y in self.variables] + [y.op for y in self.node_order]:
+            try: 
+                for i in x.c_no_compile_args():
+                    try:
+                        ret.remove(i)
+                    except ValueError: 
+                        pass# in case the value is not there
+            except utils.MethodNotDefined: pass
        return ret
    def headers(self):
@@ -703,16 +716,7 @@ class CLinker(link.Linker):
                    instantiate.customize.add_support_code(support_code)
                instantiate.customize.add_support_code(self.struct_code)
                instantiate.customize.add_support_code(static)
-                for extra_arg in (
-                        "-O2", 
-                        "-ffast-math",
-                        #"-fprefetch-loop-arrays",
-                        #"-ftree-vect-loop-version",
-                        #"-ftree-loop-optimize",
-                        #"-ftree-vectorize"):
-                        "-w" #-w means supress all warnings
-                        ):
-                    instantiate.customize.add_extra_compile_arg(extra_arg)
                for arg in self.compile_args():
                    instantiate.customize.add_extra_compile_arg(arg)
                for header in self.headers():

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -97,6 +97,25 @@ class CLinkerOp(object):
        raise utils.MethodNotDefined('%s.c_compile_args' \
                % self.__class__.__name__)
+    def c_no_compile_args(self):
+        """Optional: Return a list of incompatible gcc compiler arguments.
+        We will remove those arguments from the command line of gcc. So if 
+        another Op adds a compile arg in the graph that is incompatible 
+        with this Op, the incompatible arg will not be used. 
+        Useful for instance to remove -ffast-math.
+        EXAMPLE
+        WRITEME
+        :Exceptions:
+         - `MethodNotDefined`: the subclass does not override this method
+        """
+        raise utils.MethodNotDefined('%s.c_no_compile_args' \
+                % self.__class__.__name__)
    def c_headers(self):
        """Optional: Return a list of header files that must be included to compile the C code.

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -148,6 +148,24 @@ class CLinkerType(object):
        """
        raise MethodNotDefined("c_compile_args", type(self), self.__class__.__name__)
+    def c_no_compile_args(self):
+        """Optional: Return a list of incompatible gcc compiler arguments.
+        We will remove those arguments from the command line of gcc. So if 
+        another Op adds a compile arg in the graph that is incompatible 
+        with this Op, the incompatible arg will not be used. 
+        Useful for instance to remove -ffast-math.
+        EXAMPLE
+        WRITEME
+        :Exceptions:
+         - `MethodNotDefined`: the subclass does not override this method
+        """
+        raise MethodNotDefined("c_no_compile_args", type(self), self.__class__.__name__)
    def c_headers(self):
        """Optional: Return a list of header files required by code returned by
        this class.

--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -8,7 +8,7 @@ def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
    s = -1 if mode=='valid' else 1
    inshp, kshp = N.array(inshp), N.array(kshp)
    return  N.int64(N.ceil((inshp[1:] + s*kshp - s*1)/\
-            N.array([dy,dx], dtype='float')))
+            N.array([dx,dy], dtype='float')))
 class ConvOp(Op):
    """
@@ -44,21 +44,19 @@ class ConvOp(Op):
        self.unroll_kern=unroll_kern
        if self.unroll_batch>0 and self.bsize % self.unroll_batch!=0:
-            if self.bsize<self.unroll_batch:
+            if self.bsize<=self.unroll_batch:
                self.unroll_batch = self.bsize
            else:
-                self.unroll_batch=1
                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_batch(%s) must be 0 or a multiple of bsize(%s). We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_batch),str(self.bsize))
+                self.unroll_batch=1
        if self.unroll_kern>0 and self.nkern % unroll_kern!=0:
-            if self.nkern<self.unroll_kern:
+            if self.nkern<=self.unroll_kern:
                self.unroll_kern = self.nkern
            else:
-                self.unroll_kern=1
                print "OPTIMISATION WARNING: in ConvOp.__init__() unroll_kern(%s) should be 0 or a multiple of nkern(%s)We revert it to 1. This won't change the result, but may make it slower."%(str(self.unroll_kern),str(self.nkern))
-        if self.dx!=1 or self.dy!=1:
+                self.unroll_kern=1
-            print "Warning, dx!=1 or dy!=1 only supported in python mode!"
-            raise NotImplementedError()
        self.outshp = getFilterOutShp(self.imshp, kshp, (dx,dy), output_mode)
+        self.fulloutshp = getFilterOutShp(self.imshp, kshp, (1,1), output_mode)
        self.out_mode = output_mode
        if not self.out_mode in ["valid", "full"]:
            raise Exception("Mode %s not implemented"%self.out_mode)
@@ -92,7 +90,7 @@ class ConvOp(Op):
            raise Exception("The image and the kernel must have the same type."
                            "inputs(%s), kerns(%s)"%(inputs.dtype, kerns.dtype))
        output = tensor.tensor(dtype=inputs.type.dtype,
-                               broadcastable=[False]*outdim, 
+                               broadcastable=[False]*outdim,
                               name="ConvOp_Output");
        return gof.Apply(self, [inputs, kerns], [output])
@@ -105,7 +103,8 @@ class ConvOp(Op):
        from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
        from scipy.signal.sigtools import _convolve2d
        if z[0] is None:
-            z[0] = N.zeros((self.bsize,)+(self.nkern,)+tuple(self.outshp))
+            z[0] = N.zeros((self.bsize,)+(self.nkern,)+tuple(self.fulloutshp),
+                           dtype=img2d.dtype)
        zz=z[0]
        val = _valfrommode(self.out_mode)
        bval = _bvalfromboundary('fill')
@@ -119,7 +118,11 @@ class ConvOp(Op):
                for im0 in range(self.imshp[0]):
                    zz[b,n,...] +=  _convolve2d(\
                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
-        zz = zz[:,:,0::self.dx,0::self.dy]
+        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
+        #The copy make that we return an object with the same stride as the c version.
+        #The copy don't affect the performence during our experience as in that case we
+        #execute the c version which is much faster.
+        zz = zz[:,:,0::self.dx,0::self.dy].copy()
        z[0]=zz
@@ -131,6 +134,13 @@ class ConvOp(Op):
        * inputs needs to be a 4D tensor. Couldn't get 3D to work
        * will crash if filter the same size as input image
        """
+        outshp = self.fulloutshp
+        if self.dx!=1 or self.dy!=1:
+            upgz = T.as_tensor(N.zeros((self.bsize,self.nkern)+tuple(self.fulloutshp),
+                                       dtype=gz.type.dtype))
+            gz = T.SetSubtensor([slice(self.bsize), slice(self.nkern),
+                                 slice(0,outshp[0],self.dy),
+                                 slice(0,outshp[1],self.dx)])(upgz,gz)
        ####### Determine gradient on kernels ########
        if inputs.ndim == 3:
@@ -144,26 +154,28 @@ class ConvOp(Op):
            (img, filters) = (newin, newgz)
            (bsize, nkern) = (self.imshp[0], self.nkern)
            imshp = N.hstack((self.bsize, self.imshp[1:]))
-            kshp  = self.outshp
+            kshp  = outshp
+            un_b = self.unroll_batch
+            un_k = self.unroll_kern
        elif self.out_mode == 'full':
            (img, filters) = (newgz, newin)
            (bsize, nkern) = (self.nkern, self.imshp[0])
-            imshp = N.hstack((self.bsize, self.outshp))
+            imshp = N.hstack((self.bsize, outshp))
            kshp  = self.imshp[1:]
+            un_b = self.unroll_kern
+            un_k = self.unroll_batch
        else:
            raise NotImplementedError('Only [full,valid] modes are currently supported.')
        filters = filters[:,:,::-1,::-1]
        #find good value for the unroll
-        un_b = self.unroll_batch
-        un_k = self.unroll_kern
        if un_b!=0 and bsize%un_b!=0:
            if bsize<un_b:
                un_b = bsize
            else:
                un_b = 1
-                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!"
+                print "OPTIMISATION WARNING: in ConvOp.grad() we can't determine a good unroll value for the batch. Maybe you can optimize this!", bsize, un_b, self.unroll_batch, self.unroll_kern
        if un_k!=0 and nkern%un_k!=0:
            if nkern<un_k:
                un_k = nkern
@@ -173,6 +185,7 @@ class ConvOp(Op):
        dw = ConvOp(imshp, kshp, nkern, bsize, 1,1, output_mode='valid',
                    unroll_batch=un_b, unroll_kern=un_k)(img,filters)
+        assert (dw.owner.op.outshp==self.kshp).all()
        if self.out_mode == 'valid':
            # before DimShuffle, dw is of shape visdim x nkern x kshp[0] x kshp[1]
            dw = tensor.DimShuffle(dw.broadcastable, (1,0,2,3))(dw)
@@ -183,11 +196,11 @@ class ConvOp(Op):
        filters = tensor.DimShuffle(gz.broadcastable, (1,0,2,3))(kerns)
        filters = filters[:,:,::-1,::-1]
        nkern = self.imshp[0]
-        imshp = N.hstack((self.nkern,self.outshp))
+        imshp = N.hstack((self.nkern,outshp))
        din = ConvOp(imshp, self.kshp, nkern, self.bsize, 
                     1,1, output_mode=mode,
                     unroll_batch=un_b, unroll_kern=un_k)(gz,filters)
+        assert (din.owner.op.outshp==self.imshp[1:]).all()
        return [din, dw]
 #def c():
@@ -238,7 +251,7 @@ using namespace std;
                                                   self.unroll_kern)
        #TODO: should we choose the unroll size automatically with the bigger divisor under 5? 
-        if self.out_mode == 'valid':
+        if self.out_mode == 'valid' and self.dx==0 and self.dy==0:
 #            print "return gemm version"
            return _conv_op_code_valid_gemm % d
        else:
@@ -388,8 +401,11 @@ if ((!%(z)s)
 }
 int Os[2];
-if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
+Os[0]=%(self_outshp0)s;
-else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
+Os[1]=%(self_outshp1)s;
+//I keep the formula to calculte Os in case we need it in the futur.
+//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
+//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
 for(int b=0;b< %(self_bsize)s;b++){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
@@ -410,12 +426,14 @@ for(int b=0;b< %(self_bsize)s;b++){
      int new_m;
-      for (int m=0; m < Os[0]; m++) {
+      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
        // Reposition index into input image based on requested output size
-        if (mode == FULL) new_m = m ;
+        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        else new_m = (m+dim_ker[0]-1);
+        if (mode == FULL) new_m = pos_m ;
+        else new_m = (pos_m+dim_ker[0]-1);
-        for (int n=0; n < Os[1]; n++) {  // loop over columns 
+        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
+          int pos_n=iter_n*%(self_dy)s;
          %(type)s sum=0;
          // Sum over kernel, if index into image is out of bounds
@@ -433,7 +451,7 @@ for(int b=0;b< %(self_bsize)s;b++){
              }else{
                //do the part where kernel is to the right of the img
-                int k=0,max_k=max((int)(n-dim_im[1])+1,0);
+                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
                if(fill_value!=0){ 
                  for(k=0;k<max_k;k++){
@@ -442,9 +460,9 @@ for(int b=0;b< %(self_bsize)s;b++){
                }else {k=max_k;}
                //do the part where the kernel is on the img
-                max_k=min(n+1,(int)dim_ker[1]);
+                max_k=min(pos_n+1,(int)dim_ker[1]);
                const %(type)s * idx_in=&in[ind0*dim_im[1]];
-                for (int ind1=n-k; k<max_k; k++,ind1--) {
+                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
                  sum+= idx_hvals[k] * idx_in[ind1];
                }
                //do the part to the left of the img
@@ -454,14 +472,13 @@ for(int b=0;b< %(self_bsize)s;b++){
            }else{
              const %(type)s* idx_in=&in[ind0*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
              const %(type)s* idx_hvals=&hvals[j*dim_ker[1]];
-              int new_n = (n+dim_ker[1]-1);
+              int new_n = (pos_n+dim_ker[1]-1);
              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
                sum+=idx_hvals[k]*idx_in[last];
              }
            }
          }//for j
-          out[m*dim_zz[1]+n] %(affectation)s sum;
+          out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
        }//for n
      }//for m
    }//for stack_size
@@ -763,7 +780,11 @@ if(%(img2d)s->nd==2){
  img2d_dim[1]=%(img2d)s->dimensions[1];
  img2d_dim[0]=%(img2d)s->dimensions[0];
 }else {
-    PyErr_SetString(PyExc_ValueError, "img don't have a good shape");
+    std:stringstream temp;
+    temp << "nddim="<<%(img2d)s->nd;
+    std::string param = temp.str();
+    PyErr_SetString(PyExc_ValueError,
+      ("img don't have a good shape. " + param).c_str());
    %(fail)s;
 }
@@ -777,11 +798,7 @@ if(%(filtersflipped)s->nd==3){
  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
 }else{
-    std:stringstream temp;
+    PyErr_SetString(PyExc_ValueError, "kernel don't have a good shape");
-    temp << "nddim="<<%(filtersflipped)s->nd;
-    std::string param = temp.str();
-    PyErr_SetString(PyExc_ValueError,
-      ("kernel don't have a good shape. " + param).c_str());
    %(fail)s;
 }
@@ -844,8 +861,12 @@ if ((!%(z)s)
 }
 int Os[2];
-if (mode == FULL) {Os[0] = dim_im[0]+dim_ker[0]-1; Os[1] = dim_im[1]+dim_ker[1]-1;}
+Os[0]=%(self_outshp0)s;
-else {Os[0] = dim_im[0]-dim_ker[0]+1; Os[1] = dim_im[1]-dim_ker[1]+1;}
+Os[1]=%(self_outshp1)s;
+//I keep the formula to calculte Os in case we need it in the futur.
+//if (mode == FULL) {Os[0] = (int)ceil((dim_im[0]+dim_ker[0]-1)/float(%(self_dx)s)); Os[1] = ceil((dim_im[1]+dim_ker[1]-1)/float(%(self_dy)s));}
+//else {Os[0] = (int)ceil((dim_im[0]-dim_ker[0]+1)/float(%(self_dx)s)); Os[1] = (int)ceil((dim_im[1]-dim_ker[1]+1)/float(%(self_dy)s));}
 for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
  for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
@@ -866,12 +887,14 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
      int new_m;
-      for (int m=0; m < Os[0]; m++) {
+      for (int iter_m=0; iter_m < Os[0]; iter_m++) {
        // Reposition index into input image based on requested output size
-        if (mode == FULL) new_m = m ;
+        int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-        else new_m = (m+dim_ker[0]-1);
+        if (mode == FULL) new_m = pos_m ;
+        else new_m = (pos_m+dim_ker[0]-1);
-        for (int n=0; n < Os[1]; n++) {  // loop over columns 
+        for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns 
+          int pos_n=iter_n*%(self_dy)s;
        """%d
    ret+=my_dup("%(type)s sum%(unroll_iter)s=0;", unroll_bsize*unroll_ksize)
    ret+="""
@@ -895,7 +918,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
              }else{
                //do the part where kernel is to the right of the img
-                int k=0,max_k=max((int)(n-dim_im[1])+1,0);
+                int k=0,max_k=max((int)(pos_n-dim_im[1])+1,0);
                if(fill_value!=0){ 
                  for(k=0;k<max_k;k++){
@@ -906,11 +929,11 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
                }else {k=max_k;}
                //do the part where the kernel is on the img
-                max_k=min(n+1,(int)dim_ker[1]);
+                max_k=min(pos_n+1,(int)dim_ker[1]);
 """%d
    ret+=my_dup("const %(type)s * idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
    ret+="""
-                for (int ind1=n-k; k<max_k; k++,ind1--) {
+                for (int ind1=pos_n-k; k<max_k; k++,ind1--) {
 """%d
    ret+=my_dup2("sum%(unroll_iter)s+= idx_hvals%(unroll_kiter)s[k] * idx_in%(unroll_biter)s[ind1];")
@@ -929,7 +952,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
    ret+=my_dup("const %(type)s* idx_in%(unroll_iter)s=&in%(unroll_iter)s[ind0*dim_im[1]];", unroll_bsize)
    ret+=my_dup("const %(type)s* idx_hvals%(unroll_iter)s=&hvals%(unroll_iter)s[j*dim_ker[1]];",unroll_ksize)
    ret+="""
-              int new_n = (n+dim_ker[1]-1);
+              int new_n = (pos_n+dim_ker[1]-1);
              for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
 """%d
@@ -940,7 +963,7 @@ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
          }//for j
 """%d
-    ret+=my_dup("out%(unroll_iter)s[m*dim_zz[1]+n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
+    ret+=my_dup("out%(unroll_iter)s[iter_m*dim_zz[1]+iter_n] %(affectation)s sum%(unroll_iter)s;", unroll_bsize*unroll_ksize)
    ret+="""
        }//for n
      }//for m

--- a/theano/sandbox/test_conv.py
+++ b/theano/sandbox/test_conv.py
@@ -90,16 +90,18 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
                ####### test with new sp.convolve2 function ######
                time1 = time.time()
                hid, outshp2 = convolve2(kern, kshp, nkern, img, imshp,  
-                                         bsize, (1,1), mode=conv_mode)
+                                         bsize, (ss[0],ss[1]), mode=conv_mode)
                propup = function([kern, img], hid)
                propup1 = function([kern, img], hid,mode=Mode(linker="py"))
                hidval  = propup(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
-                hidval  = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]]
+                hidval  = hidval.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
+#                hidval = hidval[:,:,::ss[0],::ss[1]]
                hidval = hidval.reshape(bsize, -1)
                for i in range(repeat):
                    hidval1 = propup1(w_flip.reshape(nkern,-1), imgval.reshape(bsize,-1))
-                hidval1  = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])[:,:,::ss[0],::ss[1]]
+                hidval1  = hidval1.reshape(bsize,nkern,outshp2[-2],outshp2[-1])
+#                hidval1  = hidval1[:,:,::ss[0],::ss[1]]
                hidval1 = hidval1.reshape(bsize, -1)
                assert (N.abs(hidval-hidval1)<1e-5).all()
@@ -113,7 +115,7 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
                hidval1=outval.copy()
            # ConvOp
-            conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
+            conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode, unroll_batch=unroll_batch, unroll_kern=unroll_kern)(inputs4, kerns4)
            l1shp=N.hstack((nkern,
                            getFilterOutShp(imshp, kshp, ss, conv_mode)))
            propup2 = function([inputs4, kerns4], conv_op)
@@ -122,14 +124,14 @@ def exec_multilayer_conv_nnet(conv_mode, ss, bsize, imshp, kshps, nkerns, unroll
            time1 = time.time()
            for i in range(repeat):
                hidval2_ = propup2(imgval,w_flip)
-            hidval2 = hidval2_[:,:,0::ss[0],0::ss[1]]
+            hidval2 = hidval2_#[:,:,0::ss[0],0::ss[1]]
            tctot += time.time() - time1
            if conv_op_py:
                time1 = time.time()
                for i in range(repeat):
                    hidval3_ = propup3(imgval,w_flip)
-                hidval3 = hidval3_[:,:,0::ss[0],0::ss[1]]
+                hidval3 = hidval3_#[:,:,0::ss[0],0::ss[1]]
                tpytot += time.time() - time1
                assert (N.abs(hidval2-hidval3)<1e-5).all()
            else:
@@ -235,7 +237,7 @@ class TestConvOp(unittest.TestCase):
                    # compute with new convolve2 (no timing info)
                    output4, outshp4  = convolve2(kerns, kshp, nkern, input,\
-                            imshp, bsize, (1,1), bias=bias, mode=conv_mode)
+                            imshp, bsize, (ss[0],ss[1]), bias=bias, mode=conv_mode)
 #                    print 'output4', output4
                    ttime1 = time.time()
@@ -244,7 +246,7 @@ class TestConvOp(unittest.TestCase):
 #                    print 'out4', out4, img1d, filtersflipped
                    tconv2 += [time.time() - ttime1]
                    out4 = out4.reshape(bsize, nkern, outshp4[1], outshp4[2])
-                    out4 = out4[:,:,0::ss[0],0::ss[1]]
+                    out4 = out4#[:,:,0::ss[0],0::ss[1]]
                    out4 = out4.reshape(bsize, -1)
                    # compute with ConvOp
@@ -252,18 +254,18 @@ class TestConvOp(unittest.TestCase):
                    inputs=dmatrix3()
                    kerns3=dmatrix3()
                    bia=T.dscalar()
-                    conv_op = ConvOp(imshp, kshp, nkern, bsize, 1,1, conv_mode)(inputs, kerns3)
+                    conv_op = ConvOp(imshp, kshp, nkern, bsize, ss[0],ss[1], conv_mode)(inputs, kerns3)
                    f2 = function([inputs, kerns3], conv_op, mode=Mode(linker="c"))
                    f3 = function([inputs, kerns3], conv_op, mode=Mode(linker="py"))
                    ttime1 = time.time()
                    out2_ = f2(img2d, filtersflipped)
-                    out2__ = out2_[:,:,0::ss[0],0::ss[1]]
+                    out2__ = out2_#[:,:,0::ss[0],0::ss[1]]
                    tconvop += [time.time() - ttime1]
                    out2___ = out2__.copy()
                    out2 = out2___ + biasvals.reshape(1,nkern,1,1)
                    out3_ = f3(img2d, filtersflipped)
-                    out3__ = out3_[:,:,0::ss[0],0::ss[1]]
+                    out3__ = out3_#[:,:,0::ss[0],0::ss[1]]
                    out3___ = out3__.copy()
                    out3 = out3___ + biasvals.reshape(1,nkern,1,1)
                    assert (N.abs(out2_-out3_)<1e-5).all()
@@ -302,15 +304,21 @@ class TestConvOp(unittest.TestCase):
        print 'speed up ConvOp vs convolve2d: %.3f'%d.mean(),d
    def test_multilayer_conv(self):
+        print '\n\n*************************************************'
+        print '           TEST MULTILAYER CONVOLUTION' 
+        print '*************************************************'
        # fixed parameters
+        # test multiple configuration at the same time
        bsizes = [6,6] # batch size
-        imshp_starts = [(1,28,28),(1,4,4)]
+        imshp_starts = [(1,13,14),(1,4,5)]
        kshpss = ([[5,6],[7,4]],[[2,2],[2,2]])
        nkernss = [[20,40],[2,2]] # per output pixel
-        ssizess = [[(1,1),(2,2)],[(1,1),(2,2)]]
+        ssizess = [[(1,1),(1,2)],[(1,1),(2,2)]]
        convmodes = ['valid','full']
        do_convolve2=True
        unroll = [(0,0),(1,1),(2,2),(3,2)]#(batch,kern)
+        do_speed_test = False
        # TODO: this version show a bug that was fixed
        # the test is included in the upper test.
@@ -319,15 +327,6 @@ class TestConvOp(unittest.TestCase):
 #        nkerns = [2,2] # per output pixel
 #        ssizes = [(1,1),(2,2)]#2,2)]
-        #test speed
-#        bsize = 10 # batch size
-#        imshp_start = (1,50,49)#un square shape to test more corner case.
-#        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-#        nkerns = [20,20] # per output pixel
-#        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-#        convmodes = ['valid','full']
-#        do_convolve2=False
        N.set_printoptions(threshold=N.nan)
        # symbolic stuff
@@ -338,7 +337,7 @@ class TestConvOp(unittest.TestCase):
        for i in range(len(kshpss)):
            assert len(kshpss[i])==len(nkernss[i])==len(kerns)
-        if False:
+        if do_speed_test:
            # calculate the speed up of different combination of unroll
            # put the paramter to the same you will try. 
@@ -418,19 +417,23 @@ class TestConvOp(unittest.TestCase):
        d=N.asarray(ntot)/tpytot
        print 'speed up py theano(ConvOp) vs convolve2d: %.3fx'%d.mean(),d
    def test_ConvOpGrad(self):
        """
        test the gradient in float and double
        """
+        print '\n\n*************************************************'
+        print '           TEST ConvOp.grad' 
+        print '*************************************************'
        nkern = 4
        bsize = 3
        types = ["float32", "float64"]
-        kshps = [(5,5), (6,7)]
+        kshps = [(3,4)]
-        imshps = [(1,5,5), (2,8,8), (3,8,7)]
+        imshps = [(2,8,7)]
        modes = ['valid', 'full']
        unroll_batch=[0,1,3]
        unroll_kern=[0,1,4]
+        ssizes = [(1,1),(2,2)]
        for typ in types:
            imgs  = T.TensorType(typ, (False, False, False, False),'imgs')
@@ -445,42 +448,41 @@ class TestConvOp(unittest.TestCase):
                            continue
                        for un_b in unroll_batch:
                            for un_k in unroll_kern:
-                                imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
+                                for ss in ssizes:
-#                                print 'imgvals.shape = ', imgvals.shape, imgvals.dtype
+                                    imgvals = N.array(N.random.random(N.hstack((bsize,imshp))),dtype=imgs.dtype)
-#                                imgvals = imgvals.reshape(bsize,-1)
+                                    kernvals = N.array(N.random.rand(nkern,visdim,kshp[0],
-                                kernvals = N.array(N.random.rand(nkern,visdim,kshp[0],
+                                                             kshp[1]),dtype=kerns.dtype)
-                                                         kshp[1]),dtype=kerns.dtype)
+                                    def testf(imgs, kerns):
-#                                print 'kernvals.shape = ', kernvals.shape, kernvals.dtype
+                                        out, outshp = convolve2(kerns, kshp, nkern, 
-#                                kernvals = kernvals.reshape(nkern,-1)
+                                                                imgs, imshp, bsize, 
+                                                                mode=mode, step=ss,
-                                def testf(imgs, kerns):
+                                                                unroll_batch=un_b,
-                                    out, outshp = convolve2(kerns, kshp, nkern, 
+                                                                unroll_kern=un_k)
-                                                            imgs, imshp, bsize, 
+                                        return out
-                                                            mode=mode,
+                                    #TODO the tolerance needed to pass is very high for float32(0.16). Is this acceptable? Expected?
-                                                            unroll_batch=un_b,
+                                    utt.verify_grad(testf, [imgvals, kernvals],
-                                                            unroll_kern=un_k)
+                                                    cast_to_output_type=True,
-                                    return out
+                                                    tol=None if typ!="float32" else 0.16)
-                                #TODO the tolerance needed to pass is very high for float32(0.16). Is this acceptable? Expected?
-                                utt.verify_grad(testf, [imgvals, kernvals],
-                                                cast_to_output_type=True,
-                                                tol=None if typ!="float32" else 0.16)
 if __name__ == '__main__':
-#    t = TestConvOp("test_convolution")
+    t = TestConvOp("test_convolution")
 #    t.test_convolution()
-#    t.test_multilayer_conv()
+    t.test_multilayer_conv()
 #    from theano.tests import main
 #    main("test_sp")
-    bsize = 20 # batch size
+    if False:
-    imshp_start = (1,100,100)#un square shape to test more corner case.
+        #used to lanch 8 jobs at the same time.
-    kshps = ([11,12],[12,11])#un square shape to test more corner case.
+        bsize = 20 # batch size
-    nkerns = [20,20] # per output pixel
+        imshp_start = (1,100,100)#un square shape to test more corner case.
-    ssizes = [(1,1),]#(1,1)]#(2,2) bugged
+        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-    convmodes = ['valid','full']
+        nkerns = [20,20] # per output pixel
-    unroll_batch = 5
+        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
-    unroll_kern = 2
+        convmodes = ['valid','full']
-    ctot=0
+        unroll_batch = 5
-    tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
+        unroll_kern = 2
-    print "total exec time %.3fs"%tctot
+        ctot=0
+        tctot, tpytot, ntot = exec_multilayer_conv_nnet(convmodes[1], ssizes[0], bsize, imshp_start, kshps, nkerns, unroll_batch=unroll_batch, unroll_kern=unroll_kern, validate=False, do_print=False,repeat=5)
+        print "total exec time %.3fs"%tctot
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2054,8 +2054,10 @@ class Reshape(Op):
    The number of dimensions to which to reshape to (ndim) must be known at graph 
    build time."""
    view_map = {0: [0]} #output 0 is potentially aliased to inputs [0]
-    def __init__(self, ndim):
+    def __init__(self, ndim, name = None):
        self.ndim = ndim
+        if name:
+            self.name = name
    def __eq__(self, other):
        return (type(other) is Reshape) and (other.ndim == self.ndim)
    def __hash__(self):
@@ -2075,10 +2077,10 @@ class Reshape(Op):
    def grad(self, (x, shp), (g_out,)):
        return [reshape(g_out, shape(x), ndim=x.ndim), None]
-def reshape(x, newshape, ndim=None):
+def reshape(x, newshape, ndim=None, name=None):
    if ndim is None:
        ndim = get_vector_length(newshape)
-    op = Reshape(ndim)
+    op = Reshape(ndim, name)
    return op(x, newshape)

--- a/theano/tensor/nnet.py
+++ b/theano/tensor/nnet.py
@@ -581,6 +581,11 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
                """,
                inside_row_loop,
                """
+            if ((y_i >= %(x)s->dimensions[1]) || (y_i < 0))
+            {
+                PyErr_SetString(PyExc_ValueError, "y_i value out of bounds");
+                %(fail)s;
+            }
            nll_i[0] = - x_i[y_i*Sx]
                       - b_i[y_i*Sb]
                       + row_max

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -686,7 +686,7 @@ class Canonizer(gof.LocalOptimizer):
        op = node.op
        if op not in [self.main, self.inverse, self.reciprocal]:
            return False
        inputs = node.inputs
        out = node.outputs[0]
        assert len(node.outputs) == 1
@@ -725,8 +725,14 @@ class Canonizer(gof.LocalOptimizer):
        return getattr(self, 'name', 'Canonizer(%s, %s, %s)' % (self.main, self.inverse, self.reciprocal))
-def mul_calculate(num, denum, aslist = False):
+def mul_calculate(num, denum, aslist=False):
-    v = reduce(N.multiply, num, 1.0) / reduce(N.multiply, denum, 1.0)
+    if not num and not denum:
+        # Smallest 1 possible.
+        return [] if aslist else N.int8(1)
+    # Make sure we do not accidently upcast data types.
+    first = num[0] if num else denum[0]
+    one = N.asarray(first).dtype.type(1)
+    v = reduce(N.multiply, num, one) / reduce(N.multiply, denum, one)
    if aslist:
        if N.all(v == 1):
            return []

--- a/theano/tensor/tests/test_naacl09.py
+++ b/theano/tensor/tests/test_naacl09.py
@@ -551,7 +551,8 @@ def test_naacl_model(iters_per_unsup=10, iters_per_sup=10,
        s0 = str(m.finetuning_update(*(inputs + [targets])))
        print iters_per_sup * (i+1), s0
    if iters_per_sup == 10:
-        assert s0.startswith('15.6511')#should check for the 8 decimal only.
+        s0f = float(s0)
+        assert 15.6510 < s0f and s0f < 15.6512
 def jtest_main():
    from theano import gof

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -57,9 +57,13 @@ class test_dimshuffle_lift(unittest.TestCase):
        x, y, z = inputs([False]*1, [False]*2, [False]*3)
        e = x + y + z
        g = Env([x, y, z], [e])
-        self.failUnless(str(g) == "[add(InplaceDimShuffle{x,0,1}(add(InplaceDimShuffle{x,0}(x), y)), z)]", str(g))
+        self.failUnless(str(g) == ("[Elemwise{add,no_inplace}("
+            "InplaceDimShuffle{x,0,1}(Elemwise{add,no_inplace}"
+            "(InplaceDimShuffle{x,0}(x), y)), z)]"), str(g))
        dimshuffle_lift.optimize(g)
-        self.failUnless(str(g) == "[add(add(InplaceDimShuffle{x,x,0}(x), InplaceDimShuffle{x,0,1}(y)), z)]", str(g))
+        self.failUnless(str(g) == ("[Elemwise{add,no_inplace}(Elemwise"
+            "{add,no_inplace}(InplaceDimShuffle{x,x,0}(x), InplaceDimShuffle"
+            "{x,0,1}(y)), z)]"), str(g))
 def test_add_canonizer_problem0():