merge

e8c50c78 · fsavard · f60f996d · 80a67b27 · e8c50c78 · e8c50c78
--- a/README.txt
+++ b/README.txt
@@ -27,7 +27,7 @@ Theano (current directory) is the distribution directory.
        * scalar depends upon core
        * tensor depends upon scalar
        * sparse depends upon tensor
-        * sandbox can depends on everything else
+        * sandbox can depend on everything else
    * Theano/examples are copies of the example on the wiki
    * Theano/benchmark, Theano/bin and Theano/examples are in the distribution,
      but not in the python package

--- a/doc/extending/fibby.txt
+++ b/doc/extending/fibby.txt
@@ -99,7 +99,7 @@ The ``make_node`` method creates a node to be included in the expression graph.
 It runs when we apply our Op (``fibby``) to Variable (``x``), as in ``fibby(tensor.vector())``.
 When an Op has multiple inputs, their order in the inputs argument to ``Apply``
 is important:  Theano will call ``make_node(*inputs)`` to copy the graph, 
-so it is important to not change the semantics of the expression by doing changing the argument order.
+so it is important not to change the semantics of the expression by changing the argument order.




--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -138,7 +138,7 @@ following methods:
  other criterion C with respect to the Op's input.

  If the outputs of your op are :math:`[ f_1, ... f_n]`, then
-  ``output_derivatives`` gives
+  ``output_gradients`` is
  :math:`[ grad_{f_1}(C), grad_{f_2}(C), ... , grad_{f_n}(C) ]`.
  If the inputs of your op are :math:`[x_1, ..., x_m]`, then your Op.grad
  should return :math:`[ grad_{x_1}(C), grad_{x_2}(C), ..., grad_{x_m}(C) ]`,

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -14,7 +14,8 @@ Requirements
 ------------

 In order to use Theano, the following libraries and software will need
-to be installed:
+to be installed (MacOS and Windows users should refer to platform-specific
+instructions below for detailed installation steps):

    Linux, Mac OS X or Windows operating system
        We develop mainly on 64-bit Linux machines. 32-bit architectures are
@@ -394,7 +395,7 @@ Windows V1 (Installing from Scratch)
  You can keep the default install options (except for the installation directory).

 - Install Mercurial. You can download it
-  `here <http://mercurial.selenic.com/downloads>`_. You may get either the command
+  `here <http://mercurial.selenic.com/downloads>`__. You may get either the command
  line Windows version or the TortoiseHG GUI version: it does not matter as
  far as installing Theano is concerned.

@@ -450,7 +451,7 @@ compile GotoBLAS2 (ATLAS may work too, but was not tested, and is
 usually reported to be slower and more difficult to compile -- especially
 on Windows).
 GotoBLAS2 can be downloaded
-`here <http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads>`_
+`here <http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads>`__
 after registering on the website (we tested v1.13).
 To compile it, you will also need to install MSYS and Perl,
 as described below.
@@ -538,8 +539,7 @@ Windows: Using the GPU

 Please note that these are tentative instructions (we have not yet been able to
 get the GPU to work under Windows with Theano).
-Please report your own successes / failures on the
-`theano-users <http://groups.google.com/group/theano-users>`_ mailing list.
+Please report your own successes / failures on the `theano-users`_ mailing list.

 Those are instructions for the 32-bit version of Python (the one that comes
 with Python(x,y) is 32-bit).
@@ -555,14 +555,15 @@ use a compilation directory located somewhere else:
      [global]
      base_compiledir=path_to_a_directory_without_such_characters

-  You also need to add in the configuration file those lines:
+You also need to add in the configuration file those lines (make sure this
+is the correct Python installation path):

    .. code-block:: cfg

      [cuda]
      nvccflags=-LC:\Python26\libs

-  Then
+Then
  
  1) Install CUDA driver (32-bit on 32-bit Windows, idem for 64-bit).
  

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -128,16 +128,26 @@ Config Attributes

    Default 'Mode'

-    This set the default compilation mode for theano functions. By default the
+    This sets the default compilation mode for theano functions. By default the
    mode Mode is equivalent to FAST_RUN. See Config attribute linker and optimizer.

+.. attribute:: config.lib.amdlibm
+
+    Bool value: either True or False
+
+    Default False
+
+    This makes the compilation use the
+    `amdlibm <http://developer.amd.com/cpu/libraries/libm/>`__
+    library, which is faster than the standard libm.
+
 .. attribute:: linker

    String value: 'c|py', 'py', 'c', 'c|py_nogc', 'c&py'

    Default: 'c|py'

-    When the mode is Mode, it set the default linker used.
+    When the mode is Mode, it sets the default linker used.

 .. attribute:: optimizer

@@ -145,7 +155,7 @@ Config Attributes

    Default: 'fast_run'

-    When the mode is Mode, it set the default optimizer used.
+    When the mode is Mode, it sets the default optimizer used.

 .. attribute:: warn.ignore_bug_before


--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -46,7 +46,7 @@ AddConfigVar('DebugMode.check_strides',
        IntParam(1, lambda i: i in (0,1,2)))

 AddConfigVar('DebugMode.warn_input_not_reused',
-        ("Generate a warning when the destroy_map tell that an op work inplace, but the op did not reuse the input for its output."
+        ("Generate a warning when the destroy_map or view_map tell that an op work inplace, but the op did not reuse the input for its output."
         ),
        BoolParam(True))

@@ -519,6 +519,18 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, clobber_dr_v
            if storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
                warning("input idx %d marked as destroyed was not changed for node '%s'"%(ii[0],str(node)))

+    if warn_input_not_reused:
+        vmap=getattr(node.op,'view_map',{})
+        for oo,ii in vmap.iteritems():
+            if hasattr(node.outputs[0].type,"may_share_memory"):
+                if not node.outputs[0].type.may_share_memory(storage_map[node.outputs[oo]][0],storage_map[node.inputs[ii[0]]][0]):
+                    #when a subtensor return a tensor ofndim==0, numpy seam to return a copy.
+                    #when have an empty ndarray(happen with output guard) it is not the same. why?
+                    if storage_map[node.outputs[oo]][0].ndim>0 and storage_map[node.outputs[oo]][0].size>0:
+                        warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
+            elif storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
+                warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
+
    for r_idx, r in enumerate(node.inputs):
        if not r.type.values_eq(r_vals[r], storage_map[r][0]):
            # some input node 'r' got changed by running the node

--- a/theano/misc/hooks/check_whitespace.py
+++ b/theano/misc/hooks/check_whitespace.py
@@ -14,6 +14,8 @@ import tokenize
 import argparse
 import reindent

+SKIP_WHITESPACE_CHECK_FILENAME = ".hg/skip_whitespace_check"
+
 def get_parse_error(code):
    """
    Checks code for ambiguous tabs or other basic parsing issues.
@@ -128,6 +130,20 @@ def save_diffs(diffs, filename):
    diff_file.write(diff)
    diff_file.close()

+def should_skip_commit():
+    if not os.path.exists(SKIP_WHITESPACE_CHECK_FILENAME):
+        return False
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "r")
+    whitespace_check_changeset = whitespace_check_file.read()
+    whitespace_check_file.close()
+    return whitespace_check_changeset == parent_commit()
+
+def save_skip_next_commit():
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "w")
+    whitespace_check_file.write(parent_commit())
+    whitespace_check_file.close()
+
+
 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
@@ -145,12 +161,32 @@ def main(argv=None):
                        const=True,
                        help="only check indentation if the file was previously correctly indented (or is new)"
                       )
+    parser.add_argument("-s", "--skip-after-failure",
+                        action="store_const",
+                        default=False,
+                        const=True,
+                        help="when this pre-commit hook fails, don't run it on the next commit; "
+                             "this lets you check in your changes and then check in "
+                             "any necessary whitespace changes in the subsequent commit"
+                       )
    args = parser.parse_args(argv)

+    # -i and -s are incompatible; if you skip checking, you end up with a not-correctly-indented
+    # file, which -i then causes you to ignore!
+    if args.skip_after_failure and args.incremental:
+        print >> sys.stderr, "*** check whitespace hook misconfigured! -i and -s are incompatible."
+        return 1
+
    if is_merge():
        # don't inspect merges: (a) they're complex and (b) they don't really introduce new code
        return 0

+    if args.skip_after_failure and should_skip_commit():
+        # we're set up to skip this one, so skip it, but
+        # first, make sure we don't skip the next one as well :)
+        os.remove(SKIP_WHITESPACE_CHECK_FILENAME)
+        return 0
+
    block_commit = False

    diffs = []
@@ -185,12 +221,15 @@ def main(argv=None):
        save_diffs(diffs, diffs_filename)
        print >> sys.stderr, "*** To fix all indentation issues, run: cd `hg root` && patch -p0 < %s" % diffs_filename

-
    if block_commit:
        save_filename = ".hg/commit_message.saved"
        save_commit_message(save_filename)
        print >> sys.stderr, "*** Commit message saved to %s" % save_filename

+        if args.skip_after_failure:
+            save_skip_next_commit()
+            print >> sys.stderr, "*** Next commit attempt will not be checked. To change this, rm %s" % SKIP_WHITESPACE_CHECK_FILENAME
+
    return int(block_commit)



--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-import atexit, os, stat
+import atexit, gc, os, stat
 from theano.compile import optdb
 from theano import config

@@ -96,6 +96,9 @@ if cuda_available:
        cuda_initialization_error_message = ""
        # actively closing our gpu session presents segfault-on-exit on some systems
        atexit.register(gpu_shutdown)
+        # do garbage collection before releasing the gpu to avoid releasing invalid pointers later
+        # note that atexit-registered calls are called in LIFO order
+        atexit.register(gc.collect)
    except EnvironmentError, e:
        cuda_available = False
        cuda_initialization_error_message = e.message

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -12,43 +12,12 @@
 //If true, we fill with NAN allocated device memory.
 #define ALLOC_MEMSET 0

-#define DEBUG_GPU_CONTEXT_REFCOUNT 0
-// g_gpu_context_refcount starts at one b/c the gpu context will be implicitly created
-// on the first successful cuda call. the matching decref is in CudaNdarray_gpu_shutdown.
-static int g_gpu_context_refcount = 1;
-
-///////////////////////////
-// cuda context management
-///////////////////////////
-
-void gpu_context_incref() {
-  g_gpu_context_refcount++;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_incref, to %d\n", g_gpu_context_refcount);
-#endif
-}
-
-void gpu_context_decref() {
-  g_gpu_context_refcount--;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_decref, to %d\n", g_gpu_context_refcount);
-#endif
-  if(g_gpu_context_refcount == 0) {
-    // we're now free to close the cuda context; if we don't explicitly
-    // exit our cuda context, some systems segfault on process exit
-    // for as-yet unknown reasons; see
-    // http://groups.google.com/group/theano-users/browse_thread/thread/c351846e5cebe35f
-    cudaThreadExit();
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-    fprintf(stderr, "gpu_context_decref at 0, calling cudaThreadExit\n");
-#endif
-  }
-}
-
 /////////////////////////
 // Alloc and Free
 /////////////////////////

+static int g_gpu_context_active = 0;
+
 /**
 *
 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
@@ -80,9 +49,6 @@ void * device_malloc(size_t size)
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
-    if(rval != NULL) {
-        gpu_context_incref(); // keep the gpu context around until we've free this memory
-    }
 #if COMPUTE_GPU_MEM_USED
    for(int i=0;i<TABLE_SIZE;i++){
      if(NULL==_alloc_size_table[i].ptr){
@@ -104,6 +70,10 @@ void * device_malloc(size_t size)
 }
 int device_free(void *ptr)
 {
+    // if there is no gpu context, the call to cudaFree will fail; skip it entirely
+    if(!g_gpu_context_active) {
+        return 0;
+    }
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
    {
@@ -116,9 +86,6 @@ int device_free(void *ptr)
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
-    if(ptr != NULL) {
-        gpu_context_decref();
-    }
 #if COMPUTE_GPU_MEM_USED
    int i=0;
    for(;i<TABLE_SIZE;i++)
@@ -1883,6 +1850,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
                        "Unable to get the number of gpus available: %s",
                        cudaGetErrorString(cudaGetLastError()));
  }
+  
+  // as soon as the first successful call to a cuda* function is made, a
+  // gpu context has been created
+  g_gpu_context_active = 1;
+  
  if(deviceCount <= 0) {
    return PyErr_Format(PyExc_EnvironmentError,
                        "Can't use the GPU, no devices support CUDA");
@@ -1926,7 +1898,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)

 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
-    gpu_context_decref();
+    cudaThreadExit();
+    g_gpu_context_active = 0; // context has now been closed down
    Py_INCREF(Py_None);
    return Py_None;
 }

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -213,7 +213,8 @@ class SparseType(gof.Type):
        # a FAST_RUN computation..
        return scipy.sparse.issparse(a) \
                and scipy.sparse.issparse(b) \
-                and abs(a-b).sum() < (1e-6 * a.nnz)
+                and ((abs(a-b).sum() < (1e-6 * a.nnz))
+                     or (a.nnz==0 and b.nnz==0))#in case a and b are empty

    def values_eq(self, a, b):
        #WARNING: equality comparison of sparse matrices is not fast or easy
@@ -789,7 +790,11 @@ class StructuredDot(gof.Op):
        dtype_out = scalar.upcast(a.type.dtype, b.type.dtype)
        if b.type.ndim != 2:
            raise NotImplementedError('non-matrix b')
-        return gof.Apply(self, [a,b], [tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])
+
+        if _is_sparse_variable(b):
+            return gof.Apply(self, [a,b], [SparseType(a.type.format,dtype_out)()])
+        else:
+            return gof.Apply(self, [a,b], [tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])

    def perform(self, node, (a,b), (out,)):
        if a.shape[1] != b.shape[0]:
@@ -797,6 +802,11 @@ class StructuredDot(gof.Op):

        #variable = a.dot(b)  # deprecated
        variable = a * b
+        if isinstance(node.outputs[0].type,SparseType):
+            assert _is_sparse(variable)
+            out[0] = variable
+            return
+
        assert _is_dense(variable) # scipy 0.7 automatically converts to dense

        # dot of an NxM sparse matrix, with a Mx1 dense matrix, returns vector not matrix

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -344,6 +344,28 @@ class test_structureddot(unittest.TestCase):
        outvals = f(kernvals,imvals)
        print outvals

+    def test_dot_sparse_sparse(self):
+        #test dot for 2 input sparse matrix
+        sparse_dtype = 'float64'
+        for sparse_format in ['csc','csr']:
+            a = SparseType(sparse_format, dtype=sparse_dtype)()
+            b = SparseType(sparse_format, dtype=sparse_dtype)()
+            d = theano.dot(a,b)
+            f = theano.function([a,b], theano.Out(d, borrow=True))
+            topo = f.maker.env.toposort()
+            for M,N,K,nnz in [(4,3,2,3),
+                              (40,30,20,3),
+                              (40,30,20,30),
+                              (400,3000,200,6000),
+                              ]:
+                if sparse_format == 'csc':
+                    spmat = sp.csc_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csc_matrix(random_lil((N,K), sparse_dtype, nnz))
+                elif sparse_format == 'csr':
+                    spmat = sp.csr_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csr_matrix(random_lil((N,K), sparse_dtype, nnz))
+                f(spmat,spmat2)
+
    def test_csc_correct_output_faster_than_scipy(self):
        sparse_dtype = 'float64'
        dense_dtype = 'float64'

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -240,10 +240,10 @@ class DimShuffle(Op):

        shape_statements = ['npy_intp dimensions[%i]'%nd_out]
        for i, o in enumerate(self.new_order):
-          if o != 'x':
-            shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')]
-          else:
-            shape_statements += [('dimensions['+str(i)+'] = 1')]
+            if o != 'x':
+                shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')]
+            else:
+                shape_statements += [('dimensions['+str(i)+'] = 1')]
        #backport
        #shape_statements += [('dimensions['+str(i)+'] = %(basename)s->dimensions['+str(o)+']')
        #    if o != 'x' else
@@ -255,10 +255,10 @@ class DimShuffle(Op):

        #set the strides of the non-broadcasted dimensions
        for i, o in enumerate(self.new_order):
-          if o != 'x':
-             strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')]
-          else:
-             strides_statements += [('strides['+str(i)+'] = 0')]
+            if o != 'x':
+                strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')]
+            else:
+                strides_statements += [('strides['+str(i)+'] = 0')]
        #backport
        #strides_statements += [('strides['+str(i)+'] = %(basename)s->strides['+str(o)+']')
        #    if o != 'x' else
@@ -276,7 +276,7 @@ class DimShuffle(Op):
        #                       npy_intp* strides, void* data, int itemsize, int flags, PyObject* obj)
        #
        close_bracket = [
-                #create a new array, 
+                #create a new array,
                ('%(res)s = (PyArrayObject*)PyArray_New(&PyArray_Type, '
                            '' + str(nd_out) + ', dimensions, '
                            'PyArray_TYPE(%(basename)s), strides, '
@@ -287,13 +287,13 @@ class DimShuffle(Op):
                #recalculate flags: CONTIGUOUS, FORTRAN, ALIGNED
                'PyArray_UpdateFlags(%(res)s, NPY_UPDATE_ALL)',
                #we are making a view in both inplace and non-inplace cases
-                '%(res)s->base = (PyObject*)%(basename)s', 
+                '%(res)s->base = (PyObject*)%(basename)s',
                '}']

-        full_code = statements(check_input_nd 
+        full_code = statements(check_input_nd
                + clear_output
                + get_base
-                + shape_statements 
+                + shape_statements
                + strides_statements
                + close_bracket)

@@ -345,7 +345,7 @@ class DimShufflePrinter:
            raise TypeError("Can only print DimShuffle.")
        elif isinstance(r.owner.op, DimShuffle):
            ord = r.owner.op.new_order
-            return self.__p(ord, pstate, r.owner.inputs[0])            
+            return self.__p(ord, pstate, r.owner.inputs[0])
        else:
            raise TypeError("Can only print DimShuffle.")

@@ -411,7 +411,7 @@ class Elemwise(Op):
        d.pop('__epydoc_asRoutine', None)
        d.pop('_hashval')
        return d
-    
+
    def __setstate__(self, d):
        self.__dict__.update(d)
        if self.scalar_op.nin > 0:
@@ -441,7 +441,7 @@ class Elemwise(Op):
            else:
                # TODO: use LComplete instead
                args.append(DimShuffle(
-                    input.type.broadcastable, 
+                    input.type.broadcastable,
                    ['x']*difference + range(length),
                    inplace = True)(input))
        inputs = args
@@ -463,7 +463,7 @@ class Elemwise(Op):
                        raise ValueError("Operation cannot be done inplace on an input with broadcasted dimensions.")
        out_dtypes = [o.type.dtype for o in shadow.outputs]
        if any(inputs[i].type.dtype != out_dtypes[o] for o, i in inplace_pattern.items()):
-            raise TypeError("Cannot do an inplace operation on incompatible data types.", 
+            raise TypeError("Cannot do an inplace operation on incompatible data types.",
                    ([i.type.dtype for i in inputs], out_dtypes, inplace_pattern))
        outputs = [TensorType(dtype = dtype, broadcastable = broadcastable)() for dtype, broadcastable in zip(out_dtypes, out_broadcastables)]
        return Apply(self, inputs, outputs)
@@ -484,10 +484,10 @@ class Elemwise(Op):
        first_part = [k for k,v in items]
        second_part = []
        for k,v in items:
-          if isinstance(v, (tuple, list)):
-            second_part += [tuple(v)]
-          else:
-            second_part += [v]
+            if isinstance(v, (tuple, list)):
+                second_part += [tuple(v)]
+            else:
+                second_part += [v]
        tuple_items = tuple(first_part + second_part)
        #backport
        #tuple_items = tuple([k for k,v in items] + [(tuple(v) if isinstance(v, (tuple, list)) else v) for k,v in items])
@@ -511,7 +511,7 @@ class Elemwise(Op):

    def grad(self, inputs, ograds):
        # Gradients (especially on the final costs) don't have to be symbolic
-        ograds = map(as_tensor_variable, ograds) 
+        ograds = map(as_tensor_variable, ograds)
        scalar_inputs = [Scalar(dtype = t.type.dtype)() for t in inputs]
        scalar_ograds = [Scalar(dtype = ograd.type.dtype)() for ograd in ograds]
        scalar_igrads = self.scalar_op.grad(scalar_inputs, scalar_ograds)
@@ -575,7 +575,7 @@ class Elemwise(Op):
                    msg2 = []
                    for d, b in zip(input.shape, sinput.type.broadcastable):
                        if b:
-                            msg2 += ['*'] 
+                            msg2 += ['*']
                        else:
                            msg2 += [str(d)]
                    msg.append('(%s)' % ", ".join(msg2))
@@ -616,7 +616,7 @@ class Elemwise(Op):
        # the first (faster) version leads to segfaults
        ufunc_args = inputs # + output_storage
        ufunc = self.ufunc or numpy.frompyfunc(self.scalar_op.impl, len(inputs), self.scalar_op.nout)
-        
+
        try:
            variables = ufunc(*ufunc_args)
        except Exception, e:
@@ -655,7 +655,7 @@ class Elemwise(Op):
                # b_dim might still be None, if every input's shape was unknown in dimension 'dim'
                oshp.append(b_dim)
                # TODO: it would be interesting to return the constraining information that if
-                # one of the inputs shape[dim] is known and another input's shape[dim] is not, 
+                # one of the inputs shape[dim] is known and another input's shape[dim] is not,
                # that we can now assume that the other input's shape[dim] is the same as the
                # first.
            rval.append(tuple(oshp))
@@ -841,9 +841,9 @@ class CAReduce(Op):
    Examples:
     CAReduce(add) -> sum
     CAReduce(mul) -> product
-     CAReduce(maximum) -> sum
-     CAReduce(_or) -> any # not lazy
-     CAReduce(_and) -> all # not lazy
+     CAReduce(maximum) -> max
+     CAReduce(or_) -> any # not lazy
+     CAReduce(and_) -> all # not lazy

    In order to (eventually) optimize memory usage patterns,
    L{CAReduce} makes zero guarantees on the order in which it
@@ -899,7 +899,7 @@ class CAReduce(Op):
            assert len(axis)==len(axis2)
            axis = tuple(axis2)
            op = self.__class__(self.scalar_op, axis)
-        else: 
+        else:
            op = self
        output = TensorType(dtype = self._output_dtype(input.type.dtype),
                             broadcastable = [x for i, x in enumerate(input.type.broadcastable) if i not in axis])()
@@ -910,7 +910,7 @@ class CAReduce(Op):
        d = copy(self.__dict__)
        d.pop('ufunc')
        return d
-    
+
    def __setstate__(self, d):
        self.__dict__.update(d)
        self.ufunc = numpy.frompyfunc(self.scalar_op.impl, 2, 1)

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -317,8 +317,10 @@ class MakeVector(T.Op):
        inputs = map(T.as_tensor_variable, inputs)
        if not all(a.type == inputs[0].type for a in inputs) or (len(inputs)>0 and inputs[0].dtype != self.dtype):
            dtype=theano.scalar.upcast(self.dtype,*[i.dtype for i in inputs])
-            #upcast the input to the determined dtype, but don't upcast downcast anything
-            assert dtype==self.dtype, "Upcast the input of MakeVector to dtype gived in init without precissino loss only."
+            #upcast the input to the determined dtype, but don't downcast anything
+            assert dtype==self.dtype, (
+                    "The upcast of the inputs to MakeVector should match the "
+                    "dtype given in __init__.")
            if not all(self.dtype == T.cast(i,dtype=dtype).dtype for a in inputs):
                raise TypeError("MakeVector.make_node expected inputs upcastable to %s. got %s"%(
                        self.dtype,
@@ -348,6 +350,9 @@ class MakeVector(T.Op):
            # assume that out has correct dtype.  there is no cheap way to check
            out[0][...] = inputs

+    def grad(self, inputs, output_gradients):
+        return [output_gradients[0][i] for i in xrange(len(inputs))]
+
 make_vector = MakeVector()

 class MakeVectorPrinter:

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1552,6 +1552,36 @@ class T_Join_and_Split(unittest.TestCase):
        assert len([n for n in e if isinstance(n, Join)]) == 0
        assert f.maker.env.outputs[0].dtype == config.floatX

+    def test_stack_scalar_make_vector_dtype(self):
+        '''Test that calling stack() on scalars instantiates MakeVector,
+        event when the scalar don't have the same dtype.'''
+        a = tensor.iscalar('a')
+        b = tensor.lscalar('b')
+        s = stack(a, b, a, b)
+        f = function([a,b], s)
+        val = f(1,2)
+        self.failUnless(numpy.all(val == [1,2,1,2]))
+        e = f.maker.env.toposort()
+        assert len([n for n in e if isinstance(n.op,opt.MakeVector)]) > 0
+        assert len([n for n in e if isinstance(n, Join)]) == 0
+        assert f.maker.env.outputs[0].dtype == 'int64'
+
+    def test_stack_scalar_make_vector_constant(self):
+        '''Test that calling stack() on scalars instantiates MakeVector,
+        event when the scalar are simple int type.'''
+        a = tensor.iscalar('a')
+        b = tensor.lscalar('b')
+        #test when the constant is the first element.
+        #The first element is used in a special way
+        s = stack(10,a,b, numpy.int8(3))
+        f = function([a,b], s)
+        val = f(1,2)
+        self.failUnless(numpy.all(val == [10,1,2,3]))
+        e = f.maker.env.toposort()
+        assert len([n for n in e if isinstance(n.op,opt.MakeVector)]) > 0
+        assert len([n for n in e if isinstance(n, Join)]) == 0
+        assert f.maker.env.outputs[0].dtype == 'int64'
+
    def test_join_vector(self):
        a = as_tensor_variable(numpy.array([1, 2, 3]))
        b = as_tensor_variable(numpy.array([7, 8, 9]))
@@ -3440,6 +3470,28 @@ def test_dimshuffle_duplicate():

    assert success

+class T_get_constant_value(unittest.TestCase):
+
+    def test_get_constant_value(self):
+        a = tensor.stack(1,2,3)
+        assert get_constant_value(a[0])==1
+        assert get_constant_value(a[1])==2
+        assert get_constant_value(a[2])==3
+
+        b = tensor.iscalar()
+        a = tensor.stack(b,2,3)
+        self.assertRaises(TypeError, get_constant_value, a[0])
+        assert get_constant_value(a[1])==2
+        assert get_constant_value(a[2])==3
+
+        #For now get_constant_value got throught only MakeVector and Join of scalar.
+        v = tensor.ivector()
+        a = tensor.stack(v,2,3)
+        self.assertRaises(TypeError, get_constant_value, a[0])
+        self.assertRaises(TypeError, get_constant_value, a[1])
+        self.assertRaises(TypeError, get_constant_value, a[2])
+
+
 if __name__ == '__main__':
    if 1:
        unittest.main()
@@ -3449,5 +3501,3 @@ if __name__ == '__main__':
        suite = unittest.TestLoader()
        suite = suite.loadTestsFromTestCase(testcase)
        unittest.TextTestRunner(verbosity=2).run(suite)
-
-
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py