merge

e8c50c78 · fsavard · f60f996d · 80a67b27 · e8c50c78 · e8c50c78
--- a/README.txt
+++ b/README.txt
@@ -27,7 +27,7 @@ Theano (current directory) is the distribution directory.
        * scalar depends upon core
        * tensor depends upon scalar
        * sparse depends upon tensor
-        * sandbox can depends on everything else
+        * sandbox can depend on everything else
    * Theano/examples are copies of the example on the wiki
    * Theano/benchmark, Theano/bin and Theano/examples are in the distribution,
      but not in the python package

--- a/doc/extending/fibby.txt
+++ b/doc/extending/fibby.txt
@@ -99,7 +99,7 @@ The ``make_node`` method creates a node to be included in the expression graph.
 It runs when we apply our Op (``fibby``) to Variable (``x``), as in ``fibby(tensor.vector())``.
 When an Op has multiple inputs, their order in the inputs argument to ``Apply``
 is important:  Theano will call ``make_node(*inputs)`` to copy the graph, 
-so it is important to not change the semantics of the expression by doing changing the argument order.
+so it is important not to change the semantics of the expression by changing the argument order.

--- a/doc/extending/op.txt
+++ b/doc/extending/op.txt
@@ -138,7 +138,7 @@ following methods:
  other criterion C with respect to the Op's input.
  If the outputs of your op are :math:`[ f_1, ... f_n]`, then
-  ``output_derivatives`` gives
+  ``output_gradients`` is
  :math:`[ grad_{f_1}(C), grad_{f_2}(C), ... , grad_{f_n}(C) ]`.
  If the inputs of your op are :math:`[x_1, ..., x_m]`, then your Op.grad
  should return :math:`[ grad_{x_1}(C), grad_{x_2}(C), ..., grad_{x_m}(C) ]`,

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -14,7 +14,8 @@ Requirements
 ------------
 In order to use Theano, the following libraries and software will need
-to be installed:
+to be installed (MacOS and Windows users should refer to platform-specific
+instructions below for detailed installation steps):
    Linux, Mac OS X or Windows operating system
        We develop mainly on 64-bit Linux machines. 32-bit architectures are
@@ -394,7 +395,7 @@ Windows V1 (Installing from Scratch)
  You can keep the default install options (except for the installation directory).
 - Install Mercurial. You can download it
-  `here <http://mercurial.selenic.com/downloads>`_. You may get either the command
+  `here <http://mercurial.selenic.com/downloads>`__. You may get either the command
  line Windows version or the TortoiseHG GUI version: it does not matter as
  far as installing Theano is concerned.
@@ -450,7 +451,7 @@ compile GotoBLAS2 (ATLAS may work too, but was not tested, and is
 usually reported to be slower and more difficult to compile -- especially
 on Windows).
 GotoBLAS2 can be downloaded
-`here <http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads>`_
+`here <http://www.tacc.utexas.edu/tacc-projects/gotoblas2/downloads>`__
 after registering on the website (we tested v1.13).
 To compile it, you will also need to install MSYS and Perl,
 as described below.
@@ -538,8 +539,7 @@ Windows: Using the GPU
 Please note that these are tentative instructions (we have not yet been able to
 get the GPU to work under Windows with Theano).
-Please report your own successes / failures on the
+Please report your own successes / failures on the `theano-users`_ mailing list.
-`theano-users <http://groups.google.com/group/theano-users>`_ mailing list.
 Those are instructions for the 32-bit version of Python (the one that comes
 with Python(x,y) is 32-bit).
@@ -555,14 +555,15 @@ use a compilation directory located somewhere else:
      [global]
      base_compiledir=path_to_a_directory_without_such_characters
-  You also need to add in the configuration file those lines:
+You also need to add in the configuration file those lines (make sure this
+is the correct Python installation path):
    .. code-block:: cfg
      [cuda]
      nvccflags=-LC:\Python26\libs
-  Then
+Then
  1) Install CUDA driver (32-bit on 32-bit Windows, idem for 64-bit).

--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -128,16 +128,26 @@ Config Attributes
    Default 'Mode'
-    This set the default compilation mode for theano functions. By default the
+    This sets the default compilation mode for theano functions. By default the
    mode Mode is equivalent to FAST_RUN. See Config attribute linker and optimizer.
+.. attribute:: config.lib.amdlibm
+    Bool value: either True or False
+    Default False
+    This makes the compilation use the
+    `amdlibm <http://developer.amd.com/cpu/libraries/libm/>`__
+    library, which is faster than the standard libm.
 .. attribute:: linker
    String value: 'c|py', 'py', 'c', 'c|py_nogc', 'c&py'
    Default: 'c|py'
-    When the mode is Mode, it set the default linker used.
+    When the mode is Mode, it sets the default linker used.
 .. attribute:: optimizer
@@ -145,7 +155,7 @@ Config Attributes
    Default: 'fast_run'
-    When the mode is Mode, it set the default optimizer used.
+    When the mode is Mode, it sets the default optimizer used.
 .. attribute:: warn.ignore_bug_before

--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -46,7 +46,7 @@ AddConfigVar('DebugMode.check_strides',
        IntParam(1, lambda i: i in (0,1,2)))
 AddConfigVar('DebugMode.warn_input_not_reused',
-        ("Generate a warning when the destroy_map tell that an op work inplace, but the op did not reuse the input for its output."
+        ("Generate a warning when the destroy_map or view_map tell that an op work inplace, but the op did not reuse the input for its output."
         ),
        BoolParam(True))
@@ -519,6 +519,18 @@ def _check_inputs(node, storage_map, r_vals, dr_vals, active_nodes, clobber_dr_v
            if storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
                warning("input idx %d marked as destroyed was not changed for node '%s'"%(ii[0],str(node)))
+    if warn_input_not_reused:
+        vmap=getattr(node.op,'view_map',{})
+        for oo,ii in vmap.iteritems():
+            if hasattr(node.outputs[0].type,"may_share_memory"):
+                if not node.outputs[0].type.may_share_memory(storage_map[node.outputs[oo]][0],storage_map[node.inputs[ii[0]]][0]):
+                    #when a subtensor return a tensor ofndim==0, numpy seam to return a copy.
+                    #when have an empty ndarray(happen with output guard) it is not the same. why?
+                    if storage_map[node.outputs[oo]][0].ndim>0 and storage_map[node.outputs[oo]][0].size>0:
+                        warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
+            elif storage_map[node.outputs[oo]][0] is not storage_map[node.inputs[ii[0]]][0]:
+                warning("input idx %d marked as viewed but new memory allocated by node '%s'"%(ii[0],str(node)))
    for r_idx, r in enumerate(node.inputs):
        if not r.type.values_eq(r_vals[r], storage_map[r][0]):
            # some input node 'r' got changed by running the node

--- a/theano/misc/hooks/check_whitespace.py
+++ b/theano/misc/hooks/check_whitespace.py
@@ -14,6 +14,8 @@ import tokenize
 import argparse
 import reindent
+SKIP_WHITESPACE_CHECK_FILENAME = ".hg/skip_whitespace_check"
 def get_parse_error(code):
    """
    Checks code for ambiguous tabs or other basic parsing issues.
@@ -128,6 +130,20 @@ def save_diffs(diffs, filename):
    diff_file.write(diff)
    diff_file.close()
+def should_skip_commit():
+    if not os.path.exists(SKIP_WHITESPACE_CHECK_FILENAME):
+        return False
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "r")
+    whitespace_check_changeset = whitespace_check_file.read()
+    whitespace_check_file.close()
+    return whitespace_check_changeset == parent_commit()
+def save_skip_next_commit():
+    whitespace_check_file = open(SKIP_WHITESPACE_CHECK_FILENAME, "w")
+    whitespace_check_file.write(parent_commit())
+    whitespace_check_file.close()
 def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]
@@ -145,12 +161,32 @@ def main(argv=None):
                        const=True,
                        help="only check indentation if the file was previously correctly indented (or is new)"
                       )
+    parser.add_argument("-s", "--skip-after-failure",
+                        action="store_const",
+                        default=False,
+                        const=True,
+                        help="when this pre-commit hook fails, don't run it on the next commit; "
+                             "this lets you check in your changes and then check in "
+                             "any necessary whitespace changes in the subsequent commit"
+                       )
    args = parser.parse_args(argv)
+    # -i and -s are incompatible; if you skip checking, you end up with a not-correctly-indented
+    # file, which -i then causes you to ignore!
+    if args.skip_after_failure and args.incremental:
+        print >> sys.stderr, "*** check whitespace hook misconfigured! -i and -s are incompatible."
+        return 1
    if is_merge():
        # don't inspect merges: (a) they're complex and (b) they don't really introduce new code
        return 0
+    if args.skip_after_failure and should_skip_commit():
+        # we're set up to skip this one, so skip it, but
+        # first, make sure we don't skip the next one as well :)
+        os.remove(SKIP_WHITESPACE_CHECK_FILENAME)
+        return 0
    block_commit = False
    diffs = []
@@ -185,12 +221,15 @@ def main(argv=None):
        save_diffs(diffs, diffs_filename)
        print >> sys.stderr, "*** To fix all indentation issues, run: cd `hg root` && patch -p0 < %s" % diffs_filename
    if block_commit:
        save_filename = ".hg/commit_message.saved"
        save_commit_message(save_filename)
        print >> sys.stderr, "*** Commit message saved to %s" % save_filename
+        if args.skip_after_failure:
+            save_skip_next_commit()
+            print >> sys.stderr, "*** Next commit attempt will not be checked. To change this, rm %s" % SKIP_WHITESPACE_CHECK_FILENAME
    return int(block_commit)

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
-import atexit, os, stat
+import atexit, gc, os, stat
 from theano.compile import optdb
 from theano import config
@@ -96,6 +96,9 @@ if cuda_available:
        cuda_initialization_error_message = ""
        # actively closing our gpu session presents segfault-on-exit on some systems
        atexit.register(gpu_shutdown)
+        # do garbage collection before releasing the gpu to avoid releasing invalid pointers later
+        # note that atexit-registered calls are called in LIFO order
+        atexit.register(gc.collect)
    except EnvironmentError, e:
        cuda_available = False
        cuda_initialization_error_message = e.message

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -12,43 +12,12 @@
 //If true, we fill with NAN allocated device memory.
 #define ALLOC_MEMSET 0
-#define DEBUG_GPU_CONTEXT_REFCOUNT 0
-// g_gpu_context_refcount starts at one b/c the gpu context will be implicitly created
-// on the first successful cuda call. the matching decref is in CudaNdarray_gpu_shutdown.
-static int g_gpu_context_refcount = 1;
-///////////////////////////
-// cuda context management
-///////////////////////////
-void gpu_context_incref() {
-  g_gpu_context_refcount++;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_incref, to %d\n", g_gpu_context_refcount);
-#endif
-}
-void gpu_context_decref() {
-  g_gpu_context_refcount--;
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-  fprintf(stderr, "gpu_context_decref, to %d\n", g_gpu_context_refcount);
-#endif
-  if(g_gpu_context_refcount == 0) {
-    // we're now free to close the cuda context; if we don't explicitly
-    // exit our cuda context, some systems segfault on process exit
-    // for as-yet unknown reasons; see
-    // http://groups.google.com/group/theano-users/browse_thread/thread/c351846e5cebe35f
-    cudaThreadExit();
-#if DEBUG_GPU_CONTEXT_REFCOUNT
-    fprintf(stderr, "gpu_context_decref at 0, calling cudaThreadExit\n");
-#endif
-  }
-}
 /////////////////////////
 // Alloc and Free
 /////////////////////////
+static int g_gpu_context_active = 0;
 /**
 *
 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
@@ -80,9 +49,6 @@ void * device_malloc(size_t size)
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
-    if(rval != NULL) {
-        gpu_context_incref(); // keep the gpu context around until we've free this memory
-    }
 #if COMPUTE_GPU_MEM_USED
    for(int i=0;i<TABLE_SIZE;i++){
      if(NULL==_alloc_size_table[i].ptr){
@@ -104,6 +70,10 @@ void * device_malloc(size_t size)
 }
 int device_free(void *ptr)
 {
+    // if there is no gpu context, the call to cudaFree will fail; skip it entirely
+    if(!g_gpu_context_active) {
+        return 0;
+    }
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
    {
@@ -116,9 +86,6 @@ int device_free(void *ptr)
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
-    if(ptr != NULL) {
-        gpu_context_decref();
-    }
 #if COMPUTE_GPU_MEM_USED
    int i=0;
    for(;i<TABLE_SIZE;i++)
@@ -1883,6 +1850,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
                        "Unable to get the number of gpus available: %s",
                        cudaGetErrorString(cudaGetLastError()));
  }
+  // as soon as the first successful call to a cuda* function is made, a
+  // gpu context has been created
+  g_gpu_context_active = 1;
  if(deviceCount <= 0) {
    return PyErr_Format(PyExc_EnvironmentError,
                        "Can't use the GPU, no devices support CUDA");
@@ -1926,7 +1898,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
 PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
-    gpu_context_decref();
+    cudaThreadExit();
+    g_gpu_context_active = 0; // context has now been closed down
    Py_INCREF(Py_None);
    return Py_None;
 }

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -213,7 +213,8 @@ class SparseType(gof.Type):
        # a FAST_RUN computation..
        return scipy.sparse.issparse(a) \
                and scipy.sparse.issparse(b) \
-                and abs(a-b).sum() < (1e-6 * a.nnz)
+                and ((abs(a-b).sum() < (1e-6 * a.nnz))
+                     or (a.nnz==0 and b.nnz==0))#in case a and b are empty
    def values_eq(self, a, b):
        #WARNING: equality comparison of sparse matrices is not fast or easy
@@ -789,6 +790,10 @@ class StructuredDot(gof.Op):
        dtype_out = scalar.upcast(a.type.dtype, b.type.dtype)
        if b.type.ndim != 2:
            raise NotImplementedError('non-matrix b')
+        if _is_sparse_variable(b):
+            return gof.Apply(self, [a,b], [SparseType(a.type.format,dtype_out)()])
+        else:
            return gof.Apply(self, [a,b], [tensor.tensor(dtype_out, (False, b.type.broadcastable[1]))])
    def perform(self, node, (a,b), (out,)):
@@ -797,6 +802,11 @@ class StructuredDot(gof.Op):
        #variable = a.dot(b)  # deprecated
        variable = a * b
+        if isinstance(node.outputs[0].type,SparseType):
+            assert _is_sparse(variable)
+            out[0] = variable
+            return
        assert _is_dense(variable) # scipy 0.7 automatically converts to dense
        # dot of an NxM sparse matrix, with a Mx1 dense matrix, returns vector not matrix

--- a/theano/sparse/tests/test_basic.py
+++ b/theano/sparse/tests/test_basic.py
@@ -344,6 +344,28 @@ class test_structureddot(unittest.TestCase):
        outvals = f(kernvals,imvals)
        print outvals
+    def test_dot_sparse_sparse(self):
+        #test dot for 2 input sparse matrix
+        sparse_dtype = 'float64'
+        for sparse_format in ['csc','csr']:
+            a = SparseType(sparse_format, dtype=sparse_dtype)()
+            b = SparseType(sparse_format, dtype=sparse_dtype)()
+            d = theano.dot(a,b)
+            f = theano.function([a,b], theano.Out(d, borrow=True))
+            topo = f.maker.env.toposort()
+            for M,N,K,nnz in [(4,3,2,3),
+                              (40,30,20,3),
+                              (40,30,20,30),
+                              (400,3000,200,6000),
+                              ]:
+                if sparse_format == 'csc':
+                    spmat = sp.csc_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csc_matrix(random_lil((N,K), sparse_dtype, nnz))
+                elif sparse_format == 'csr':
+                    spmat = sp.csr_matrix(random_lil((M,N), sparse_dtype, nnz))
+                    spmat2 = sp.csr_matrix(random_lil((N,K), sparse_dtype, nnz))
+                f(spmat,spmat2)
    def test_csc_correct_output_faster_than_scipy(self):
        sparse_dtype = 'float64'
        dense_dtype = 'float64'

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1207,4 +1207,3 @@ from opt import register_specialize, register_canonicalize
 def local_print_as_we_go_along(node):
    if node.op in (T.sub, T.add):
        debugprint(node)
--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -841,9 +841,9 @@ class CAReduce(Op):
    Examples:
     CAReduce(add) -> sum
     CAReduce(mul) -> product
-     CAReduce(maximum) -> sum
+     CAReduce(maximum) -> max
-     CAReduce(_or) -> any # not lazy
+     CAReduce(or_) -> any # not lazy
-     CAReduce(_and) -> all # not lazy
+     CAReduce(and_) -> all # not lazy
    In order to (eventually) optimize memory usage patterns,
    L{CAReduce} makes zero guarantees on the order in which it

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -317,8 +317,10 @@ class MakeVector(T.Op):
        inputs = map(T.as_tensor_variable, inputs)
        if not all(a.type == inputs[0].type for a in inputs) or (len(inputs)>0 and inputs[0].dtype != self.dtype):
            dtype=theano.scalar.upcast(self.dtype,*[i.dtype for i in inputs])
-            #upcast the input to the determined dtype, but don't upcast downcast anything
+            #upcast the input to the determined dtype, but don't downcast anything
-            assert dtype==self.dtype, "Upcast the input of MakeVector to dtype gived in init without precissino loss only."
+            assert dtype==self.dtype, (
+                    "The upcast of the inputs to MakeVector should match the "
+                    "dtype given in __init__.")
            if not all(self.dtype == T.cast(i,dtype=dtype).dtype for a in inputs):
                raise TypeError("MakeVector.make_node expected inputs upcastable to %s. got %s"%(
                        self.dtype,
@@ -348,6 +350,9 @@ class MakeVector(T.Op):
            # assume that out has correct dtype.  there is no cheap way to check
            out[0][...] = inputs
+    def grad(self, inputs, output_gradients):
+        return [output_gradients[0][i] for i in xrange(len(inputs))]
 make_vector = MakeVector()
 class MakeVectorPrinter:

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1552,6 +1552,36 @@ class T_Join_and_Split(unittest.TestCase):
        assert len([n for n in e if isinstance(n, Join)]) == 0
        assert f.maker.env.outputs[0].dtype == config.floatX
+    def test_stack_scalar_make_vector_dtype(self):
+        '''Test that calling stack() on scalars instantiates MakeVector,
+        event when the scalar don't have the same dtype.'''
+        a = tensor.iscalar('a')
+        b = tensor.lscalar('b')
+        s = stack(a, b, a, b)
+        f = function([a,b], s)
+        val = f(1,2)
+        self.failUnless(numpy.all(val == [1,2,1,2]))
+        e = f.maker.env.toposort()
+        assert len([n for n in e if isinstance(n.op,opt.MakeVector)]) > 0
+        assert len([n for n in e if isinstance(n, Join)]) == 0
+        assert f.maker.env.outputs[0].dtype == 'int64'
+    def test_stack_scalar_make_vector_constant(self):
+        '''Test that calling stack() on scalars instantiates MakeVector,
+        event when the scalar are simple int type.'''
+        a = tensor.iscalar('a')
+        b = tensor.lscalar('b')
+        #test when the constant is the first element.
+        #The first element is used in a special way
+        s = stack(10,a,b, numpy.int8(3))
+        f = function([a,b], s)
+        val = f(1,2)
+        self.failUnless(numpy.all(val == [10,1,2,3]))
+        e = f.maker.env.toposort()
+        assert len([n for n in e if isinstance(n.op,opt.MakeVector)]) > 0
+        assert len([n for n in e if isinstance(n, Join)]) == 0
+        assert f.maker.env.outputs[0].dtype == 'int64'
    def test_join_vector(self):
        a = as_tensor_variable(numpy.array([1, 2, 3]))
        b = as_tensor_variable(numpy.array([7, 8, 9]))
@@ -3440,6 +3470,28 @@ def test_dimshuffle_duplicate():
    assert success
+class T_get_constant_value(unittest.TestCase):
+    def test_get_constant_value(self):
+        a = tensor.stack(1,2,3)
+        assert get_constant_value(a[0])==1
+        assert get_constant_value(a[1])==2
+        assert get_constant_value(a[2])==3
+        b = tensor.iscalar()
+        a = tensor.stack(b,2,3)
+        self.assertRaises(TypeError, get_constant_value, a[0])
+        assert get_constant_value(a[1])==2
+        assert get_constant_value(a[2])==3
+        #For now get_constant_value got throught only MakeVector and Join of scalar.
+        v = tensor.ivector()
+        a = tensor.stack(v,2,3)
+        self.assertRaises(TypeError, get_constant_value, a[0])
+        self.assertRaises(TypeError, get_constant_value, a[1])
+        self.assertRaises(TypeError, get_constant_value, a[2])
 if __name__ == '__main__':
    if 1:
        unittest.main()
@@ -3449,5 +3501,3 @@ if __name__ == '__main__':
        suite = unittest.TestLoader()
        suite = suite.loadTestsFromTestCase(testcase)
        unittest.TextTestRunner(verbosity=2).run(suite)
--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py