branch merge

d0184177 · Dumitru Erhan · d008c97a · 6751357d · d0184177 · d0184177
--- a/doc/tutorial/debug_faq.txt
+++ b/doc/tutorial/debug_faq.txt
@@ -41,12 +41,27 @@ precise inspection of what's being computed where, when, and how, see the
 :ref:`faq_wraplinker`.
+How do I print a graph before or after compilation?
+----------------------------------------------------------
+Theano provides a function to print a graph before and after compilation:
+>>> x = T.dscalar('x') 
+>>> y = x**2
+>>> gy = T.grad(y, x)
+>>> pp(gy)  # print out the gradient prior to optimization
+'((fill((x ** 2), 1.0) * 2) * (x ** (2 - 1)))'
+>>> f = function([x], gy)
+>>> pp(f.maker.env.outputs[0])
+'(2.0 * x)'
+The parameter in T.dscalar('x') in the first line is the name of this variable(in the graph, not in python). This name is reused when printing the graph. Otherwise the variable x is printed as its type as: <TensorType(float64, scalar)>. That is not the most comprehensible. The string 'x' can be any string, but to make the code more comprehensible, try to pass the same name or derivative of the name in python.
 The function I compiled is too slow, what's up?
 -----------------------------------------------
+First, make sure you're running in FAST_RUN mode, by passing ``mode='FAST_RUN'``
-First, make sure you're running in FAST_RUN mode, by passing
+to ``theano.function`` or ``theano.make`` or by setting to ``PROFILE_MODE`` 
-``mode='FAST_RUN'`` to ``theano.function`` or ``theano.make``. Some
+the flags :attr:`config.mode`. Some
 operations have excruciatingly slow Python implementations and that
 can negatively effect the performance of FAST_COMPILE.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1211,7 +1211,7 @@ class GpuSum(Op):
 class GpuReshape(tensor.Reshape):
    # __hash__, __eq__, __str__ come from tensor.Subtensor
    def make_node(self, x, shp):
-        host_reshaped = host_from_gpu(x).reshape(shp)
+        host_reshaped = host_from_gpu(x).reshape(shp,ndim=self.ndim)
        return Apply(self, [x, shp], [CudaNdarrayType(host_reshaped.broadcastable)()])
    def perform(self, node, (x, shp), (out,)):
        if (len(shp) != self.ndim):

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -198,7 +198,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,2)
+        return (0,3)
    def c_support_code_apply(self, node, nodename):
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -626,7 +626,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        }
        else
        {
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
+            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_valid! (%s)",
                    cudaGetErrorString(sts));
            return -1;
        }
@@ -673,7 +673,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
@@ -821,13 +821,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
-            if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+	  if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
            work_complete = true;
        }
        else
        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+	  if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version);
            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
 				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
                                cudaGetErrorString(sts));
@@ -1013,7 +1013,7 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
 	  if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
 	  if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
 			      cudaGetErrorString(sts));
-	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
+	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)",
 		       cudaGetErrorString(sts));
            return -1;
        }

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -5,9 +5,6 @@
 #include <iostream>
 #include "cuda_ndarray.cuh"
-#ifndef DONT_UNROLL
-#define UNROLL_LOOP
-#endif
 /////////////////////////
 // Static helper methods

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1168,6 +1168,8 @@ class ScalarFromTensor(Op):
        out[0] = s.flatten()[0]
    def grad(self, (s,), (dt,)):
        return [TensorFromScalar(dt)]
+    def __str__(self):
+        return self.__class__.__name__
 scalar_from_tensor = ScalarFromTensor()

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -71,6 +71,8 @@ class GemmRelated(Op):
        return (type(self) == type(other))
    def __hash__(self):
        return hash(type(self))
+    def __str__(self):
+        return self.__class__.__name__
    def c_support_code(self):
        #return cblas_header_text()
        mod_str = """

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -1516,7 +1516,7 @@ if(mode != VALID && mode != FULL){
 if(dim_zz[0]<=0 || dim_zz[1]<=0){
 PyErr_Format(PyExc_ValueError,
-      "Output dimensions are not valid %%dx%%d",dim_zz[0],dim_zz[1]);
+      "Output dimensions are not valid %%ldx%%ld",(long int)dim_zz[0],(long int)dim_zz[1]);
      %(fail)s;
 }

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -495,7 +495,8 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
        return type(self) == type(other)
    def __hash__(self):
        return tensor.hashtype(self)
+    def __str__(self):
+        return self.__class__.__name__
    def make_node(self, x, b, y_idx):
        x = tensor.as_tensor_variable(x)
        b = tensor.as_tensor_variable(b)
@@ -673,6 +674,8 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        return type(self) == type(other)
    def __hash__(self):
        return tensor.hashtype(self)
+    def __str__(self):
+        return self.__class__.__name__
    def make_node(self, dy, sm, y_idx,**kwargs):
        dy = tensor.as_tensor_variable(dy)
        sm = tensor.as_tensor_variable(sm)
@@ -720,14 +723,14 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
        }
        if (%(dnll)s->dimensions[0] != %(sm)s->dimensions[0])
        {
-            PyErr_Format(PyExc_ValueError, "dnll.shape[0] (%%d) != sm.shape[0] (%%d)",
+            PyErr_Format(PyExc_ValueError, "dnll.shape[0] (%%ld) != sm.shape[0] (%%ld)",
-                        %(dnll)s->dimensions[0], %(sm)s->dimensions[0]);
+                        (long int)%(dnll)s->dimensions[0], (long int)%(sm)s->dimensions[0]);
-            //PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != sm.shape[0]");
            %(fail)s;
        }
        if (%(dnll)s->dimensions[0] != %(y_idx)s->dimensions[0])
        {
-            PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]");
+            PyErr_Format(PyExc_ValueError, "dnll.shape[0] (%%ld) != y_idx.shape[0] (%%ld)",
+                        (long int)%(dnll)s->dimensions[0], (long int)%(y_idx)s->dimensions[0]);
            %(fail)s;
        }
        if ((NULL == %(dx)s)

--- a/theano/tensor/raw_random.py
+++ b/theano/tensor/raw_random.py
--- a/theano/tensor/tests/test_randomstreams.py
+++ b/theano/tensor/tests/test_randomstreams.py
@@ -170,10 +170,7 @@ class T_RandomStreams(unittest.TestCase):
        # ndim specified, inconsistent with shape, should raise ValueError
        m3 = Module()
        m3.random = RandomStreams(234)
-        m3.fn = Method([], m3.random.uniform((2,2), ndim=1))
+        self.assertRaises(ValueError, m3.random.uniform, (2,2), ndim=1)
-        made3 = m3.make()
-        made3.random.initialize()
-        self.assertRaises(ValueError, made3.fn)
    def test_uniform(self):
        """Test that RandomStreams.uniform generates the same results as numpy"""

--- a/theano/tensor/tests/test_raw_random.py
+++ b/theano/tensor/tests/test_raw_random.py