merge

e16f2ec7 · James Bergstra · f57ea7bd · c4b6dca9 · e16f2ec7 · e16f2ec7
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -6,6 +6,9 @@
 #include "cuda_ndarray.cuh"
+//If true, when their is a gpu malloc or free error, we print the size of allocated memory on the device.
+#define COMPUTE_GPU_MEM_USED false
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -18,17 +21,40 @@
 *
 */
 int _outstanding_mallocs[] = {0,0};
+#if COMPUTE_GPU_MEM_USED
+int _allocated_size = 0;
+const int TABLE_SIZE = 10000;
+struct table_struct{
+  void* ptr;
+  int size;
+};
+table_struct _alloc_size_table[TABLE_SIZE];
+#endif
 void * device_malloc(size_t size)
 {
    void * rval=NULL;
    cudaError_t err = cudaMalloc(&rval, size);
    if (cudaSuccess != err)
    {
+#if COMPUTE_GPU_MEM_USED
+        fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n", (long)size, cudaGetErrorString(err),_allocated_size);
+#else
        fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
+#endif
        PyErr_Format(PyExc_MemoryError, "error allocating %li bytes of device memory (%s)", (long)size, cudaGetErrorString(err));
        return NULL;
    }
    _outstanding_mallocs[0] += (rval != NULL);
+#if COMPUTE_GPU_MEM_USED
+    for(int i=0;i<TABLE_SIZE;i++){
+      if(NULL==_alloc_size_table[i].ptr){
+	_alloc_size_table[i].ptr=rval;
+	_alloc_size_table[i].size=size;
+	break;
+      }
+    }
+    _allocated_size += size;
+#endif
    return rval;
 }
 int device_free(void *ptr)
@@ -36,11 +62,28 @@ int device_free(void *ptr)
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
    {
+#if COMPUTE_GPU_MEM_USED
+        fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n", ptr, cudaGetErrorString(err), _allocated_size);
+#else
        fprintf(stderr, "Error freeing device pointer %p (%s).\n", ptr, cudaGetErrorString(err));
+#endif
        PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)", ptr, cudaGetErrorString(err));
        return -1;
    }
    _outstanding_mallocs[0] -= (ptr != NULL);
+#if COMPUTE_GPU_MEM_USED
+    int i=0;
+    for(;i<TABLE_SIZE;i++)
+      if(_alloc_size_table[i].ptr==ptr){
+	_allocated_size -= _alloc_size_table[i].size;
+	_alloc_size_table[i].ptr=0;
+	_alloc_size_table[i].size=0;
+	break;
+      }
+    if(i==TABLE_SIZE)
+      printf("Unallocated unknow size!\n");
+#endif
    return 0;
 }
 static PyObject *
@@ -1868,7 +1911,12 @@ initcuda_ndarray(void)
    Py_INCREF(&CudaNdarrayType);
    PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
+#if COMPUTE_GPU_MEM_USED
+    for(int i=0;i<TABLE_SIZE;i++){
+      _alloc_size_table[i].ptr=NULL;
+      _alloc_size_table[i].size=0;
+    }
+#endif
    //    cublasInit();
    //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
    //{

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -278,12 +278,22 @@ def local_gpu_reshape(node):
        if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape):
            rshp = host_input.owner.op
            x, shp = host_input.owner.inputs
-            return [GpuReshape(rshp.ndim)(gpu_from_host(x), shp)]
+            gpu_reshape = GpuReshape(rshp.ndim)(gpu_from_host(x), shp)
+            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
+                #this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape
+                #Event if we did the same think, with the constant optimization that could happen.
+                gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable)
+            return [gpu_reshape]
    if isinstance(node.op, tensor.Reshape):
        x, shp = node.inputs
        if x.owner and x.owner.op == host_from_gpu:
            gpu_x, = x.owner.inputs
-            return [host_from_gpu(GpuReshape(node.op.ndim)(gpu_x, shp))]
+            gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
+            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
+                #this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape
+                #Event if we did the same think, with the constant optimization that could happen.
+                gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable)
+            return [host_from_gpu(gpu_reshape)]
    return False
 @register_opt()

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -3,11 +3,12 @@ from theano import Op, Apply
 import theano.tensor as T
 from theano.tensor.opt import register_specialize
 from theano.gof import local_optimizer
 from theano.sandbox.cuda import cuda_available
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
+    from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
 class Images2Neibs(Op):
    def __eq__(self, other):
@@ -17,7 +18,7 @@ class Images2Neibs(Op):
    def make_node(self, ten4, neib_shape):
        ten4 = T.as_tensor_variable(ten4)
        neib_shape = T.as_tensor_variable(neib_shape)
-        return Apply(self, [ten4, neib_shape], [T.matrix()])
+        return Apply(self, [ten4, neib_shape], [ten4.type()])
    def grad(self, (pvals, unis), (gz,)):
        return [None, None]
@@ -163,7 +164,7 @@ class GpuImages2Neibs(Images2Neibs):
        #    raise TypeError('unis must be cudandarray', neib_shape)
        #print 'neib_shape type and dtype', type(neib_shape), neib_shape.dtype
-        return Apply(self, [ten4, neib_shape], [CudaNdarrayType(broadcastable=(False,)*2)()])
+        return Apply(self, [ten4, neib_shape], [ten4.type()])
    def c_code_cache_version(self):
        return ()
@@ -360,6 +361,7 @@ gpu_images2neibs = GpuImages2Neibs()
 def use_gpu_images2neibs(node):
    if node.op == images2neibs:
        return [host_from_gpu(gpu_images2neibs(*[gpu_from_host(node.inputs[0]),node.inputs[1]]))]
-if theano.config.device.startswith('gpu'):
-    register_specialize(use_gpu_images2neibs)
+if cuda_available:
+    register_gpu_opt()(use_gpu_images2neibs)
--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
-from numpy import *
+import numpy
 import theano
 from theano import shared, function
 import theano.tensor as T
-from neighbours import images2neibs, neibs2images
+from neighbours import images2neibs, neibs2images, GpuImages2Neibs
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda
-mode = theano.config.mode
+if theano.config.mode=='FAST_COMPILE':
-if mode=="FAST_COMPILE":
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode='FAST_RUN'
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
-def neibs_test():
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+def test_neibs():
    shape = (100,40,18,18)
-    images = shared(arange(prod(shape), dtype='float32').reshape(shape))
+    images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
-    f = function([], images2neibs(images, neib_shape), mode=mode)
+    f = function([], images2neibs(images, neib_shape), mode=mode_without_gpu)
    #print images.value
    neibs = f()
    #print neibs
-    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode)
+    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_without_gpu)
    #print g()
-    assert allclose(images.value,g())
+    assert numpy.allclose(images.value,g())
+def test_neibs_gpu():
+    if cuda.cuda_available == False:
+       raise SkipTest('Optional package cuda disabled')
+    shape = (100,40,18,18)
+    images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
+    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
+    from theano.sandbox.cuda.basic_ops import gpu_from_host
+    f = function([], images2neibs(images,neib_shape),
+                 mode=mode_with_gpu)
+    assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
+    #print images.value
+    res1=[[[[  0.,   1.,   4.,   5.],
+         [  2.,   3.,   6.,   7.],
+         [  8.,   9.,  12.,  13.],
+         [ 10.,  11.,  14.,  15.],
+         [ 16.,  17.,  20.,  21.],
+         [ 18.,  19.,  22.,  23.],
+         [ 24.,  25.,  28.,  29.],
+         [ 26.,  27.,  30.,  31.],
+         [ 32.,  33.,  36.,  37.],
+         [ 34.,  35.,  38.,  39.],
+         [ 40.,  41.,  44.,  45.],
+         [ 42.,  43.,  46.,  47.],
+         [ 48.,  49.,  52.,  53.],
+         [ 50.,  51.,  54.,  55.],
+         [ 56.,  57.,  60.,  61.],
+         [ 58.,  59.,  62.,  63.]]]]
+    neibs = numpy.asarray(f())
+    numpy.allclose(neibs,res1)
+    #print neibs
+    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
+    assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
+    #print numpy.asarray(g())
+    assert numpy.allclose(images.value,g())
-neibs_test()
+if __name__ == '__main__':
+    test_neibs_gpu()
+    test_neibs()
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -953,7 +953,11 @@ class Mod(BinaryScalarOp):
    def impl(self, x, y):
        return x % y
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
+    def c_support_code(self):
+        #We use a macro as python use % as a special string caractere.
+        return "#define THEANO_MACRO_MOD(x,y) (x % y)"
    def c_code(self, node, name, (x, y), (z, ), sub):
        """
@@ -962,10 +966,10 @@ class Mod(BinaryScalarOp):
        #raise NotImplementedError("Unlike Python, C's modulo returns negative modulo on negative dividend (to implement)")
        t = node.inputs[0].type.upcast(*[ i.type for i in node.inputs[1:]])
        if t in int_types or t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64']:
-            x_mod_y = "(%(x)s %% %(y)s)"%locals()
+            x_mod_y = "THEANO_MACRO_MOD(%(x)s, %(y)s)"%locals()
-            x_mod_ymm = "(-%(x)s %% -%(y)s)"%locals()
+            x_mod_ymm = "THEANO_MACRO_MOD(-%(x)s, -%(y)s)"%locals()
-            x_mod_ypm = "(%(x)s %% -%(y)s)"%locals()
+            x_mod_ypm = "THEANO_MACRO_MOD(%(x)s, -%(y)s)"%locals()
-            x_mod_ymp = "(-%(x)s %% %(y)s)"%locals()
+            x_mod_ymp = "THEANO_MACRO_MOD(-%(x)s, %(y)s)"%locals()
        elif t in float_types or t in ['float32','float64']:
            x_mod_y = "fmod(%(x)s,%(y)s)"%locals()
            x_mod_ymm = "fmod(-%(x)s,-%(y)s)"%locals()
@@ -1706,8 +1710,6 @@ class Composite(ScalarOp):
                                      [subd[output] for output in node.outputs],
                                      dict(fail = "%(fail)s",
                                           id = "%%(id)s_%i" % j))
-            if any([isinstance(x.op,Mod) for x in env.toposort()]):
-                s = s.replace('% ','%% ')
            _c_code += s
            _c_code += "\n"
        _c_code += "}\n"
@@ -1773,6 +1775,15 @@ class Composite(ScalarOp):
    def c_code_cache_version(self):
        return (1,)+tuple([x.op.c_code_cache_version() for x in self.env.toposort()])
+    def c_support_code(self):
+        str = ""
+        for node in self.env.toposort():
+            try:
+                str += node.op.c_support_code()+"\n"
+            except gof.utils.MethodNotDefined:
+                pass
+        return str
    def __eq__(self, other):
        if self is other: return True
        if not isinstance(other, self.__class__): return False

--- a/theano/scan.py
+++ b/theano/scan.py
@@ -68,10 +68,6 @@ def hash_listsDictsTuples(x):
    return hash_value
-## TODO
-###################################
-## Implement specific function calls : map, reduce, generate
 def map(fn, sequences, non_sequences = [],
        truncate_gradient = -1, go_backwards = False,
        mode = None, name = None):
@@ -88,8 +84,8 @@ def map(fn, sequences, non_sequences = [],
    :param truncate_gradient: see scan for more info
-    :param go_backwards: if map should also inverse the order in the arrays
+    :param go_backwards: set to true if you want map to start at the end of the
-                         see scan for more info
+                         provided arrays in ``sequences`` going towards 0 (back in time)
    :param mode: see scan
@@ -108,15 +104,17 @@ def reduce(fn, sequences, outputs_info, non_sequences = [], go_backwards = False
               sequences ( see scan `fn` for more info)
    :param outputs_info: information about outputs (mainly the initial state
-                        of each )
+                        of each, but other options are available ), see scan for more 
+                        info
    :param sequences: list of arrays over which reduce should 
                      iterate (see scan for more info)
    :param non_sequences: list of other arguments of `fn` over which 
                          reduce shouldn't iterate (see scan for more info)
-    :param go_backwards: if reduce should also inverse the order in the arrays
+    :param go_backwards: set to true if you want map to start at the end of the
-                         see scan for more info
+                         provided arrays in ``sequences`` going towards 0 (back in time)
    :param mode: see scan 
    :param name: see scan
@@ -241,8 +239,8 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
        If you are using shared variables over which you do not want to iterate, 
        you do not need to provide them as arguments to ``fn``, though you can if you 
-        wish so. The function should return the outputs after each step plus the updates for
+        wish so. The function should return the outputs after each step plus the updates 
-        any of the shared variables. You can either return only outputs or only
+        for any of the shared variables. You can either return only outputs or only
        updates. If you have both outputs and updates the function should return
        them as a tuple : (outputs, updates) or (updates, outputs).
@@ -281,9 +279,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
          list of ints (only negative .. since you can not use future values of outputs),
          with the same meaning as for ``sequences`` (see above).
        * ``inplace`` -- theano variable pointing to one of the input sequences; this
-          flag tells scan that the output should be computed in the memory spaced occupied
+          flag tells scan that the output should be computed in the memory space occupied
          by that input sequence. Note that scan will only do this if allowed by the
-          rest of your computational graph and if you are not using past taps of the input.
+          rest of your computational graph and if you are not using past taps of the 
+          input.
        * ``return_steps`` how many steps to return from your output. If not given, or 
          0 scan will return all steps, otherwise it will return the last ``return_steps``.
          Note that if you set this to something else then 0, scan will try to be smart
@@ -298,8 +297,8 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
        notation, when ``t = 0``, we would need values for ``y[-1]``, ``y[-2]``
        and ``y[-4]``. These values are provided by the initial state of ``y``,
        which should have same number  of dimension as ``y``, where the first
-        dimension should be large enough to cover all past values, which in this
+        dimension should be large enough to cover all the required past values, which in 
-        case is 4.  If ``init_y`` is the variable containing the initial state
+        this case is 4.  If ``init_y`` is the variable containing the initial state
        of ``y``, then ``init_y[0]`` corresponds to ``y[-4]``, ``init_y[1]``
        corresponds to ``y[-3]``, ``init_y[2]`` corresponds to ``y[-2]``,
        ``init_y[3]`` corresponds to ``y[-1]``. The default behaviour of scan is
@@ -313,13 +312,13 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
          of -1
        * if you wrap an output in a dictionary but you do not provide any initial state,
          it assumes that you are not using any form of taps
-        * if you provide a ``None`` scan assumes that you will not use any taps for this 
+        * if you provide a ``None`` instead of a variable or a dictionary scan assumes 
-          output (this would be the case for map )
+          that you will not use any taps for this output (this would be the case for map)
-        If you did not provide any information for your outputs, scan will assume by default
+        If you did not provide any information for your outputs, scan will assume by 
-        that you are not using any taps for any of the outputs. If you provide information for
+        default that you are not using any taps for any of the outputs. If you provide 
-        just a subset of outputs, scan will not know to which outputs these information 
+        information for just a subset of outputs, scan will not know to which outputs 
-        corresponds and will raise an error.
+        these correspond and will raise an error.
    :param non_sequences:
        Parameters over which scan should not iterate.  These parameters are
@@ -332,18 +331,20 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
        the input sequences. If the value is 0, the outputs will have 0 rows. If the 
        value is negative, scan will run backwards (or if the flag go_backwards is 
        already set to true it will run forward in time). If n_steps is not provided, 
-        or evaluetes to None, inf or nan, scan will figure out the maximal amount of steps it can 
+        or evaluetes to None, inf or nan, scan will figure out the maximal amount of 
-        take and do that. 
+        steps it can run given the input sequences and do that.
    :param truncate_gradient:
        Number of steps to use in truncated BPTT.  If you compute gradients
        through a scan op, they are computed using backpropagation through time.
        By providing a different value then -1, you choose to use truncated BPTT
        instead of classical BPTT, where you only do ``truncate_gradient``
-        number of steps. (NOT YET IMPLEMENTED)
+        number of steps.
    :param go_backwards:
-        Flag indicating if you should go backwards through the sequences
+        Flag indicating if you should go backwards through the sequences ( if you 
+        think as the sequences being indexed by time, this would mean go backwards 
+        in time)
    :param name:
        The name of the theano function compiled by the Scan op. It will show in the 

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2676,6 +2676,21 @@ def test_mod():
                ):
        assert fn(a,b) == a%b, (a,)
+def test_mod_compile():
+    """
+    This test generate an Elemwise of Composite as: 
+        Elemwise{Composite{Composite{Composite{Composite{mod,EQ},Switch},mul},add}}
+    The c_code generated is not compiling as of 30 June 2010. I fix the compilation in the same commit.
+    """
+    x = tensor.vector()
+    y = tensor.vector()
+    shape = x.shape
+    out = tensor.switch(tensor.eq(3%x.shape[0],0),y,y[:-1])
+    f = theano.function([x,y],out)
 if __name__ == '__main__':
    if 1:
        unittest.main()