Merged

925a4eb6 · Olivier Delalleau · 646bd761 · 15aebd1c · 925a4eb6 · 925a4eb6
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -421,7 +421,7 @@ class GpuDownsampleFactorMax(Op):
    #def perform(self, node, input_storage, output_storage):
        #raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
-        return ()
+        return (1)
    def c_code(self, node, nodename, (x,), (z,), sub):
        fail = sub['fail']
        ds0, ds1 = self.ds

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -521,6 +521,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    }

    if (1 && (version==6||version==-1) &&
+	kern_len<=320 &&
 	!work_complete) //conv_valid_row_reduce
    {
        int outsize = CudaNdarray_SIZE(out);

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
 import sys, time
 import numpy
+
+from nose.plugins.skip import SkipTest
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import convolve2d
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
+
 import theano

 # Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
@@ -38,9 +46,23 @@ def py_conv_full_numpy(img, kern):
    pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
    padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
-    return py_conv_valid(padded_img, kern)
+    return py_conv_valid_numpy(padded_img, kern)
+
+def py_conv(img, kern, mode, subsample):
+    """
+    use a scipy or numpy implementation depending is scipy is available.
+    The scipy version is faster.
+    """
+    if imported_scipy_convolve2d:
+        return py_conv_scipy(img, kern, mode, subsample)
+    elif mode=='valid':
+        return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+    elif mode=='full':
+        return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+    else:
+        raise Exception("Can't execute this kernel.")
+
 def py_conv_scipy(img, kern, mode, subsample):
-    from scipy.signal import convolve2d
    assert img.shape[1] == kern.shape[1]
    if mode == 'valid':
        outshp = (img.shape[0], kern.shape[0],
@@ -89,7 +111,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
    rval = True
    try:
        t0 = time.time()
-        cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = cuda_tensor4()
        k = cuda_tensor4()
@@ -550,7 +572,7 @@ def _test_dummy():
    rval = True

    t0 = time.time()
-    cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
    t1 = time.time()
    gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
    t2 = time.time()

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -252,13 +252,13 @@ class GpuImages2Neibs(Images2Neibs):
                                                                dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return ()
-        return (2,)
+        return (6,)

    def c_support_code_apply(self, node, nodename):
-        if self.mode=="valid":
-            return """
-        static __global__ void k_multi_warp_%(nodename)s(
+        mode = self.mode
+        return """
+//a version that use less register but don't work in all case.
+        static __global__ void k_multi_warp_less_%(nodename)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -274,8 +274,10 @@ class GpuImages2Neibs(Images2Neibs):
            float * global_out
        )
        {
-        
-            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+            const int wrap_centered_idx_shift_x = c/2;
+            const int wrap_centered_idx_shift_y = d/2;
+
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
@@ -289,12 +291,23 @@ class GpuImages2Neibs(Images2Neibs):
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            for (int i = 0; i < c; i++)     // loop over c
+                            int i = threadIdx.y;     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
-                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
+                                if("%(mode)s"=="wrap_centered"){
+                                    ten4_2 -= wrap_centered_idx_shift_x;
+                                    if ( ten4_2 < 0 ) ten4_2 += height;
+                                    else if (ten4_2 >= height) ten4_2 -= height;
+                                }
+                                int j = threadIdx.x;  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
+                                    if("%(mode)s"=="wrap_centered"){
+                                        ten4_3 -= wrap_centered_idx_shift_y;
+                                        if ( ten4_3 < 0 ) ten4_3 += width;
+                                        else if (ten4_3 >= width) ten4_3 -= width;
+                                    }
+
                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); 
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; 
@@ -307,9 +320,6 @@ class GpuImages2Neibs(Images2Neibs):
            }
        }

-        """ % locals()
-        if self.mode=="wrap_centered":
-            return """
        static __global__ void k_multi_warp_%(nodename)s(
            const int nb_batch,
            const int nb_stack,
@@ -329,7 +339,7 @@ class GpuImages2Neibs(Images2Neibs):
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;

-            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
@@ -343,19 +353,23 @@ class GpuImages2Neibs(Images2Neibs):
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            for (int i = 0; i < c; i++)     // loop over c
+                            for (int i = threadIdx.y; i < c; i+=blockDim.y)     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
-                                ten4_2 -= wrap_centered_idx_shift_x;
-                                if ( ten4_2 < 0 ) ten4_2 += height;
-                                else if (ten4_2 >= height) ten4_2 -= height;
-
+                                if("%(mode)s"=="wrap_centered"){
+                                    ten4_2 -= wrap_centered_idx_shift_x;
+                                    if ( ten4_2 < 0 ) ten4_2 += height;
+                                    else if (ten4_2 >= height) ten4_2 -= height;
+                                }
                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
-                                    ten4_3 -= wrap_centered_idx_shift_y;
-                                    if ( ten4_3 < 0 ) ten4_3 += width;
-                                    else if (ten4_3 >= width) ten4_3 -= width;
+                                    if("%(mode)s"=="wrap_centered"){
+                                        ten4_3 -= wrap_centered_idx_shift_y;
+                                        if ( ten4_3 < 0 ) ten4_3 += width;
+                                        else if (ten4_3 >= width) ten4_3 -= width;
+                                    }
+
                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); 
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; 
@@ -370,7 +384,6 @@ class GpuImages2Neibs(Images2Neibs):

        """ % locals()

-
    def c_code(self, node, name, (ten4, neib_shape, neib_step), (z,), sub):
        fail = sub['fail']
        mode = self.mode
@@ -473,17 +486,36 @@ class GpuImages2Neibs(Images2Neibs):
            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
            
+            dim3 n_threads(d,c,1);
+            //Their is a max of 512 threads per blocks
+            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; 
+            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; 
+
+            //Make bigger block to have better memory access pattern and a higher core utilisation.
+            //for smaller patch size
+            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
+                n_threads.z++;
+            }
            int nb_block;
-            if (nb_batch %% 32 == 0)
-                nb_block = nb_batch/32;
+            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
+                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
            else
-                nb_block = (int)((float)nb_batch/32. + 1.); 
-                
-            dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
-            dim3 n_threads(32,1,1);
+                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
+            dim3 n_blocks(std::min(32*1024,nb_block));
            int n_shared = 0;

-            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(                
+	    void (*f)(int, int, int ,int,
+                      int, int, int ,int,
+                      int, int,
+                      int, int, int, int,
+                      float*, float*);
+            if(n_threads.x==d && n_threads.y==c){
+                f = k_multi_warp_less_%(name)s;
+            }else{
+                f = k_multi_warp_%(name)s;
+            }
+
+            f<<<n_blocks, n_threads, n_shared>>>(                
                nb_batch,
                nb_stack,
                height, width,

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -278,26 +278,30 @@ def test_neibs_wrap_centered_step_manual():
 def test_neibs_gpu():
    if cuda.cuda_available == False:
       raise SkipTest('Optional package cuda disabled')
-   
-    shape = (100,40,18,18)
-    images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
-    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
-
-    from theano.sandbox.cuda.basic_ops import gpu_from_host
-
-    f = function([], images2neibs(images,neib_shape),
-                 mode=mode_with_gpu)
-    f_gpu = function([], images2neibs(images,neib_shape),
-                 mode=mode_with_gpu)
-    assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
-    #print images.value
-    neibs = numpy.asarray(f_gpu())
-    assert numpy.allclose(neibs,f())
-    #print neibs
-    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
-    assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
-    #print numpy.asarray(g())
-    assert numpy.allclose(images.value,g())
+    for shape, pshape in [((100,40,18,18),(2,2)),
+                          ((100,40,6,18),(3,2)),
+                          ((10,40,66,66),(33,33)),
+                          ((10,40,68,66),(34,33))
+                          ]:
+
+        images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
+        neib_shape = T.as_tensor_variable(pshape)
+
+        from theano.sandbox.cuda.basic_ops import gpu_from_host
+
+        f = function([], images2neibs(images,neib_shape),
+                     mode=mode_with_gpu)
+        f_gpu = function([], images2neibs(images,neib_shape),
+                     mode=mode_with_gpu)
+        assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
+        #print images.value
+        neibs = numpy.asarray(f_gpu())
+        assert numpy.allclose(neibs,f())
+        #print neibs
+        g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
+        assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
+        #print numpy.asarray(g())
+        assert numpy.allclose(images.value,g())


 def speed_neibs():

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -12,7 +12,7 @@ import numpy, theano
 #from copy import copy as python_copy

 from theano import gof, shared
-from theano.gof import Variable, Op, utils, Type, Constant,  Value
+from theano.gof import Variable, Op, Type, Constant,  Value
 from theano.tensor.tsor_apply import Apply

 from theano import gradient
@@ -21,7 +21,7 @@ import elemwise
 from theano import scalar as scal
 from theano.gof.python25 import partial, any, all
 from theano import compile, printing
-from theano.printing import pprint, Print
+from theano.printing import pprint

 ### set up the external interface
 from elemwise import Elemwise, DimShuffle, CAReduce, Sum

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -18,6 +18,16 @@ from theano import gof, Op, tensor, config
 from theano.tensor.tsor_apply import Apply
 from theano.gof.python25 import any

+imported_scipy_signal = False
+try:
+    # TODO: move these back out to global scope when they no longer cause an atexit error
+    from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
+    from scipy.signal.sigtools import _convolve2d
+    imported_scipy_signal = True
+except ImportError:
+    pass
+    
+
 _logger=logging.getLogger("theano.signal.conv")
 def _debug(*msg):
    _logger.debug(' '.join([ str(x) for x in msg]))
@@ -547,9 +557,12 @@ class ConvOp(Op):
        """
        By default if len(img2d.shape)==3, we
        """
+        if not imported_scipy_signal:
+            raise theano.gof.utils.MethodNotDefined(
+                "c_headers", type(self), self.__class__.__name__,
+                "Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
+            
        # TODO: move these back out to global scope when they no longer cause an atexit error
-        from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
-        from scipy.signal.sigtools import _convolve2d
        imshp = self.imshp
        if imshp is None or any([x is None for x in imshp]):
            imshp = tuple(img2d.shape[1:])
@@ -584,8 +597,6 @@ class ConvOp(Op):
            z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
                           dtype=img2d.dtype)
        zz=z[0]
-        val = _valfrommode(self.out_mode)
-        bval = _bvalfromboundary('fill')

        stacklen = imshp[0]

@@ -616,6 +627,9 @@ class ConvOp(Op):
            filtersflipped = buf
            del buf, rstride, cstride

+        val = _valfrommode(self.out_mode)
+        bval = _bvalfromboundary('fill')
+
        for b in range(bsize):
            for n in range(nkern):
                zz[b,n,...].fill(0)
@@ -623,6 +637,25 @@ class ConvOp(Op):
                    zz[b,n,...] +=  _convolve2d(\
                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)

+        if False:
+            if False and self.out_mode=="full":
+                img2d2 = numpy.zeros((bsize,stacklen,
+                                      imshp[1]+2*kshp[0]-2,
+                                      imshp[2]+2*kshp[1]-2))
+                img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
+                           kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
+                img2d = img2d2
+            #N_image_shape = image_data.shape
+
+            for b in range(bsize):
+                for n in range(nkern):
+                    zz[b,n,...].fill(0)
+                    for im0 in range(stacklen):
+                        for row in range(0,zz.shape[2],self.dx):
+                            for col in range(0,zz.shape[3],self.dy):
+                                zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
+                                                            filtersflipped[n,im0,::-1,::-1]).sum()
+
        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
        #The copy make that we return an object with the same stride as the c version.
        #The copy don't affect the performence during our experience as in that case we

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
 import sys, time, unittest
 import numpy
-from scipy import signal

 import theano
 import theano.tensor as T
@@ -60,6 +59,7 @@ class TestConv2D(unittest.TestCase):

        ############# REFERENCE IMPLEMENTATION ############
        s = 1.
+        orig_image_data = image_data
        if border_mode is not 'full': s = -1.
        out_shape2d = numpy.array(N_image_shape[-2:]) +\
                      s*numpy.array(N_filter_shape[-2:]) - s
@@ -68,26 +68,41 @@ class TestConv2D(unittest.TestCase):
        ref_output = numpy.zeros(out_shape)

        # loop over output feature maps
-        for k in range(N_filter_shape[0]):
-            # loop over input feature maps
-            for l in range(N_filter_shape[1]):
-
-                filter2d = filter_data[k,l,:,:]
-
-                # loop over mini-batches
-                for b in range(N_image_shape[0]):
-                    image2d = image_data[b,l,:,:]
-                    output2d = signal.convolve2d(image2d, filter2d, border_mode)
-
-                    ref_output[b,k,:,:] +=\
-                       output2d[::subsample[0],::subsample[1]]
+        ref_output.fill(0)
+        if border_mode=='full':
+            image_data2 = numpy.zeros((N_image_shape[0],N_image_shape[1],
+                                      N_image_shape[2]+2*N_filter_shape[2]-2,
+                                      N_image_shape[3]+2*N_filter_shape[3]-2))
+            image_data2[:,:,N_filter_shape[2]-1:N_filter_shape[2]-1+N_image_shape[2],
+                            N_filter_shape[3]-1:N_filter_shape[3]-1+N_image_shape[3]] = image_data
+            image_data = image_data2
+            N_image_shape = image_data.shape
+        for bb in range(N_image_shape[0]):
+            for nn in range(N_filter_shape[0]):
+                for im0 in range(N_image_shape[1]):
+                    filter2d = filter_data[nn,im0,:,:]
+                    image2d = image_data[bb,im0,:,:]
+                    for row in range(ref_output.shape[2]):
+                        irow = row * subsample[0]#image row
+                        for col in range(ref_output.shape[3]):
+                            icol = col * subsample[1]#image col
+                            ref_output[bb,nn,row,col] += (image2d[irow:irow+N_filter_shape[2],
+                                                                  icol:icol+N_filter_shape[3]]*filter2d[::-1,::-1]
+                                                          ).sum()

        self.failUnless(_allclose(theano_output, ref_output))

        ############# TEST GRADIENT ############
        if verify_grad:
-            utt.verify_grad(sym_conv2d, [image_data, filter_data])
+            utt.verify_grad(sym_conv2d, [orig_image_data, filter_data])
+

+    def test_basic1(self):
+        """
+        Tests that basic convolutions work for odd and even dimensions of image and filter
+        shapes, as well as rectangular images and filters.
+        """
+        self.validate((2,2,3,3), (2,2,2,2), 'valid', verify_grad=False)

    def test_basic(self):
        """

--- a/theano/tensor/signal/tests/test_conv.py
+++ b/theano/tensor/signal/tests/test_conv.py
 import sys, time, unittest
 import numpy
-from scipy import signal

 import theano
 import theano.tensor as T
@@ -59,7 +58,13 @@ class TestSignalConv2D(unittest.TestCase):

                image2d = image_data3d[b,:,:]
                filter2d = filter_data3d[k,:,:]
-                output2d = signal.convolve2d(image2d, filter2d, 'valid')
+                output2d = numpy.zeros(ref_output.shape)
+                for row in range(ref_output.shape[0]):
+                    for col in range(ref_output.shape[1]):
+                        output2d[row,col] += (image2d[row:row+filter2d.shape[0],
+                                                            col:col+filter2d.shape[1]]*filter2d[::-1,::-1]
+                                                    ).sum()
+

                self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))