Merged

925a4eb6 · Olivier Delalleau · 646bd761 · 15aebd1c · 925a4eb6 · 925a4eb6
--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -37,8 +37,9 @@ class ProfileMode(Mode):
        fct_call_time = {}#time passed inside theano fct call including op time.
        fct_call = {}
        message=""
+        outputs_size={}
        self.__setstate__((linker, optimizer, apply_time, op_cimpl,
-                           compile_time, fct_call_time, fct_call, message))
+                           compile_time, fct_call_time, fct_call, message, outputs_size))
    def function_maker(self, i,o,m, *args, **kwargs):
        """Return an instance of `Profiler_Maker` which init the count"""
@@ -51,10 +52,11 @@ class ProfileMode(Mode):
    def __getstate__(self):
        #print "__getstate__",self.provided_linker,self.provided_optimizer
        return (self.provided_linker, self.provided_optimizer, self.apply_time,
-                self.op_cimpl, self.compile_time, self.fct_call_time, self.fct_call, self.message)
+                self.op_cimpl, self.compile_time, self.fct_call_time, 
+                self.fct_call, self.message, self.outputs_size)
    def __setstate__(self, (linker, optimizer, apply_time, op_cimpl,
-                            compile_time, fct_call_time, fct_call, message)):
+                            compile_time, fct_call_time, fct_call, message, outputs_size)):
        self.apply_time = apply_time
        self.op_cimpl = op_cimpl
@@ -64,8 +66,11 @@ class ProfileMode(Mode):
        self.call_time = 0
        self.fn_time = 0
        self.message = ""
+        self.outputs_size = outputs_size
        def profile_thunk(i, node, th):
+            """ Profile only the execution time
+            """
            if hasattr(th, 'cthunk'):
                t0 = time.time()
                failure = run_cthunk(th.cthunk)
@@ -82,12 +87,50 @@ class ProfileMode(Mode):
            apply_time[(i,node)] += dt
+        def profile_thunk2(i, node, th):
+            """ Profile the execution time and the memory size.
+            """
+            if hasattr(th, 'cthunk'):
+                t0 = time.time()
+                failure = run_cthunk(th.cthunk)
+                dt = time.time() - t0
+                if failure:
+                    raise RuntimeError(('A C Op raised an exception.  PROFILE_MODE cannot' 
+                        ' tell you what it was though.  Use a standard mode such as'
+                        ' FAST_RUN_NOGC to correct the problem.'))
+            else:
+                t0 = time.time()
+                th()
+                dt = time.time() - t0
+            size=[]
+            for o in th.outputs:
+                s=o[0].size
+                #can't use o[0].dtype.itemsize as dtype is a str for CudaNdarray
+                dtype = str(o[0].dtype)
+                dtype2=dtype[-2:]
+                if dtype2 == '32':
+                    s *= 4
+                elif dtype2 == '64':
+                    s *= 8
+                elif dtype2 == '16':
+                    s *= 2
+                elif dtype[-1] == '8':
+                    s *= 1
+                elif dtype[-3:] == '128':
+                    s *= 16
+                else:
+                    raise Exception("Can't determine the memory size of dtype",o[0].dtype)
+                size.append(s)
+            outputs_size[node]=size
+            apply_time[(i,node)] += dt
        self.provided_linker = linker
        self.provided_optimizer = optimizer
        if isinstance(linker, str) or linker is None:
            linker = predefined_linkers[linker]
-        linker = WrapLinker([linker], profile_thunk)
+        linker = WrapLinker([linker], profile_thunk2)
        self.linker = linker
        if isinstance(optimizer, str) or optimizer is None:
@@ -116,9 +159,10 @@ class ProfileMode(Mode):
        apply_time = self.apply_time
        op_cimpl = self.op_cimpl
        message = self.message
+        outputs_size = self.outputs_size
        self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
-                            apply_time, op_cimpl, message,
+                            apply_time, op_cimpl, message, outputs_size,
                            n_apply_to_print, n_ops_to_print)
@@ -154,15 +198,16 @@ class ProfileMode(Mode):
        apply_time = diff_dict(self.apply_time, other.apply_time)
        op_cimpl = self.op_cimpl and other.op_cimpl
        message = self.message
+        outputs_size = diff_dict(self.outputs_size,other.outputs_size)
        self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
-                            apply_time, op_cimpl, message,
+                            apply_time, op_cimpl, message, outputs_size,
                            n_apply_to_print=n_apply_to_print,
                            n_ops_to_print=n_ops_to_print, print_apply=False)
    @staticmethod
    def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
-                       apply_time, op_cimpl, message,
+                       apply_time, op_cimpl, message, outputs_size,
                       n_apply_to_print=15, n_ops_to_print=20, print_apply=True):
        """
        do the actual printing of print_summary and print_diff_summary.
@@ -240,7 +285,7 @@ class ProfileMode(Mode):
                print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %7.1f %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, op_flops.get(a,-1), nb_call, nb_apply, a)
            else:
                print '   %4.1f%%  %5.1f%%  %5.3fs  %5.3fs  %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_apply, a)
-        print '   ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\
+        print '   ... (remaining %i Apply account for %6.2f%%(%.2fs) of the runtime)'\
                %(max(0, len(otimes)-n_ops_to_print),
                  sum(f for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:]),
                  sum(t for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:]))
@@ -312,8 +357,6 @@ class ProfileMode(Mode):
        print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
        print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
-        #imported here to break circular dependency...
-        from theano.tensor.basic import as_tensor_variable
        print
        print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
        print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
@@ -348,6 +391,80 @@ class ProfileMode(Mode):
                    if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
                        print fct.name, i.name, i.type, i
+        if outputs_size:
+            fct_memory={}#env->dict(node->(outputs size))
+            var_mem = {}
+            for node,val in outputs_size.items():
+                fct_memory.setdefault(node.env,{})
+                fct_memory[node.env][node]=val
+                for out,v in zip(node.outputs,val):
+                    var_mem[out]=v
+            print 
+            print "Profile of Theano functions memory:"            
+            for env,nodes_mem in fct_memory.iteritems():
+                print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
+                size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
+                print "    Max without gc, inplace and view (KB)",size_sum/1024
+                node_memory_size = 0
+                node_memory_saved_by_view = 0
+                node_memory_saved_by_inplace = 0
+                running_memory_size = 0
+                running_max_memory_size = 0
+                post_thunk_old_storage = []
+                items = nodes_mem.items()
+                items.sort(key=lambda a: a[1])
+                items.reverse()
+                order = env.toposort()
+                computed, last_user = gof.link.gc_helper(order)
+                for node in order:
+                    post_thunk_old_storage.append([ input_idx
+                                                    for input_idx,input in enumerate(node.inputs)
+                                                    if (input in computed) and (input not in env.outputs) and node == last_user[input]])
+                for node,val in items[:n_apply_to_print]:
+                    dmap = getattr(node.op,'destroy_map',None)
+                    vmap = getattr(node.op,'view_map',None)
+                    for idx,v in enumerate(val):
+                        if dmap and idx in dmap:#TODO check the op returned a view
+                            node_memory_saved_by_inplace += v
+                        elif vmap and idx in vmap:#TODO check the op returned a view
+                            node_memory_saved_by_view += v
+                        else: 
+                            node_memory_size += v
+                            running_memory_size += v
+                            if running_memory_size > running_max_memory_size:
+                                running_max_memory_size = running_memory_size
+                            old_storage = post_thunk_old_storage[order.index(node)]
+                            for old_s in old_storage:
+                                running_memory_size -= var_mem[node.inputs[old_s]]
+                                pass
+                    pass
+                print "    Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
+                print "    Max FAST_RUN (KB)", running_max_memory_size/1024
+                print "    Memory saved by view (KB)", node_memory_saved_by_view/1024
+                print "    Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
+                print "    Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
+                n_apply_to_print+=10#TODO remove this line
+                print "    <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
+                print "    <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected." 
+                for key,val in items[:n_apply_to_print]:
+                    code = ['c']*len(node.outputs)
+                    for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
+                        code[out] = "i"
+                    for out,inp in getattr(key.op,'view_map',{}).iteritems():
+                        code[out] = "v"
+                    print '       %9dB  %s %s %s' % (sum(val), str(val), ' '.join(code), key)
+            print '   ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
+                %(max(0, len(nodes_mem)-n_ops_to_print),
+                  sum(sum(val) for key, val in items[n_ops_to_print:]),
+                  sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
        print
        print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up."
        from theano import tensor as T

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -421,7 +421,7 @@ class GpuDownsampleFactorMax(Op):
    #def perform(self, node, input_storage, output_storage):
        #raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
-        return ()
+        return (1)
    def c_code(self, node, nodename, (x,), (z,), sub):
        fail = sub['fail']
        ds0, ds1 = self.ds

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -521,6 +521,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    }
    if (1 && (version==6||version==-1) &&
+	kern_len<=320 &&
 	!work_complete) //conv_valid_row_reduce
    {
        int outsize = CudaNdarray_SIZE(out);

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
 import sys, time
 import numpy
+from nose.plugins.skip import SkipTest
+imported_scipy_convolve2d = False
+try:
+    from scipy.signal import convolve2d
+    imported_scipy_convolve2d = True
+except ImportError:
+    pass
 import theano
 # Skip test if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
@@ -38,9 +46,23 @@ def py_conv_full_numpy(img, kern):
    pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
    padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
-    return py_conv_valid(padded_img, kern)
+    return py_conv_valid_numpy(padded_img, kern)
+def py_conv(img, kern, mode, subsample):
+    """
+    use a scipy or numpy implementation depending is scipy is available.
+    The scipy version is faster.
+    """
+    if imported_scipy_convolve2d:
+        return py_conv_scipy(img, kern, mode, subsample)
+    elif mode=='valid':
+        return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+    elif mode=='full':
+        return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
+    else:
+        raise Exception("Can't execute this kernel.")
 def py_conv_scipy(img, kern, mode, subsample):
-    from scipy.signal import convolve2d
    assert img.shape[1] == kern.shape[1]
    if mode == 'valid':
        outshp = (img.shape[0], kern.shape[0],
@@ -89,7 +111,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
    rval = True
    try:
        t0 = time.time()
-        cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
        t1 = time.time()
        i = cuda_tensor4()
        k = cuda_tensor4()
@@ -550,7 +572,7 @@ def _test_dummy():
    rval = True
    t0 = time.time()
-    cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
    t1 = time.time()
    gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
    t2 = time.time()

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -252,13 +252,13 @@ class GpuImages2Neibs(Images2Neibs):
                                                                dtype=ten4.type.dtype)()])
    def c_code_cache_version(self):
-        return ()
+        return (6,)
-        return (2,)
    def c_support_code_apply(self, node, nodename):
-        if self.mode=="valid":
+        mode = self.mode
-            return """
+        return """
-        static __global__ void k_multi_warp_%(nodename)s(
+//a version that use less register but don't work in all case.
+        static __global__ void k_multi_warp_less_%(nodename)s(
            const int nb_batch,
            const int nb_stack,
            const int height,
@@ -274,8 +274,10 @@ class GpuImages2Neibs(Images2Neibs):
            float * global_out
        )
        {
+            const int wrap_centered_idx_shift_x = c/2;
-            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+            const int wrap_centered_idx_shift_y = d/2;
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
@@ -289,12 +291,23 @@ class GpuImages2Neibs(Images2Neibs):
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            for (int i = 0; i < c; i++)     // loop over c
+                            int i = threadIdx.y;     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
-                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
+                                if("%(mode)s"=="wrap_centered"){
+                                    ten4_2 -= wrap_centered_idx_shift_x;
+                                    if ( ten4_2 < 0 ) ten4_2 += height;
+                                    else if (ten4_2 >= height) ten4_2 -= height;
+                                }
+                                int j = threadIdx.x;  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
+                                    if("%(mode)s"=="wrap_centered"){
+                                        ten4_3 -= wrap_centered_idx_shift_y;
+                                        if ( ten4_3 < 0 ) ten4_3 += width;
+                                        else if (ten4_3 >= width) ten4_3 -= width;
+                                    }
                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); 
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; 
@@ -307,9 +320,6 @@ class GpuImages2Neibs(Images2Neibs):
            }
        }
-        """ % locals()
-        if self.mode=="wrap_centered":
-            return """
        static __global__ void k_multi_warp_%(nodename)s(
            const int nb_batch,
            const int nb_stack,
@@ -329,7 +339,7 @@ class GpuImages2Neibs(Images2Neibs):
            const int wrap_centered_idx_shift_x = c/2;
            const int wrap_centered_idx_shift_y = d/2;
-            for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){
+            for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
                const int b = tblock%%grid_d;
                int left = tblock/grid_d;
                const int a = left%%grid_c;
@@ -343,19 +353,23 @@ class GpuImages2Neibs(Images2Neibs):
                if(a>grid_c)continue;
                if(b>grid_d)continue;
                            int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
-                            for (int i = 0; i < c; i++)     // loop over c
+                            for (int i = threadIdx.y; i < c; i+=blockDim.y)     // loop over c
                            {
                                int ten4_2 = i + a * step_x;
-                                ten4_2 -= wrap_centered_idx_shift_x;
+                                if("%(mode)s"=="wrap_centered"){
-                                if ( ten4_2 < 0 ) ten4_2 += height;
+                                    ten4_2 -= wrap_centered_idx_shift_x;
-                                else if (ten4_2 >= height) ten4_2 -= height;
+                                    if ( ten4_2 < 0 ) ten4_2 += height;
+                                    else if (ten4_2 >= height) ten4_2 -= height;
+                                }
                                for (int j = threadIdx.x; j < d; j+=blockDim.x)  // loop over d
                                {
                                    int ten4_3 = j + b * step_y;
-                                    ten4_3 -= wrap_centered_idx_shift_y;
+                                    if("%(mode)s"=="wrap_centered"){
-                                    if ( ten4_3 < 0 ) ten4_3 += width;
+                                        ten4_3 -= wrap_centered_idx_shift_y;
-                                    else if (ten4_3 >= width) ten4_3 -= width;
+                                        if ( ten4_3 < 0 ) ten4_3 += width;
+                                        else if (ten4_3 >= width) ten4_3 -= width;
+                                    }
                                    //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
                                    //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); 
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; 
@@ -370,7 +384,6 @@ class GpuImages2Neibs(Images2Neibs):
        """ % locals()
    def c_code(self, node, name, (ten4, neib_shape, neib_step), (z,), sub):
        fail = sub['fail']
        mode = self.mode
@@ -473,17 +486,36 @@ class GpuImages2Neibs(Images2Neibs):
            const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
            const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
+            dim3 n_threads(d,c,1);
+            //Their is a max of 512 threads per blocks
+            while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--; 
+            while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--; 
+            //Make bigger block to have better memory access pattern and a higher core utilisation.
+            //for smaller patch size
+            while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
+                n_threads.z++;
+            }
            int nb_block;
-            if (nb_batch %% 32 == 0)
+            if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
-                nb_block = nb_batch/32;
+                nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
            else
-                nb_block = (int)((float)nb_batch/32. + 1.); 
+                nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
+            dim3 n_blocks(std::min(32*1024,nb_block));
-            dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
-            dim3 n_threads(32,1,1);
            int n_shared = 0;
-            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(                
+	    void (*f)(int, int, int ,int,
+                      int, int, int ,int,
+                      int, int,
+                      int, int, int, int,
+                      float*, float*);
+            if(n_threads.x==d && n_threads.y==c){
+                f = k_multi_warp_less_%(name)s;
+            }else{
+                f = k_multi_warp_%(name)s;
+            }
+            f<<<n_blocks, n_threads, n_shared>>>(                
                nb_batch,
                nb_stack,
                height, width,

--- a/theano/sandbox/test_neighbours.py
+++ b/theano/sandbox/test_neighbours.py
@@ -278,26 +278,30 @@ def test_neibs_wrap_centered_step_manual():
 def test_neibs_gpu():
    if cuda.cuda_available == False:
       raise SkipTest('Optional package cuda disabled')
+    for shape, pshape in [((100,40,18,18),(2,2)),
-    shape = (100,40,18,18)
+                          ((100,40,6,18),(3,2)),
-    images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
+                          ((10,40,66,66),(33,33)),
-    neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
+                          ((10,40,68,66),(34,33))
+                          ]:
-    from theano.sandbox.cuda.basic_ops import gpu_from_host
+        images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
-    f = function([], images2neibs(images,neib_shape),
+        neib_shape = T.as_tensor_variable(pshape)
-                 mode=mode_with_gpu)
-    f_gpu = function([], images2neibs(images,neib_shape),
+        from theano.sandbox.cuda.basic_ops import gpu_from_host
-                 mode=mode_with_gpu)
-    assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
+        f = function([], images2neibs(images,neib_shape),
-    #print images.value
+                     mode=mode_with_gpu)
-    neibs = numpy.asarray(f_gpu())
+        f_gpu = function([], images2neibs(images,neib_shape),
-    assert numpy.allclose(neibs,f())
+                     mode=mode_with_gpu)
-    #print neibs
+        assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
-    g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
+        #print images.value
-    assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
+        neibs = numpy.asarray(f_gpu())
-    #print numpy.asarray(g())
+        assert numpy.allclose(neibs,f())
-    assert numpy.allclose(images.value,g())
+        #print neibs
+        g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
+        assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
+        #print numpy.asarray(g())
+        assert numpy.allclose(images.value,g())
 def speed_neibs():

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -12,7 +12,7 @@ import numpy, theano
 #from copy import copy as python_copy
 from theano import gof, shared
-from theano.gof import Variable, Op, utils, Type, Constant,  Value
+from theano.gof import Variable, Op, Type, Constant,  Value
 from theano.tensor.tsor_apply import Apply
 from theano import gradient
@@ -21,7 +21,7 @@ import elemwise
 from theano import scalar as scal
 from theano.gof.python25 import partial, any, all
 from theano import compile, printing
-from theano.printing import pprint, Print
+from theano.printing import pprint
 ### set up the external interface
 from elemwise import Elemwise, DimShuffle, CAReduce, Sum

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -18,6 +18,16 @@ from theano import gof, Op, tensor, config
 from theano.tensor.tsor_apply import Apply
 from theano.gof.python25 import any
+imported_scipy_signal = False
+try:
+    # TODO: move these back out to global scope when they no longer cause an atexit error
+    from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
+    from scipy.signal.sigtools import _convolve2d
+    imported_scipy_signal = True
+except ImportError:
+    pass
 _logger=logging.getLogger("theano.signal.conv")
 def _debug(*msg):
    _logger.debug(' '.join([ str(x) for x in msg]))
@@ -547,9 +557,12 @@ class ConvOp(Op):
        """
        By default if len(img2d.shape)==3, we
        """
+        if not imported_scipy_signal:
+            raise theano.gof.utils.MethodNotDefined(
+                "c_headers", type(self), self.__class__.__name__,
+                "Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
        # TODO: move these back out to global scope when they no longer cause an atexit error
-        from scipy.signal.signaltools import  _valfrommode, _bvalfromboundary
-        from scipy.signal.sigtools import _convolve2d
        imshp = self.imshp
        if imshp is None or any([x is None for x in imshp]):
            imshp = tuple(img2d.shape[1:])
@@ -584,8 +597,6 @@ class ConvOp(Op):
            z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
                           dtype=img2d.dtype)
        zz=z[0]
-        val = _valfrommode(self.out_mode)
-        bval = _bvalfromboundary('fill')
        stacklen = imshp[0]
@@ -616,6 +627,9 @@ class ConvOp(Op):
            filtersflipped = buf
            del buf, rstride, cstride
+        val = _valfrommode(self.out_mode)
+        bval = _bvalfromboundary('fill')
        for b in range(bsize):
            for n in range(nkern):
                zz[b,n,...].fill(0)
@@ -623,6 +637,25 @@ class ConvOp(Op):
                    zz[b,n,...] +=  _convolve2d(\
                        img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
+        if False:
+            if False and self.out_mode=="full":
+                img2d2 = numpy.zeros((bsize,stacklen,
+                                      imshp[1]+2*kshp[0]-2,
+                                      imshp[2]+2*kshp[1]-2))
+                img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
+                           kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
+                img2d = img2d2
+            #N_image_shape = image_data.shape
+            for b in range(bsize):
+                for n in range(nkern):
+                    zz[b,n,...].fill(0)
+                    for im0 in range(stacklen):
+                        for row in range(0,zz.shape[2],self.dx):
+                            for col in range(0,zz.shape[3],self.dy):
+                                zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
+                                                            filtersflipped[n,im0,::-1,::-1]).sum()
        #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
        #The copy make that we return an object with the same stride as the c version.
        #The copy don't affect the performence during our experience as in that case we

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
 import sys, time, unittest
 import numpy
-from scipy import signal
 import theano
 import theano.tensor as T
@@ -60,6 +59,7 @@ class TestConv2D(unittest.TestCase):
        ############# REFERENCE IMPLEMENTATION ############
        s = 1.
+        orig_image_data = image_data
        if border_mode is not 'full': s = -1.
        out_shape2d = numpy.array(N_image_shape[-2:]) +\
                      s*numpy.array(N_filter_shape[-2:]) - s
@@ -68,26 +68,41 @@ class TestConv2D(unittest.TestCase):
        ref_output = numpy.zeros(out_shape)
        # loop over output feature maps
-        for k in range(N_filter_shape[0]):
+        ref_output.fill(0)
-            # loop over input feature maps
+        if border_mode=='full':
-            for l in range(N_filter_shape[1]):
+            image_data2 = numpy.zeros((N_image_shape[0],N_image_shape[1],
+                                      N_image_shape[2]+2*N_filter_shape[2]-2,
-                filter2d = filter_data[k,l,:,:]
+                                      N_image_shape[3]+2*N_filter_shape[3]-2))
+            image_data2[:,:,N_filter_shape[2]-1:N_filter_shape[2]-1+N_image_shape[2],
-                # loop over mini-batches
+                            N_filter_shape[3]-1:N_filter_shape[3]-1+N_image_shape[3]] = image_data
-                for b in range(N_image_shape[0]):
+            image_data = image_data2
-                    image2d = image_data[b,l,:,:]
+            N_image_shape = image_data.shape
-                    output2d = signal.convolve2d(image2d, filter2d, border_mode)
+        for bb in range(N_image_shape[0]):
+            for nn in range(N_filter_shape[0]):
-                    ref_output[b,k,:,:] +=\
+                for im0 in range(N_image_shape[1]):
-                       output2d[::subsample[0],::subsample[1]]
+                    filter2d = filter_data[nn,im0,:,:]
+                    image2d = image_data[bb,im0,:,:]
+                    for row in range(ref_output.shape[2]):
+                        irow = row * subsample[0]#image row
+                        for col in range(ref_output.shape[3]):
+                            icol = col * subsample[1]#image col
+                            ref_output[bb,nn,row,col] += (image2d[irow:irow+N_filter_shape[2],
+                                                                  icol:icol+N_filter_shape[3]]*filter2d[::-1,::-1]
+                                                          ).sum()
        self.failUnless(_allclose(theano_output, ref_output))
        ############# TEST GRADIENT ############
        if verify_grad:
-            utt.verify_grad(sym_conv2d, [image_data, filter_data])
+            utt.verify_grad(sym_conv2d, [orig_image_data, filter_data])
+    def test_basic1(self):
+        """
+        Tests that basic convolutions work for odd and even dimensions of image and filter
+        shapes, as well as rectangular images and filters.
+        """
+        self.validate((2,2,3,3), (2,2,2,2), 'valid', verify_grad=False)
    def test_basic(self):
        """

--- a/theano/tensor/signal/tests/test_conv.py
+++ b/theano/tensor/signal/tests/test_conv.py
 import sys, time, unittest
 import numpy
-from scipy import signal
 import theano
 import theano.tensor as T
@@ -59,7 +58,13 @@ class TestSignalConv2D(unittest.TestCase):
                image2d = image_data3d[b,:,:]
                filter2d = filter_data3d[k,:,:]
-                output2d = signal.convolve2d(image2d, filter2d, 'valid')
+                output2d = numpy.zeros(ref_output.shape)
+                for row in range(ref_output.shape[0]):
+                    for col in range(ref_output.shape[1]):
+                        output2d[row,col] += (image2d[row:row+filter2d.shape[0],
+                                                            col:col+filter2d.shape[1]]*filter2d[::-1,::-1]
+                                                    ).sum()
                self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))