- fixed a bug in cafe conv (values of _M, _N, _K)

- added a test that does a check on variety of shapes and sizes of image and kernel - removed flip form local_conv_gemm in cuda/opt.py is the code never reached there for me - added flip in the kernel in the test code itself

- fixed a bug in cafe conv (values of _M, _N, _K)
c649d668 · Arjun Jain · f18c8498 · c649d668 · c649d668 · c649d668
--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
@@ -165,7 +165,7 @@ CudaNdarray* validMM(const CudaNdarray *input,
 	float alpha = 1.0f; float beta = 0.0f;
 	int m = CudaNdarray_HOST_DIMS(columns)[1];
-	int n = CudaNdarray_HOST_DIMS(weight)[1];
+	int n = CudaNdarray_HOST_DIMS(weight)[0];
 	int k = CudaNdarray_HOST_DIMS(columns)[0];
 	status = cublasSgemm(handle,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1291,7 +1291,8 @@ def local_conv_gemm(node):
        print "WARNING, YOU ARE USING BUGGED CODE!"
        img, kern = node.inputs
        img = gpu_contiguous(img)
-        kern = gpu_contiguous(kern[:, :, ::-1, ::-1])
+        #kern = kern[:, :, ::-1, ::-1]
+        kern = gpu_contiguous(kern) 
        return [GpuConvMM(node.op.border_mode)(img, kern)]
 gpu_optimizer.register("conv_gemm", local_conv_gemm)

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -115,8 +115,8 @@ def py_conv_scipy(img, kern, mode, subsample):
        for k in xrange(out.shape[1]):
            for s in xrange(img.shape[1]):
                out[b, k, :, :] += convolve2d(img[b, s, :, :],
-                  numpy.rot90(kern[k, s, :, :],2),
+                                  kern[k, s, :, :],
-                  mode)
+                                  mode)
    return out[:, :, ::subsample[0], ::subsample[1]]
@@ -819,37 +819,53 @@ class TestConv2DGPU(unittest.TestCase):
 def _test_dummy():
-    ishape = (1, 1, 7, 7)
+    """
-    kshape = (1, 1, 3, 3)
+    input: (batch size, channels, rows, columns)
-    mode = 'valid'
+    filters: (number of filters, channels, rows, columns)
-    subsample = (1, 1)
+    """
-    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
-    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
-    i = cuda_tensor4()
-    k = cuda_tensor4()
-    #print >> sys.stdout, '_params_allgood trying ', ishape, kshape, mode
-    t2 = None
-    rval = True
-    t0 = time.time()
-    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-    t1 = time.time()
-    op = theano.sandbox.cuda.blas.GpuConvMM(border_mode=mode)(i, k)
-    f = theano.function([i, k], op, mode=theano_mode)
-    gpuval = f(npy_img, npy_kern)
-    t2 = time.time()
-    gpuval = numpy.asarray(gpuval)
+    for bs in range(1, 5):
-    print gpuval
+        for ch in range(1,4):
-    print '-------------------'
+            for nf in range(1,4):
-    print cpuval
+                for rImg in range(5, 9):
+                    for rFlt in range(2, 3):
+                        ishape = (bs, ch, rImg, rImg)
+                        kshape = (nf, ch, rFlt, rFlt)
+                        print "ishape: ", ishape
+                        print "kshape: ", kshape 
+                        # ishape = (2, 1, 5, 5)
+                        # kshape = (2, 1, 3, 3)
+                        mode = 'valid'
+                        subsample = (1, 1)
+                        npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
+                        npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
+                        i = cuda_tensor4()
+                        k = cuda_tensor4()
+                        t2 = None
+                        t0 = time.time()
+                        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
+                        t1 = time.time()
+                        op = theano.sandbox.cuda.blas.GpuConvMM(border_mode=mode)(i, k)
+                        f = theano.function([i, k], op, mode=theano_mode)
+                        for k in range(npy_kern.shape[0]):
+                            for s in range(npy_kern.shape[1]):
+                                npy_kern[k,s,:,:] = numpy.rot90(npy_kern[k,s,:,:], 2)
+                        gpuval = f(npy_img, npy_kern)
+                        t2 = time.time()
+                        gpuval = numpy.asarray(gpuval)
+                        rval = numpy.allclose(cpuval, gpuval, rtol=1e-4)
+                        assert (rval == True)
+                        print 'Test Passed'
 def benchmark():