- added some documentaiton

- changed conv to corr as suggested by Fred

- added some documentaiton
4c55bc4b · Arjun Jain · 1e3de2ce · 4c55bc4b · 4c55bc4b · 4c55bc4b
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -51,8 +51,21 @@ TODO: Give examples for how to use these things! They are pretty complicated.
      implementation.
      Also, there is restrictions on which shape are supported.
+    - :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`
+      This is a GPU-only version of a correlation that computes correlations
+      as `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu>`. 
+      For each element in a batch, it first creates a 
+      Toeplitz<http://en.wikipedia.org/wiki/Toeplitz_matrix> matrix in a cuda kernel. 
+      Then, it performs a `gemm` call to multiply this Toeplitz matrix and to the kernel. 
+      It need extra memory for this, which is the size of the Toeplitz matrix. Precisely, 
+      the dimensions of this Toeplitz matrix is equal to 
+      (no of channels * filter width * filter height, output width * output height).
+      You can enable it for call to conv2d 2d by setting 'THEANO_FLAGS=optimizer_including=conv_gemm'
+      in your environment. This is not enabled by default because it
+      uses some extra memory. It don't support strides for now and requires square kernels.
 .. autofunction:: theano.tensor.nnet.conv.conv2d
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
 .. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
+.. autofunction:: theano.sandbox.cuda.blas.GpuCorrMM
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -498,7 +498,7 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)
-class GpuConvMM(GpuOp):
+class GpuCorrMM(GpuOp):
    """
    Author: Arjun Jain
    Implement the caffe convolution
@@ -516,10 +516,10 @@ class GpuConvMM(GpuOp):
        self.pad = pad
        if pad != 0:
            raise NotImplementedError(
-                "GpuConvMM don't implement the pad parameter")
+                "GpuCorrMM don't implement the pad parameter")
        if subsample != (1, 1):
            raise NotImplementedError(
-                "GpuConvMM we don't implement the subsample parameter")
+                "GpuCorrMM we don't implement the subsample parameter")
    def __eq__(self, other):
        return type(self) == type(other) \
@@ -658,7 +658,7 @@ class GpuConvMM(GpuOp):
    }
-    out2 = validMM(%(img)s, %(kern)s, %(out)s, pad);
+    out2 = corrMM(%(img)s, %(kern)s, %(out)s, pad);
    if (out2==NULL){
       %(fail)s
    }

--- a/theano/sandbox/cuda/conv_gemm.cu
+++ b/theano/sandbox/cuda/conv_gemm.cu
@@ -105,7 +105,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
     long batchSize = CudaNdarray_HOST_DIMS(input)[0];
     if (CudaNdarray_HOST_DIMS(input)[2] != CudaNdarray_HOST_DIMS(input)[3]){
       PyErr_Format(PyExc_ValueError,
-                    "GpuConvMM support only square images. Got %dx%d images\n",
+                    "GpuCorrMM support only square images. Got %dx%d images\n",
 		    CudaNdarray_HOST_DIMS(input)[2],
 		    CudaNdarray_HOST_DIMS(input)[3]
 		    );
@@ -113,14 +113,14 @@ CudaNdarray* corrMM(const CudaNdarray *input,
     }
     if (kW != kH){
       PyErr_Format(PyExc_ValueError,
-                    "GpuConvMM support only square kernel. Got %dx%d kernel\n",
+                    "GpuCorrMM support only square kernel. Got %dx%d kernel\n",
 		    kW, kH
 		    );
       return NULL;
     }
     if (CudaNdarray_HOST_DIMS(input)[1]  != CudaNdarray_HOST_DIMS(weight)[1]){
       PyErr_SetString(PyExc_ValueError,
-                    "GpuConvMM images and kernel must have the same stack size\n"
+                    "GpuCorrMM images and kernel must have the same stack size\n"
 		    );
       return NULL;
     }
@@ -136,7 +136,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
 	 outputHeight != CudaNdarray_HOST_DIMS(output)[2] ||
 	 outputWidth != CudaNdarray_HOST_DIMS(output)[3]){
       PyErr_SetString(PyExc_ValueError,
-                    "GpuConvMM outputs parameter don't have the good shape\n"
+                    "GpuCorrMM outputs parameter don't have the good shape\n"
 		    );
       return NULL;
     }
@@ -182,7 +182,7 @@ CudaNdarray* corrMM(const CudaNdarray *input,
                );
  	     if (status != CUBLAS_STATUS_SUCCESS) {
-      	         std::cerr << "!!!! CUBLAS error in GpuConvMM\n";
+      	         std::cerr << "!!!! CUBLAS error in GpuCorrMM\n";
 	      }
      }

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -25,7 +25,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuConvMM)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, GpuCorrMM)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -1292,7 +1292,7 @@ def local_conv_gemm(node):
        img = gpu_contiguous(img)
        kern = kern[:, :, ::-1, ::-1]
        kern = gpu_contiguous(kern)
-        return [GpuConvMM(node.op.border_mode)(img, kern)]
+        return [GpuCorrMM(node.op.border_mode)(img, kern)]
 gpu_optimizer.register("conv_gemm", local_conv_gemm)

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -648,7 +648,7 @@ def test_valid():
        shp[1][2]/shp[4][0] == shp[1][3]/shp[4][1])]
    exec_conv(version, shapes, verbose, random, 'valid',
              print_=print_, ones=ones, rtol=1.1e-5,
-              theano_mode=mode, cls=cuda.blas.GpuConvMM)
+              theano_mode=mode, cls=cuda.blas.GpuCorrMM)
 def test_full():
@@ -713,14 +713,14 @@ def test_full():
 #    exec_conv(version, shapes, verbose, random, 'full')
-    # Test the GpuConvMM version
+    # Test the GpuCorrMM version
    mode = theano_mode.including("conv_gemm")
    shapes = [shp for shp in shapes if shp[1][2] == shp[1][3]]
    shapes = [shp for shp in shapes if shp[0][2] == shp[0][3]]
    shapes = shapes[0:10]
    exec_conv(version, shapes, verbose, random, 'full',
-              theano_mode=mode, cls=cuda.blas.GpuConvMM)
+              theano_mode=mode, cls=cuda.blas.GpuCorrMM)
 def test_subsample():
@@ -856,7 +856,7 @@ def test_gemm():
                            t1 = time.time()
-                            op = theano.sandbox.cuda.blas.GpuConvMM(border_mode=mode)(i, k)
+                            op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode=mode)(i, k)
                            f = theano.function([i, k], op, mode=theano_mode)
                            for k in range(npy_kern.shape[0]):