Merge pull request #5174 from abergeron/cormm_f16

Make corrMM work in float16/64

Merge pull request #5174 from abergeron/cormm_f16
cea45e8b · Frédéric Bastien · GitHub · 29af0e5b · d79d38c1 · cea45e8b
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -173,11 +173,15 @@ class Kernel(object):
    fname: str
        the name of the function wrapper.
        (defaults to name + `_call`)
+    sname: str
+        the name of the scheduled call function
+        (defaults to name _ `_scall`)

    """

    def __init__(self, code, params, name, flags,
-                 codevar=None, binvar=None, objvar=None, fname=None):
+                 codevar=None, binvar=None, objvar=None, fname=None,
+                 sname=None):
        self.code = code
        self.params = params
        self.name = name
@@ -194,6 +198,9 @@ class Kernel(object):
        if fname is None:
            fname = name + '_call'
        self.fname = fname
+        if sname is None:
+            sname = name + '_scall'
+        self.sname = sname

    @staticmethod
    def get_flags(*types):
@@ -338,22 +345,30 @@ class GpuKernelBase(object):
        setargs = '\n  '.join(setargs)

        return """
-int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
+int {fname}(unsigned int _nd, size_t *_gdim, size_t *_ldim, size_t _shared,
                  {args}) {{
  {setargs}

-  return GpuKernel_call(&{kname}, nd, ldim, gdim, shared, NULL);
+  return GpuKernel_call(&{kname}, _nd, _ldim, _gdim, _shared, NULL);
 }}
-        """.format(args=args, fname=k.fname, setargs=setargs, kname=k.objvar)

-    def c_support_code(self):
-        return """
-        template <typename T>
-        static T ceil_intdiv(T a, T b)
-        {
-            return (a/b) + ((a % b) ? 1: 0);
-        }
-        """
+int {sname}(unsigned int _nd, size_t *_n, size_t _shared, {args}) {{
+  size_t _ls = 0;
+  size_t _gs = 0;
+  int _err;
+
+  if (_nd != 1) return GA_UNSUPPORTED_ERROR;
+
+  _err = GpuKernel_sched(&{kname}, _n[0], &_ls, &_gs);
+  if (_err != GA_NO_ERROR)
+    return _err;
+
+  {setargs}
+
+  return GpuKernel_call(&{kname}, 1, &_ls, &_gs, _shared, NULL);
+}}
+        """.format(args=args, fname=k.fname, setargs=setargs, sname=k.sname,
+                   kname=k.objvar)

    def c_support_code_apply(self, node, name):
        kernels = self.gpu_kernels(node, name)
@@ -428,7 +443,7 @@ int {fname}(unsigned int nd, size_t *gdim, size_t *ldim, size_t shared,
            The node that we need the cache version for.

        """
-        return (6, self.get_params(node).bin_id)
+        return (7, self.get_params(node).bin_id)


 def forward_string_meth(name):
@@ -466,12 +481,14 @@ class CGpuKernelBase(COp, GpuKernelBase):

    kernel_re = re.compile(r'^#kernel ([a-zA-Z_].*?)$', re.MULTILINE)

-    c_support_code = forward_string_meth('c_support_code')
    c_support_code_apply = forward_string_meth('c_support_code_apply')
    c_support_code_struct = forward_string_meth('c_support_code_struct')
    c_init_code_struct = forward_string_meth('c_init_code_struct')
    c_cleanup_code_struct = forward_string_meth('c_cleanup_code_struct')

+    def c_code_cache_version_apply(self, node):
+        return GpuKernelBase.c_code_cache_version_apply(self, node)
+
    def _type_macros(self, node):
        define_template = "#define %s %s\n"
        undef_template = "#undef %s\n"

--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -414,7 +414,7 @@ gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
 gpugemmbatch_inplace = GpuGemmBatch(inplace=True)


-class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
+class BaseGpuCorrMM(CGpuKernelBase):
    """
    Base class for `GpuCorrMM`, `GpuCorrMM_gradWeights` and
    `GpuCorrMM_gradInputs`. Cannot be used directly.
@@ -429,9 +429,9 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
    filter_dilation
        Perform subsampling of the input, also known as dilation (default: (1, 1)).
    """
-
    check_broadcast = False
    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    _f16_ok = True

    def __init__(self, border_mode="valid", subsample=(1, 1),
                 filter_dilation=(1, 1)):
@@ -489,9 +489,15 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
    def get_params(self, node):
        return node.inputs[0].type.context

+    def c_headers(self):
+        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+
    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 2)
+        # Raise this whenever modifying the code below.
+        return (2,)

    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -953,7 +959,7 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
            return [[1], [1], [0], [0]]  # no connection to height, width


-class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
+class BaseGpuCorr3dMM(CGpuKernelBase):
    """
    Base class for `GpuCorr3dMM`, `GpuCorr3dMM_gradWeights` and
    `GpuCorr3dMM_gradInputs`. Cannot be used directly.
@@ -967,10 +973,11 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
        Perform subsampling of the output (default: (1, 1, 1)).
    filter_dilation
        Perform subsampling of the input, also known as dilation (default: (1, 1, 1)).
-    """

+    """
    check_broadcast = False
    __props__ = ('border_mode', 'subsample', 'filter_dilation')
+    _f16_ok = True

    def __init__(self, border_mode="valid", subsample=(1, 1, 1),
                 filter_dilation=(1, 1, 1)):
@@ -1028,9 +1035,15 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
    def get_params(self, node):
        return node.inputs[0].type.context

+    def c_headers(self):
+        return ["<gpuarray/array.h>", "<gpuarray/blas.h>", "gpuarray_helper.h"]
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+
    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 2)
+        # raise this whenever modifying the code below.
+        return (2,)

    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -236,11 +236,9 @@ KERNEL void col2im3d_kernel(const ga_size n,
  }
 }

-
-
 #section support_code_struct

-int im3d2col(const size_t max_threads_dim,
+int im3d2col(
    gpudata * data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t kernel_h, const size_t kernel_w, const size_t kernel_d,
@@ -257,13 +255,10 @@ int im3d2col(const size_t max_threads_dim,
  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  size_t depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
  size_t num_kernels = channels * height_col * width_col * depth_col;
-  size_t threads_per_block = max_threads_dim;
-  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  int err;
-  GpuKernel *kernel;
-  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
-    err = dilated_im3d2col_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
+    err = dilated_im3d2col_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
@@ -273,10 +268,9 @@ int im3d2col(const size_t max_threads_dim,
                     "gpuarray error: dilated_im3d2col_kernel: %s.",
                     GpuKernel_error(&k_dilated_im3d2col_kernel, err));
    }
-  }
-  else{
-    err = im3d2col_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  } else {
+    err = im3d2col_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
@@ -290,7 +284,7 @@ int im3d2col(const size_t max_threads_dim,
  return err;
 }

-int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
+int col2im3d(gpudata * data_col, const size_t channels,
    const size_t height, const size_t width, const size_t depth,
    const size_t patch_h, const size_t patch_w, const size_t patch_d,
    const size_t dilation_h, const size_t dilation_w, const size_t dilation_d,
@@ -304,14 +298,12 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  size_t depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
  size_t num_kernels = channels * height * width * depth;
-  size_t threads_per_block = max_threads_dim;
-  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
  int err;
-  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
-    err = dilated_col2im3d_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  if (dilation_h != 1 || dilation_w != 1 || dilation_d != 1) {
+    err = dilated_col2im3d_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
@@ -323,8 +315,8 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
    }
  }
  else{
-    err = col2im3d_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+    err = col2im3d_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
      height_col, width_col, depth_col, data_im, data_im_offset);
@@ -460,15 +452,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        return NULL;
    }

-    // Get the max threads per blocks
-    size_t max_threads_dim;
-    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
-    if (err != GA_NO_ERROR){
-        PyErr_Format(PyExc_RuntimeError,
-                     "Could not fetch max_threads_dim.");
-        return NULL;
-    }
-
    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH * kD;
@@ -492,8 +475,6 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;
-    const DTYPE_INPUT_0 one = 1.0f;
-    const DTYPE_INPUT_0 zero = 0.0f;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
@@ -502,24 +483,46 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
-            err = im3d2col(max_threads_dim,
-                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
-                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-                           padH, padW, padD, dH, dW, dD, col->ga.data);
+            err = im3d2col(
+              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
+              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
-            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                N_, M_, K_, one,
-                                col->ga.data, 0, N_,
-                                weight->ga.data, 0, K_,
-                                zero,
-                                top->ga.data, n * top_stride, N_);
+            switch (col->ga.typecode) {
+            case GA_FLOAT:
+              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            case GA_DOUBLE:
+              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            case GA_HALF:
+              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            default:
+              err = GA_UNSUPPORTED_ERROR;
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorr3dMM encountered an error running sgemm.\n");
+                             "GpuCorr3dMM forward encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
@@ -531,10 +534,10 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im3d2col
-            err = im3d2col(max_threads_dim,
-                           bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
-                           bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
-                           padH, padW, padD, dH, dW, dD, col->ga.data);
+            err = im3d2col(
+              bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+              bottomWidth, bottomDepth, kH, kW, kD, dilH, dilW, dilD,
+              padH, padW, padD, dH, dW, dD, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
@@ -543,15 +546,37 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
-                                K_, M_, N_, one,
-                                col->ga.data, 0, N_,
-                                top->ga.data, n * top_stride, N_,
-                                (n == 0) ? zero : one,
-                                weight->ga.data, 0, K_);
+            switch (col->ga.typecode) {
+            case GA_FLOAT:
+              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            case GA_DOUBLE:
+              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            case GA_HALF:
+              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            default:
+              err = GA_UNSUPPORTED_ERROR;
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorr3dMM encountered an error running sgemm.\n");
+                             "GpuCorr3dMM grad weights encountered an error running gemm.");
                Py_DECREF(col);
                return NULL;
            }
@@ -562,29 +587,50 @@ PyGpuArrayObject* corr3dMM(PyGpuArrayObject *const bottom,
        // full convolution: gemm, then col2im3d
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
-            // gemm into columns
+          // gemm into columns
+          switch (top->ga.typecode) {
+          case GA_FLOAT:
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, one,
+                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
-                                zero,
+                                0,
                                col->ga.data, 0, N_);
-            if (err != GA_NO_ERROR) {
-                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorr3dMM encountered an error running sgemm.\n");
-                Py_DECREF(col);
-                return NULL;
-            }
-            // col2im3d back to the data
-            err = col2im3d(max_threads_dim,
-                           col->ga.data, nChannels,
-                           bottomHeight, bottomWidth, bottomDepth,
-                           kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
-                           dH, dW, dD, bottom->ga.data, n * bottom_stride);
-            if (err != GA_NO_ERROR) {
-                Py_DECREF(col);
-                return NULL;
-            }
+            break;
+          case GA_DOUBLE:
+            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, 1,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                0,
+                                col->ga.data, 0, N_);
+            break;
+          case GA_HALF:
+            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, 1,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                0,
+                                col->ga.data, 0, N_);
+            break;
+          default:
+            err = GA_UNSUPPORTED_ERROR;
+          }
+          if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuCorr3dMM grad inputs encountered an error running gemm.");
+            Py_DECREF(col);
+            return NULL;
+          }
+          // col2im3d back to the data
+          err = col2im3d(col->ga.data, nChannels,
+                         bottomHeight, bottomWidth, bottomDepth,
+                         kH, kW, kD, dilH, dilW, dilD, padH, padW, padD,
+                         dH, dW, dD, bottom->ga.data, n * bottom_stride);
+          if (err != GA_NO_ERROR) {
+            Py_DECREF(col);
+            return NULL;
+          }
        }
    }
    // Free temporary columns

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -195,8 +195,7 @@ KERNEL void col2im_kernel(const ga_size n,

 #section support_code_struct

-int im2col(const size_t max_threads_dim,
-    gpudata * data_im, const size_t data_im_offset, const size_t channels,
+int im2col(gpudata *data_im, const size_t data_im_offset, const size_t channels,
    const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w,
@@ -209,13 +208,10 @@ int im2col(const size_t max_threads_dim,
  size_t height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
  size_t width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
  size_t num_kernels = channels * height_col * width_col;
-  size_t threads_per_block = max_threads_dim;
-  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  int err;
-  GpuKernel *kernel;
-  if(dilation_h != 1 || dilation_w != 1){
-    err = dilated_im2col_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  if (dilation_h != 1 || dilation_w != 1) {
+    err = dilated_im2col_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
@@ -224,10 +220,9 @@ int im2col(const size_t max_threads_dim,
                     "gpuarray error: dilated_im2col_kernel: %s.",
                     GpuKernel_error(&k_dilated_im2col_kernel, err));
    }
-  }
-  else{
-    err = im2col_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  } else {
+    err = im2col_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
@@ -240,7 +235,7 @@ int im2col(const size_t max_threads_dim,
  return err;
 }

-int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channels,
+int col2im(gpudata * data_col, const size_t channels,
    const size_t height, const size_t width, const size_t patch_h, const size_t patch_w,
    const size_t dilation_h, const size_t dilation_w,
    const size_t pad_h, const size_t pad_w, const size_t stride_h,
@@ -250,14 +245,12 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
  size_t height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
  size_t width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
  size_t num_kernels = channels * height * width;
-  size_t threads_per_block = max_threads_dim;
-  size_t n_blocks = (num_kernels + threads_per_block - 1) / threads_per_block;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.
  int err;
-  if(dilation_h != 1 || dilation_w != 1){
-    err = dilated_col2im_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  if (dilation_h != 1 || dilation_w != 1) {
+    err = dilated_col2im_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im, data_im_offset);
@@ -266,10 +259,9 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
                     "gpuarray error: dilated_col2im_kernel: %s.",
                     GpuKernel_error(&k_dilated_col2im_kernel, err));
    }
-  }
-  else{
-    err = col2im_kernel_call(
-      1, &n_blocks, &threads_per_block, 0,
+  } else {
+    err = col2im_kernel_scall(
+      1, &num_kernels, 0,
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im, data_im_offset);
@@ -393,15 +385,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        return NULL;
    }

-    // Get the max threads per blocks
-    size_t max_threads_dim;
-    err = gpucontext_property(bottom->context->ctx, GA_CTX_PROP_MAXLSIZE, &max_threads_dim);
-    if (err != GA_NO_ERROR){
-        PyErr_Format(PyExc_RuntimeError,
-                     "Could not fetch max_threads_dim.");
-        return NULL;
-    }
-
    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH;
@@ -411,8 +394,7 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
-    if (NULL == col)
-    {
+    if (NULL == col) {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorrMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
@@ -425,8 +407,6 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;
-    const DTYPE_INPUT_0 one = 1.0f;
-    const DTYPE_INPUT_0 zero = 0.0f;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
@@ -435,8 +415,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(max_threads_dim,
-                         bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+            err = im2col(bottom->ga.data, n * bottom_stride,
+                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, col->ga.data);
            if (err != GA_NO_ERROR) {
@@ -444,15 +424,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                return NULL;
            }
            // Second, gemm
-            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
-                                N_, M_, K_, one,
-                                col->ga.data, 0, N_,
-                                weight->ga.data, 0, K_,
-                                zero,
-                                top->ga.data, n * top_stride, N_);
+            switch (col->ga.typecode) {
+            case GA_FLOAT:
+              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            case GA_DOUBLE:
+              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            case GA_HALF:
+              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
+                                  N_, M_, K_, 1,
+                                  col->ga.data, 0, N_,
+                                  weight->ga.data, 0, K_,
+                                  0,
+                                  top->ga.data, n * top_stride, N_);
+              break;
+            default:
+              err = GA_UNSUPPORTED_ERROR;
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered an error running sgemm.\n");
+                             "GpuCorrMM forward encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
@@ -464,8 +466,8 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
-            err = im2col(max_threads_dim,
-                         bottom->ga.data, n * bottom_stride, nChannels, bottomHeight,
+            err = im2col(bottom->ga.data, n * bottom_stride,
+                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, col->ga.data);
            if (err != GA_NO_ERROR) {
@@ -476,15 +478,37 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
-            err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
-                                K_, M_, N_, one,
-                                col->ga.data, 0, N_,
-                                top->ga.data, n * top_stride, N_,
-                                (n == 0) ? zero : one,
-                                weight->ga.data, 0, K_);
+            switch (col->ga.typecode) {
+            case GA_FLOAT:
+              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            case GA_DOUBLE:
+              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            case GA_HALF:
+              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
+                                  K_, M_, N_, 1,
+                                  col->ga.data, 0, N_,
+                                  top->ga.data, n * top_stride, N_,
+                                  (n == 0) ? 0 : 1,
+                                  weight->ga.data, 0, K_);
+              break;
+            default:
+                err = GA_UNSUPPORTED_ERROR;
+            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered an error running sgemm.\n");
+                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
@@ -496,21 +520,42 @@ PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
+          switch (top->ga.typecode) {
+          case GA_FLOAT:
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
-                                N_, K_, M_, one,
+                                N_, K_, M_, 1,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                0,
+                                col->ga.data, 0, N_);
+            break;
+          case GA_DOUBLE:
+            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, 1,
+                                top->ga.data, n * top_stride, N_,
+                                weight->ga.data, 0, K_,
+                                0,
+                                col->ga.data, 0, N_);
+            break;
+          case GA_HALF:
+            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
+                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
-                                zero,
+                                0,
                                col->ga.data, 0, N_);
+            break;
+          default:
+            err = GA_UNSUPPORTED_ERROR;
+          }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                        "GpuCorrMM encountered an error running sgemm.\n");
+                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            // col2im back to the data
-            err = col2im(max_threads_dim,
-                         col->ga.data, nChannels, bottomHeight, bottomWidth,
+            err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
                         dH, dW, bottom->ga.data, n * bottom_stride);
            if (err != GA_NO_ERROR) {

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -613,6 +613,15 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray/types.h>']

+    def c_support_code(self):
+        return """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
+
    def c_code(self, node, name, inp, out, sub):
        x, = inp
        z, = out

--- a/theano/gpuarray/neighbours.py
+++ b/theano/gpuarray/neighbours.py
@@ -242,6 +242,15 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
                              flags=flags, objvar=k_var))
        return kernels

+    def c_support_code(self):
+        return """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
+
    def c_code(self, node, name, inp, out, sub):
        dtype_ten4 = node.inputs[0].dtype
        dtype_neib_shape = node.inputs[1].dtype

--- a/theano/gpuarray/tests/config.py
+++ b/theano/gpuarray/tests/config.py
 from __future__ import absolute_import, print_function, division
 from nose.plugins.skip import SkipTest

+import theano.tensor
 import theano.gpuarray

 if theano.gpuarray.pygpu is None:
@@ -21,3 +22,10 @@ if theano.config.mode == 'FAST_COMPILE':
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+
+
+# If using float16, cast reference input to float32
+def ref_cast(x):
+    if x.type.dtype == 'float16':
+        x = theano.tensor.cast(x, 'float32')
+    return x
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -17,7 +17,7 @@ from .. import dnn
 from ..basic_ops import GpuAllocEmpty
 from ..type import gpuarray_shared_constructor

-from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
 from . import test_nnet
 from .rnn_support import Model, GRU, LSTM, WrapperLayer

@@ -33,13 +33,6 @@ def set_precision(floatX):
    return precision


-# If using float16, cast reference input to float32
-def ref_cast(x):
-    if theano.config.floatX == 'float16':
-        x = T.cast(x, 'float32')
-    return x
-
-
 def test_dnn_conv_desc_merge():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -3,13 +3,14 @@ import unittest
 import numpy

 import theano
+from theano import config
 from theano.tests import unittest_tools as utt

 from theano.tensor.nnet.corr import CorrMM, CorrMM_gradWeights, CorrMM_gradInputs

 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, ref_cast


 class TestCorrMM(unittest.TestCase):
@@ -22,15 +23,16 @@ class TestCorrMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)

        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

        conv_ref = CorrMM(border_mode=border_mode,
                          filter_dilation=filter_dilation,
-                          subsample=subsample)(inputs, filters)
+                          subsample=subsample)(ref_cast(inputs),
+                                               ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)

        conv = GpuCorrMM(border_mode=border_mode,
@@ -120,20 +122,20 @@ class TestCorrMM(unittest.TestCase):
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        dCdH = gpuarray_shared_constructor(dCdH_val)
        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))

        if (subsample == (1, 1)):
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                inputs, dCdH)
+                ref_cast(inputs), ref_cast(dCdH))
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
                inputs, dCdH)
        else:
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape)
+                ref_cast(inputs), ref_cast(dCdH), shape=shape)
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
                inputs, dCdH, shape=shape)

@@ -167,8 +169,8 @@ class TestCorrMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

@@ -178,12 +180,13 @@ class TestCorrMM(unittest.TestCase):

        if (subsample == (1, 1)):
            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs))
            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs)
        else:
            conv_ref = CorrMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs),
+                shape=bottom_shape)
            conv_gemm = GpuCorrMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs, shape=bottom_shape)


--- a/theano/gpuarray/tests/test_gemmcorr3d.py
+++ b/theano/gpuarray/tests/test_gemmcorr3d.py
@@ -3,13 +3,14 @@ import unittest
 import numpy

 import theano
+from theano import config
 from theano.tests import unittest_tools as utt

 from theano.tensor.nnet.corr3d import Corr3dMM, Corr3dMM_gradWeights, Corr3dMM_gradInputs

 from ..type import gpuarray_shared_constructor
 from ..blas import GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs
-from .config import mode_with_gpu, mode_without_gpu
+from .config import mode_with_gpu, mode_without_gpu, ref_cast


 class TestCorr3dMM(unittest.TestCase):
@@ -22,15 +23,15 @@ class TestCorr3dMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)

        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

        conv_ref = Corr3dMM(border_mode=border_mode,
                            filter_dilation=filter_dilation,
-                            subsample=subsample)(inputs, filters)
+                            subsample=subsample)(ref_cast(inputs), ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)

        conv = GpuCorr3dMM(border_mode=border_mode,
@@ -120,20 +121,20 @@ class TestCorr3dMM(unittest.TestCase):
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]
        dCdH_shape = [dCdH_shape[i] for i in (0, 4, 1, 2, 3)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        dCdH_val = numpy.random.random(dCdH_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        dCdH_val = numpy.random.random(dCdH_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        dCdH = gpuarray_shared_constructor(dCdH_val)
        shape = gpuarray_shared_constructor(numpy.array(filters_shape[2:]))

        if (subsample == (1, 1, 1)):
            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
-                inputs, dCdH)
+                ref_cast(inputs), ref_cast(dCdH))
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                inputs, dCdH)
        else:
            conv_ref = Corr3dMM_gradWeights(subsample=subsample)(
-                inputs, dCdH, shape=shape)
+                ref_cast(inputs), ref_cast(dCdH), shape=shape)
            conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(
                inputs, dCdH, shape=shape)

@@ -167,8 +168,8 @@ class TestCorr3dMM(unittest.TestCase):
        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]

-        inputs_val = numpy.random.random(inputs_shape).astype('float32')
-        filters_val = numpy.random.random(filters_shape).astype('float32')
+        inputs_val = numpy.random.random(inputs_shape).astype(config.floatX)
+        filters_val = numpy.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

@@ -179,12 +180,12 @@ class TestCorr3dMM(unittest.TestCase):

        if (subsample == (1, 1, 1)):
            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs))
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs)
        else:
            conv_ref = Corr3dMM_gradInputs(subsample=subsample)(
-                kern=filters, topgrad=inputs, shape=bottom_shape)
+                kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape)
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs, shape=bottom_shape)