Merge pull request #2664 from f0k/gpu-alloc-empty

GpuAllocEmpty for dnn_conv()

Merge pull request #2664 from f0k/gpu-alloc-empty
bb53ed07 · Pascal Lamblin · e972e956 · 8fcb85e5 · bb53ed07 · bb53ed07
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3257,38 +3257,13 @@ class GpuSplit(tensor.Split, GpuOp):
        return Apply(self, [x] + node.inputs[1:], outs)
-class GpuAlloc(GpuOp):
+class GpuAllocEmpty(GpuOp):
-    """Implement Alloc on the gpu.
+    """Implement Alloc on the gpu, but without initializing memory."""
+    __props__ = ()
-    The memset_0 param is an optimization. When True, we call
-    cudaMalloc that is faster.
-    """
-    def __init__(self, memset_0=False):
-        self.memset_0 = memset_0
-    def __eq__(self, other):
-        return type(self) == type(other) and self.memset_0 == other.memset_0
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.memset_0)
-    def __str__(self):
-        #Hide the memset parameter when not used to prevent confusion.
-        if self.memset_0:
-            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
-        else:
-            s = self.__class__.__name__
-        return s
-    def make_node(self, value, *shape):
+    @staticmethod
-        #if their is unneeded transfert generated by the next line
+    def validate_shape(shape):
-        #the optimizer will remove them.
-        v = as_cuda_ndarray_variable(value)
        sh = [tensor.as_tensor_variable(s) for s in shape]
-        if v.ndim != len(shape):
-            value = tensor.shape_padleft(value, len(shape) - v.ndim)
        bcast = []
        for s in sh:
            if s.type.dtype[:3] not in ('int', 'uin'):
@@ -3300,23 +3275,26 @@ class GpuAlloc(GpuOp):
                const_shp = None
            bcast.append(numpy.all(1 == const_shp))
        otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
-        return Apply(self, [v] + sh, [otype()])
+        output = otype()
+        return sh, output
+    def make_node(self, *shape):
+        shape, output = self.validate_shape(shape)
+        output.values_eq_approx = tensor.type.values_eq_approx_always_true
+        return Apply(self, shape, [output])
    def perform(self, node, inputs, out_):
        out, = out_
-        v = inputs[0]
+        sh = tuple([int(i) for i in inputs])
-        sh = tuple([int(i) for i in inputs[1:]])
        if out[0] is None or out[0].shape != sh:
+            # XXX: We could implement and call CudaNdarray.empty(sh) instead.
            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
-        out[0][...] = v  # broadcast v to fill us up
    def c_code(self, node, name, inputs, out_, sub):
        out, = out_
        fail = sub['fail']
-        value = inputs[0]
+        shps = inputs
-        shps = inputs[1:]
        nd = len(shps)
-        memset_0 = int(self.memset_0)
        str = "int dims[%(nd)s];\n" % locals()
        for idx, sh in enumerate(shps):
            str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
@@ -3340,6 +3318,65 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
+        """ % locals()
+        return str
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs]
+    def c_code_cache_version(self):
+        return (1,)
+    def do_constant_folding(self, node):
+        return False
+gpu_alloc_empty = GpuAllocEmpty()
+class GpuAlloc(GpuAllocEmpty):
+    """Implement Alloc on the gpu.
+    The memset_0 param is an optimization. When True, we call
+    cudaMemset that is faster.
+    """
+    __props__ = ('memset_0',)
+    def __init__(self, memset_0=False):
+        self.memset_0 = memset_0
+    def __str__(self):
+        #Hide the memset parameter when not used to prevent confusion.
+        if self.memset_0:
+            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+        else:
+            s = self.__class__.__name__
+        return s
+    def make_node(self, value, *shape):
+        #if there is unneeded transfert generated by the next line
+        #the optimizer will remove them.
+        v = as_cuda_ndarray_variable(value)
+        shape, output = self.validate_shape(shape)
+        return Apply(self, [v] + shape, [output])
+    def perform(self, node, inputs, out_):
+        # the super class (GpuAllocEmpty) allocates memory, we fill it
+        value = inputs[0]
+        shps = inputs[1:]
+        super(GpuAlloc, self).perform(node, shps, out_)
+        out, = out_
+        out[0][...] = value  # broadcast value to fill us up
+    def c_code(self, node, name, inputs, out_, sub):
+        # the super class (GpuAllocEmpty) allocates memory, we fill it
+        value = inputs[0]
+        shps = inputs[1:]
+        str = super(GpuAllocEmpty, self).c_code(node, name, shps, out_, sub)
+        out, = out_
+        fail = sub['fail']
+        memset_0 = int(self.memset_0)
+        str += """
        if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
        {
            cudaError_t err = cudaMemset(%(out)s->devdata, 0,
@@ -3369,10 +3406,6 @@ class GpuAlloc(GpuOp):
    def infer_shape(self, node, input_shapes):
        return [node.inputs[1:]]
-    def grad(self, inputs, grads):
-        gout, = grads
-        return [None for i in inputs]
    def c_code_cache_version(self):
        return (9,)

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
-                                           gpu_alloc)
+                                           gpu_alloc_empty)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -443,8 +443,8 @@ class GpuDnnConv(DnnBase, COp):
        top = gpu_contiguous(top)
-        d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
-        d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
+        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -519,8 +519,8 @@ class GpuDnnConvGradW(DnnBase, COp):
        kerns = gpu_contiguous(kerns)
-        d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
-        d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
+        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -586,8 +586,8 @@ class GpuDnnConvGradI(DnnBase, COp):
        img = gpu_contiguous(img)
-        d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
+        d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
-        d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
+        d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)
@@ -675,7 +675,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = gpu_alloc(_zero.clone(), shape_i(kerns, 1, fgraph),
+        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
                        shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross')(img.shape, out.shape)
@@ -692,7 +692,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc(_zero.clone(), shape_i(img, 0, fgraph),
+        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
                        shape_i(kerns, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode)(out.shape, kerns.shape)
@@ -709,9 +709,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
-    out = gpu_alloc(_zero.clone(),
+    out = gpu_alloc_empty(*out_shp)
-                    out_shp[0], out_shp[1],
-                    out_shp[2], out_shp[3])
    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -630,6 +630,10 @@ def values_eq_approx_remove_inf_nan(a, b):
    return TensorType.values_eq_approx(a, b, True, True)
+def values_eq_approx_always_true(a, b):
+    return True
 # Register TensorType C code for ViewOp.
 theano.compile.register_view_op_c_code(
        TensorType,