Add GpuAllocEmpty and use it in dnn_conv()

ad8571e0 · f0k · 0838ae4d · ad8571e0 · ad8571e0
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3257,37 +3257,19 @@ class GpuSplit(tensor.Split, GpuOp):
        return Apply(self, [x] + node.inputs[1:], outs)


-class GpuAlloc(GpuOp):
-    """Implement Alloc on the gpu.
-
-    The memset_0 param is an optimization. When True, we call
-    cudaMalloc that is faster.
-
-    """
-    def __init__(self, memset_0=False):
-        self.memset_0 = memset_0
-
+class GpuAllocEmpty(GpuOp):
+    """Implement Alloc on the gpu, but without initializing memory."""
    def __eq__(self, other):
-        return type(self) == type(other) and self.memset_0 == other.memset_0
+        return type(self) == type(other)

    def __hash__(self):
-        return hash(type(self)) ^ hash(self.memset_0)
+        return hash(type(self))

    def __str__(self):
-        #Hide the memset parameter when not used to prevent confusion.
-        if self.memset_0:
-            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
-        else:
-            s = self.__class__.__name__
-        return s
+        return self.__class__.__name__

-    def make_node(self, value, *shape):
-        #if their is unneeded transfert generated by the next line
-        #the optimizer will remove them.
-        v = as_cuda_ndarray_variable(value)
+    def make_node(self, *shape):
        sh = [tensor.as_tensor_variable(s) for s in shape]
-        if v.ndim != len(shape):
-            value = tensor.shape_padleft(value, len(shape) - v.ndim)

        bcast = []
        for s in sh:
@@ -3300,23 +3282,20 @@ class GpuAlloc(GpuOp):
                const_shp = None
            bcast.append(numpy.all(1 == const_shp))
        otype = CudaNdarrayType(dtype='float32', broadcastable=bcast)
-        return Apply(self, [v] + sh, [otype()])
+        return Apply(self, sh, [otype()])

    def perform(self, node, inputs, out_):
        out, = out_
-        v = inputs[0]
-        sh = tuple([int(i) for i in inputs[1:]])
+        sh = tuple([int(i) for i in inputs])
        if out[0] is None or out[0].shape != sh:
+            # XXX: We could implement and call CudaNdarray.empty(sh) instead.
            out[0] = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(sh)
-        out[0][...] = v  # broadcast v to fill us up

    def c_code(self, node, name, inputs, out_, sub):
        out, = out_
        fail = sub['fail']
-        value = inputs[0]
-        shps = inputs[1:]
+        shps = inputs
        nd = len(shps)
-        memset_0 = int(self.memset_0)
        str = "int dims[%(nd)s];\n" % locals()
        for idx, sh in enumerate(shps):
            str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
@@ -3340,6 +3319,75 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
+        """ % locals()
+        return str
+
+    def infer_shape(self, node, input_shapes):
+        return [node.inputs]
+
+    def grad(self, inputs, grads):
+        gout, = grads
+        return [None for i in inputs]
+
+    def c_code_cache_version(self):
+        return (1,)
+
+    def do_constant_folding(self, node):
+        # XXX: anything needed here?
+        return False
+
+gpu_alloc_empty = GpuAllocEmpty()
+
+
+class GpuAlloc(GpuAllocEmpty):
+    """Implement Alloc on the gpu.
+
+    The memset_0 param is an optimization. When True, we call
+    cudaMemset that is faster.
+
+    """
+    def __init__(self, memset_0=False):
+        self.memset_0 = memset_0
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.memset_0 == other.memset_0
+
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.memset_0)
+
+    def __str__(self):
+        #Hide the memset parameter when not used to prevent confusion.
+        if self.memset_0:
+            s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
+        else:
+            s = self.__class__.__name__
+        return s
+
+    def make_node(self, value, *shape):
+        node = super(GpuAlloc, self).make_node(*shape)
+        node.outputs[0].owner = None
+        #if their is unneeded transfert generated by the next line
+        #the optimizer will remove them.
+        v = as_cuda_ndarray_variable(value)
+        if v.ndim != len(shape):
+            value = tensor.shape_padleft(value, len(shape) - v.ndim)
+        return Apply(self, [v] + node.inputs, node.outputs)
+
+    def perform(self, node, inputs, out_):
+        # the super class (GpuAllocEmpty) allocates memory, we fill it
+        super(GpuAlloc, self).perform(node, inputs, out_)
+        out, = out_
+        out[0][...] = v  # broadcast v to fill us up
+
+    def c_code(self, node, name, inputs, out_, sub):
+        # the super class (GpuAllocEmpty) allocates memory, we fill it
+        value = inputs[0]
+        shps = inputs[1:]
+        str = super(GpuAllocEmpty, self).c_code(node, name, shps, out_, sub)
+        out, = out_
+        fail = sub['fail']
+        memset_0 = int(self.memset_0)
+        str += """
        if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
        {
            if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
@@ -3367,10 +3415,6 @@ class GpuAlloc(GpuOp):
    def infer_shape(self, node, input_shapes):
        return [node.inputs[1:]]

-    def grad(self, inputs, grads):
-        gout, = grads
-        return [None for i in inputs]
-
    def c_code_cache_version(self):
        return (7,)


--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -17,7 +17,7 @@ from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
                                           gpu_contiguous, HostFromGpu,
-                                           gpu_alloc)
+                                           gpu_alloc_empty)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -443,8 +443,8 @@ class GpuDnnConv(DnnBase, COp):

        top = gpu_contiguous(top)

-        d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
-        d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, img, desc)
+        d_kerns = GpuDnnConvGradW()(img, top, kerns, desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -519,8 +519,8 @@ class GpuDnnConvGradW(DnnBase, COp):

        kerns = gpu_contiguous(kerns)

-        d_img = GpuDnnConvGradI()(kerns, top, img.zeros_like(), desc)
-        d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
+        d_img = GpuDnnConvGradI()(kerns, top, img, desc)
+        d_top = GpuDnnConv()(img, kerns, top, desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -586,8 +586,8 @@ class GpuDnnConvGradI(DnnBase, COp):

        img = gpu_contiguous(img)

-        d_kerns = GpuDnnConvGradW()(img, top, kerns.zeros_like(), desc)
-        d_top = GpuDnnConv()(img, kerns, top.zeros_like(), desc)
+        d_kerns = GpuDnnConvGradW()(img, top, kerns, desc)
+        d_top = GpuDnnConv()(img, kerns, top, desc)
        d_alpha = grad_not_implemented(self, 4, alpha)
        d_beta = grad_not_implemented(self, 5, beta)

@@ -675,7 +675,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
        shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
        shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
-        out = gpu_alloc(_zero.clone(), shape_i(kerns, 1, fgraph),
+        out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
                        shape_i(img, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode='cross')(img.shape, out.shape)
@@ -692,7 +692,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
        shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
        shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
-        out = gpu_alloc(_zero.clone(), shape_i(img, 0, fgraph),
+        out = gpu_alloc_empty(shape_i(img, 0, fgraph),
                        shape_i(kerns, 1, fgraph), shape2, shape3)
        desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
                              conv_mode=conv_mode)(out.shape, kerns.shape)
@@ -709,9 +709,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
                                       desc_op.border_mode,
                                       desc_op.subsample)
-    out = gpu_alloc(_zero.clone(),
-                    out_shp[0], out_shp[1],
-                    out_shp[2], out_shp[3])
+    out = gpu_alloc_empty(*out_shp)
    return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)