reorganize code

24b77b44 · Nicolas Ballas · Pascal Lamblin · a3e94b40 · 24b77b44 · 24b77b44
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i
 from theano.tensor.nnet import SoftmaxGrad
 from theano.tensor.signal.downsample import (
    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.opt import register_specialize_device
+from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           host_from_gpu,
@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
+from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
+                                                AbstractConv2d_gradWeights,
+                                                AbstractConv2d_gradInputs)
+from theano.tensor.opt import register_specialize_device
 def dnn_available():
    if dnn_available.avail is None:
@@ -2439,3 +2448,44 @@ if True:
                gpu_contiguous(ins[1])
            )
            return [out.dimshuffle(0, 1)]
+### AbstractConv Optimizations
+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
+def local_conv2d_cudnn(node):
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+    if not isinstance(inp1.type, CudaNdarrayType) or \
+            not isinstance(inp2.type, CudaNdarrayType):
+        return None
+    if not dnn_available():
+        return None
+    if node.op.filters_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+    if (isinstance(node.op, AbstractConv2d)):
+        rval = dnn_conv(inp1, inp2,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='forward',
+                        conv_mode = conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv2d_gradWeights)):
+        shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradweight(inp1, inp2, shape,
+                              border_mode=node.op.border_mode,
+                              subsample=node.op.subsample,
+                              conv_mode = conv_mode)
+        return [rval]
+    if (isinstance(node.op, AbstractConv2d_gradInputs)):
+        shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradinput(inp1, inp2, shape,
+                             border_mode=node.op.border_mode,
+                             subsample=node.op.subsample,
+                             conv_mode = conv_mode)
+        return [rval]
+register_specialize_device(local_conv2d_cudnn, 'cudnn')
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -75,6 +75,12 @@ from theano.tensor import slinalg
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tests.breakpoint import PdbBreakpoint
+from theano.tensor.nnet.abstract_conv2d import (BaseAbstractConv2d, AbstractConv2d,
+                                                AbstractConv2d_gradWeights,
+                                                AbstractConv2d_gradInputs)
+from theano.tensor.opt import register_specialize_device
 try:
    # We need to be able to import this file even if cuda isn't avail.
    from theano.sandbox.cuda import device_properties
@@ -2619,3 +2625,157 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
 import theano.sandbox.cuda.extra_ops
+### Move to Gpu optimization
+@local_optimizer([gpu_from_host,
+                  AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
+def local_conv2d_gpu_conv(node):
+    """
+    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
+    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
+    """
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and  isinstance(host_input.owner.op, BaseAbstractConv2d):
+            conv = host_input.owner.op
+            inps = list(host_input.owner.inputs)
+            inps[0] = as_cuda_ndarray_variable(inps[0])
+            inps[1] = as_cuda_ndarray_variable(inps[1])
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(out,
+                                                 node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            return [out]
+    if isinstance(node.op, BaseAbstractConv2d):
+        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+        inp1 = node.inputs[0]
+        inp2 = node.inputs[1]
+        if ((isinstance(inp1.type, CudaNdarrayType) and
+             isinstance(inp2.type, CudaNdarrayType))):
+            # Both inputs are already directly on the GPU, nothing to do
+            return
+        inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
+                       (inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
+        inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
+                       (inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
+        if inp1_on_gpu or inp2_on_gpu:
+            conv = node.op
+            inps = list(node.inputs)
+            inps[0] = as_cuda_ndarray_variable(inps[0])
+            inps[1] = as_cuda_ndarray_variable(inps[1])
+            out = conv(*inps)
+            # out is on the GPU because both inputs are.
+            out = theano.tensor.patternbroadcast(
+                out,
+                node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            # If the original output was on CPU, we have to transfer it
+            if isinstance(node.outputs[0].type, tensor.TensorType):
+                return [tensor.as_tensor_variable(out)]
+            else:
+                return [out]
+register_opt()(local_conv2d_gpu_conv)
+### Corrmm opt
+@local_optimizer([AbstractConv2d])
+def local_conv2d_corrmm(node):
+    img, kern = node.inputs
+    if (not isinstance(img.type, CudaNdarrayType) or
+            not isinstance(kern.type, CudaNdarrayType)):
+        return None
+    border_mode = node.op.border_mode
+    subsample = node.op.subsample
+    if (border_mode == 'full') and (subsample == (1, 1)):
+        if not node.op.filters_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # need to dimshuffle the kernel for full convolution
+        kern = kern.dimshuffle(1, 0, 2, 3)
+        # call GpuCorrMM_gradInputs
+        rval = GpuCorrMM_gradInputs('valid', subsample)(
+                gpu_contiguous(kern), gpu_contiguous(img))
+    else:
+        # need to flip the kernel if necessary
+        if node.op.filters_flip:
+            kern = kern[:, :, ::-1, ::-1]
+        # By default use GpuCorrMM
+        rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
+                                                 gpu_contiguous(kern))
+        # call GpuCorrMM_gradWeights if good
+        # (the latter is faster if batchsize * kernelHeight * kernelWidth
+        # is larger than inputChannels * outputHeight * outputWidth.
+        # GpuConv does not always store information on the batchsize and
+        # channels, though, so we only use what information we have.)
+        if ((subsample == (1,1)) and
+            (node.op.imshp is not None) and
+            (None not in node.op.imshp[-2:]) and
+            (node.op.kshp is not None) and
+            (None not in node.op.kshp)):
+            # we know the kernel and output size
+            prod1 = node.op.kshp[0] * node.op.kshp[1]
+            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
+            if ((node.op.bsize is not None) and
+                    (len(node.op.imshp) == 3) and
+                    (node.op.imshp[0] is not None)):
+                # we also know batchsize and input channels
+                prod1 *= node.op.bsize
+                prod2 *= node.op.imshp[0]
+            # compare to decide
+            if prod1 > prod2:
+                # (we need to wrap the result in as_cuda_ndarray_variable,
+                # because we are not allowed to replace a CudaNdarray with
+                # a DimShuffle instance in a graph optimization)
+                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
+                    GpuCorrMM_gradWeights(border_mode, subsample)(
+                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
+                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
+                    ).dimshuffle(1, 0, 2, 3))
+    return [rval]
+register_specialize_device(local_conv2d_corrmm, 'conv_gemm')
+@local_optimizer([AbstractConv2d_gradWeights])
+def local_conv2d_gradweight_corrmm(node):
+    img, topgrad, shape = node.inputs
+    if not isinstance(img.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
+                                 subsample=node.op.subsample)(
+        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
+    if node.op.filters_flip:
+        rval = rval[:, :, ::-1, ::-1]
+    rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
+    rval = as_cuda_ndarray_variable(rval)
+    return [rval]
+register_specialize_device(local_conv2d_gradweight_corrmm, 'conv_gemm')
+@local_optimizer([AbstractConv2d_gradInputs])
+def local_conv2d_gradinputs_corrmm(node):
+    kern, topgrad, shape = node.inputs
+    if not isinstance(kern.type, CudaNdarrayType) or \
+            not isinstance(topgrad.type, CudaNdarrayType):
+        return None
+    if node.op.filters_flip:
+        kern = kern[:, :, ::-1, ::-1]
+    rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+    subsample=node.op.subsample)(
+        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
+    return [rval]
+register_specialize_device(local_conv2d_gradinputs_corrmm, 'conv_gemm')
--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
--- a/theano/tensor/nnet/abstract_conv2d.py
+++ b/theano/tensor/nnet/abstract_conv2d.py
@@ -15,20 +15,8 @@ from theano.tensor import TensorType
 from theano.gof import Apply, Op
 from theano.gof import local_optimizer
-from theano.sandbox.cuda import register_opt as register_gpu
 from theano.tensor.opt import register_specialize_device
-### Gpu related optimization (to be moved in sandbox/cuda)
-from theano.sandbox.cuda.basic_ops import (
-    as_cuda_ndarray_variable,
-    gpu_contiguous, gpu_from_host, host_from_gpu,
-    GpuFromHost, HostFromGpu
-    )
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda.dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
-from theano.sandbox.cuda.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
-from theano.sandbox.cuda.opt import values_eq_approx_high_tol
 ## Cpu implementation
@@ -36,6 +24,7 @@ from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
 from theano.tensor.nnet.ConvGrad3D import convGrad3D
 from theano.tensor.nnet.ConvTransp3D import convTransp3D
 _logger = logging.getLogger("theano.tensor.nnet.conv2d")
@@ -330,202 +319,6 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
    def connection_pattern(self, node):
        return [[1], [1], [0]]  # no connection to height, width
-### Move to Gpu optimization
-@local_optimizer([gpu_from_host,
-                  AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
-def local_conv2d_gpu_conv(node):
-    """
-    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
-    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
-    """
-    if isinstance(node.op, GpuFromHost):
-        host_input = node.inputs[0]
-        if host_input.owner and  isinstance(host_input.owner.op, BaseAbstractConv2d):
-            conv = host_input.owner.op
-            inps = list(host_input.owner.inputs)
-            inps[0] = as_cuda_ndarray_variable(inps[0])
-            inps[1] = as_cuda_ndarray_variable(inps[1])
-            out = conv(*inps)
-            # out is on the GPU because both inputs are.
-            out = theano.tensor.patternbroadcast(out,
-                                                 node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx_high_tol
-            return [out]
-    if isinstance(node.op, BaseAbstractConv2d):
-        # conv(host_from_gpu) -> host_from_gpu(gpu_conv)
-        inp1 = node.inputs[0]
-        inp2 = node.inputs[1]
-        if ((isinstance(inp1.type, CudaNdarrayType) and
-             isinstance(inp2.type, CudaNdarrayType))):
-            # Both inputs are already directly on the GPU, nothing to do
-            return
-        inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
-                       (inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
-        inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
-                       (inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
-        if inp1_on_gpu or inp2_on_gpu:
-            conv = node.op
-            inps = list(node.inputs)
-            inps[0] = as_cuda_ndarray_variable(inps[0])
-            inps[1] = as_cuda_ndarray_variable(inps[1])
-            out = conv(*inps)
-            # out is on the GPU because both inputs are.
-            out = theano.tensor.patternbroadcast(
-                out,
-                node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx_high_tol
-            # If the original output was on CPU, we have to transfer it
-            if isinstance(node.outputs[0].type, TensorType):
-                return [as_tensor_variable(out)]
-            else:
-                return [out]
-register_gpu()(local_conv2d_gpu_conv)
-### Cudnn Opt
-@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
-def local_conv2d_cudnn(node):
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-    if not isinstance(inp1.type, CudaNdarrayType) or \
-            not isinstance(inp2.type, CudaNdarrayType):
-        return None
-    if not dnn_available():
-        return None
-    if node.op.filters_flip:
-        conv_mode = 'conv'
-    else:
-        conv_mode = 'cross'
-    if (isinstance(node.op, AbstractConv2d)):
-        rval = dnn_conv(inp1, inp2,
-                        border_mode=node.op.border_mode,
-                        subsample=node.op.subsample,
-                        direction_hint='forward',
-                        conv_mode = conv_mode)
-        return [rval]
-    if (isinstance(node.op, AbstractConv2d_gradWeights)):
-        shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
-        rval = dnn_gradweight(inp1, inp2, shape,
-                              border_mode=node.op.border_mode,
-                              subsample=node.op.subsample,
-                              conv_mode = conv_mode)
-        return [rval]
-    if (isinstance(node.op, AbstractConv2d_gradInputs)):
-        shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
-        rval = dnn_gradinput(inp1, inp2, shape,
-                             border_mode=node.op.border_mode,
-                             subsample=node.op.subsample,
-                             conv_mode = conv_mode)
-        return [rval]
-register_specialize_device(local_conv2d_cudnn, 'cudnn')
-### Corrmm opt
-@local_optimizer([AbstractConv2d])
-def local_conv2d_corrmm(node):
-    img, kern = node.inputs
-    if (not isinstance(img.type, CudaNdarrayType) or
-            not isinstance(kern.type, CudaNdarrayType)):
-        return None
-    border_mode = node.op.border_mode
-    subsample = node.op.subsample
-    if (border_mode == 'full') and (subsample == (1, 1)):
-        if not node.op.filters_flip:
-            kern = kern[:, :, ::-1, ::-1]
-        # need to dimshuffle the kernel for full convolution
-        kern = kern.dimshuffle(1, 0, 2, 3)
-        # call GpuCorrMM_gradInputs
-        rval = GpuCorrMM_gradInputs('valid', subsample)(
-                gpu_contiguous(kern), gpu_contiguous(img))
-    else:
-        # need to flip the kernel if necessary
-        if node.op.filters_flip:
-            kern = kern[:, :, ::-1, ::-1]
-        # By default use GpuCorrMM
-        rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
-                                                 gpu_contiguous(kern))
-        # call GpuCorrMM_gradWeights if good
-        # (the latter is faster if batchsize * kernelHeight * kernelWidth
-        # is larger than inputChannels * outputHeight * outputWidth.
-        # GpuConv does not always store information on the batchsize and
-        # channels, though, so we only use what information we have.)
-        if ((subsample == (1,1)) and
-            (node.op.imshp is not None) and
-            (None not in node.op.imshp[-2:]) and
-            (node.op.kshp is not None) and
-            (None not in node.op.kshp)):
-            # we know the kernel and output size
-            prod1 = node.op.kshp[0] * node.op.kshp[1]
-            prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
-                     (node.op.imshp[-1] - node.op.kshp[1] + 1))
-            if ((node.op.bsize is not None) and
-                    (len(node.op.imshp) == 3) and
-                    (node.op.imshp[0] is not None)):
-                # we also know batchsize and input channels
-                prod1 *= node.op.bsize
-                prod2 *= node.op.imshp[0]
-            # compare to decide
-            if prod1 > prod2:
-                # (we need to wrap the result in as_cuda_ndarray_variable,
-                # because we are not allowed to replace a CudaNdarray with
-                # a DimShuffle instance in a graph optimization)
-                rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
-                    GpuCorrMM_gradWeights(border_mode, subsample)(
-                        gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
-                        gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
-                    ).dimshuffle(1, 0, 2, 3))
-    return [rval]
-register_specialize_device(local_conv2d_corrmm, 'conv_gemm')
-@local_optimizer([AbstractConv2d_gradWeights])
-def local_conv2d_gradweight_corrmm(node):
-    img, topgrad, shape = node.inputs
-    if not isinstance(img.type, CudaNdarrayType) or \
-            not isinstance(topgrad.type, CudaNdarrayType):
-        return None
-    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
-                                 subsample=node.op.subsample)(
-        gpu_contiguous(img), gpu_contiguous(topgrad), shape)
-    if node.op.filters_flip:
-        rval = rval[:, :, ::-1, ::-1]
-    rval = patternbroadcast(rval, node.outputs[0].broadcastable)
-    rval = as_cuda_ndarray_variable(rval)
-    return [rval]
-register_specialize_device(local_conv2d_gradweight_corrmm, 'conv_gemm')
-@local_optimizer([AbstractConv2d_gradInputs])
-def local_conv2d_gradinputs_corrmm(node):
-    kern, topgrad, shape = node.inputs
-    if not isinstance(kern.type, CudaNdarrayType) or \
-            not isinstance(topgrad.type, CudaNdarrayType):
-        return None
-    if node.op.filters_flip:
-        kern = kern[:, :, ::-1, ::-1]
-    rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
-    subsample=node.op.subsample)(
-        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
-    return [rval]
-register_specialize_device(local_conv2d_gradinputs_corrmm, 'conv_gemm')
 ### Cpu Optmization
 @local_optimizer([AbstractConv2d])
 def local_conv2d_cpu(node):