update optim

dc6c058c · Nicolas Ballas · Pascal Lamblin · 5ae763de · dc6c058c
--- a/theano/tensor/nnet/abstract_conv2d.py
+++ b/theano/tensor/nnet/abstract_conv2d.py
@@ -14,31 +14,25 @@ from theano.tensor import (as_tensor_variable, blas, get_scalar_constant_value,
 from theano.gof import Apply, Op
 from theano.gof import local_optimizer

+from theano.sandbox.cuda import register_opt as register_gpu
+from theano.tensor.opt import register_specialize_device
+
+
+### Gpu related optimization (to be moved in sandbox/cuda)
 from theano.sandbox.cuda.basic_ops import (
    as_cuda_ndarray_variable,
    gpu_contiguous, gpu_from_host, host_from_gpu,
    GpuFromHost, HostFromGpu
    )
-from theano.sandbox.cuda import gpu_optimizer, register_opt
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.dnn import dnn_available, dnn_conv
 from theano.sandbox.cuda.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
 from theano.sandbox.cuda.opt import values_eq_approx_high_tol


+## Cpu implementation
 from theano.tensor.nnet import conv2d as cpu_conv2d
-
-imported_scipy_signal = False
-try:
-    # TODO: move these back out to global scope when they no longer
-    # cause an atexit error
-    from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
-    from scipy.signal.sigtools import _convolve2d
-    imported_scipy_signal = True
-except ImportError:
-    pass
-
-_logger = logging.getLogger("theano.tensor.nnet.conv")
+_logger = logging.getLogger("theano.tensor.nnet.conv2d")


 def conv2d(img,
@@ -115,7 +109,7 @@ def conv2d(img,



-class BaseConv2d(Op):
+class BaseAbstractConv2d(Op):
    """Base class for ConvInferace

    FIXME
@@ -178,7 +172,7 @@ class BaseConv2d(Op):



-class Conv2d(BaseConv2d):
+class AbstractConv2d(BaseAbstractConv2d):
    """
    FIXME
    """
@@ -188,7 +182,7 @@ class Conv2d(BaseConv2d):
                 bsize=None,
                 border_mode="valid",
                 subsample=(1, 1)):
-        super(Conv2d, self).__init__(imshp, kshp, bsize,
+        super(AbstractConv2d, self).__init__(imshp, kshp, bsize,
                                     border_mode, subsample)

    def make_node(self, img, kern):
@@ -200,29 +194,31 @@ class Conv2d(BaseConv2d):
        broadcastable=[img.broadcastable[0],
                       kern.broadcastable[0],
                       False, False]
-        img = as_tensor_variable(img)
-        kern = as_tensor_variable(kern)
-        output = theano.tensor.tensor(dtype=img.type.dtype,
-                                      broadcastable=broadcastable)
+        output = img.type.__class__(dtype=img.type.dtype,
+                                    broadcastable=broadcastable)
        return Apply(self, [img, kern], [output])

    def perform(self, node, inp, out_):
-        raise NotImplementedError('Conv2d theano optimization failed')
+        raise NotImplementedError('AbstractConv2d theano optimization failed')

    def grad(self, inp, grads):
        bottom, weights = inp
        top, = grads
-        d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
-                                     self.border_mode, self.subsample)(
+        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
+                                             self.bsize,
+                                             self.border_mode,
+                                             self.subsample)(
            weights, top, bottom.shape[-2:])
-        d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
-                                       self.border_mode, self.subsample)(
+        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
+                                               self.bsize,
+                                               self.border_mode,
+                                               self.subsample)(
            bottom, top, weights.shape[-2:])
        return d_bottom, d_weights


-class Conv2d_gradWeights(BaseConv2d):
-    """Gradient wrt. filters for `Conv2d`.
+class AbstractConv2d_gradWeights(BaseAbstractConv2d):
+    """Gradient wrt. filters for `AbstractConv2d`.

    :note: You will not want to use this directly, but rely on
           Theano's automatic differentiation or graph optimization to
@@ -236,7 +232,7 @@ class Conv2d_gradWeights(BaseConv2d):
                 bsize=None,
                 border_mode="valid",
                 subsample=(1, 1)):
-        super(Conv2d_gradWeights, self).__init__(imshp, kshp, bsize,
+        super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp, bsize,
                                                 border_mode, subsample)

    def make_node(self, img, topgrad, shape=None):
@@ -255,23 +251,27 @@ class Conv2d_gradWeights(BaseConv2d):
        broadcastable=[topgrad.broadcastable[0],
                       img.broadcastable[0],
                       False, False]
-        img = as_tensor_variable(img)
-        topgrad = as_tensor_variable(topgrad)
-        output = theano.tensor.tensor(dtype=img.type.dtype,
-                                      broadcastable=broadcastable)
+        output = img.type.__class__(dtype=img.type.dtype,
+                                    broadcastable=broadcastable)
        return Apply(self, [img, topgrad] + height_width, [output])

    def perform(self, node, inp, out_):
-        raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
+        raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')

    def grad(self, inp, grads):
        bottom, top = inp[:2]
        weights, = grads
-        d_bottom = Conv2d_gradInputs(self.imshp, self.kshp, self.bsize,
-                                     self.border_mode, self.subsample)(
+        d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
+                                             self.bsize,
+                                             self.border_mode,
+                                             self.subsample)(
            weights, top, bottom.shape[-2:])
-        d_top = Conv2d(self.imshp, self.kshp, self.bsize,
-                       self.border_mode, self.subsample)(bottom, weights)
+                                     d_top = AbstractConv2d(self.imshp,
+                                                            self.kshp,
+                                                            self.bsize,
+                                                            self.border_mode,
+                                                            self.subsample)(
+                                         bottom, weights)
        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
        return (d_bottom, d_top) + d_height_width

@@ -282,8 +282,8 @@ class Conv2d_gradWeights(BaseConv2d):
            return [[1], [1], [0], [0]]  # no connection to height, width


-class Conv2d_gradInputs(Conv2d):
-    """Gradient wrt. inputs for `Conv2d`.
+class AbstractConv2d_gradInputs(Conv2d):
+    """Gradient wrt. inputs for `AbstractConv2d`.

    :note: You will not want to use this directly, but rely on
           Theano's automatic differentiation or graph optimization to
@@ -297,7 +297,7 @@ class Conv2d_gradInputs(Conv2d):
                 bsize=None,
                 border_mode="valid",
                 subsample=(1, 1)):
-        super(Conv2d_gradInputs, self).__init__(imshp, kshp, bsize,
+        super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp, bsize,
                                                border_mode, subsample)

    def make_node(self, kern, topgrad, shape=None):
@@ -312,24 +312,25 @@ class Conv2d_gradInputs(Conv2d):
        broadcastable = [topgrad.type.broadcastable[0],
                         kern.type.broadcastable[1],
                         False, False]
-        kern = as_tensor_variable(kern)
-        topgrad = as_tensor_variable(topgrad)
-        output = theano.tensor.tensor(dtype=kern.type.dtype,
-                                      broadcastable=broadcastable)
+        output = kern.type.__class__(dtype=kern.type.dtype,
+                                     broadcastable=broadcastable)
        return Apply(self, [kern, topgrad] + height_width, [output])


    def perform(self, node, nodename, inp, out_, sub):
-        raise NotImplementedError('Conv2d_gradWeight theano optimization failed')
+        raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')

    def grad(self, inp, grads):
        weights, top = inp[:2]
        bottom, = grads
-        d_weights = Conv2d_gradWeights(self.imshp, self.kshp, self.bsize,
-                                       self.border_mode, self.subsample)(
+        d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
+                                               self.bsize,
+                                               self.border_mode,
+                                               self.subsample)(
            bottom, top, weights.shape[-2:])
-        d_top = Conv2d(self.imshp, self.filter_shape, self.bsize,
-                       self.border_mode, self.subsample)(bottom, weights)
+        d_top = AbstractConv2d(self.imshp, self.filter_shape, self.bsize,
+                               self.border_mode, self.subsample)(
+            bottom, weights)
        d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
        return (d_weights, d_top) + d_height_width

@@ -340,38 +341,126 @@ class Conv2d_gradInputs(Conv2d):
            return [[1], [1], [0], [0]]  # no connection to height, width


+### Optimizations should be move in their appropriate files
+
+### move to Gpu optimization
+### Do not replace the AbstractOpt only the inputs
+### Abstract Ops is replaced layer by device_specialized opt
+@local_optimizer([gpu_from_host, AbstractConv2d,
+                  AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_conv2d_gpu_conv(node):
+    """
+    gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
+
+    AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
+    """
+    if isinstance(node.op, GpuFromHost):
+        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
+        host_input = node.inputs[0]
+        if host_input.owner and \
+                (isinstance(host_input.owner.op, AbstractConv2d) or
+                 isinstance(host_input.owner.op, AbstractConv2d_gradWeights) or
+                 isinstance(host_input.owner.op, AbstractConv2d_gradInputs)):
+
+            conv = host_input.owner.op
+            if len(host_input.owner.inputs) == 3:
+                inp1, inp2, shape = host_input.owner.inputs
+            else:
+                inp1, inp2 = host_input.owner.inputs
+                shape = None
+            out = conv.type.__class__(imgshp = conv.imshp,
+                                      kshp = conv.kshp,
+                                      bsize = conv.bsize,
+                                      border_mode = conv.border_mode,
+                                      subsample = conv.subsample)
+            out = out(gpu_from_host(inp1),
+                      gpu_from_host(inp2),
+                      shape)
+            out = theano.tensor.patternbroadcast(gpu_from_host(out),
+                                                 node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            return [out]
+
+    if (isinstance(node.op, AbstractConv2d) or
+        isinstance(node.op, AbstractConv2d_gradWeights) or
+        isinstance(node.op, AbstractConv2d_gradInputs)):
+        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+
+        if len(node.inputs) == 3:
+            inp1, inp2, shape = node.inputs
+        else:
+            inp1, inp2 = node.inputs
+            shape = None
+        inp1_on_gpu = (inp1.owner and isinstance(inp1.owner.op, HostFromGpu))
+        inp2_on_gpu = (inp2.owner and isinstance(inp2.owner.op, HostFromGpu))
+        if inp1_on_gpu or inp2_on_gpu:
+            conv = node.op
+            out = conv.type.__class__(imgshp=conv.imshp,
+                                      kshp=conv.kshp,
+                                      bsize=conv.bsize,
+                                      border_mode=conv.border_mode,
+                                      subsample = conv.subsample)
+            out = out(gpu_from_host(inp1),
+                      gpu_from_host(inp2),
+                      shape)
+            out = theano.tensor.patternbroadcast(
+                out,
+                node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx_high_tol
+            return [as_tensor_variable(out)]
+# We register the optimizer that moves convolutions to the GPU.
+register_gpu()(local_conv2d_gpu_conv)
+

+@local_optimizer([AbstractConv2d,
+                  AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_conv2d_cudnn(node):

-def replace_conv_with_cudnn(convop, inputs):
+    if len(node.inputs) == 3:
+        inp1, inp2, shape = node.inputs
+    else:
+        inp1, inp2 = node.inputs
+        shape = None
+
+    if not isinstance(inp1, CudaNdarrayType) or \
+            isinstance(inp2, CudaNdarrayType):
+        return None
    if not dnn_available():
        return None
-
-    inp1, inp2, shape = inputs
-    if (isinstance(convop, Conv2d)):
+    if (isinstance(node.op, AbstractConv2d)):
        rval = dnn_conv(inp1, inp2,
-                        border_mode=convop.border_mode,
-                        subsample=convop.subsample,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
                        direction_hint='forward')
        return rval
-    if (isinstance(convop, Conv2d_gradWeights)):
+    if (isinstance(node.op, AbstractConv2d_gradWeights)):
        rval = dnn_conv(inp1.dimshuffle(1, 0, 2, 3), inp2,
-                        border_mode=convop.border_mode,
-                        subsample=convop.subsample,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
                        direction_hint='bprop weights')
        return rval
-    if (isinstance(convop, Conv2d_gradInputs)):
+    if (isinstance(node.op, AbstractConv2d_gradInputs)):
        rval = dnn_conv(inp1, inp2,
-                        border_mode=convop.border_mode,
-                        subsample=convop.subsample,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
                        direction_hint='bprop inputs')
        return rval
+register_specialize_device()(local_conv2d_cudnn)

-def replace_convforward_with_corrmm(convop, inputs):
-    img, kern, shape = inputs

-    if convop.border_mode in ['full', 'valid']:
-        border_mode = convop.border_mode
-        subsample = convop.subsample
+@local_optimizer(AbstractConv2d)
+def local_conv2d_corrmm(convop, inputs):
+
+    img, kern = node.inputs
+    if not isinstance(img, CudaNdarrayType) or \
+            isinstance(kern, CudaNdarrayType):
+        return None
+
+    if node.op.border_mode in ['full', 'valid']:
+        border_mode = node.op.border_mode
+        subsample = node.op.subsample
        if (border_mode == 'valid') or (subsample != (1,1)):
            # need to flip the kernel for valid convolution
            kern = kern[:, :, ::-1, ::-1]
@@ -385,20 +474,20 @@ def replace_convforward_with_corrmm(convop, inputs):
            # GpuConv does not always store information on the batchsize and
            # channels, though, so we only use what information we have.)
            if ((subsample == (1,1)) and
-                (convop.imshp is not None) and
-                (None not in convop.imshp[-2:]) and
-                (convop.kshp is not None) and
-                (None not in convop.kshp)):
+                (node.op.imshp is not None) and
+                (None not in node.op.imshp[-2:]) and
+                (node.op.kshp is not None) and
+                (None not in node.op.kshp)):
                # we know the kernel and output size
-                prod1 = convop.kshp[0] * convop.kshp[1]
-                prod2 = ((convop.imshp[-2] - convop.kshp[0] + 1) *
-                         (convop.imshp[-1] - convop.kshp[1] + 1))
-                if ((convop.bsize is not None) and
-                        (len(convop.imshp) == 3) and
-                        (convop.imshp[0] is not None)):
+                prod1 = node.op.kshp[0] * node.op.kshp[1]
+                prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
+                         (node.op.imshp[-1] - node.op.kshp[1] + 1))
+                if ((node.op.bsize is not None) and
+                        (len(node.op.imshp) == 3) and
+                        (node.op.imshp[0] is not None)):
                    # we also know batchsize and input channels
-                    prod1 *= convop.bsize
-                    prod2 *= convop.imshp[0]
+                    prod1 *= node.op.bsize
+                    prod2 *= node.op.imshp[0]
                # compare to decide
                if prod1 > prod2:
                    # (we need to wrap the result in as_cuda_ndarray_variable,
@@ -416,108 +505,41 @@ def replace_convforward_with_corrmm(convop, inputs):
            rval = GpuCorrMM_gradInputs('valid', subsample)(
                    gpu_contiguous(kern), gpu_contiguous(img))
        return rval
+register_specialize_device()(local_conv2d_corrmm)

-def replace_convgradweight_with_corrmm(convop, inputs):
-    img, topgrad, shape = inputs
-    rval = GpuCorrMM_gradWeights(border_mode=convop.border_mode,
-    subsample=convop.subsample)(
+@local_optimizer(AbstractConv2d_gradWeights)
+def local_conv2d_gradweight_corrmm(node):
+
+    img, topgrad, shape = node.inputs
+    if not isinstance(img, CudaNdarrayType) or \
+            isinstance(topgrad, CudaNdarrayType):
+        return None
+    rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
+    subsample=node.op.subsample)(
    gpu_contiguous(img), gpu_contiguous(topgrad), shape)
    return rval
+register_specialize_device()(local_conv2d_gradweight_corrmm)

-def replace_convgradinputs_withcorrmm(convop, inputs):
-    kern, topgrad, shape = inputs
-    rval =  GpuCorrMM_gradInputs(border_mode=convop.border_mode,
-    subsample=convop.subsample)(
+@local_optimizer(AbstractConv2d_gradInputs)
+def local_conv2d_gradinputs_corrmm(node):
+
+    kern, topgrad, shape = node.inputs
+    if not isinstance(img, CudaNdarrayType) or \
+            isinstance(topgrad, CudaNdarrayType):
+        return None
+    rval =  GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
+    subsample=node.op.subsample)(
        gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
    return rval
-
-
-def replace_convop(convop, inputs):
-    """
-    Dispatch based on the convop.optim values
-    """
-    gpu_conv = None
-    if "cudnn" in convop.optim:
-        gpu_conv = replace_conv_with_cudnn(convop, inputs)
-    if gpu_conv is None and "corrmm" in convop.optim:
-        if isinstance(convop, Conv2d):
-            gpu_conv = replace_convforward_with_corrmm(convop, inputs)
-        elif isinstance(convop, Conv2d_gradWeights):
-            gpu_conv = replace_convgradweight_with_corrmm(convop, inputs)
-        elif isinstance(convop, Conv2d_gradInputs):
-            gpu_conv = replace_convgradinputs_withcorrmm(convop, inputs)
-
-    ### FIXME add fft code
-    return gpu_conv
-
-
-
-### move to Gpu optimization
-@local_optimizer([gpu_from_host, Conv2d, Conv2d_gradWeights, Conv2d_gradInputs])
-def local_conv2d_gpu_conv(node):
-    """
-    gpu_from_host(Conv) -> (gpu)_Conv(gpu_from_host)
-
-    Conv(host_from_gpu) -> host_from_gpu((gpu)_Conv)
-    """
-    if isinstance(node.op, GpuFromHost):
-        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
-        host_input = node.inputs[0]
-        if host_input.owner and \
-                (isinstance(host_input.owner.op, Conv2d) or
-                 isinstance(host_input.owner.op, Conv2d_gradWeights) or
-                 isinstance(host_input.owner.op, Conv2d_gradInputs)):
-
-            conv = host_input.owner.op
-            if len(host_input.owner.inputs) == 3:
-                inp1, inp2, shape = host_input.owner.inputs
-            else:
-                inp1, inp2 = host_input.owner.inputs
-                shape = None
-            out = replace_convop(conv, [gpu_from_host(inp1),
-                                        gpu_from_host(inp2),
-                                        shape])
-            if out is None:
-                return
-            out = theano.tensor.patternbroadcast(gpu_from_host(out),
-                                                 node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx_high_tol
-            return [out]
-
-    if (isinstance(node.op, Conv2d) or
-        isinstance(node.op, Conv2d_gradWeights) or
-        isinstance(node.op, Conv2d_gradInputs)):
-        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
-
-        if len(node.inputs) == 3:
-            inp1, inp2, shape = node.inputs
-        else:
-            inp1, inp2 = node.inputs
-            shape = None
-        inp1_on_gpu = (inp1.owner and isinstance(inp1.owner.op, HostFromGpu))
-        inp2_on_gpu = (inp2.owner and isinstance(inp2.owner.op, HostFromGpu))
-        if inp1_on_gpu or inp2_on_gpu:
-            conv = node.op
-            out = replace_convop(conv, [gpu_from_host(inp1),
-                                        gpu_from_host(inp2),
-                                        shape])
-            if out is None:
-                return
-            out = theano.tensor.patternbroadcast(
-                out,
-                node.outputs[0].broadcastable)
-            out.values_eq_approx = values_eq_approx_high_tol
-            return [as_tensor_variable(out)]
-# We register the optimizer that moves convolutions to the GPU.
-register_opt()(local_conv2d_gpu_conv)
+register_specialize_device()(local_conv2d_gradinputs_corrmm)



 ### Cpu Optmization
 ### Desactived focus on GPU optimization first
-# @local_optimizer([Conv2d])
+# @local_optimizer([AbstractConv2d])
 # def local_conv2d(node):
-#     if isinstance(node.op, Conv2d) and not node.on_gpu:
+#     if isinstance(node.op, AbstractConv2d) and not node.on_gpu:
 #         img, kern = node.inputs
 #         rval = cpu_conv2d(img, kern,
 #                           node.op.imshp, node.op.filter_shape,
@@ -526,10 +548,10 @@ register_opt()(local_conv2d_gpu_conv)
 #         return [rval]


-# @local_optimizer([Conv2d_gradWeights])
+# @local_optimizer([AbstractConv2d_gradWeights])
 # def local_conv2d_gradweight_cpu(node):

-#     if not isinstance(node.op, Conv2d_gradWeights) or not node.on_gpu:
+#     if not isinstance(node.op, AbstractConv2d_gradWeights) or not node.on_gpu:
 #         return

 #     img, topgrad = node.inputs
@@ -555,7 +577,7 @@ register_opt()(local_conv2d_gpu_conv)
 #             "stride y are different from 1 and 2, as there is a bug in it.")

 #     if op.imshp is None or op.kshp is None:
-#         raise Exception("Conv2d grad when stride x!=1 or stride y!=1 we must have"
+#         raise Exception("AbstractConv2d grad when stride x!=1 or stride y!=1 we must have"
 #                         " all the optional shape information")

 #     ####### Determine gradient on kernels ########
@@ -604,9 +626,9 @@ register_opt()(local_conv2d_gpu_conv)
 #     return [dw(img, filters)]


-# @local_optimizer([Conv2d_gradInputs])
+# @local_optimizer([AbstractConv2d_gradInputs])
 # def local_conv2d_gradinputs_cpu(node):
-#     if not isinstance(node.op, Conv2d_gradInputs) or not node.on_gpu:
+#     if not isinstance(node.op, AbstractConv2d_gradInputs) or not node.on_gpu:
 #         return

 #     # ####### Determine gradient on inputs ########