提交 24b77b44 authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: Pascal Lamblin

reorganize code

上级 a3e94b40
...@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i ...@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.opt import register_specialize_device
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, host_from_gpu,
...@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -2439,3 +2448,44 @@ if True: ...@@ -2439,3 +2448,44 @@ if True:
gpu_contiguous(ins[1]) gpu_contiguous(ins[1])
) )
return [out.dimshuffle(0, 1)] return [out.dimshuffle(0, 1)]
### AbstractConv Optimizations
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not isinstance(inp1.type, CudaNdarrayType) or \
not isinstance(inp2.type, CudaNdarrayType):
return None
if not dnn_available():
return None
if node.op.filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if (isinstance(node.op, AbstractConv2d)):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward',
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradWeights)):
shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradInputs)):
shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
register_specialize_device(local_conv2d_cudnn, 'cudnn')
...@@ -75,6 +75,12 @@ from theano.tensor import slinalg ...@@ -75,6 +75,12 @@ from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tensor.nnet.abstract_conv2d import (BaseAbstractConv2d, AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
try: try:
# We need to be able to import this file even if cuda isn't avail. # We need to be able to import this file even if cuda isn't avail.
from theano.sandbox.cuda import device_properties from theano.sandbox.cuda import device_properties
...@@ -2619,3 +2625,157 @@ optdb.register('local_inplace_gpu_sparse_block_outer', ...@@ -2619,3 +2625,157 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
import theano.sandbox.cuda.extra_ops import theano.sandbox.cuda.extra_ops
### Move to Gpu optimization
@local_optimizer([gpu_from_host,
AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, BaseAbstractConv2d):
conv = host_input.owner.op
inps = list(host_input.owner.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if isinstance(node.op, BaseAbstractConv2d):
# conv(host_from_gpu) -> host_from_gpu(gpu_conv)
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((isinstance(inp1.type, CudaNdarrayType) and
isinstance(inp2.type, CudaNdarrayType))):
# Both inputs are already directly on the GPU, nothing to do
return
inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
(inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
(inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
inps = list(node.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
register_opt()(local_conv2d_gpu_conv)
### Corrmm opt
@local_optimizer([AbstractConv2d])
def local_conv2d_corrmm(node):
img, kern = node.inputs
if (not isinstance(img.type, CudaNdarrayType) or
not isinstance(kern.type, CudaNdarrayType)):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'full') and (subsample == (1, 1)):
if not node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp)):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if ((node.op.bsize is not None) and
(len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)):
# we also know batchsize and input channels
prod1 *= node.op.bsize
prod2 *= node.op.imshp[0]
# compare to decide
if prod1 > prod2:
# (we need to wrap the result in as_cuda_ndarray_variable,
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
return [rval]
register_specialize_device(local_conv2d_corrmm, 'conv_gemm')
@local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_corrmm(node):
img, topgrad, shape = node.inputs
if not isinstance(img.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filters_flip:
rval = rval[:, :, ::-1, ::-1]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_cuda_ndarray_variable(rval)
return [rval]
register_specialize_device(local_conv2d_gradweight_corrmm, 'conv_gemm')
@local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_corrmm(node):
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
if node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
register_specialize_device(local_conv2d_gradinputs_corrmm, 'conv_gemm')
import unittest
import numpy
import copy
import itertools
import theano
import theano.tensor as T
from theano.tests import unittest_tools as utt
from nose.plugins.skip import SkipTest
import theano.tensor.nnet.conv as conv_ref
import theano.tensor.nnet.abstract_conv2d as conv
from theano.sandbox.cuda import float32_shared_constructor as gpu_shared
from theano.compile import shared as cpu_shared
from theano.sandbox.cuda.tests.test_conv_cuda_ndarray import py_conv
from theano.sandbox.cuda.dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.get_default_mode().excluding('gpu')
class TestConv2d(unittest.TestCase):
def setUp(self):
super(TestConv2d, self).setUp()
self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
(1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
self.subsamples = [(1, 1), (2, 2), (2, 4)]
self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
self.filters_flip = [True, False]
def get_output_shape(self, inputs_shape, filters_shape, subsample, border_mode):
if border_mode == "valid":
border_mode = (0, 0)
if border_mode == "full":
border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
batch_size = inputs_shape[0]
num_filters = filters_shape[0]
return (batch_size, num_filters,) \
+ tuple(None if i is None or k is None
else ((i + 2*pad - k) // d + 1)
for i, k, d, pad in zip(inputs_shape[2:], filters_shape[2:],
subsample, border_mode))
def run_fwd(self, inputs_shape, filters_shape, ref=dnn_conv,
subsample=(1, 1), verify_grad=True, mode=mode_without_gpu,
border_mode='valid', filters_flip=True, device='cpu', provide_shape=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
filters = gpu_shared(filters_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c_ref = ref(inputs, filters,
border_mode=border_mode,
subsample=subsample,
conv_mode = conv_mode)
c = conv.conv2d(inputs, filters,
border_mode=border_mode,
subsample=subsample,
filters_flip=filters_flip,
inputs_shape=imshp,
filters_shape=kshp)
f_ref = theano.function([], c_ref, mode=mode)
f = theano.function([], c, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(conv.AbstractConv2d(border_mode="valid", imshp=imshp, kshp=kshp,
bsize=inputs_shape[0], subsample=subsample),
[inputs_val, filters_val],
mode=mode)
def run_gradweight(self, inputs_shape, filters_shape, output_shape,
ref=dnn_gradweight, subsample=(1, 1), filters_flip=True,
verify_grad=True, mode=mode_without_gpu, border_mode='valid',
device='cpu', provide_shape = False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
output_val = numpy.random.random(output_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
output = gpu_shared(output_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
filters_flip=filters_flip,
subsample=subsample,
imshp = imshp, kshp = kshp)
c = c(inputs, output, filters_shape[-2:])
c_ref = ref(inputs, output,
filters_shape,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradweight(inputs_val, output_val):
conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode, subsample=subsample)
return conv_op(inputs_val, output_val, filters_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradweight, [inputs_val, output_val],
mode=mode, eps=1)
def run_gradinput(self, inputs_shape, filters_shape, output_shape, ref=dnn_gradinput,
subsample=(1, 1), filters_flip=True, verify_grad=True, mode=mode_without_gpu,
border_mode='valid', device='cpu', provide_shape = False):
output_val = numpy.random.random(output_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
output = gpu_shared(output_val)
filters = gpu_shared(filters_val)
else:
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
subsample=subsample,
filters_flip=filters_flip,
imshp=imshp, kshp=kshp)
c = c(filters, output, inputs_shape[-2:])
c_ref = ref(filters, output, inputs_shape,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradinputs(filters_val, output_val):
conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode, subsample=subsample)
return conv_op(filters_val, output_val, inputs_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradinputs, [filters_val, output_val],
mode=mode, eps=1)
def test_dnn_conv(self):
if not dnn_available():
return
mode=mode_with_gpu
# provide_shape is not used by the CuDNN impementation
provide_shape = False
for (i, f), s, b, flip in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filters_flip):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
def test_cormm_conv(self):
mode = mode_with_gpu.excluding('cudnn')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filters_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
def test_cpu_conv(self):
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filters_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filters_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filters_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filters_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filters_flip=flip)
...@@ -15,20 +15,8 @@ from theano.tensor import TensorType ...@@ -15,20 +15,8 @@ from theano.tensor import TensorType
from theano.gof import Apply, Op from theano.gof import Apply, Op
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda import register_opt as register_gpu
from theano.tensor.opt import register_specialize_device from theano.tensor.opt import register_specialize_device
### Gpu related optimization (to be moved in sandbox/cuda)
from theano.sandbox.cuda.basic_ops import (
as_cuda_ndarray_variable,
gpu_contiguous, gpu_from_host, host_from_gpu,
GpuFromHost, HostFromGpu
)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
from theano.sandbox.cuda.blas import GpuCorrMM, GpuCorrMM_gradWeights, GpuCorrMM_gradInputs
from theano.sandbox.cuda.opt import values_eq_approx_high_tol
## Cpu implementation ## Cpu implementation
...@@ -36,6 +24,7 @@ from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp ...@@ -36,6 +24,7 @@ from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
from theano.tensor.nnet.ConvGrad3D import convGrad3D from theano.tensor.nnet.ConvGrad3D import convGrad3D
from theano.tensor.nnet.ConvTransp3D import convTransp3D from theano.tensor.nnet.ConvTransp3D import convTransp3D
_logger = logging.getLogger("theano.tensor.nnet.conv2d") _logger = logging.getLogger("theano.tensor.nnet.conv2d")
...@@ -330,202 +319,6 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d): ...@@ -330,202 +319,6 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [1], [0]] # no connection to height, width return [[1], [1], [0]] # no connection to height, width
### Move to Gpu optimization
@local_optimizer([gpu_from_host,
AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, BaseAbstractConv2d):
conv = host_input.owner.op
inps = list(host_input.owner.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if isinstance(node.op, BaseAbstractConv2d):
# conv(host_from_gpu) -> host_from_gpu(gpu_conv)
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((isinstance(inp1.type, CudaNdarrayType) and
isinstance(inp2.type, CudaNdarrayType))):
# Both inputs are already directly on the GPU, nothing to do
return
inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
(inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
(inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
inps = list(node.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, TensorType):
return [as_tensor_variable(out)]
else:
return [out]
register_gpu()(local_conv2d_gpu_conv)
### Cudnn Opt
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if not isinstance(inp1.type, CudaNdarrayType) or \
not isinstance(inp2.type, CudaNdarrayType):
return None
if not dnn_available():
return None
if node.op.filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if (isinstance(node.op, AbstractConv2d)):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward',
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradWeights)):
shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradInputs)):
shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
register_specialize_device(local_conv2d_cudnn, 'cudnn')
### Corrmm opt
@local_optimizer([AbstractConv2d])
def local_conv2d_corrmm(node):
img, kern = node.inputs
if (not isinstance(img.type, CudaNdarrayType) or
not isinstance(kern.type, CudaNdarrayType)):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'full') and (subsample == (1, 1)):
if not node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp)):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if ((node.op.bsize is not None) and
(len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)):
# we also know batchsize and input channels
prod1 *= node.op.bsize
prod2 *= node.op.imshp[0]
# compare to decide
if prod1 > prod2:
# (we need to wrap the result in as_cuda_ndarray_variable,
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
return [rval]
register_specialize_device(local_conv2d_corrmm, 'conv_gemm')
@local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_corrmm(node):
img, topgrad, shape = node.inputs
if not isinstance(img.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filters_flip:
rval = rval[:, :, ::-1, ::-1]
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_cuda_ndarray_variable(rval)
return [rval]
register_specialize_device(local_conv2d_gradweight_corrmm, 'conv_gemm')
@local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_corrmm(node):
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
if node.op.filters_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
register_specialize_device(local_conv2d_gradinputs_corrmm, 'conv_gemm')
### Cpu Optmization ### Cpu Optmization
@local_optimizer([AbstractConv2d]) @local_optimizer([AbstractConv2d])
def local_conv2d_cpu(node): def local_conv2d_cpu(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论