提交 54fe4a7f authored 作者: Chiheb Trabelsi's avatar Chiheb Trabelsi

opt.py has been modified in order to respect the flake8 style.

上级 1a3948cc
...@@ -10,22 +10,32 @@ import warnings ...@@ -10,22 +10,32 @@ import warnings
import numpy import numpy
from six.moves import reduce, xrange from six.moves import reduce, xrange
from . import dnn
import theano import theano
from theano import scalar as scal from theano import scalar as scal
from theano import config, tensor, gof from theano import config, tensor, gof
import theano.ifelse import theano.ifelse
import theano.tensor.signal.pool
import theano.tensor.nnet
import theano.tensor.nnet.neighbours
# Convolution
from theano.tensor.nnet import conv
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
# Pooling
import theano.tensor.signal.pool as pool
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, TopoOptimizer, toolbox) Optimizer, TopoOptimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous, gpu_eye, gpu_contiguous,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
GpuContiguous, GpuContiguous,
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce,
GpuFlatten, gpu_flatten, gpu_flatten,
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
...@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')( ...@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
# This is a partial list of CPU ops that can be in some circonstance # This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization. # moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date. # Hopefully, we can keep this list up to date.
import theano.tensor.signal.pool
import theano.tensor.nnet.neighbours
cpu_ops_moved_to_gpu = [ cpu_ops_moved_to_gpu = [
tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm, tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp, tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
...@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node): ...@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
if y.ndim == 2: if y.ndim == 2:
y_ = y_.dimshuffle(0, 1, "x") y_ = y_.dimshuffle(0, 1, "x")
z = GpuBatchedDot()(as_cuda_ndarray_variable(x_), z = GpuBatchedDot()(as_cuda_ndarray_variable(x_),
as_cuda_ndarray_variable(y_)) as_cuda_ndarray_variable(y_))
# unpad z shape # unpad z shape
if x.ndim == 2: if x.ndim == 2:
z = z.dimshuffle(0, *range(2, z.ndim)) z = z.dimshuffle(0, *range(2, z.ndim))
...@@ -850,8 +858,8 @@ def local_gpu_careduce(node): ...@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
if x.type == node.outputs[0].type: if x.type == node.outputs[0].type:
return [x] return [x]
elif (all([c != "output" and isinstance(c.op, GpuFromHost) elif (all([c != "output" and isinstance(c.op, GpuFromHost)
for c, i in node.outputs[0].clients]) for c, i in node.outputs[0].clients]) and
and x.owner and x.owner.op.__class__ in x.owner and x.owner.op.__class__ in
cpu_ops_moved_to_gpu): cpu_ops_moved_to_gpu):
# It is not always good to transfer the reduction to # It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the # the GPU when the clients are on the GPU but not the
...@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node): ...@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result # operation with some reduction pattern will probably result
# to slow down. # to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
...@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node): ...@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
return [gpu_flatten(host_input.owner.inputs[0], outdim)( return [gpu_flatten(host_input.owner.inputs[0], outdim)(
as_cuda_ndarray_variable(host_input.owner.inputs[0]))] as_cuda_ndarray_variable(host_input.owner.inputs[0]))]
if isinstance(node.op, tensor.Flatten): if isinstance(node.op, tensor.Flatten):
x, = node.inputs x, shp = node.inputs
outdim = node.op.outdim
if x.owner and isinstance(x.owner.op, HostFromGpu): if x.owner and isinstance(x.owner.op, HostFromGpu):
outdim = node.op.outdim outdim = node.op.outdim
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
...@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node): ...@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
*coords)] *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and if (x.owner and x.dtype == "float32" and
isinstance(x.owner.op, HostFromGpu) and isinstance(x.owner.op, HostFromGpu)):
x.dtype == "float32"):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and # And it is a shared var or an input of the graph.
isinstance(gpu_x.owner.op, GpuFromHost) and not(gpu_x.owner.inputs[0].owner) and
# And it is a shared var or an input of the graph. isinstance(gpu_x.owner.op, GpuFromHost)):
not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or isinstance(n.op, GpuOp) if any([n == 'output' or isinstance(n.op, GpuOp)
...@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
...@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else: else:
...@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node): ...@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32 # Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast # output even if y is float64, so we can downcast
# y to put it on GPU # y to put it on GPU
elif type(node.op) == tensor.IncSubtensor and \ elif (type(node.op) == tensor.IncSubtensor and
node.inputs[0].dtype == "float32": node.inputs[0].dtype == "float32"):
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
assert isinstance(x.type, tensor.TensorType) assert isinstance(x.type, tensor.TensorType)
assert isinstance(y.type, tensor.TensorType) assert isinstance(y.type, tensor.TensorType)
...@@ -1346,8 +1349,6 @@ def cast(x, dtype): ...@@ -1346,8 +1349,6 @@ def cast(x, dtype):
cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype))) cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))
return cast_op(x) return cast_op(x)
import theano.tensor.nnet
@register_opt() @register_opt()
@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) @local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
...@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node): ...@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
return False return False
# Convolution
from theano.tensor.nnet import conv
def _gpu_conv_to_fftconv(node): def _gpu_conv_to_fftconv(node):
# shared helper function for local_conv_fft_valid and local_conv_fft_full. # shared helper function for local_conv_fft_valid and local_conv_fft_full.
# we import conv2d_fft locally to avoid pycuda warnings # we import conv2d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv2d_fft from theano.sandbox.cuda.fftconv import conv2d_fft
kwargs = {'border_mode': node.op.border_mode} kwargs = {'border_mode': node.op.border_mode}
if (node.op.imshp is not None and if (node.op.imshp is not None and node.op.imshp[-1] % 2 == 1 and
node.op.imshp[-1] is not None and node.op.imshp[-1] is not None):
node.op.imshp[-1] % 2 == 1):
kwargs['pad_last_dim'] = True kwargs['pad_last_dim'] = True
# If the user supplied the full nonsymbolic image_shape and # If the user supplied the full nonsymbolic image_shape and
...@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node): ...@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_fft_valid(node): def local_conv_fft_valid(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.border_mode == 'valid' and if (node.op.border_mode == 'valid' and node.op.fft_opt and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1)):
node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return False return False
...@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node): ...@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_fft_full(node): def local_conv_fft_full(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.border_mode == 'full' and if (node.op.border_mode == 'full' and node.op.fft_opt and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1)):
node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return return
...@@ -1586,7 +1580,7 @@ def local_gpu_conv(node): ...@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_gemm(node): def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and if (isinstance(node.op, GpuConv) and
node.op.border_mode in ['full', 'valid']): node.op.border_mode in ['full', 'valid']):
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
...@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10, ...@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
'conv_fft') 'conv_fft')
# cuDNN is the second, but only registered if cuDNN is available. # cuDNN is the second, but only registered if cuDNN is available.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'. # It can be disabled by excluding 'conv_dnn' or 'cudnn'.
from . import dnn
# We can't check at import if dnn is available, so we must always # We can't check at import if dnn is available, so we must always
# register it. This do not cause problem as if it is not avail, the # register it. This do not cause problem as if it is not avail, the
# opt will do nothing. # opt will do nothing.
...@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer): ...@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
shapes = ((node.op.bsize,) + node.op.imshp, shapes = ((node.op.bsize,) + node.op.imshp,
(node.op.nkern, nchannels) + node.op.kshp) (node.op.nkern, nchannels) + node.op.kshp)
for (var, shape) in zip(vars, shapes): for (var, shape) in zip(vars, shapes):
if ((var in inputs) and if ((var in inputs) and (shape is not None) and
(shape is not None) and not any(s is None for s in shape)):
not any(s is None for s in shape)):
result[var] = theano.shared( result[var] = theano.shared(
# TODO: Use var.type.filter when cuda_ndarray.filter # TODO: Use var.type.filter when cuda_ndarray.filter
...@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node): ...@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
gpu_optimizer.register("conv3d_fft", local_conv3d_fft) gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
@local_optimizer([ConvGrad3D]) @local_optimizer([ConvGrad3D])
def local_convgrad3d_fft(node): def local_convgrad3d_fft(node):
...@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node): ...@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
return False return False
if (isinstance(node.op, ConvGrad3D) and if (isinstance(node.op, ConvGrad3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
...@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node): ...@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft) gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
@local_optimizer([ConvTransp3D]) @local_optimizer([ConvTransp3D])
def local_convtransp3d_fft(node): def local_convtransp3d_fft(node):
...@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node): ...@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
return False return False
if (isinstance(node.op, ConvTransp3D) and if (isinstance(node.op, ConvTransp3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
# Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t) # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
...@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node): ...@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm) gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
# Pooling
import theano.tensor.signal.pool as pool
@register_opt() @register_opt()
@local_optimizer([pool.Pool]) @local_optimizer([pool.Pool])
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if (isinstance(node.op, pool.Pool) if (isinstance(node.op, pool.Pool) and
and node.op.ds == node.op.st): node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
...@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node): ...@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
@register_opt() @register_opt()
@local_optimizer([pool.MaxPoolGrad]) @local_optimizer([pool.MaxPoolGrad])
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if (isinstance(node.op, pool.MaxPoolGrad) and if (isinstance(node.op, pool.MaxPoolGrad) and node.op.ds == node.op.st):
node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if (node.op.padding != (0, 0) or if (node.op.padding != (0, 0) or
node.op.mode != 'max' or node.op.mode != 'max' or
node.op.st != node.op.ds): node.op.st != node.op.ds):
return return
x, z, gz = node.inputs x, z, gz = node.inputs
...@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node): ...@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
as_cuda_ndarray_variable(gx)))] as_cuda_ndarray_variable(gx)))]
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
@register_opt() @register_opt()
@local_optimizer([tensor.Join]) @local_optimizer([tensor.Join])
def local_gpu_join(node): def local_gpu_join(node):
...@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node): ...@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
if isinstance(node.op, GpuAlloc) and not node.op.memset_0: if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0] inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and if (isinstance(inp, CudaNdarrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_out = GpuAlloc(memset_0=True)(*node.inputs)
old_bcast = node.outputs[0].type.broadcastable old_bcast = node.outputs[0].type.broadcastable
...@@ -2308,8 +2287,9 @@ def local_gpu_eye(node): ...@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"): host_input.owner.op.dtype == "float32"):
if tensor.extract_constant(host_input.owner.inputs[2]) != 0: if tensor.extract_constant(host_input.owner.inputs[2]) != 0:
return return
return [gpu_eye(*host_input.owner.inputs)] return [gpu_eye(*host_input.owner.inputs)]
...@@ -2324,7 +2304,7 @@ def local_gpu_eye(node): ...@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
return as_cuda_ndarray_variable(x) return as_cuda_ndarray_variable(x)
else: else:
...@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def tensor_to_cuda(x): def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
y = CudaNdarrayType(broadcastable=x.type.broadcastable)() y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
if x.name: if x.name:
...@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node): ...@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1): len(host_input.owner.outputs) == 1):
# Note that we are not doing the right thing here !! # Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one # This is because the local optimizer expects only one
...@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node): ...@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
return _outputs return _outputs
# scan(host_from_gpu) -> host_from_gpu(GPUscan) # scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan and
and not node.op.info['gpu']): not node.op.info['gpu']):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node): ...@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
kern = kern.dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs # call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)( rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img)) gpu_contiguous(kern), gpu_contiguous(img))
else: else:
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
...@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node): ...@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
# GpuConv does not always store information on the batchsize and # GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.) # channels, though, so we only use what information we have.)
if ((subsample == (1, 1)) and if ((subsample == (1, 1)) and
(node.op.imshp is not None) and (node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and (None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and (node.op.kshp is not None) and
(None not in node.op.kshp) and (None not in node.op.kshp) and
border_mode != "half"): border_mode != "half"):
# we know the kernel and output size # we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1] prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) * prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论