提交 a24fd9bb authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4570 from Sentient07/new_graph2gpu

New graph2gpu
...@@ -402,6 +402,14 @@ class Shape_i(gof.Op): ...@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
return [()] return [()]
def connection_pattern(self, node):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return [[False]]
def grad(self, inp, grads): def grad(self, inp, grads):
return [theano.gradient.grad_not_implemented( return [theano.gradient.grad_not_implemented(
op=self, x_pos=0, x=inp[0], op=self, x_pos=0, x=inp[0],
...@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None): ...@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
return var.shape[i] return var.shape[i]
def shape_i_op(i):
key = i
if key not in shape_i_op.cache:
shape_i_op.cache[key] = Shape_i(i)
return shape_i_op.cache[key]
shape_i_op.cache = {}
def register_shape_i_c_code(typ, code, check_input, version=()): def register_shape_i_c_code(typ, code, check_input, version=()):
""" """
Tell Shape_i how to generate C code for a Theano Type. Tell Shape_i how to generate C code for a Theano Type.
......
...@@ -54,7 +54,7 @@ def _atexit_print_fn(): ...@@ -54,7 +54,7 @@ def _atexit_print_fn():
destination_file = open(config.profiling.destination, 'w') destination_file = open(config.profiling.destination, 'w')
for ps in _atexit_print_list: for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0: if ps.fct_callcount >= 1 or ps.compile_time > 1:
ps.summary(file=destination_file, ps.summary(file=destination_file,
n_ops_to_print=config.profiling.n_ops, n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply) n_apply_to_print=config.profiling.n_apply)
......
...@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, n_created, o) in count_opt[::-1]: for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % ( print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream) t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % ( print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream) not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1]))) not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]: for (t, o) in not_used[::-1]:
......
...@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name): ...@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor # If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x) return gpu_from_host(context_name)(x)
# Try _as_GpuArrayVariable if possible # Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
...@@ -544,7 +544,7 @@ class HostFromGpu(Op): ...@@ -544,7 +544,7 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [GpuFromHost(inputs[0].type.context_name)(gz)] return [gpu_from_host(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
...@@ -647,6 +647,14 @@ class GpuFromHost(Op): ...@@ -647,6 +647,14 @@ class GpuFromHost(Op):
return (9,) return (9,)
# Caching GPUAlloc
def gpu_from_host(ctx):
if ctx not in gpu_alloc.cache:
gpu_from_host.cache[ctx] = GpuFromHost(ctx)
return gpu_from_host.cache[ctx]
gpu_from_host.cache = {}
class GpuToGpu(Op): class GpuToGpu(Op):
""" """
Transfer data between GPUs. Transfer data between GPUs.
...@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc): ...@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
return True return True
# Caching GPUAlloc
def gpu_alloc(ctx, memset_0=False):
key = (ctx, memset_0)
if key not in gpu_alloc.cache:
gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
return gpu_alloc.cache[key]
gpu_alloc.cache = {}
class GpuAllocEmpty(HideC, Alloc): class GpuAllocEmpty(HideC, Alloc):
""" """
Allocate uninitialized memory on the GPU. Allocate uninitialized memory on the GPU.
...@@ -956,6 +973,14 @@ def empty_like(var): ...@@ -956,6 +973,14 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(ctx, dtype):
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
return gpu_alloc_empty.cache[key]
gpu_alloc_empty.cache = {}
class GpuContiguous(Op): class GpuContiguous(Op):
""" """
Return a C contiguous version of the input. Return a C contiguous version of the input.
...@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
def make_node(self, x, shp): def make_node(self, x, shp):
ctx_name = infer_context_name(x) ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name) x = as_gpuarray_variable(x, context_name=ctx_name)
shp = tensor.as_tensor_variable(shp)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim) res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype, otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable, broadcastable=res.broadcastable,
......
...@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp ...@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic from theano.gof.type import CDataType, Generic
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
from theano.tensor.nnet.abstract_conv import (AbstractConv2d, from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
...@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d, ...@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.pool import ( from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad) Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu from . import pygpu
from .type import get_context, gpu_context_type, list_contexts, GpuArrayType from .type import get_context, gpu_context_type, list_contexts
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, GpuAllocEmpty, empty_like) gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
# These don't exist in gpuarray # These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter from .opt import (gpu_seqopt, register_opt, conv_groupopt,
op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty from .opt_util import alpha_merge, output_merge, inplace_allocempty
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
...@@ -94,6 +97,7 @@ def dnn_present(): ...@@ -94,6 +97,7 @@ def dnn_present():
if config.dnn.enabled == "False": if config.dnn.enabled == "False":
dnn_present.msg = "Disabled by dnn.enabled flag" dnn_present.msg = "Disabled by dnn.enabled flag"
dnn_present.avail = False dnn_present.avail = False
return False
if pygpu is None: if pygpu is None:
dnn_present.msg = "PyGPU not available" dnn_present.msg = "PyGPU not available"
...@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp): ...@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version()) return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
def gpu_dnn_conv_desc(border_mode, subsample=(1, 1), conv_mode='conv',
precision="float32"):
key = (border_mode, subsample, conv_mode, precision)
if key not in gpu_dnn_conv_desc.cache:
gpu_dnn_conv_desc.cache[key] = GpuDnnConvDesc(border_mode,
subsample,
conv_mode,
precision)
return gpu_dnn_conv_desc.cache[key]
gpu_dnn_conv_desc.cache = {}
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float64')) _zero = constant(numpy.asarray(0.0, dtype='float64'))
_one = constant(numpy.asarray(1.0, dtype='float64')) _one = constant(numpy.asarray(1.0, dtype='float64'))
...@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase): ...@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc) d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc) d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase): ...@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv.cache:
gpu_dnn_conv.cache[key] = GpuDnnConv(algo, inplace)
return gpu_dnn_conv.cache[key]
gpu_dnn_conv.cache = {}
class GpuDnnConvGradW(DnnBase): class GpuDnnConvGradW(DnnBase):
""" """
...@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase): ...@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase):
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc) d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc) d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase): ...@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv_gradW(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradW.cache:
gpu_dnn_conv_gradW.cache[key] = GpuDnnConvGradW(inplace, algo)
return gpu_dnn_conv_gradW.cache[key]
gpu_dnn_conv_gradW.cache = {}
class GpuDnnConvGradI(DnnBase): class GpuDnnConvGradI(DnnBase):
""" """
...@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase): ...@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase):
img = gpu_contiguous(img) img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc) d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc) d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase): ...@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv_gradI(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradI.cache:
gpu_dnn_conv_gradI.cache[key] = GpuDnnConvGradI(inplace, algo)
return gpu_dnn_conv_gradI.cache[key]
gpu_dnn_conv_gradI.cache = {}
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None, conv_mode='conv', direction_hint=None, workmem=None,
algo=None, precision=None): algo=None, precision=None):
...@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = GpuAllocEmpty(img.dtype, ctx_name)( out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3) shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross', precision=precision)(out.shape) conv_mode='cross', precision=precision)(out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc) conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1) and elif (border_mode == 'full' and subsample == (1, 1) and
...@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph), out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape2, shape3) shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc) return gpu_dnn_conv_gradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy # contig_version will return a gpu_contiguous copy
# if the img contains negative strides # if the img contains negative strides
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, # We can use Shape_i and bypass the infer_shape here as this is on
desc_op.border_mode, # the input of node and it will always be present.
desc_op.subsample) ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp) kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
return GpuDnnConv(algo=algo)(img, kerns, out, desc) out_shp = get_conv_output_shape(ishape, kshape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
...@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
img = gpu_contiguous(img) img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad) topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp) kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp) conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp) out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc) return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
...@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad) topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp) img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape) conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp) out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc) return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
...@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, @register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_abstractconv_cudnn(node): def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(node.op, (AbstractConv2d, if (not isinstance(op, (AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))): AbstractConv2d_gradInputs))):
return None return
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if (node.op.filter_dilation != (1, 1)): if (op.filter_dilation != (1, 1)):
return None return None
if not isinstance(inp1.type, GpuArrayType): inp1 = inputs[0]
return None inp2 = inputs[1]
if not dnn_available(inp1.type.context_name): if not dnn_available(inp1.type.context_name):
raise_no_cudnn() raise_no_cudnn()
if node.op.filter_flip: if op.filter_flip:
conv_mode = 'conv' conv_mode = 'conv'
else: else:
conv_mode = 'cross' conv_mode = 'cross'
if isinstance(node.op, AbstractConv2d): if isinstance(op, AbstractConv2d):
rval = dnn_conv(inp1, inp2, rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode, border_mode=op.border_mode,
subsample=node.op.subsample, subsample=op.subsample,
direction_hint='forward!', direction_hint='forward!',
conv_mode=conv_mode) conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights): elif isinstance(op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1], shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1]) inputs[2][0], inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape, rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode, border_mode=op.border_mode,
subsample=node.op.subsample, subsample=op.subsample,
conv_mode=conv_mode) conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs): elif isinstance(op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1], shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1]) inputs[2][0], inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape, rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode, border_mode=op.border_mode,
subsample=node.op.subsample, subsample=op.subsample,
conv_mode=conv_mode) conv_mode=conv_mode)
return [rval] return [rval]
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
conv_groupopt.register('local_abstractconv_cudnn', conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20, local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run', 'fast_compile', 'fast_run',
...@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn', ...@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn',
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs): def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)] return [gpu_dnn_conv(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs): def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] return [gpu_dnn_conv_gradW(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs): def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] return [gpu_dnn_conv_gradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
...@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace', ...@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([Pool]) @op_lifter([Pool])
def local_pool_dnn_alternative(node, ctx_name): @register_opt2([Pool], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not op.ignore_border:
return return
img, = node.inputs img, = inputs
img = as_gpuarray_variable(img, ctx_name) img = as_gpuarray_variable(img, ctx_name)
ds = node.op.ds ds = op.ds
stride = node.op.st stride = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode) return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
def local_pool_dnn_grad_stride(node, ctx_name): @register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not op.ignore_border:
return return
inp, out, out_grad = node.inputs inp, out, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name) inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out, ctx_name) out = as_gpuarray_variable(out, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name) out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds ds = op.ds
st = node.op.st st = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp), return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out), gpu_contiguous(out),
...@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name): ...@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
def local_avg_pool_dnn_grad_stride(node, ctx_name): @register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn() raise_no_cudnn()
if not node.op.ignore_border: if not op.ignore_border:
return return
inp, out_grad = node.inputs inp, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name) inp = as_gpuarray_variable(inp, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name) out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds ds = op.ds
st = node.op.st st = op.st
pad = node.op.padding pad = op.padding
mode = node.op.mode mode = op.mode
cg = gpu_contiguous(out_grad) cg = gpu_contiguous(out_grad)
...@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node): ...@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax]) @op_lifter([LogSoftmax])
def local_logsoftmax_to_dnn(node, ctx_name): @register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax # Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0] inp = inputs[0]
if inp.ndim != 2: if inp.ndim != 2:
return return
if not dnn_available(ctx_name) or version(raises=False) < 3000: if not dnn_available(ctx_name) or version(raises=False) < 3000:
...@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') ...@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile') @register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node, ctx_name): @register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name): if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad") raise_no_cudnn("cuDNN needed for SoftmaxGrad")
ins = [] ins = []
for n in node.inputs: for n in inputs:
n = as_gpuarray_variable(n, ctx_name) n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2: if n.ndim != 2:
return return
......
...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return kernels return kernels
# Caching GpuCAReduceCuda
def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
pre_scalar_op)
if key not in gpu_ca_reduce_cuda.cache:
gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
acc_dtype, pre_scalar_op)
return gpu_ca_reduce_cuda.cache[key]
gpu_ca_reduce_cuda.cache = {}
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
""" """
CAReduce that reuse the python code from gpuarray. CAReduce that reuse the python code from gpuarray.
......
...@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division ...@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
import os import os
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp
from .basic_ops import infer_context_name
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
infer_context_name, GpuFromHost) from .opt import register_opt, op_lifter, register_opt2
from .opt import register_opt as register_gpu_opt, op_lifter
class GpuCumsum(GpuKernelBase, Op): class GpuCumsum(GpuKernelBase, Op):
...@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
x = as_gpuarray_variable(x, infer_context_name(x))
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\ raise NotImplementedError('Only cumsum on 1D, 2D and\
...@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
@register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
def use_gpu_cumsumop(node, ctx_name): @register_opt2([CumsumOp], 'fast_compile')
if node.inputs[0].dtype == 'float32': def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
axis = node.op.axis if inputs[0].dtype == 'float32':
x = node.inputs[0] axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None return None
if axis is None and x.ndim > 1: x = as_gpuarray_variable(x, ctx_name)
x = x.flatten()
x = GpuFromHost(ctx_name)(x) if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumsum`` assume array has been flattened if needed. # ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None: if axis is None:
axis = 0 axis = 0
return GpuCumsum(axis)(x) return GpuCumsum(axis)(x)
register_gpu_opt()(use_gpu_cumsumop)
...@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType ...@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
from theano.gpuarray import (basic_ops, GpuArrayType) from theano.gpuarray import (basic_ops, GpuArrayType)
import theano.tensor.fft import theano.tensor.fft
from .opt import register_opt, op_lifter from .opt import register_opt, op_lifter, register_opt2
try: try:
import pygpu import pygpu
...@@ -373,10 +373,12 @@ def _unitary(norm): ...@@ -373,10 +373,12 @@ def _unitary(norm):
if scikits_cuda_available: if scikits_cuda_available:
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.fft.RFFTOp]) @op_lifter([theano.tensor.fft.RFFTOp])
def local_curfft_op(node, context_name): @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
return curfft_op return curfft_op
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.fft.IRFFTOp]) @op_lifter([theano.tensor.fft.IRFFTOp])
def local_cuirfft_op(node, context_name): @register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
return cuirfft_op return cuirfft_op
...@@ -14,7 +14,7 @@ from theano.gof import Op ...@@ -14,7 +14,7 @@ from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano import gpuarray from theano import gpuarray
from .basic_ops import as_gpuarray_variable, infer_context_name from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt import register_opt, op_lifter from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType from .type import GpuArrayType
...@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial( ...@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
return (1,) return (1,)
@register_opt() @register_opt('fast_compile')
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform]) @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name): @register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
def local_gpua_multinomial(op, context_name, inputs, outputs):
# TODO : need description for function # TODO : need description for function
if len(node.inputs) == 2: if len(inputs) == 2:
p, u = node.inputs p, u = inputs
n_samples = 1 n_samples = 1
else: else:
p, u, n_samples = node.inputs p, u, n_samples = inputs
try: try:
if get_scalar_constant_value(n_samples) != 1: if get_scalar_constant_value(n_samples) != 1:
return None return None
except NotScalarConstantError: except NotScalarConstantError:
return None return None
m, = node.outputs m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'): if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype) gpu_op = GPUAMultinomialFromUniform(op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])( return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u)) gpu_op(p, u))
...@@ -13,7 +13,7 @@ except ImportError: ...@@ -13,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType from .type import GpuArrayType
...@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Op.perform(self, node, inp, out, ctx) Op.perform(self, node, inp, out, ctx)
@register_opt('fast_compile')
@op_lifter([Images2Neibs]) @op_lifter([Images2Neibs])
def use_gpu_images2neibs(node, context_name): @register_opt2([Images2Neibs], 'fast_compile')
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']: def local_gpua_images2neibs(op, context_name, inputs, outputs):
return GpuImages2Neibs(node.op.mode) if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(op.mode)
register_gpu_opt()(use_gpu_images2neibs)
...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant ...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
from . import opt from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty, from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name) infer_context_name, gpu_alloc_empty)
from .type import gpu_context_type from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
...@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz, ...@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
return '\n'.join(codel) return '\n'.join(codel)
@opt.register_opt() @opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node, ctx_name): @opt.register_opt2([tensor.Dot], 'fast_compile')
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None: if nerv is None:
return return
A = node.inputs[0] A = inputs[0]
B = node.inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = getattr(outputs[0], 'fgraph', None)
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)( C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -3,6 +3,8 @@ import copy ...@@ -3,6 +3,8 @@ import copy
import numpy import numpy
import logging import logging
import pdb import pdb
import time
from six import iteritems
from six.moves import xrange from six.moves import xrange
import theano import theano
...@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config ...@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config
from theano.compile import optdb from theano.compile import optdb
from theano.compile.ops import shape_i from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer, from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, toolbox) SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import NavigatorOptimizer
from theano.gof.optdb import LocalGroupDB from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
from theano.scalar.basic import Scalar, Pow, Cast from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
...@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous, GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin, gpu_alloc_empty, gpu_alloc, gpu_from_host)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace, gpugemm_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
gpugemv_no_inplace, gpugemv_inplace) gpugemv_no_inplace, gpugemv_inplace)
...@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, ...@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias, gpu_softmax) gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY) GpuCAReduceCPY, gpu_ca_reduce_cuda)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
...@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt") ...@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2 = EquilibriumDB()
class GraphToGPUDB(DB):
"""
Retrieves the list local optimizers based on the optimizer flag's value
from EquilibriumOptimizer by calling the method query.
"""
def query(self, *tags, **kwtags):
opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
gpu_seqopt = SequenceDB() gpu_seqopt = SequenceDB()
# Don't register this right now # Don't register this right now
conv_groupopt = LocalGroupDB() conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts" conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
...@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs): ...@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs):
return f return f
def register_opt2(tracks, *tags, **kwargs):
'''
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
'''
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
opt = theano.gof.local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs): def register_inplace(*tags, **kwargs):
def f(local_opt): def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__ name = (kwargs and kwargs.pop('name')) or local_opt.__name__
...@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert', ...@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert',
def safe_to_gpu(x, ctx_name): def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return GpuFromHost(ctx_name)(x) return gpu_from_host(ctx_name)(x)
else: else:
return x return x
...@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False): ...@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False):
context_name = i.owner.inputs[0].type.context_name context_name = i.owner.inputs[0].type.context_name
replace = True replace = True
break break
if not replace: if not replace:
# We replace if *all* clients are on the GPU # We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients] clients = [c for o in node.outputs for c in o.clients]
...@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False): ...@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs # the context was derived from the outputs
for i in node.inputs: for i in node.inputs:
i.tag.context_name = context_name i.tag.context_name = context_name
new_op = maker(node, context_name)
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP. # This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
...@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer): ...@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer):
target = getattr(input.tag, 'target', None) target = getattr(input.tag, 'target', None)
if target == 'cpu': if target == 'cpu':
continue continue
# Do not move *int* scalar to the GPU.
if (isinstance(input.type, tensor.TensorType) and
input.ndim == 0 and 'int' in input.dtype):
continue
try: try:
new_input = host_from_gpu(GpuFromHost(target)(input)) new_input = host_from_gpu(gpu_from_host(target)(input))
fgraph.replace_validate(input, new_input, fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer") "InputToGpuOptimizer")
except TypeError: except TypeError:
...@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), ...@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
class GraphToGPU(NavigatorOptimizer):
"""
Transfer the graph as a whole to GPU instead of transfering node by node.
Parameters
----------
local_optimizers_all : List or SortedSet
The local optimizations to apply to a node.
local_optimizers_map : Dict
Dictionary object containing the mapping of Op to list of
LocalOptimizers.
"""
def __init__(self, local_optimizers_all, local_optimizers_map):
self.local_optimizers_all = local_optimizers_all
self.local_optimizers_map = local_optimizers_map
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
def apply(self, fgraph):
mapping = {}
time_opts = {}
node_created = {}
process_count = {}
t_topo = time.time()
topo = fgraph.toposort()
time_topo = time.time()
toposort_timing = time_topo - t_topo
# Building a new graph
# Iterating through inputs of graph
target = infer_context_name(*fgraph.inputs)
for i in fgraph.inputs:
# Do not move *int* scalar to the GPU.
if (isinstance(i.type, tensor.TensorType) and
(i.ndim > 0 or 'int' not in i.dtype)):
mapping[i] = i.transfer(getattr(i.tag, 'target', target))
else:
mapping[i] = i
for i in fgraph.variables:
if isinstance(i, theano.Constant):
mapping[i] = i
for node in topo:
for lopt in (self.local_optimizers_map.get(node.op, []) +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_all):
process_count.setdefault(lopt, 0)
time_opts.setdefault(lopt, 0)
node_created.setdefault(lopt, 0)
for node in topo:
if isinstance(node.op, HostFromGpu):
mapping[node.outputs[0]] = mapping[node.inputs[0]]
continue
# Move only if any of the inputs are on the GPU.
move_to_GPU = False
context_name = None
for i in [mapping[i] for i in node.inputs]:
if isinstance(i.type, GpuArrayType):
context_name = i.type.context_name
move_to_GPU = True
break
if (not move_to_GPU and
isinstance(node.op, (theano.tensor.Alloc,
theano.tensor.AllocEmpty,
theano.tensor.basic.Eye))):
# If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU.
# We approximate this by supposing that if we have an
# optimization for one of the clients op, then we will
# move the client to the GPU.
for c, _ in node.outputs[0].clients:
if (c != 'output' and
(self.local_optimizers_map.get(c.op, []) +
self.local_optimizers_map.get(type(c.op), []))):
move_to_GPU = True
new_ops = None
# Apply the lifter
if move_to_GPU:
for lopt in (self.local_optimizers_map.get(node.op, []) +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_all):
t_opt = time.time()
new_ops = lopt.transform(node.op, context_name,
[mapping[i] for i in node.inputs],
node.outputs)
t_opt2 = time.time()
time_opts[lopt] += t_opt2 - t_opt
if new_ops:
process_count[lopt] += 1
break
outputs = []
if isinstance(new_ops, theano.Op):
outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
elif not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
outputs = newnode.outputs
elif isinstance(new_ops, (tuple, list)):
outputs = new_ops
elif isinstance(new_ops, theano.Variable):
outputs = [new_ops]
if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
for new_o, old_o in zip(outputs, node.outputs):
assert len(outputs) == len(node.outputs)
mapping[old_o] = new_o
new_nodes = []
for o in fgraph.outputs:
new_o = mapping[o]
if new_o.type != o.type:
assert isinstance(o.type, tensor.TensorType)
assert isinstance(new_o.type, GpuArrayType)
# This condition is needed in the case one input is an
# output of the graph. Without this, it would
# introduce cycle as we don't replace correctly that
# case. It would also add extra transfer to/from the
# gpu.
if (new_o.owner and
isinstance(new_o.owner.op, GpuFromHost) and
new_o.owner.inputs[0].type == o.type):
new_o = new_o.owner.inputs[0]
else:
new_o = safe_to_cpu(new_o)
new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
reason=self.__class__.__name__)
return (self, toposort_timing, time_opts, node_created, process_count)
@staticmethod
def print_profile(stream, prof, level=0):
(opt, toposort_timing, time_opts, node_created, process_count) = prof
blanc = (' ' * level)
print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
print(blanc, getattr(opt, "name",
getattr(opt, "__name__", "")), file=stream)
print(blanc, " time io_toposort %.3fs" % toposort_timing, file=stream)
s = sum([v for k, v in time_opts.iteritems()])
print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
count_opt = []
not_used = []
not_used_time = 0
for o, count in iteritems(process_count):
if count > 0:
count_opt.append((time_opts[o], count,
node_created[o], o))
else:
not_used.append((time_opts[o], o))
not_used_time += time_opts[o]
if count_opt:
print(blanc,
' times - times applied - Node created - name:',
file=stream)
count_opt.sort()
for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream)
print(file=stream)
@staticmethod
def merge_profile(prof1, prof2):
# (opt, toposort_timing, time_opts, node_created, process_count) = prof1
local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
prof2[0].local_optimizers_all)
def merge_dict(d1, d2):
"""
merge 2 dicts by adding the values.
"""
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
prof2[0].local_optimizers_map)
new_opt = GraphToGPU(local_optimizers, local_optimizers_map)
toposort_timing = prof1[1] + prof2[1]
time_opts = merge_dict(prof1[2], prof2[2])
node_created = merge_dict(prof1[3], prof2[3])
process_count = merge_dict(prof1[4], prof2[4])
return (new_opt,
toposort_timing,
time_opts,
node_created,
process_count)
@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu]) @local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
def local_cut_gpu_transfers(node): def local_cut_gpu_transfers(node):
# gpu[ab] -> host -> gpub # gpu[ab] -> host -> gpub
...@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua', ...@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt('fast_compile') @register_opt('fast_compile')
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node): def local_gpua_alloc2(node):
""" """
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...) Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
...@@ -292,22 +560,23 @@ def local_gpuaalloc2(node): ...@@ -292,22 +560,23 @@ def local_gpuaalloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc] i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]) for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)): for c, idx in node.outputs[0].clients)):
return [host_from_gpu(GpuAlloc(None)(*node.inputs))] return [host_from_gpu(gpu_alloc(None)(*node.inputs))]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpuaalloc(node, context_name): @register_opt2([tensor.Alloc], 'fast_compile')
return GpuAlloc(context_name)(*node.inputs) def local_gpua_alloc(op, context_name, inputs, outputs):
return gpu_alloc(context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty]) @op_lifter([tensor.AllocEmpty])
def local_gpuaallocempty(node, context_name): @register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpua_alloc_empty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
return GpuAllocEmpty(context_name=context_name, return gpu_alloc_empty(context_name, **op._props_dict())
**node.op._props_dict())(*node.inputs)
@register_opt() @register_opt()
...@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node): ...@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True) new_op = gpu_alloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)] return [new_op(*node.inputs)]
...@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node): ...@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty): if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs) context_name = infer_context_name(*node.inputs)
z = numpy.asarray(0, dtype=node.outputs[0].dtype) z = numpy.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc()(as_gpuarray_variable(z, context_name), return [gpu_alloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)] *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros', optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace. # After move to gpu and merge2, before inplace.
...@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous]) @op_lifter([tensor.extra_ops.CpuContiguous])
def local_gpu_contiguous(node, context_name): @register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpua_contiguous(op, context_name, inputs, outputs):
return gpu_contiguous return gpu_contiguous
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node, context_name): @register_opt2([tensor.Reshape], 'fast_compile')
op = node.op def local_gpua_reshape(op, context_name, inputs, outputs):
name = op.name name = op.name
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
...@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name): ...@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node, context_name): @register_opt2([tensor.Rebroadcast], 'fast_compile')
return node.op(as_gpuarray_variable(node.inputs[0], context_name)) def local_gpua_rebroadcast(op, context_name, inputs, outputs):
return op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node, context_name): @register_opt2([tensor.Flatten], 'fast_compile')
op = node.op def local_gpua_flatten(op, context_name, inputs, outputs):
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
shp = [node.inputs[0].shape[i] for i in range(op.outdim - 1)] shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
shp += [-1] shp += [-1]
res = GpuReshape(op.outdim, None) res = GpuReshape(op.outdim, None)
o = res(node.inputs[0], theano.tensor.as_tensor_variable(shp)) o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
return o return o
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node, context_name): @register_opt2([tensor.Elemwise], 'fast_compile')
op = node.op def local_gpua_elemwise(op, context_name, inputs, outputs):
scal_op = op.scalar_op scal_op = op.scalar_op
name = op.name name = op.name
if name: if name:
name = 'Gpu' + name name = 'Gpu' + name
if len(node.outputs) > 1: if len(outputs) > 1:
return return
res = GpuElemwise(scal_op, name=name, res = GpuElemwise(scal_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern), inplace_pattern=copy.copy(op.inplace_pattern),
...@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name): ...@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name):
# Only transfer the computation on the gpu if the output dtype is # Only transfer the computation on the gpu if the output dtype is
# floating point. Else, give up on the transfer to the gpu. # floating point. Else, give up on the transfer to the gpu.
out_dtype = node.outputs[0].dtype out_dtype = outputs[0].dtype
if out_dtype not in ['float16', 'float32', 'float64']: if out_dtype not in ['float16', 'float32', 'float64']:
return return
# Transfer the inputs on the GPU and cast them to the right dtype. # Transfer the inputs on the GPU and cast them to the right dtype.
new_inputs = [] new_inputs = []
for inp in node.inputs: for inp in inputs:
if inp.dtype != out_dtype: if inp.dtype != out_dtype:
gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype))) gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name))) new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name)))
...@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name): ...@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name):
# Perform the exponent on the gpu and transfer the output back to the # Perform the exponent on the gpu and transfer the output back to the
# cpu. # cpu.
gpu_output = res(*new_inputs) gpu_output = res(*new_inputs)
cpu_output = host_from_gpu(gpu_output) return [gpu_output]
return [cpu_output]
else: else:
return res return res
...@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node, context_name): @register_opt2([tensor.DimShuffle], 'fast_compile')
return GpuDimShuffle(node.op.input_broadcastable, def local_gpua_dimshuffle(op, context_name, inputs, outputs):
node.op.new_order) return GpuDimShuffle(op.input_broadcastable,
op.new_order)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node, context_name): @register_opt2([tensor.SpecifyShape], 'fast_compile')
if isinstance(node.inputs[0].type, GpuArrayType): def local_gpua_specifyShape(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType):
return return
inp = [as_gpuarray_variable(node.inputs[0], context_name)] return local_gpua_specifyShape_graph(op, context_name, inputs, outputs)
inp += node.inputs[1:]
@register_opt2([tensor.SpecifyShape], 'fast_compile')
def local_gpua_specifyShape_graph(op, context_name, inputs, outputs):
inp = [as_gpuarray_variable(inputs[0], context_name)]
inp += inputs[1:]
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape]) @op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node, context_name): def local_gpua_shape(op, context_name, inputs, outputs):
# op_lifter will call this opt too frequently as the output is # op_lifter will call this opt too frequently as the output is
# always on the CPU. # always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(inputs[0].type, GpuArrayType):
return return
return [as_gpuarray_variable(node.inputs[0], context_name).shape] return local_gpua_shape_graph(op, context_name, inputs, outputs)
@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
def local_gpua_shape_graph(op, context_name, inputs, outputs):
return [as_gpuarray_variable(inputs[0], context_name).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
...@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda): ...@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
def local_gpu_print_op(node, context_name): @register_opt2([tensor.printing.Print], 'fast_compile')
x, = node.inputs def local_gpua_print_op(op, context_name, inputs, outputs):
x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name) gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op new_op.old_op = op
return new_op(gpu_x) return new_op(gpu_x)
...@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([IfElse]) @op_lifter([IfElse])
def local_gpua_lazy_ifelse(node, context_name): @register_opt2([IfElse], 'fast_compile')
if node.op.gpu: def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
if op.gpu:
return return
c = node.inputs[0] c = inputs[0]
inps = [] inps = []
for v in node.inputs[1:]: for v in inputs[1:]:
if isinstance(v.type, (tensor.TensorType, GpuArrayType)): if isinstance(v.type, tensor.TensorType):
inps.append(as_gpuarray_variable(v, context_name)) inps.append(as_gpuarray_variable(v, context_name))
else: else:
inps.append(v) inps.append(v)
return IfElse(node.op.n_outs, gpu=True)(c, *inps, return_list=True) return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node, context_name): @register_opt2([tensor.Join], 'fast_compile')
def local_gpua_join(op, context_name, inputs, outputs):
return gpu_join return gpu_join
@register_opt('fast_compile') @register_opt('fast_compile')
@local_optimizer([GpuJoin]) @local_optimizer([GpuJoin])
def local_gpuajoin_1(node): def local_gpua_join_1(node):
# join of a single element # join of a single element
if (isinstance(node.op, GpuJoin) and if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2): len(node.inputs) == 2):
...@@ -600,14 +885,16 @@ def local_gpuajoin_1(node): ...@@ -600,14 +885,16 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
def local_gpua_split(node, context_name): @register_opt2([tensor.Split], 'fast_compile')
return GpuSplit(node.op.len_splits) def local_gpua_split(op, context_name, inputs, outputs):
# TODO use props
return GpuSplit(op.len_splits)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node, context_name): def local_gpua_subtensor(op, context_name, inputs, outputs):
x = node.inputs[0] x = inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and
...@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name): ...@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or any([isinstance(v.type, GpuArrayType) if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
for v in n.inputs + n.outputs]) for v in n.inputs + n.outputs])
for n, _ in node.outputs[0].clients]): for n, _ in outputs[0].clients]):
return return
else: else:
return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))] return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(op.idx_list)
@register_opt2([tensor.Subtensor], 'fast_compile')
def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
# We need different code as the condition is different as inputs
# aren't the same.
x = inputs[0]
# We don't want to move the subtensor to the GPU if the inputs is
# on the CPU and the only client of the CPU node is this
# subtensor. This allow to have a smaller transfer.
if (x.owner and isinstance(x.owner.op, GpuFromHost)):
cpu_x = x.owner.inputs[0]
# And it is a shared var or an input of the graph.
# and is used by only 1 node.
# x is in the new graph, so we can't tests its number of clients.
if not cpu_x.owner and len(cpu_x.clients) == 1:
c = outputs[0].clients
# If the subtensor have only 1 client, do it on the CPU.
# We let the other optimization to take care to move the
# next node or not.
if len(c) == 1:
return
return GpuSubtensor(op.idx_list)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node, context_name): @register_opt2([tensor.IncSubtensor], 'fast_compile')
op = GpuIncSubtensor(node.op.idx_list, node.op.inplace, def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
node.op.set_instead_of_inc, op = GpuIncSubtensor(op.idx_list, op.inplace,
node.op.destroyhandler_tolerate_aliased) op.set_instead_of_inc,
ret = op(*node.inputs) op.destroyhandler_tolerate_aliased)
val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True) ret = op(*inputs)
val = getattr(outputs[0].tag, 'nan_guard_mode_check', True)
ret.tag.nan_guard_mode_check = val ret.tag.nan_guard_mode_check = val
return ret return ret
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node, context_name): @register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node, context_name): @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
context = get_context(context_name) context = get_context(context_name)
# This is disabled on non-cuda contexts # This is disabled on non-cuda contexts
if context.kind != b'cuda': if context.kind != b'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = inputs
# Gpu Ops needs both inputs to have the same dtype # Gpu Ops needs both inputs to have the same dtype
if (x.type.dtype != y.type.dtype): if (x.type.dtype != y.type.dtype):
...@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name): ...@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
if y.type.dtype != dtype: if y.type.dtype != dtype:
y = tensor.cast(y, dtype) y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = op.set_instead_of_inc
compute_capability = int(context.bin_id[-2]) compute_capability = int(context.bin_id[-2])
...@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node): ...@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node, context_name): @register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, def local_gpua_careduce(op, context_name, inputs, outputs):
scalar.Maximum, scalar.Minimum)): if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)):
ctx = get_context(context_name) ctx = get_context(context_name)
if ctx.kind == b'opencl': if ctx.kind == b'opencl':
op = GpuCAReduceCPY op2 = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]: if op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code. # We don't support yet all reduction with cpy code.
return return
elif ctx.kind == b'cuda': elif ctx.kind == b'cuda':
op = GpuCAReduceCuda op2 = GpuCAReduceCuda
else: else:
return False return False
x, = node.inputs x, = inputs
greduce = op( greduce = op2(
node.op.scalar_op, axis=node.op.axis, op.scalar_op, axis=op.axis,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
gvar = greduce(x) gvar = greduce(x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op is GpuCAReduceCPY or if (op2 is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([ gvar.owner.op.supports_c_code([
as_gpuarray_variable(x, context_name)])): as_gpuarray_variable(x, context_name)])):
return greduce return greduce
...@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name): ...@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name):
# to make them a single dimension, do the reduction, and # to make them a single dimension, do the reduction, and
# then reshape to get them back. # then reshape to get them back.
if node.op.axis is None: if op.axis is None:
reduce_mask = [1] * x.type.ndim reduce_mask = [1] * x.type.ndim
else: else:
reduce_mask = [0] * x.type.ndim reduce_mask = [0] * x.type.ndim
for a in node.op.axis: for a in op.axis:
assert reduce_mask[a] == 0 assert reduce_mask[a] == 0
reduce_mask[a] = 1 reduce_mask[a] = 1
...@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name): ...@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name):
for idx, m in enumerate(new_mask): for idx, m in enumerate(new_mask):
if m == 1: if m == 1:
new_axis.append(idx) new_axis.append(idx)
greduce = op( greduce = op2(
node.op.scalar_op, op.scalar_op,
axis=new_axis, reduce_mask=new_mask, axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None), dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name) gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
...@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name): ...@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name):
# be None # be None
reshaped_gpu_inputs = [gpu_reshaped_x] reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs): if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu( reduce_reshaped_x = greduce(gpu_reshaped_x)
greduce(gpu_reshaped_x))
if reduce_reshaped_x.ndim != node.outputs[0].ndim: if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = [] out_shp = []
for i in range(x.ndim): for i in range(x.ndim):
if i not in node.op.axis: if i not in op.axis:
out_shp.append(shape_i(x, i)) out_shp.append(shape_i(x, i))
unreshaped_reduce = reduce_reshaped_x.reshape( unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
tensor.stack(out_shp)) tensor.stack(out_shp))
else: else:
unreshaped_reduce = reduce_reshaped_x unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce] return [unreshaped_reduce]
...@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name): ...@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node, context_name): @register_opt2([tensor.blas.Gemv], 'fast_compile')
if node.op.inplace: def local_gpua_gemv(op, context_name, inputs, outputs):
if op.inplace:
return gpugemv_inplace return gpugemv_inplace
else: else:
return gpugemv_no_inplace return gpugemv_no_inplace
...@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name): ...@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node, context_name): @register_opt2([tensor.blas.Gemm], 'fast_compile')
if node.op.inplace: def local_gpua_gemm(op, context_name, inputs, outputs):
if op.inplace:
return gpugemm_inplace return gpugemm_inplace
else: else:
return gpugemm_no_inplace return gpugemm_no_inplace
...@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name): ...@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot]) @op_lifter([tensor.blas.BatchedDot])
def local_gpua_gemmbatch(node, context_name): @register_opt2([tensor.blas.BatchedDot], 'fast_compile')
a, b = node.inputs def local_gpua_gemmbatch(op, context_name, inputs, outputs):
a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2]) c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0) return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node, context_name): @register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(op, context_name, inputs, outputs):
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since " _logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up " "cuda 7.5 is not available. Updating could speed up "
"your code.") "your code.")
return return
A = node.inputs[0] A = inputs[0]
B = node.inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = outputs[0].fgraph
C = GpuAllocEmpty(dtype='float16', context_name=context_name)( C = gpu_alloc_empty(context_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph)) shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0) return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
...@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name): ...@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name):
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs): def local_gpua_gemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs): def local_gpua_gemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4) @alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpuagemmbatch_alpha_merge(node, *inputs): def local_gpua_gemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0) @output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemmbatch_output_merge(node, *inputs): def local_gpua_gemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)] return [gpugemmbatch_no_inplace(*inputs)]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node, context_name): @register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
return GpuGer(inplace=node.op.destructive) def local_gpua_ger(op, context_name, inputs, outputs):
return GpuGer(inplace=op.destructive)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node, context_name): @register_opt2([tensor.blas.Dot22], 'fast_compile')
def local_gpua_dot22(op, context_name, inputs, outputs):
return gpu_dot22 return gpu_dot22
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22Scalar]) @op_lifter([tensor.blas.Dot22Scalar])
def local_gpua_dot22scalar(node, context_name): @register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
x, y, a = node.inputs def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1]) z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)] return [gpugemm_no_inplace(z, a, x, y, 0)]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
def local_gpua_eye(node, context_name): @register_opt2([tensor.basic.Eye], 'fast_compile')
return GpuEye(dtype=node.op.dtype, context_name=context_name) def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name): @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_argmax_1hot_with_bias return gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name): @register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_1hot_with_bias_dx return gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node, context_name): @register_opt2([tensor.nnet.Softmax], 'fast_compile')
def local_gpua_softmax(op, context_name, inputs, outputs):
return gpu_softmax return gpu_softmax
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node, context_name): @register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
return gpu_softmax_with_bias return gpu_softmax_with_bias
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(node, context_name): def local_gpua_assert(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType):
# Check if input nodes are already on the GPU
if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0], return local_gpua_assert_graph(op, context_name, inputs, outputs)
context_name),
*node.inputs[1:]))]
@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
def local_gpua_assert_graph(op, context_name, inputs, outputs):
return [op(as_gpuarray_variable(inputs[0], context_name),
*inputs[1:])]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([ConvOp]) @op_lifter([ConvOp])
def local_error_convop(node, context_name): @register_opt2([ConvOp], 'fast_compile')
def local_gpua_error_convop(op, context_name, inputs, outputs):
assert False, """ assert False, """
ConvOp does not work with the gpuarray backend. ConvOp does not work with the gpuarray backend.
...@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d() ...@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockGemv]) @op_lifter([SparseBlockGemv])
def local_lift_sparseblockgemv(node, context_name): @register_opt2([SparseBlockGemv], 'fast_compile')
if node.op.inplace: def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
if op.inplace:
return gpu_sparse_block_gemv_inplace return gpu_sparse_block_gemv_inplace
else: else:
return gpu_sparse_block_gemv return gpu_sparse_block_gemv
...@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name): ...@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([SparseBlockOuter]) @op_lifter([SparseBlockOuter])
def local_lift_sparseblockouter(node, context_name): @register_opt2([SparseBlockOuter], 'fast_compile')
if node.op.inplace: def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
if op.inplace:
return gpu_sparse_block_outer_inplace return gpu_sparse_block_outer_inplace
else: else:
return gpu_sparse_block_outer return gpu_sparse_block_outer
...@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node): ...@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node):
# This deals with any abstract convs that have a transfer somewhere # This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile') @register_opt('fast_compile', 'conv_dnn', 'cudnn')
@op_lifter([AbstractConv2d, @op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs]) AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(node, context_name): def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
if isinstance(node.outputs[0].type, GpuArrayType): if isinstance(outputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU. # Don't handle this node here, it's already on the GPU.
return return
inps = list(node.inputs) return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
inps[0] = as_gpuarray_variable(node.inputs[0],
@register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
inps = list(inputs)
inps[0] = as_gpuarray_variable(inputs[0],
context_name=context_name) context_name=context_name)
inps[1] = as_gpuarray_variable(node.inputs[1], inps[1] = as_gpuarray_variable(inputs[1],
context_name=context_name) context_name=context_name)
return [node.op(*inps)] return [op(*inps)]
# Register this here so that it goes after the abstract lifting # Register this here so that it goes after the abstract lifting
register_opt('fast_compile')(conv_groupopt) register_opt('fast_compile')(conv_groupopt)
...@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node): ...@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node):
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op, return [gpu_ca_reduce_cuda(scalar_op=op.scalar_op,
axis=op.axis, axis=op.axis,
reduce_mask=op.reduce_mask, reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)] pre_scalar_op=scalar.basic.sqr)(inp)]
@local_optimizer(None) @local_optimizer(None)
...@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
def local_scan_to_gpua(node, context_name): @register_opt2([scan_op.Scan], 'fast_compile')
info = copy.deepcopy(node.op.info) def local_gpua_scan_to_gpua(op, context_name, inputs, outputs):
info = copy.deepcopy(op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
info['gpua'] = True info['gpua'] = True
nw_ins = [node.inputs[0]] nw_ins = [inputs[0]]
e = (1 + e = (1 +
node.op.n_seqs + op.n_seqs +
node.op.n_mit_mot + op.n_mit_mot +
node.op.n_mit_sot + op.n_mit_sot +
node.op.n_sit_sot + op.n_sit_sot +
node.op.n_shared_outs) op.n_shared_outs)
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]] nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
b = e b = e
e = e + node.op.n_nit_sot e = e + op.n_nit_sot
nw_ins += node.inputs[b:e] nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]] nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs] scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
# The inner output corresponding to the looping condition should not be # The inner output corresponding to the looping condition should not be
# moved to the gpu # moved to the gpu
if node.op.info['as_while']: if op.info['as_while']:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]] scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]] scan_outs += [op.outputs[-1]]
else: else:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs] scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs, scan_outs,
replace=list(zip(node.op.inputs, replace=list(zip(op.inputs,
(safe_to_cpu(x) for x in scan_ins)))) (safe_to_cpu(x) for x in scan_ins))))
# We need to construct the hash here, because scan # We need to construct the hash here, because scan
......
...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer ...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32')) _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx): ...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) return maker(node, inputs)
return opt return opt
......
...@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d): ...@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
if not dnn_available(test_ctx_name): if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg) raise SkipTest(dnn_available.msg)
mode = mode_with_gpu mode = mode_with_gpu
if fd != (1, 1): if fd != (1, 1):
raise SkipTest("Doesn't have CUDNN implementation") raise SkipTest("Doesn't have CUDNN implementation")
o = self.get_output_shape(i, f, s, b, fd) o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s, self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
......
...@@ -396,7 +396,7 @@ def test_gpueye(): ...@@ -396,7 +396,7 @@ def test_gpueye():
k_symb = numpy.asarray(0) k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype) out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb], f = theano.function([N_symb, M_symb],
out, T.stack(out),
mode=mode_with_gpu) mode=mode_with_gpu)
result = numpy.asarray(f(N, M)) result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype)) assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
......
...@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0(): ...@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
ones = numpy.ones((2,), dtype='float32') ones = numpy.ones((2,), dtype='float32')
# Test with 0 from CPU op. # Test with 0 from CPU op.
# Should not be transfered as the only client is the output
a = tensor.alloc(z, i) a = tensor.alloc(z, i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0 assert isinstance(topo[0].op, theano.tensor.Alloc)
assert (numpy.asarray(f(6)) == 0).all()
# Test with 0 from CPU op.
# Should be transfered as it is used by another op.
a = tensor.alloc(z, i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAlloc)
assert (numpy.asarray(f(6)) == 0).all() assert (numpy.asarray(f(6)) == 0).all()
# Test with 0 # Test with 0
...@@ -177,19 +187,30 @@ def test_local_gpualloc_empty(): ...@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
ii = theano.tensor.iscalar() ii = theano.tensor.iscalar()
# Test with vector # Test with vector
# Should not be moved as the only client is the output
a = tensor.AllocEmpty('float32')(i) a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 1
assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
# This return not initilized data, so we can only check the shape
assert f(3).shape == (3,)
# Test with vector
# Should be moved
a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3).shape == (3,) assert f(3).shape == (3,)
# Test with matrix # Test with matrix
a = tensor.AllocEmpty('float32')(i, ii) a = tensor.AllocEmpty('float32')(i, ii)
f = theano.function([i, ii], a, mode=mode_with_gpu) f = theano.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3, 4).shape == (3, 4) assert f(3, 4).shape == (3, 4)
...@@ -334,7 +355,10 @@ def test_local_gpu_subtensor(): ...@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert any([isinstance(node.op, GpuElemwise) for node in topo]) # Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
def test_local_gpu_elemwise(): def test_local_gpu_elemwise():
...@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op(): ...@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
out = theano.tensor.tanh(ms).dot(ms.T) out = theano.tensor.tanh(ms).dot(ms.T)
mode_local_assert = mode_with_gpu.including("assert_no_cpu_op") mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise") mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
old = theano.config.assert_no_cpu_op old = theano.config.assert_no_cpu_op
old2 = theano.config.on_opt_error old2 = theano.config.on_opt_error
......
...@@ -233,7 +233,7 @@ class GpuArrayType(Type): ...@@ -233,7 +233,7 @@ class GpuArrayType(Type):
return data return data
def filter_variable(self, other, allow_convert=True): def filter_variable(self, other, allow_convert=True):
from theano.gpuarray import GpuFromHost from theano.gpuarray.basic_ops import gpu_from_host
if hasattr(other, '_as_GpuArrayVariable'): if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable(self.context_name) other = other._as_GpuArrayVariable(self.context_name)
...@@ -265,7 +265,7 @@ class GpuArrayType(Type): ...@@ -265,7 +265,7 @@ class GpuArrayType(Type):
str(self.broadcastable))) str(self.broadcastable)))
other = other2 other = other2
return GpuFromHost(self.context_name)(other) return gpu_from_host(self.context_name)(other)
@staticmethod @staticmethod
def values_eq(a, b, force_same_dtype=True): def values_eq(a, b, force_same_dtype=True):
......
...@@ -24,10 +24,11 @@ from . import multinomial ...@@ -24,10 +24,11 @@ from . import multinomial
import theano.sandbox.cuda import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua, from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2,
host_from_gpu as host_from_gpua) host_from_gpu as host_from_gpua)
if theano.sandbox.cuda.cuda_available: if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
...@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object): ...@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
return final_samples return final_samples
@register_opt2([mrg_uniform], 'fast_compile')
def local_gpua_mrg_graph(op, context_name, inputs, outputs):
if (type(op) == mrg_uniform and
isinstance(inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(inputs[0],
op.output_type.ndim,
op.output_type.dtype,
inputs[1])
return [outs[0], host_from_gpua(outs[1])]
@register_gpua('fast_compile') @register_gpua('fast_compile')
@local_optimizer([mrg_uniform]) @local_optimizer([mrg_uniform])
def local_gpua_mrg(node): def local_gpua_mrg(node):
# TODO : need description for function context_name = infer_context_name(*node.inputs)
if (type(node.op) == mrg_uniform and return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
isinstance(node.inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform) MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
......
...@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
return d return d
visited.add(out) visited.add(out)
from theano.sandbox import cuda from theano.sandbox import cuda
from theano import gpuarray from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType
if out == x: if out == x:
if isinstance(x.type, cuda.CudaNdarrayType): if isinstance(x.type, cuda.CudaNdarrayType):
d[out] = cuda.gpu_from_host(x_copy) d[out] = cuda.gpu_from_host(x_copy)
else: else:
assert isinstance(x.type, gpuarray.GpuArrayType) assert isinstance(x.type, GpuArrayType)
d[out] = gpuarray.GpuFromHost(x.type.context_name)(x_copy) d[out] = gpu_from_host(x.type.context_name)(x_copy)
return d return d
elif out.owner is None: elif out.owner is None:
return d return d
...@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
out.owner.inputs == [x]): out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy) d[out] = tensor.as_tensor_variable(x_copy)
return d return d
elif (gpuarray.pygpu_activated and elif (pygpu_activated and
out.owner.op == gpuarray.host_from_gpu and out.owner.op == host_from_gpu and
out.owner.inputs == [x]): out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy) d[out] = tensor.as_tensor_variable(x_copy)
return d return d
......
...@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True, ...@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
v = v.owner.inputs[0] v = v.owner.inputs[0]
continue continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i): elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant): i = v.owner.op.i
return numpy.asarray( inp = v.owner.inputs[0]
v.owner.inputs[0].data.shape[v.owner.op.i]) if isinstance(inp, Constant):
return numpy.asarray(inp.data.shape[i])
# The shape of a broadcastable dimension is 1
if (hasattr(inp.type, 'broadcastable') and
inp.type.broadcastable[i]):
return numpy.asarray(1)
# Don't act as the constant_folding optimization here as this # Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would # fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow. # mess with the stabilization optimization and be too slow.
...@@ -2690,15 +2696,18 @@ class Alloc(gof.Op): ...@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
sh = [as_tensor_variable(s) for s in shape] sh = [as_tensor_variable(s) for s in shape]
bcast = [] bcast = []
for i, s in enumerate(sh): for i, s in enumerate(sh):
if config.exception_verbosity == 'high': def err_str():
s_as_str = '\n' + min_informative_str(s) if config.exception_verbosity == 'high':
else: return '\n' + min_informative_str(s)
s_as_str = str(s) else:
return str(s)
if s.type.dtype[:3] not in ('int', 'uin'): if s.type.dtype[:3] not in ('int', 'uin'):
s_as_str = err_str()
raise TypeError('Shape arguments to Alloc must be integers, ' raise TypeError('Shape arguments to Alloc must be integers, '
'but argument %s is not for apply node: %s' % 'but argument %s is not for apply node: %s' %
(i, s_as_str)) (i, s_as_str))
if s.ndim != 0: if s.ndim != 0:
s_as_str = err_str()
raise TypeError( raise TypeError(
"Each shape dimension to Alloc must be a scalar, ", "Each shape dimension to Alloc must be a scalar, ",
'but dimension %s have %d dimensions for apply node: %s' % 'but dimension %s have %d dimensions for apply node: %s' %
......
...@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape, ...@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
""" """
bsize, imshp = image_shape[0], image_shape[2:] bsize, imshp = image_shape[0], image_shape[2:]
nkern, kshp = kernel_shape[0], kernel_shape[2:] nkern, kshp = kernel_shape[0], kernel_shape[2:]
if filter_dilation is None: if filter_dilation is None:
filter_dilation = numpy.ones(len(subsample), dtype='int') filter_dilation = numpy.ones(len(subsample), dtype='int')
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
out_shp = tuple(get_conv_shape_1axis( out_shp = tuple(get_conv_shape_1axis(
imshp[i], kshp[i], border_mode[i], imshp[i], kshp[i], border_mode[i],
...@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode, ...@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
pad = border_mode pad = border_mode
if pad < 0: if pad < 0:
raise ValueError("border_mode must be >= 0") raise ValueError("border_mode must be >= 0")
out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
# In case of symbolic shape, we want to build the smallest graph
# (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
if pad == 0:
out_shp = (image_shape - dil_kernel_shape)
else:
out_shp = (image_shape + 2 * pad - dil_kernel_shape)
if subsample != 1:
out_shp = out_shp // subsample
out_shp = out_shp + 1
return out_shp return out_shp
......
...@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase): ...@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
assert get_scalar_constant_value(s) == 3 assert get_scalar_constant_value(s) == 3
s = opt.Shape_i(1)(c) s = opt.Shape_i(1)(c)
assert get_scalar_constant_value(s) == 4 assert get_scalar_constant_value(s) == 4
d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
assert get_scalar_constant_value(f) == 1
def test_elemwise(self): def test_elemwise(self):
# We test only for a few elemwise, the list of all supported # We test only for a few elemwise, the list of all supported
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论