提交 a24fd9bb authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4570 from Sentient07/new_graph2gpu

New graph2gpu
......@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
def infer_shape(self, node, input_shapes):
return [()]
def connection_pattern(self, node):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return [[False]]
def grad(self, inp, grads):
return [theano.gradient.grad_not_implemented(
op=self, x_pos=0, x=inp[0],
......@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
return var.shape[i]
def shape_i_op(i):
key = i
if key not in shape_i_op.cache:
shape_i_op.cache[key] = Shape_i(i)
return shape_i_op.cache[key]
shape_i_op.cache = {}
def register_shape_i_c_code(typ, code, check_input, version=()):
"""
Tell Shape_i how to generate C code for a Theano Type.
......
......@@ -54,7 +54,7 @@ def _atexit_print_fn():
destination_file = open(config.profiling.destination, 'w')
for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0:
if ps.fct_callcount >= 1 or ps.compile_time > 1:
ps.summary(file=destination_file,
n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply)
......
......@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]:
......
......@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x)
return gpu_from_host(context_name)(x)
# Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'):
......@@ -544,7 +544,7 @@ class HostFromGpu(Op):
def grad(self, inputs, grads):
gz, = grads
return [GpuFromHost(inputs[0].type.context_name)(gz)]
return [gpu_from_host(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
......@@ -647,6 +647,14 @@ class GpuFromHost(Op):
return (9,)
# Caching GPUAlloc
def gpu_from_host(ctx):
if ctx not in gpu_alloc.cache:
gpu_from_host.cache[ctx] = GpuFromHost(ctx)
return gpu_from_host.cache[ctx]
gpu_from_host.cache = {}
class GpuToGpu(Op):
"""
Transfer data between GPUs.
......@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
return True
# Caching GPUAlloc
def gpu_alloc(ctx, memset_0=False):
key = (ctx, memset_0)
if key not in gpu_alloc.cache:
gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
return gpu_alloc.cache[key]
gpu_alloc.cache = {}
class GpuAllocEmpty(HideC, Alloc):
"""
Allocate uninitialized memory on the GPU.
......@@ -956,6 +973,14 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(ctx, dtype):
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
return gpu_alloc_empty.cache[key]
gpu_alloc_empty.cache = {}
class GpuContiguous(Op):
"""
Return a C contiguous version of the input.
......@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
def make_node(self, x, shp):
ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name)
shp = tensor.as_tensor_variable(shp)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable,
......
......@@ -14,7 +14,7 @@ from theano.gof import Optimizer, local_optimizer, COp
from theano.gof.cmodule import GCC_compiler
from theano.gof.type import CDataType, Generic
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.compile.ops import shape_i, shape_i_op
from theano.tensor.nnet import LogSoftmax, SoftmaxGrad
from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
AbstractConv2d_gradWeights,
......@@ -23,15 +23,18 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
from theano.tensor.signal.pool import (
Pool, MaxPoolGrad, AveragePoolGrad)
from . import pygpu
from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
from .type import get_context, gpu_context_type, list_contexts
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, GpuAllocEmpty, empty_like)
gpu_contiguous, gpu_alloc_empty,
empty_like, GpuArrayType)
from .elemwise import GpuElemwise
# These don't exist in gpuarray
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt import (gpu_seqopt, register_opt, conv_groupopt,
op_lifter, register_opt2)
from .opt_util import alpha_merge, output_merge, inplace_allocempty
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_BWD_FILTER
......@@ -94,6 +97,7 @@ def dnn_present():
if config.dnn.enabled == "False":
dnn_present.msg = "Disabled by dnn.enabled flag"
dnn_present.avail = False
return False
if pygpu is None:
dnn_present.msg = "PyGPU not available"
......@@ -370,6 +374,19 @@ class GpuDnnConvDesc(COp):
def c_code_cache_version(self):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
def gpu_dnn_conv_desc(border_mode, subsample=(1, 1), conv_mode='conv',
precision="float32"):
key = (border_mode, subsample, conv_mode, precision)
if key not in gpu_dnn_conv_desc.cache:
gpu_dnn_conv_desc.cache[key] = GpuDnnConvDesc(border_mode,
subsample,
conv_mode,
precision)
return gpu_dnn_conv_desc.cache[key]
gpu_dnn_conv_desc.cache = {}
# scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float64'))
_one = constant(numpy.asarray(1.0, dtype='float64'))
......@@ -526,8 +543,8 @@ class GpuDnnConv(DnnBase):
top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -564,6 +581,14 @@ class GpuDnnConv(DnnBase):
return [shape[2]]
def gpu_dnn_conv(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv.cache:
gpu_dnn_conv.cache[key] = GpuDnnConv(algo, inplace)
return gpu_dnn_conv.cache[key]
gpu_dnn_conv.cache = {}
class GpuDnnConvGradW(DnnBase):
"""
......@@ -608,8 +633,8 @@ class GpuDnnConvGradW(DnnBase):
kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc)
d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -686,6 +711,14 @@ class GpuDnnConvGradW(DnnBase):
return [shape[2]]
def gpu_dnn_conv_gradW(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradW.cache:
gpu_dnn_conv_gradW.cache[key] = GpuDnnConvGradW(inplace, algo)
return gpu_dnn_conv_gradW.cache[key]
gpu_dnn_conv_gradW.cache = {}
class GpuDnnConvGradI(DnnBase):
"""
......@@ -741,8 +774,8 @@ class GpuDnnConvGradI(DnnBase):
img = gpu_contiguous(img)
d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc)
d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
......@@ -823,6 +856,14 @@ class GpuDnnConvGradI(DnnBase):
return [shape[2]]
def gpu_dnn_conv_gradI(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradI.cache:
gpu_dnn_conv_gradI.cache[key] = GpuDnnConvGradI(inplace, algo)
return gpu_dnn_conv_gradI.cache[key]
gpu_dnn_conv_gradI.cache = {}
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None,
algo=None, precision=None):
......@@ -896,12 +937,12 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = GpuAllocEmpty(img.dtype, ctx_name)(
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross', precision=precision)(out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1) and
......@@ -914,26 +955,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape2, shape3)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode, precision=precision)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
return gpu_dnn_conv_gradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc)
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
out_shp = get_conv_output_shape(ishape, kshape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
......@@ -944,10 +989,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
......@@ -958,10 +1003,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op):
......@@ -1382,53 +1427,59 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
return Apply(self, [dy, sm], [sm.type()])
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
if (not isinstance(node.op, (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return None
inp1 = node.inputs[0]
inp2 = node.inputs[1]
@register_opt2([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if (not isinstance(op, (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs))):
return
if (node.op.filter_dilation != (1, 1)):
if (op.filter_dilation != (1, 1)):
return None
if not isinstance(inp1.type, GpuArrayType):
return None
inp1 = inputs[0]
inp2 = inputs[1]
if not dnn_available(inp1.type.context_name):
raise_no_cudnn()
if node.op.filter_flip:
if op.filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if isinstance(node.op, AbstractConv2d):
if isinstance(op, AbstractConv2d):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
border_mode=op.border_mode,
subsample=op.subsample,
direction_hint='forward!',
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradWeights):
elif isinstance(op, AbstractConv2d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
inputs[2][0], inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
border_mode=op.border_mode,
subsample=op.subsample,
conv_mode=conv_mode)
if isinstance(node.op, AbstractConv2d_gradInputs):
elif isinstance(op, AbstractConv2d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1],
node.inputs[2][0], node.inputs[2][1])
inputs[2][0], inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
border_mode=op.border_mode,
subsample=op.subsample,
conv_mode=conv_mode)
return [rval]
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
ctx = infer_context_name(*node.inputs)
if not isinstance(node.inputs[0].type, GpuArrayType):
return
return local_abstractconv_cudnn_graph(node.op, ctx, node.inputs, node.outputs)
conv_groupopt.register('local_abstractconv_cudnn',
local_abstractconv_cudnn, 20,
'fast_compile', 'fast_run',
......@@ -1437,17 +1488,17 @@ conv_groupopt.register('local_abstractconv_cudnn',
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
return [gpu_dnn_conv(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
return [gpu_dnn_conv_gradW(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
return [gpu_dnn_conv_gradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
......@@ -1460,73 +1511,75 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn', 'fast_compile')
@op_lifter([Pool])
def local_pool_dnn_alternative(node, ctx_name):
@register_opt2([Pool], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
if not node.op.ignore_border:
if not op.ignore_border:
return
img, = node.inputs
img, = inputs
img = as_gpuarray_variable(img, ctx_name)
ds = node.op.ds
stride = node.op.st
pad = node.op.padding
mode = node.op.mode
ds = op.ds
stride = op.st
pad = op.padding
mode = op.mode
return dnn_pool(gpu_contiguous(img), ds, stride=stride, pad=pad, mode=mode)
@register_opt('cudnn', 'fast_compile')
@op_lifter([MaxPoolGrad])
def local_pool_dnn_grad_stride(node, ctx_name):
@register_opt2([MaxPoolGrad], 'fast_compile', 'cudnn')
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
if not node.op.ignore_border:
if not op.ignore_border:
return
inp, out, out_grad = node.inputs
inp, out, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds
st = node.op.st
pad = node.op.padding
mode = node.op.mode
ds = op.ds
st = op.st
pad = op.padding
mode = op.mode
return GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
gpu_contiguous(out),
......@@ -1538,18 +1591,19 @@ def local_pool_dnn_grad_stride(node, ctx_name):
@register_opt('cudnn', 'fast_compile')
@op_lifter([AveragePoolGrad])
def local_avg_pool_dnn_grad_stride(node, ctx_name):
@register_opt2([AveragePoolGrad], 'fast_compile', 'cudnn')
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn()
if not node.op.ignore_border:
if not op.ignore_border:
return
inp, out_grad = node.inputs
inp, out_grad = inputs
inp = as_gpuarray_variable(inp, ctx_name)
out_grad = as_gpuarray_variable(out_grad, ctx_name)
ds = node.op.ds
st = node.op.st
pad = node.op.padding
mode = node.op.mode
ds = op.ds
st = op.st
pad = op.padding
mode = op.mode
cg = gpu_contiguous(out_grad)
......@@ -1591,9 +1645,10 @@ def local_log_softmax_dnn(node):
@register_opt('cudnn', 'fast_compile')
@op_lifter([LogSoftmax])
def local_logsoftmax_to_dnn(node, ctx_name):
@register_opt2([LogSoftmax], 'fast_compile', 'cudnn')
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
# Transform the input in the format expected by GpuDnnSoftmax
inp = node.inputs[0]
inp = inputs[0]
if inp.ndim != 2:
return
if not dnn_available(ctx_name) or version(raises=False) < 3000:
......@@ -1629,11 +1684,12 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn', 'fast_compile')
@op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node, ctx_name):
@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
if not dnn_available(ctx_name):
raise_no_cudnn("cuDNN needed for SoftmaxGrad")
ins = []
for n in node.inputs:
for n in inputs:
n = as_gpuarray_variable(n, ctx_name)
if n.ndim != 2:
return
......
......@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return kernels
# Caching GpuCAReduceCuda
def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
pre_scalar_op)
if key not in gpu_ca_reduce_cuda.cache:
gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
acc_dtype, pre_scalar_op)
return gpu_ca_reduce_cuda.cache[key]
gpu_ca_reduce_cuda.cache = {}
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
"""
CAReduce that reuse the python code from gpuarray.
......
......@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
import os
from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp
from .basic_ops import infer_context_name
try:
from pygpu import gpuarray
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name, GpuFromHost)
from .opt import register_opt as register_gpu_opt, op_lifter
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
from .opt import register_opt, op_lifter, register_opt2
class GpuCumsum(GpuKernelBase, Op):
......@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
x = as_gpuarray_variable(x, infer_context_name(x))
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\
......@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
@register_opt('fast_compile')
@op_lifter([CumsumOp])
def use_gpu_cumsumop(node, ctx_name):
if node.inputs[0].dtype == 'float32':
axis = node.op.axis
x = node.inputs[0]
@register_opt2([CumsumOp], 'fast_compile')
def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
if inputs[0].dtype == 'float32':
axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None
if axis is None and x.ndim > 1:
x = x.flatten()
x = as_gpuarray_variable(x, ctx_name)
x = GpuFromHost(ctx_name)(x)
if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None:
axis = 0
return GpuCumsum(axis)(x)
register_gpu_opt()(use_gpu_cumsumop)
......@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
from theano.gpuarray import (basic_ops, GpuArrayType)
import theano.tensor.fft
from .opt import register_opt, op_lifter
from .opt import register_opt, op_lifter, register_opt2
try:
import pygpu
......@@ -373,10 +373,12 @@ def _unitary(norm):
if scikits_cuda_available:
@register_opt('fast_compile')
@op_lifter([theano.tensor.fft.RFFTOp])
def local_curfft_op(node, context_name):
@register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
return curfft_op
@register_opt('fast_compile')
@op_lifter([theano.tensor.fft.IRFFTOp])
def local_cuirfft_op(node, context_name):
@register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
return cuirfft_op
......@@ -14,7 +14,7 @@ from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano import gpuarray
from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt import register_opt, op_lifter
from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType
......@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
return (1,)
@register_opt()
@register_opt('fast_compile')
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name):
@register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
def local_gpua_multinomial(op, context_name, inputs, outputs):
# TODO : need description for function
if len(node.inputs) == 2:
p, u = node.inputs
if len(inputs) == 2:
p, u = inputs
n_samples = 1
else:
p, u, n_samples = node.inputs
p, u, n_samples = inputs
try:
if get_scalar_constant_value(n_samples) != 1:
return None
except NotScalarConstantError:
return None
m, = node.outputs
m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
gpu_op = GPUAMultinomialFromUniform(op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u))
......@@ -13,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .opt import register_opt as register_gpu_opt, op_lifter
from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType
......@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Op.perform(self, node, inp, out, ctx)
@register_opt('fast_compile')
@op_lifter([Images2Neibs])
def use_gpu_images2neibs(node, context_name):
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(node.op.mode)
register_gpu_opt()(use_gpu_images2neibs)
@register_opt2([Images2Neibs], 'fast_compile')
def local_gpua_images2neibs(op, context_name, inputs, outputs):
if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(op.mode)
......@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
infer_context_name, gpu_alloc_empty)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge
......@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
return '\n'.join(codel)
@opt.register_opt()
@opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node, ctx_name):
@opt.register_opt2([tensor.Dot], 'fast_compile')
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None:
return
A = node.inputs[0]
B = node.inputs[1]
A = inputs[0]
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
fgraph = getattr(outputs[0], 'fgraph', None)
C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
......
......@@ -3,6 +3,8 @@ import copy
import numpy
import logging
import pdb
import time
from six import iteritems
from six.moves import xrange
import theano
......@@ -10,9 +12,11 @@ from theano import tensor, scalar, gof, config
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, toolbox)
SequenceDB, Optimizer, DB, toolbox, graph)
from theano.gof.opt import NavigatorOptimizer
from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse
from theano.misc.ordered_set import OrderedSet
from theano.scalar.basic import Scalar, Pow, Cast
from theano.scan_module import scan_utils, scan_op, scan_opt
......@@ -32,7 +36,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
GpuEye, gpu_join, GpuJoin, gpu_alloc_empty, gpu_alloc, gpu_from_host)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace, gpugemmbatch_no_inplace,
gpugemv_no_inplace, gpugemv_inplace)
......@@ -44,7 +48,7 @@ from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY)
GpuCAReduceCPY, gpu_ca_reduce_cuda)
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1,
......@@ -57,12 +61,31 @@ _logger = logging.getLogger("theano.gpuarray.opt")
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
# Not used for an EquilibriumOptimizer. It has the "tracks" that we need for GraphToGPUDB.
gpu_optimizer2 = EquilibriumDB()
class GraphToGPUDB(DB):
"""
Retrieves the list local optimizers based on the optimizer flag's value
from EquilibriumOptimizer by calling the method query.
"""
def query(self, *tags, **kwtags):
opt = gpu_optimizer2.query(*tags, **kwtags)
return GraphToGPU(opt.local_optimizers_all, opt.local_optimizers_map)
gpu_seqopt = SequenceDB()
# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"
gpu_seqopt.register('gpuarray_graph_optimization', GraphToGPUDB(), -0.5,
'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
......@@ -82,6 +105,28 @@ def register_opt(*tags, **kwargs):
return f
def register_opt2(tracks, *tags, **kwargs):
'''
Decorator for the new GraphToGPU optimizer.
Takes an extra parameter(Op) compared to register_opt decorator.
Parameters
----------
tracks : List of Op class Or Op instance or None
The Node's Op to which optimization is being applied.
tags : String
The optimization tag to which the optimizer will be registered.
'''
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
opt = theano.gof.local_optimizer(tracks)(local_opt)
gpu_optimizer2.register(name, opt, 'fast_run', 'gpuarray', *tags)
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
......@@ -102,7 +147,7 @@ gpu_optimizer.register('local_remove_all_assert',
def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(ctx_name)(x)
return gpu_from_host(ctx_name)(x)
else:
return x
......@@ -135,6 +180,7 @@ def op_lifter(OP, cuda_only=False):
context_name = i.owner.inputs[0].type.context_name
replace = True
break
if not replace:
# We replace if *all* clients are on the GPU
clients = [c for o in node.outputs for c in o.clients]
......@@ -158,7 +204,9 @@ def op_lifter(OP, cuda_only=False):
# the context was derived from the outputs
for i in node.inputs:
i.tag.context_name = context_name
new_op = maker(node, context_name)
new_op = maker(node.op, context_name, node.inputs, node.outputs)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
......@@ -195,9 +243,13 @@ class InputToGpuOptimizer(Optimizer):
target = getattr(input.tag, 'target', None)
if target == 'cpu':
continue
# Do not move *int* scalar to the GPU.
if (isinstance(input.type, tensor.TensorType) and
input.ndim == 0 and 'int' in input.dtype):
continue
try:
new_input = host_from_gpu(GpuFromHost(target)(input))
new_input = host_from_gpu(gpu_from_host(target)(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError:
......@@ -215,6 +267,222 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
class GraphToGPU(NavigatorOptimizer):
"""
Transfer the graph as a whole to GPU instead of transfering node by node.
Parameters
----------
local_optimizers_all : List or SortedSet
The local optimizations to apply to a node.
local_optimizers_map : Dict
Dictionary object containing the mapping of Op to list of
LocalOptimizers.
"""
def __init__(self, local_optimizers_all, local_optimizers_map):
self.local_optimizers_all = local_optimizers_all
self.local_optimizers_map = local_optimizers_map
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
def apply(self, fgraph):
mapping = {}
time_opts = {}
node_created = {}
process_count = {}
t_topo = time.time()
topo = fgraph.toposort()
time_topo = time.time()
toposort_timing = time_topo - t_topo
# Building a new graph
# Iterating through inputs of graph
target = infer_context_name(*fgraph.inputs)
for i in fgraph.inputs:
# Do not move *int* scalar to the GPU.
if (isinstance(i.type, tensor.TensorType) and
(i.ndim > 0 or 'int' not in i.dtype)):
mapping[i] = i.transfer(getattr(i.tag, 'target', target))
else:
mapping[i] = i
for i in fgraph.variables:
if isinstance(i, theano.Constant):
mapping[i] = i
for node in topo:
for lopt in (self.local_optimizers_map.get(node.op, []) +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_all):
process_count.setdefault(lopt, 0)
time_opts.setdefault(lopt, 0)
node_created.setdefault(lopt, 0)
for node in topo:
if isinstance(node.op, HostFromGpu):
mapping[node.outputs[0]] = mapping[node.inputs[0]]
continue
# Move only if any of the inputs are on the GPU.
move_to_GPU = False
context_name = None
for i in [mapping[i] for i in node.inputs]:
if isinstance(i.type, GpuArrayType):
context_name = i.type.context_name
move_to_GPU = True
break
if (not move_to_GPU and
isinstance(node.op, (theano.tensor.Alloc,
theano.tensor.AllocEmpty,
theano.tensor.basic.Eye))):
# If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU.
# We approximate this by supposing that if we have an
# optimization for one of the clients op, then we will
# move the client to the GPU.
for c, _ in node.outputs[0].clients:
if (c != 'output' and
(self.local_optimizers_map.get(c.op, []) +
self.local_optimizers_map.get(type(c.op), []))):
move_to_GPU = True
new_ops = None
# Apply the lifter
if move_to_GPU:
for lopt in (self.local_optimizers_map.get(node.op, []) +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_all):
t_opt = time.time()
new_ops = lopt.transform(node.op, context_name,
[mapping[i] for i in node.inputs],
node.outputs)
t_opt2 = time.time()
time_opts[lopt] += t_opt2 - t_opt
if new_ops:
process_count[lopt] += 1
break
outputs = []
if isinstance(new_ops, theano.Op):
outputs = new_ops(*[mapping[i] for i in node.inputs], return_list=True)
elif not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i) for i in node.inputs])
outputs = newnode.outputs
elif isinstance(new_ops, (tuple, list)):
outputs = new_ops
elif isinstance(new_ops, theano.Variable):
outputs = [new_ops]
if new_ops:
node_created[lopt] += len(graph.ops([mapping[i] for i in node.inputs], outputs))
for new_o, old_o in zip(outputs, node.outputs):
assert len(outputs) == len(node.outputs)
mapping[old_o] = new_o
new_nodes = []
for o in fgraph.outputs:
new_o = mapping[o]
if new_o.type != o.type:
assert isinstance(o.type, tensor.TensorType)
assert isinstance(new_o.type, GpuArrayType)
# This condition is needed in the case one input is an
# output of the graph. Without this, it would
# introduce cycle as we don't replace correctly that
# case. It would also add extra transfer to/from the
# gpu.
if (new_o.owner and
isinstance(new_o.owner.op, GpuFromHost) and
new_o.owner.inputs[0].type == o.type):
new_o = new_o.owner.inputs[0]
else:
new_o = safe_to_cpu(new_o)
new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes),
reason=self.__class__.__name__)
return (self, toposort_timing, time_opts, node_created, process_count)
@staticmethod
def print_profile(stream, prof, level=0):
(opt, toposort_timing, time_opts, node_created, process_count) = prof
blanc = (' ' * level)
print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
print(blanc, getattr(opt, "name",
getattr(opt, "__name__", "")), file=stream)
print(blanc, " time io_toposort %.3fs" % toposort_timing, file=stream)
s = sum([v for k, v in time_opts.iteritems()])
print(blanc, "Total time taken by local optimizers %.3fs " % s, file=stream)
count_opt = []
not_used = []
not_used_time = 0
for o, count in iteritems(process_count):
if count > 0:
count_opt.append((time_opts[o], count,
node_created[o], o))
else:
not_used.append((time_opts[o], o))
not_used_time += time_opts[o]
if count_opt:
print(blanc,
' times - times applied - Node created - name:',
file=stream)
count_opt.sort()
for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream)
print(file=stream)
@staticmethod
def merge_profile(prof1, prof2):
# (opt, toposort_timing, time_opts, node_created, process_count) = prof1
local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
prof2[0].local_optimizers_all)
def merge_dict(d1, d2):
"""
merge 2 dicts by adding the values.
"""
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
prof2[0].local_optimizers_map)
new_opt = GraphToGPU(local_optimizers, local_optimizers_map)
toposort_timing = prof1[1] + prof2[1]
time_opts = merge_dict(prof1[2], prof2[2])
node_created = merge_dict(prof1[3], prof2[3])
process_count = merge_dict(prof1[4], prof2[4])
return (new_opt,
toposort_timing,
time_opts,
node_created,
process_count)
@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
def local_cut_gpu_transfers(node):
# gpu[ab] -> host -> gpub
......@@ -273,7 +541,7 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt('fast_compile')
@local_optimizer([tensor.Alloc])
def local_gpuaalloc2(node):
def local_gpua_alloc2(node):
"""
Join(axis, {Alloc or HostFromGPU}, ...) -> Join(axis, GpuAlloc, Alloc, ...)
......@@ -292,22 +560,23 @@ def local_gpuaalloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)):
return [host_from_gpu(GpuAlloc(None)(*node.inputs))]
return [host_from_gpu(gpu_alloc(None)(*node.inputs))]
@register_opt('fast_compile')
@op_lifter([tensor.Alloc])
def local_gpuaalloc(node, context_name):
return GpuAlloc(context_name)(*node.inputs)
@register_opt2([tensor.Alloc], 'fast_compile')
def local_gpua_alloc(op, context_name, inputs, outputs):
return gpu_alloc(context_name)
@register_opt('fast_compile')
@op_lifter([tensor.AllocEmpty])
def local_gpuaallocempty(node, context_name):
@register_opt2([tensor.AllocEmpty], 'fast_compile')
def local_gpua_alloc_empty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the
# CPU op props.
return GpuAllocEmpty(context_name=context_name,
**node.op._props_dict())(*node.inputs)
return gpu_alloc_empty(context_name, **op._props_dict())
@register_opt()
......@@ -318,7 +587,7 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_op = GpuAlloc(node.op.context_name, memset_0=True)
new_op = gpu_alloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)]
......@@ -328,8 +597,8 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs)
z = numpy.asarray(0, dtype=node.outputs[0].dtype)
return [GpuAlloc()(as_gpuarray_variable(z, context_name),
*node.inputs)]
return [gpu_alloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
# After move to gpu and merge2, before inplace.
......@@ -352,14 +621,15 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile')
@op_lifter([tensor.extra_ops.CpuContiguous])
def local_gpu_contiguous(node, context_name):
@register_opt2([tensor.extra_ops.CpuContiguous], 'fast_compile')
def local_gpua_contiguous(op, context_name, inputs, outputs):
return gpu_contiguous
@register_opt('fast_compile')
@op_lifter([tensor.Reshape])
def local_gpureshape(node, context_name):
op = node.op
@register_opt2([tensor.Reshape], 'fast_compile')
def local_gpua_reshape(op, context_name, inputs, outputs):
name = op.name
if name:
name = 'Gpu' + name
......@@ -369,32 +639,33 @@ def local_gpureshape(node, context_name):
@register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node, context_name):
return node.op(as_gpuarray_variable(node.inputs[0], context_name))
@register_opt2([tensor.Rebroadcast], 'fast_compile')
def local_gpua_rebroadcast(op, context_name, inputs, outputs):
return op(as_gpuarray_variable(inputs[0], context_name))
@register_opt('fast_compile')
@op_lifter([tensor.Flatten])
def local_gpuflatten(node, context_name):
op = node.op
@register_opt2([tensor.Flatten], 'fast_compile')
def local_gpua_flatten(op, context_name, inputs, outputs):
shp = []
if op.outdim != 1:
shp = [node.inputs[0].shape[i] for i in range(op.outdim - 1)]
shp = [inputs[0].shape[i] for i in range(op.outdim - 1)]
shp += [-1]
res = GpuReshape(op.outdim, None)
o = res(node.inputs[0], theano.tensor.as_tensor_variable(shp))
o = res(inputs[0], theano.tensor.as_tensor_variable(shp))
return o
@register_opt('fast_compile')
@op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node, context_name):
op = node.op
@register_opt2([tensor.Elemwise], 'fast_compile')
def local_gpua_elemwise(op, context_name, inputs, outputs):
scal_op = op.scalar_op
name = op.name
if name:
name = 'Gpu' + name
if len(node.outputs) > 1:
if len(outputs) > 1:
return
res = GpuElemwise(scal_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern),
......@@ -407,13 +678,13 @@ def local_gpu_elemwise(node, context_name):
# Only transfer the computation on the gpu if the output dtype is
# floating point. Else, give up on the transfer to the gpu.
out_dtype = node.outputs[0].dtype
out_dtype = outputs[0].dtype
if out_dtype not in ['float16', 'float32', 'float64']:
return
# Transfer the inputs on the GPU and cast them to the right dtype.
new_inputs = []
for inp in node.inputs:
for inp in inputs:
if inp.dtype != out_dtype:
gpu_cast_op = GpuElemwise(Cast(Scalar(out_dtype)))
new_inputs.append(gpu_cast_op(as_gpuarray_variable(inp, context_name)))
......@@ -423,8 +694,7 @@ def local_gpu_elemwise(node, context_name):
# Perform the exponent on the gpu and transfer the output back to the
# cpu.
gpu_output = res(*new_inputs)
cpu_output = host_from_gpu(gpu_output)
return [cpu_output]
return [gpu_output]
else:
return res
......@@ -461,29 +731,41 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile')
@op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node, context_name):
return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order)
@register_opt2([tensor.DimShuffle], 'fast_compile')
def local_gpua_dimshuffle(op, context_name, inputs, outputs):
return GpuDimShuffle(op.input_broadcastable,
op.new_order)
@register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType):
@register_opt2([tensor.SpecifyShape], 'fast_compile')
def local_gpua_specifyShape(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType):
return
inp = [as_gpuarray_variable(node.inputs[0], context_name)]
inp += node.inputs[1:]
return local_gpua_specifyShape_graph(op, context_name, inputs, outputs)
@register_opt2([tensor.SpecifyShape], 'fast_compile')
def local_gpua_specifyShape_graph(op, context_name, inputs, outputs):
inp = [as_gpuarray_variable(inputs[0], context_name)]
inp += inputs[1:]
return tensor.specify_shape(*inp)
@register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node, context_name):
def local_gpua_shape(op, context_name, inputs, outputs):
# op_lifter will call this opt too frequently as the output is
# always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType):
if isinstance(inputs[0].type, GpuArrayType):
return
return [as_gpuarray_variable(node.inputs[0], context_name).shape]
return local_gpua_shape_graph(op, context_name, inputs, outputs)
@register_opt2([tensor.compile.ops.Shape], 'fast_compile')
def local_gpua_shape_graph(op, context_name, inputs, outputs):
return [as_gpuarray_variable(inputs[0], context_name).shape]
def gpu_print_wrapper(op, cnda):
......@@ -492,11 +774,12 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile')
@op_lifter([tensor.printing.Print])
def local_gpu_print_op(node, context_name):
x, = node.inputs
@register_opt2([tensor.printing.Print], 'fast_compile')
def local_gpua_print_op(op, context_name, inputs, outputs):
x, = inputs
gpu_x = as_gpuarray_variable(x, context_name=context_name)
new_op = node.op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = node.op
new_op = op.__class__(global_fn=gpu_print_wrapper)
new_op.old_op = op
return new_op(gpu_x)
......@@ -570,28 +853,30 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile')
@op_lifter([IfElse])
def local_gpua_lazy_ifelse(node, context_name):
if node.op.gpu:
@register_opt2([IfElse], 'fast_compile')
def local_gpua_lazy_ifelse(op, context_name, inputs, outputs):
if op.gpu:
return
c = node.inputs[0]
c = inputs[0]
inps = []
for v in node.inputs[1:]:
if isinstance(v.type, (tensor.TensorType, GpuArrayType)):
for v in inputs[1:]:
if isinstance(v.type, tensor.TensorType):
inps.append(as_gpuarray_variable(v, context_name))
else:
inps.append(v)
return IfElse(node.op.n_outs, gpu=True)(c, *inps, return_list=True)
return IfElse(op.n_outs, gpu=True)(c, *inps, return_list=True)
@register_opt('fast_compile')
@op_lifter([tensor.Join])
def local_gpua_join(node, context_name):
@register_opt2([tensor.Join], 'fast_compile')
def local_gpua_join(op, context_name, inputs, outputs):
return gpu_join
@register_opt('fast_compile')
@local_optimizer([GpuJoin])
def local_gpuajoin_1(node):
def local_gpua_join_1(node):
# join of a single element
if (isinstance(node.op, GpuJoin) and
len(node.inputs) == 2):
......@@ -600,14 +885,16 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile')
@op_lifter([tensor.Split])
def local_gpua_split(node, context_name):
return GpuSplit(node.op.len_splits)
@register_opt2([tensor.Split], 'fast_compile')
def local_gpua_split(op, context_name, inputs, outputs):
# TODO use props
return GpuSplit(op.len_splits)
@register_opt('fast_compile')
@op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node, context_name):
x = node.inputs[0]
def local_gpua_subtensor(op, context_name, inputs, outputs):
x = inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0]
if (gpu_x.owner and
......@@ -617,41 +904,68 @@ def local_gpua_subtensor(node, context_name):
if len(x.clients) == 1:
if any([n == 'output' or any([isinstance(v.type, GpuArrayType)
for v in n.inputs + n.outputs])
for n, _ in node.outputs[0].clients]):
for n, _ in outputs[0].clients]):
return
else:
return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
return GpuSubtensor(node.op.idx_list)
return [host_from_gpu(gpu_x.owner.op(outputs[0]))]
return GpuSubtensor(op.idx_list)
@register_opt2([tensor.Subtensor], 'fast_compile')
def local_gpua_subtensor_graph(op, context_name, inputs, outputs):
# We need different code as the condition is different as inputs
# aren't the same.
x = inputs[0]
# We don't want to move the subtensor to the GPU if the inputs is
# on the CPU and the only client of the CPU node is this
# subtensor. This allow to have a smaller transfer.
if (x.owner and isinstance(x.owner.op, GpuFromHost)):
cpu_x = x.owner.inputs[0]
# And it is a shared var or an input of the graph.
# and is used by only 1 node.
# x is in the new graph, so we can't tests its number of clients.
if not cpu_x.owner and len(cpu_x.clients) == 1:
c = outputs[0].clients
# If the subtensor have only 1 client, do it on the CPU.
# We let the other optimization to take care to move the
# next node or not.
if len(c) == 1:
return
return GpuSubtensor(op.idx_list)
@register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node, context_name):
op = GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased)
ret = op(*node.inputs)
val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
@register_opt2([tensor.IncSubtensor], 'fast_compile')
def local_gpua_inc_subtensor(op, context_name, inputs, outputs):
op = GpuIncSubtensor(op.idx_list, op.inplace,
op.set_instead_of_inc,
op.destroyhandler_tolerate_aliased)
ret = op(*inputs)
val = getattr(outputs[0].tag, 'nan_guard_mode_check', True)
ret.tag.nan_guard_mode_check = val
return ret
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node, context_name):
@register_opt2([tensor.AdvancedSubtensor1], 'fast_compile')
def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node, context_name):
@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
context = get_context(context_name)
# This is disabled on non-cuda contexts
if context.kind != b'cuda':
return None
x, y, ilist = node.inputs
x, y, ilist = inputs
# Gpu Ops needs both inputs to have the same dtype
if (x.type.dtype != y.type.dtype):
......@@ -661,7 +975,7 @@ def local_gpua_advanced_incsubtensor(node, context_name):
if y.type.dtype != dtype:
y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc
set_instead_of_inc = op.set_instead_of_inc
compute_capability = int(context.bin_id[-2])
......@@ -684,29 +998,31 @@ def local_advincsub1_gpua_inplace(node):
@register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node, context_name):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)):
@register_opt2([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod], 'fast_compile')
def local_gpua_careduce(op, context_name, inputs, outputs):
if isinstance(op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)):
ctx = get_context(context_name)
if ctx.kind == b'opencl':
op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]:
op2 = GpuCAReduceCPY
if op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code.
return
elif ctx.kind == b'cuda':
op = GpuCAReduceCuda
op2 = GpuCAReduceCuda
else:
return False
x, = node.inputs
x, = inputs
greduce = op(
node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
greduce = op2(
op.scalar_op, axis=op.axis,
dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(op, 'acc_dtype', None))
gvar = greduce(x)
# We need to have the make node called, otherwise the mask can
# be None
if (op is GpuCAReduceCPY or
if (op2 is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([
as_gpuarray_variable(x, context_name)])):
return greduce
......@@ -717,11 +1033,11 @@ def local_gpua_careduce(node, context_name):
# to make them a single dimension, do the reduction, and
# then reshape to get them back.
if node.op.axis is None:
if op.axis is None:
reduce_mask = [1] * x.type.ndim
else:
reduce_mask = [0] * x.type.ndim
for a in node.op.axis:
for a in op.axis:
assert reduce_mask[a] == 0
reduce_mask[a] = 1
......@@ -737,11 +1053,11 @@ def local_gpua_careduce(node, context_name):
for idx, m in enumerate(new_mask):
if m == 1:
new_axis.append(idx)
greduce = op(
node.op.scalar_op,
greduce = op2(
op.scalar_op,
axis=new_axis, reduce_mask=new_mask,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
dtype=getattr(op, 'dtype', None),
acc_dtype=getattr(op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = as_gpuarray_variable(reshaped_x, context_name)
......@@ -750,16 +1066,15 @@ def local_gpua_careduce(node, context_name):
# be None
reshaped_gpu_inputs = [gpu_reshaped_x]
if greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu(
greduce(gpu_reshaped_x))
reduce_reshaped_x = greduce(gpu_reshaped_x)
if reduce_reshaped_x.ndim != node.outputs[0].ndim:
if reduce_reshaped_x.ndim != outputs[0].ndim:
out_shp = []
for i in range(x.ndim):
if i not in node.op.axis:
if i not in op.axis:
out_shp.append(shape_i(x, i))
unreshaped_reduce = reduce_reshaped_x.reshape(
tensor.stack(out_shp))
unreshaped_reduce = GpuReshape(len(out_shp))(reduce_reshaped_x,
tensor.stack(out_shp))
else:
unreshaped_reduce = reduce_reshaped_x
return [unreshaped_reduce]
......@@ -767,8 +1082,9 @@ def local_gpua_careduce(node, context_name):
@register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node, context_name):
if node.op.inplace:
@register_opt2([tensor.blas.Gemv], 'fast_compile')
def local_gpua_gemv(op, context_name, inputs, outputs):
if op.inplace:
return gpugemv_inplace
else:
return gpugemv_no_inplace
......@@ -776,8 +1092,9 @@ def local_gpua_gemv(node, context_name):
@register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node, context_name):
if node.op.inplace:
@register_opt2([tensor.blas.Gemm], 'fast_compile')
def local_gpua_gemm(op, context_name, inputs, outputs):
if op.inplace:
return gpugemm_inplace
else:
return gpugemm_no_inplace
......@@ -785,27 +1102,29 @@ def local_gpua_gemm(node, context_name):
@register_opt('fast_compile')
@op_lifter([tensor.blas.BatchedDot])
def local_gpua_gemmbatch(node, context_name):
a, b = node.inputs
@register_opt2([tensor.blas.BatchedDot], 'fast_compile')
def local_gpua_gemmbatch(op, context_name, inputs, outputs):
a, b = inputs
c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node, context_name):
@register_opt2([tensor.basic.Dot], 'fast_compile')
def local_gpua_hgemm(op, context_name, inputs, outputs):
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since "
"cuda 7.5 is not available. Updating could speed up "
"your code.")
return
A = node.inputs[0]
B = node.inputs[1]
A = inputs[0]
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16', context_name=context_name)(
fgraph = outputs[0].fgraph
C = gpu_alloc_empty(context_name, dtype='float16')(
shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
......@@ -813,95 +1132,106 @@ def local_gpua_hgemm(node, context_name):
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs):
def local_gpua_gemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs):
def local_gpua_gemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@alpha_merge(GpuGemmBatch, alpha_in=1, beta_in=4)
def local_gpuagemmbatch_alpha_merge(node, *inputs):
def local_gpua_gemmbatch_alpha_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemmBatch, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemmbatch_output_merge(node, *inputs):
def local_gpua_gemmbatch_output_merge(node, *inputs):
return [gpugemmbatch_no_inplace(*inputs)]
@register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node, context_name):
return GpuGer(inplace=node.op.destructive)
@register_opt2([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer], 'fast_compile')
def local_gpua_ger(op, context_name, inputs, outputs):
return GpuGer(inplace=op.destructive)
@register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node, context_name):
@register_opt2([tensor.blas.Dot22], 'fast_compile')
def local_gpua_dot22(op, context_name, inputs, outputs):
return gpu_dot22
@register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22Scalar])
def local_gpua_dot22scalar(node, context_name):
x, y, a = node.inputs
@register_opt2([tensor.blas.Dot22Scalar], 'fast_compile')
def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs
x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name)
z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)]
@register_opt('fast_compile')
@op_lifter([tensor.basic.Eye])
def local_gpua_eye(node, context_name):
return GpuEye(dtype=node.op.dtype, context_name=context_name)
@register_opt2([tensor.basic.Eye], 'fast_compile')
def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
def local_gpua_crossentropysoftmaxargmax1hotwithbias(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_argmax_1hot_with_bias
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
@register_opt2([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], 'fast_compile')
def local_gpua_crossentropysoftmax1hotwithbiasdx(op, context_name, inputs, outputs):
return gpu_crossentropy_softmax_1hot_with_bias_dx
@register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node, context_name):
@register_opt2([tensor.nnet.Softmax], 'fast_compile')
def local_gpua_softmax(op, context_name, inputs, outputs):
return gpu_softmax
@register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node, context_name):
@register_opt2([tensor.nnet.SoftmaxWithBias], 'fast_compile')
def local_gpua_softmaxwithbias(op, context_name, inputs, outputs):
return gpu_softmax_with_bias
@register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert])
def local_assert(node, context_name):
# Check if input nodes are already on the GPU
if isinstance(node.inputs[0].type, GpuArrayType):
def local_gpua_assert(op, context_name, inputs, outputs):
if isinstance(inputs[0].type, GpuArrayType):
return
return [host_from_gpu(node.op(as_gpuarray_variable(node.inputs[0],
context_name),
*node.inputs[1:]))]
return local_gpua_assert_graph(op, context_name, inputs, outputs)
@register_opt2([theano.tensor.opt.Assert], 'fast_compile')
def local_gpua_assert_graph(op, context_name, inputs, outputs):
return [op(as_gpuarray_variable(inputs[0], context_name),
*inputs[1:])]
@register_opt('fast_compile')
@op_lifter([ConvOp])
def local_error_convop(node, context_name):
@register_opt2([ConvOp], 'fast_compile')
def local_gpua_error_convop(op, context_name, inputs, outputs):
assert False, """
ConvOp does not work with the gpuarray backend.
......@@ -912,8 +1242,9 @@ theano.tensor.nnet.conv2d()
@register_opt('fast_compile')
@op_lifter([SparseBlockGemv])
def local_lift_sparseblockgemv(node, context_name):
if node.op.inplace:
@register_opt2([SparseBlockGemv], 'fast_compile')
def local_gpua_sparseblockgemv(op, context_name, inputs, outputs):
if op.inplace:
return gpu_sparse_block_gemv_inplace
else:
return gpu_sparse_block_gemv
......@@ -921,8 +1252,9 @@ def local_lift_sparseblockgemv(node, context_name):
@register_opt('fast_compile')
@op_lifter([SparseBlockOuter])
def local_lift_sparseblockouter(node, context_name):
if node.op.inplace:
@register_opt2([SparseBlockOuter], 'fast_compile')
def local_gpua_sparseblockouter(op, context_name, inputs, outputs):
if op.inplace:
return gpu_sparse_block_outer_inplace
else:
return gpu_sparse_block_outer
......@@ -943,20 +1275,27 @@ def local_inplace_sparseblockouter(node):
# This deals with any abstract convs that have a transfer somewhere
@register_opt('fast_compile')
@register_opt('fast_compile', 'conv_dnn', 'cudnn')
@op_lifter([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs])
def local_lift_abstractconv2d(node, context_name):
if isinstance(node.outputs[0].type, GpuArrayType):
def local_gpua_abstractconv2d(op, context_name, inputs, outputs):
if isinstance(outputs[0].type, GpuArrayType):
# Don't handle this node here, it's already on the GPU.
return
inps = list(node.inputs)
inps[0] = as_gpuarray_variable(node.inputs[0],
return local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs)
@register_opt2([AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs], 'fast_compile', 'conv_dnn', 'cudnn')
def local_gpua_lift_abstractconv2d_graph(op, context_name, inputs, outputs):
inps = list(inputs)
inps[0] = as_gpuarray_variable(inputs[0],
context_name=context_name)
inps[1] = as_gpuarray_variable(node.inputs[1],
inps[1] = as_gpuarray_variable(inputs[1],
context_name=context_name)
return [node.op(*inps)]
return [op(*inps)]
# Register this here so that it goes after the abstract lifting
register_opt('fast_compile')(conv_groupopt)
......@@ -980,10 +1319,10 @@ def local_gpu_elemwise_careduce(node):
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
op = node.op
inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduceCuda(scalar_op=op.scalar_op,
axis=op.axis,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
return [gpu_ca_reduce_cuda(scalar_op=op.scalar_op,
axis=op.axis,
reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)]
@local_optimizer(None)
......@@ -1063,35 +1402,36 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan])
def local_scan_to_gpua(node, context_name):
info = copy.deepcopy(node.op.info)
@register_opt2([scan_op.Scan], 'fast_compile')
def local_gpua_scan_to_gpua(op, context_name, inputs, outputs):
info = copy.deepcopy(op.info)
if info.get('gpua', False):
return
info['gpua'] = True
nw_ins = [node.inputs[0]]
nw_ins = [inputs[0]]
e = (1 +
node.op.n_seqs +
node.op.n_mit_mot +
node.op.n_mit_sot +
node.op.n_sit_sot +
node.op.n_shared_outs)
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
op.n_seqs +
op.n_mit_mot +
op.n_mit_sot +
op.n_sit_sot +
op.n_shared_outs)
nw_ins += [safe_to_gpu(x, context_name) for x in inputs[1:e]]
b = e
e = e + node.op.n_nit_sot
nw_ins += node.inputs[b:e]
nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
e = e + op.n_nit_sot
nw_ins += inputs[b:e]
nw_ins += [safe_to_gpu(x, context_name) for x in inputs[e:]]
scan_ins = [tensor_to_gpu(x, context_name) for x in op.inputs]
# The inner output corresponding to the looping condition should not be
# moved to the gpu
if node.op.info['as_while']:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]]
if op.info['as_while']:
scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs[:-1]]
scan_outs += [op.outputs[-1]]
else:
scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
scan_outs = [safe_to_gpu(x, context_name) for x in op.outputs]
scan_outs = scan_utils.clone(
scan_outs,
replace=list(zip(node.op.inputs,
replace=list(zip(op.inputs,
(safe_to_cpu(x) for x in scan_ins))))
# We need to construct the hash here, because scan
......
......@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
......@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
alloc.owner.op.context_name)
alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs)
return opt
......
......@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg)
mode = mode_with_gpu
if fd != (1, 1):
raise SkipTest("Doesn't have CUDNN implementation")
o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b,
......
......@@ -396,7 +396,7 @@ def test_gpueye():
k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb],
out,
T.stack(out),
mode=mode_with_gpu)
result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
......
......@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
ones = numpy.ones((2,), dtype='float32')
# Test with 0 from CPU op.
# Should not be transfered as the only client is the output
a = tensor.alloc(z, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
assert len(topo) == 1
assert isinstance(topo[0].op, theano.tensor.Alloc)
assert (numpy.asarray(f(6)) == 0).all()
# Test with 0 from CPU op.
# Should be transfered as it is used by another op.
a = tensor.alloc(z, i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAlloc)
assert (numpy.asarray(f(6)) == 0).all()
# Test with 0
......@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
ii = theano.tensor.iscalar()
# Test with vector
# Should not be moved as the only client is the output
a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert len(topo) == 1
assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
# This return not initilized data, so we can only check the shape
assert f(3).shape == (3,)
# Test with vector
# Should be moved
a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape
assert f(3).shape == (3,)
# Test with matrix
a = tensor.AllocEmpty('float32')(i, ii)
f = theano.function([i, ii], a, mode=mode_with_gpu)
f = theano.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape
assert f(3, 4).shape == (3, 4)
......@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert any([isinstance(node.op, GpuElemwise) for node in topo])
# Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
def test_local_gpu_elemwise():
......@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
out = theano.tensor.tanh(ms).dot(ms.T)
mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise")
mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
old = theano.config.assert_no_cpu_op
old2 = theano.config.on_opt_error
......
......@@ -233,7 +233,7 @@ class GpuArrayType(Type):
return data
def filter_variable(self, other, allow_convert=True):
from theano.gpuarray import GpuFromHost
from theano.gpuarray.basic_ops import gpu_from_host
if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable(self.context_name)
......@@ -265,7 +265,7 @@ class GpuArrayType(Type):
str(self.broadcastable)))
other = other2
return GpuFromHost(self.context_name)(other)
return gpu_from_host(self.context_name)(other)
@staticmethod
def values_eq(a, b, force_same_dtype=True):
......
......@@ -24,10 +24,11 @@ from . import multinomial
import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2,
host_from_gpu as host_from_gpua)
if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType,
......@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
return final_samples
@register_opt2([mrg_uniform], 'fast_compile')
def local_gpua_mrg_graph(op, context_name, inputs, outputs):
if (type(op) == mrg_uniform and
isinstance(inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(inputs[0],
op.output_type.ndim,
op.output_type.dtype,
inputs[1])
return [outs[0], host_from_gpua(outs[1])]
@register_gpua('fast_compile')
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
# TODO : need description for function
if (type(node.op) == mrg_uniform and
isinstance(node.inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
context_name = infer_context_name(*node.inputs)
return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
......
......@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
return d
visited.add(out)
from theano.sandbox import cuda
from theano import gpuarray
from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType
if out == x:
if isinstance(x.type, cuda.CudaNdarrayType):
d[out] = cuda.gpu_from_host(x_copy)
else:
assert isinstance(x.type, gpuarray.GpuArrayType)
d[out] = gpuarray.GpuFromHost(x.type.context_name)(x_copy)
assert isinstance(x.type, GpuArrayType)
d[out] = gpu_from_host(x.type.context_name)(x_copy)
return d
elif out.owner is None:
return d
......@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
elif (gpuarray.pygpu_activated and
out.owner.op == gpuarray.host_from_gpu and
elif (pygpu_activated and
out.owner.op == host_from_gpu and
out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy)
return d
......
......@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
v = v.owner.inputs[0]
continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant):
return numpy.asarray(
v.owner.inputs[0].data.shape[v.owner.op.i])
i = v.owner.op.i
inp = v.owner.inputs[0]
if isinstance(inp, Constant):
return numpy.asarray(inp.data.shape[i])
# The shape of a broadcastable dimension is 1
if (hasattr(inp.type, 'broadcastable') and
inp.type.broadcastable[i]):
return numpy.asarray(1)
# Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow.
......@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
sh = [as_tensor_variable(s) for s in shape]
bcast = []
for i, s in enumerate(sh):
if config.exception_verbosity == 'high':
s_as_str = '\n' + min_informative_str(s)
else:
s_as_str = str(s)
def err_str():
if config.exception_verbosity == 'high':
return '\n' + min_informative_str(s)
else:
return str(s)
if s.type.dtype[:3] not in ('int', 'uin'):
s_as_str = err_str()
raise TypeError('Shape arguments to Alloc must be integers, '
'but argument %s is not for apply node: %s' %
(i, s_as_str))
if s.ndim != 0:
s_as_str = err_str()
raise TypeError(
"Each shape dimension to Alloc must be a scalar, ",
'but dimension %s have %d dimensions for apply node: %s' %
......
......@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
"""
bsize, imshp = image_shape[0], image_shape[2:]
nkern, kshp = kernel_shape[0], kernel_shape[2:]
if filter_dilation is None:
filter_dilation = numpy.ones(len(subsample), dtype='int')
if isinstance(border_mode, tuple):
out_shp = tuple(get_conv_shape_1axis(
imshp[i], kshp[i], border_mode[i],
......@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
pad = border_mode
if pad < 0:
raise ValueError("border_mode must be >= 0")
out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
# In case of symbolic shape, we want to build the smallest graph
# (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
if pad == 0:
out_shp = (image_shape - dil_kernel_shape)
else:
out_shp = (image_shape + 2 * pad - dil_kernel_shape)
if subsample != 1:
out_shp = out_shp // subsample
out_shp = out_shp + 1
return out_shp
......
......@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
assert get_scalar_constant_value(s) == 3
s = opt.Shape_i(1)(c)
assert get_scalar_constant_value(s) == 4
d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
assert get_scalar_constant_value(f) == 1
def test_elemwise(self):
# We test only for a few elemwise, the list of all supported
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论