提交 4814cd99 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3482 from abergeron/multi_gpu_new2

Multi-gpu support
...@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): ...@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
if (config.device.startswith('cuda') or if (config.device.startswith('cuda') or
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')): config.init_gpu_device.startswith('opencl') or
config.contexts != ''):
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr # Use config.numpy to call numpy.seterr
......
...@@ -111,6 +111,29 @@ AddConfigVar( ...@@ -111,6 +111,29 @@ AddConfigVar(
BoolParam(False, allow_override=False), BoolParam(False, allow_override=False),
in_c_key=False) in_c_key=False)
class ContextsParam(ConfigParam):
def __init__(self):
def filter(val):
if val == '':
return val
for v in val.split(';'):
s = v.split('->')
if len(s) != 2:
raise ValueError("Malformed context map: %s" % (v,))
return val
ConfigParam.__init__(self, '', filter, False)
AddConfigVar(
'contexts',
"""
Context map for multi-gpu operation. Format is a
semicolon-separated list of names and device names in the
'name->dev_name' format. An example that would map name 'test' to
device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
"test->cuda0;test2->opencl0:0".
""", ContextsParam(), in_c_key=False)
AddConfigVar( AddConfigVar(
'print_active_device', 'print_active_device',
"Print active device at when the GPU device is initialized.", "Print active device at when the GPU device is initialized.",
......
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import time
import numpy
import theano
from theano.sandbox.gpuarray import init_dev
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor as shared
from theano.sandbox.gpuarray.blas import gpu_dot22
def main(dev1, dev2):
init_dev(dev1, 'ctx1')
init_dev(dev2, 'ctx2')
val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
f1 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val1c, val1d)])
f2 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val2a, val2b)])
r = f1()
r[0].sync(), r[1].sync()
r = None
t = time.time()
r = f1()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("one ctx %f" % (t2 - t,))
r = f2()
r[0].sync(), r[1].sync()
r = None
t = time.time()
r = f2()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("two ctx %f" % (t2 - t,))
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
raise ValueError("This script require two device names.")
main(sys.argv[1], sys.argv[2])
...@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp): ...@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, tensor.TensorType): return self(ev)
return [gpu_from_host(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
...@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp): ...@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, CudaNdarrayType): self(ev)
return [host_from_gpu(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
......
...@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node): ...@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
return False return False
def typeInfer(node):
return typeConstructor
optdb.register('gpu_scanOp_make_inplace', optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor, scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
gpu_flag=True), gpu_flag=True),
75, 75,
'gpu', 'gpu',
......
...@@ -21,26 +21,30 @@ except ImportError: ...@@ -21,26 +21,30 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context)
from . import opt, nerv from . import opt, nerv
def init_dev(dev): def init_dev(dev, name=None):
if pygpu.gpuarray.api_version() != (-10000, 0): if pygpu.gpuarray.api_version() != (-10000, 0):
raise RuntimeError("Wrong API version for gpuarray:", raise RuntimeError("Wrong API version for gpuarray:",
pygpu.gpuarray.api_version(), pygpu.gpuarray.api_version(),
"Make sure Theano and libgpuarray/pygpu " "Make sure Theano and libgpuarray/pygpu "
"are in sync.") "are in sync.")
global pygpu_activated global pygpu_activated
context = pygpu.init(dev) if dev not in init_dev.devmap:
pygpu.set_default_context(context) init_dev.devmap[dev] = pygpu.init(dev)
context = init_dev.devmap[dev]
# This will map the context name to the real context object.
reg_context(name, context)
pygpu_activated = True pygpu_activated = True
if config.print_active_device: if config.print_active_device:
print("Using device %s: %s" % (dev, context.devname), file=sys.stderr) print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
# remember the active device file=sys.stderr)
init_dev.device = dev
init_dev.device = None # This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
if pygpu: if pygpu:
try: try:
...@@ -52,11 +56,21 @@ if pygpu: ...@@ -52,11 +56,21 @@ if pygpu:
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif (config.init_gpu_device.startswith('cuda') or elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')): config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu':
raise ValueError('you must set device=cpu to use init_gpu_device.')
if config.contexts != '':
print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
init_dev(config.init_gpu_device) init_dev(config.init_gpu_device)
if config.contexts != '':
for n, d in (c.split('->') for c in config.contexts.split(';')):
init_dev(d.strip(), n.strip())
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost, from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu) GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
from .basic_ops import host_from_gpu, gpu_from_host from .basic_ops import host_from_gpu, GpuFromHost
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
from .subtensor import (GpuSubtensor, GpuIncSubtensor, from .subtensor import (GpuSubtensor, GpuIncSubtensor,
GpuAdvancedIncSubtensor1) GpuAdvancedIncSubtensor1)
...@@ -67,5 +81,6 @@ else: ...@@ -67,5 +81,6 @@ else:
if (config.init_gpu_device.startswith('cuda') or if (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl') or config.init_gpu_device.startswith('opencl') or
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.device.startswith('cuda')): config.device.startswith('cuda') or
config.contexts != ''):
error("pygpu was configured but could not be imported", exc_info=True) error("pygpu was configured but could not be imported", exc_info=True)
...@@ -9,7 +9,9 @@ from theano.tensor.basic import Alloc, Join, Split ...@@ -9,7 +9,9 @@ from theano.tensor.basic import Alloc, Join, Split
from theano.gof import HideC from theano.gof import HideC
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.compat import PY3
from collections import deque
from six import string_types from six import string_types
from six.moves import xrange from six.moves import xrange
...@@ -19,27 +21,83 @@ try: ...@@ -19,27 +21,83 @@ try:
except ImportError: except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType, GpuArrayConstant, gpu_context_type, get_context
from .fp16_help import write_w from .fp16_help import write_w
def as_gpuarray_variable(x): def as_gpuarray_variable(x, context_name):
if getattr(x, 'owner', None): # If this is already some form of variable, try to avoid an extra transfer
if isinstance(x.owner.op, HostFromGpu): if isinstance(x, Variable):
return x.owner.inputs[0] while True:
elif (isinstance(x.owner.op, GpuFromHost) and # If we are already a GpuArrayVariable in the right context
x.owner.inputs[0].owner and # then there is nothing to do.
isinstance(x.owner.inputs[0].owner.op, HostFromGpu)): if (isinstance(x.type, GpuArrayType) and
return x.owner.inputs[0].owner.inputs[0] x.type.context_name == context_name):
return x
# If x is the result of a transfer, try to dig through.
if getattr(x, 'owner', None):
if isinstance(x.owner.op, HostFromGpu):
x = x.owner.inputs[0]
continue
if isinstance(x.owner.op, GpuFromHost):
x = x.owner.inputs[0]
continue
if isinstance(x.owner.op, GpuToGpu):
x = x.owner.inputs[0]
continue
# If none of the conditions where met, then continue with
# the rest of the body
break
# If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x)
# Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable(context_name)
# TODO we need to have the cuda -> gpu path taken care of.
tensor_x = tensor.as_tensor_variable(x) # If it didn't work try for a constant
return gpu_from_host(tensor_x) ctx = get_context(context_name)
if isinstance(x, gpuarray.GpuArray):
if x.context.ptr != ctx.ptr:
x = x.transfer(ctx)
def as_gpuarray(x): x = gpuarray.asarray(x, context=ctx)
return gpuarray.array(x, copy=False)
bcast = [(s == 1) for s in x.shape]
return GpuArrayConstant(GpuArrayType(dtype=x.dtype,
broadcastable=bcast,
context_name=context_name),
x)
def infer_context_name(*vars):
"""
Infer the context name to use from the inputs given
"""
# We try to infer the closest context first
# TODO: What to do in case of context conflicts?
# We currently use a first found wins approach.
todo = deque()
todo.extendleft(vars)
while todo:
v = todo.pop()
if isinstance(v.type, GpuArrayType):
return v.type.context_name
if hasattr(v.tag, 'context_name'):
return v.tag.context_name
if v.owner:
if isinstance(v.owner.op, HostFromGpu):
return v.owner.inputs[0].type.context_name
if len(v.owner.inputs) == 1:
todo.extendleft(v.owner.inputs)
# If we can't find a context we infer None, which is the default
return None
class Kernel(object): class Kernel(object):
...@@ -111,10 +169,12 @@ class Kernel(object): ...@@ -111,10 +169,12 @@ class Kernel(object):
class GpuKernelBase(object): class GpuKernelBase(object):
context_type = gpu_context_type
def gpu_kernels(self, node, name): def gpu_kernels(self, node, name):
""" """
This is the method to override. This should return an iterable of Kernel This is the method to override. This should return an iterable
objects that describe the kernels this op will need. of Kernel objects that describe the kernels this op will need.
""" """
raise MethodNotDefined('gpu_kernels') raise MethodNotDefined('gpu_kernels')
...@@ -126,8 +186,9 @@ class GpuKernelBase(object): ...@@ -126,8 +186,9 @@ class GpuKernelBase(object):
o = [] o = []
return o + ['gpuarray/types.h'] return o + ['gpuarray/types.h']
def _generate_kernel_bin(self, k): def _generate_kernel_bin(self, k, ctx):
gk = gpuarray.GpuKernel(k.code, k.name, k.params, **k.flags) gk = gpuarray.GpuKernel(k.code, k.name, k.params, context=ctx,
**k.flags)
bin = gk._binary bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin) bcode = ','.join(hex(ord(c)) for c in bin)
return ("""static const char %(bname)s[] = { %(bcode)s };""" % return ("""static const char %(bname)s[] = { %(bcode)s };""" %
...@@ -140,7 +201,7 @@ class GpuKernelBase(object): ...@@ -140,7 +201,7 @@ class GpuKernelBase(object):
dict(cname=k.codevar, code=code)) dict(cname=k.codevar, code=code))
def _generate_kernel_vars(self, k): def _generate_kernel_vars(self, k):
return """static GpuKernel %(kname)s;""" % dict(kname=k.objvar) return """GpuKernel %(kname)s;""" % dict(kname=k.objvar)
def c_support_code(self): def c_support_code(self):
return """ return """
...@@ -153,46 +214,62 @@ class GpuKernelBase(object): ...@@ -153,46 +214,62 @@ class GpuKernelBase(object):
def c_support_code_apply(self, node, name): def c_support_code_apply(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
bins = '\n'.join(self._generate_kernel_bin(k) for k in kernels) ctx = self.get_context(node)
bins = '\n'.join(self._generate_kernel_bin(k, ctx) for k in kernels)
codes = '\n'.join(self._generate_kernel_code(k) for k in kernels) codes = '\n'.join(self._generate_kernel_code(k) for k in kernels)
vars = '\n'.join(self._generate_kernel_vars(k) for k in kernels) return '\n'.join([bins, codes])
return '\n'.join([bins, codes, vars])
def _generate_kernel_init(self, k, err): def c_support_code_struct(self, node, name):
if PY3: kernels = self.gpu_kernels(node, name)
error_out = "NULL" return '\n'.join(self._generate_kernel_vars(k) for k in kernels)
else:
error_out = "" def _generate_zeros(self, k):
return """memset(&%(v)s, 0, sizeof(%(v)s));""" % dict(v=k.objvar)
def _generate_kernel_init(self, k, fail, ctx):
return """{ return """{
int err;
int types[%(numargs)u] = {%(types)s}; int types[%(numargs)u] = {%(types)s};
const char *bcode = %(bvar)s; const char *bcode = %(bvar)s;
size_t sz = sizeof(%(bvar)s); size_t sz = sizeof(%(bvar)s);
PyGpuContextObject *c = pygpu_default_context(); if (GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1, &bcode, &sz,
if (GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &bcode, &sz, "%(kname)s", "%(kname)s", %(numargs)u, types, GA_USE_BINARY, NULL)
%(numargs)u, types, GA_USE_BINARY, NULL) != GA_NO_ERROR) { != GA_NO_ERROR) {
if ((%(err)s = GpuKernel_init(&%(ovar)s, c->ops, c->ctx, 1, &%(cname)s, if ((err = GpuKernel_init(&%(ovar)s, %(ctx)s->ops, %(ctx)s->ctx, 1,
NULL, "%(kname)s", %(numargs)u, types, &%(cname)s, NULL, "%(kname)s", %(numargs)u,
%(flags)s, NULL)) != GA_NO_ERROR) { types, %(flags)s, NULL)) != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s", PyErr_Format(PyExc_RuntimeError, "GpuKernel_init error %%d: %%s",
%(err)s, Gpu_error(c->ops, c->ctx, %(err)s)); err, Gpu_error(%(ctx)s->ops, %(ctx)s->ctx, err));
return %(error_out)s; %(fail)s
} }
} }
}""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar, }""" % dict(numargs=len(k.params), types=k._get_c_types(), bvar=k.binvar,
ovar=k.objvar, kname=k.name, err=err, cname=k.codevar, ovar=k.objvar, kname=k.name, cname=k.codevar,
flags=k._get_c_flags(), error_out=error_out) flags=k._get_c_flags(), fail=fail, ctx=ctx)
def c_init_code_struct(self, node, name, sub):
ctx = sub['context']
kernels = self.gpu_kernels(node, name)
inits_0 = '\n'.join(self._generate_zeros(k) for k in kernels)
inits = '\n'.join(self._generate_kernel_init(k, sub['fail'], ctx)
for k in kernels)
return '\n'.join([inits_0, inits])
def c_init_code_apply(self, node, name): def _generate_kernel_cleanup(self, k):
err = 'err_' + name return "GpuKernel_clear(&%(ovar)s);" % dict(ovar=k.objvar)
def c_cleanup_code_struct(self, node, name):
kernels = self.gpu_kernels(node, name) kernels = self.gpu_kernels(node, name)
inits = '\n'.join(self._generate_kernel_init(k, err) for k in kernels) cleanups = '\n'.join(self._generate_kernel_cleanup(k) for k in kernels)
return ("int %(err)s;\n" % dict(err=err)) + inits return cleanups
def _GpuKernelBase_version(self): # This is a shorthand for if your op only has a fixed version
ctx = gpuarray.get_default_context() # You can reimplement it, but make sure to call kernel_version()
return (2, ctx.kind, ctx.devname) def c_code_cache_version_apply(self, node):
return (self.c_code_cache_version(), self.kernel_version(node))
GpuKernelBase_version = property(_GpuKernelBase_version) def kernel_version(self, node):
return (3, node.get_context().bin_id)
class HostFromGpu(Op): class HostFromGpu(Op):
...@@ -259,50 +336,52 @@ class HostFromGpu(Op): ...@@ -259,50 +336,52 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [gpu_from_host(gz)] return [GpuFromHost(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, tensor.TensorType): return self(ev)
return [gpu_from_host(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
host_from_gpu = HostFromGpu() host_from_gpu = HostFromGpu()
class GpuFromHost(Op): class GpuFromHost(Op):
__props__ = () __props__ = ('context_name',)
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, context_name):
self.context_name = context_name
def __str__(self): def __str__(self):
return 'GpuFromHost(gpuarray)' return 'GpuFromHost<%s>' % (self.context_name,)
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable, return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
context_name=self.context_name,
dtype=x.dtype)()]) dtype=x.dtype)()])
def perform(self, node, inp, out): def get_context(self, node):
return get_context(self.context_name)
def perform(self, node, inp, out, ctx):
x, = inp x, = inp
z, = out z, = out
z[0] = gpuarray.array(x) z[0] = gpuarray.array(x, context=ctx)
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [host_from_gpu(as_gpuarray_variable(gz))] return [host_from_gpu(as_gpuarray_variable(
gz, context_name=self.context_name))]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, GpuArrayType): return self(ev)
return [host_from_gpu(ev)]
else:
return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
...@@ -319,19 +398,67 @@ class GpuFromHost(Op): ...@@ -319,19 +398,67 @@ class GpuFromHost(Op):
PyArray_NDIM(%(name)s_tmp), PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp), (size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp), (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
pygpu_default_context(), %(ctx)s,
Py_None); Py_None);
Py_DECREF(%(name)s_tmp); Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL) if (%(out)s == NULL) {
%(fail)s %(fail)s
""" % {'name': name, 'inp': inputs[0], }
""" % {'name': name, 'inp': inputs[0], 'ctx': sub['context'],
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (7,)
gpu_from_host = GpuFromHost() class GpuToGpu(Op):
__props__ = ('context_name',)
_f16_ok = True
context_type = gpu_context_type
def __init__(self, context_name):
self.context_name = context_name
def __str__(self):
return 'GpuToGpu<%s>' % (self.context_name,)
def make_node(self, x):
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
context_name=self.context_name,
dtype=x.dtype)()])
def get_context(self, node):
return get_context(self.context_name)
def perform(self, node, inp, out, ctx):
x, = inp
z, = out
z[0] = x.transfer(ctx)
def grad(self, inputs, grads):
gz, = grads
return [GpuToGpu(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points):
return self(eval_points[0])
def infer_shape(self, node, xshp):
return xshp
def c_code(self, node, name, inputs, outputs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_transfer(%(inp)s, %(ctx)s, 0);
if (%(out)s == NULL) {
%(fail)s
}
""" % {'inp': inputs[0], 'ctx': sub['context'],
'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (0,)
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
...@@ -339,28 +466,35 @@ class GpuAlloc(HideC, Alloc): ...@@ -339,28 +466,35 @@ class GpuAlloc(HideC, Alloc):
Parameters Parameters
---------- ----------
memset_0 context_name : str
The name of the context in which to allocate memory
memset_0 : bool
It's only an optimized version. True, it means the It's only an optimized version. True, it means the
value is always 0, so the c code call memset as it is faster. value is always 0, so the c code call memset as it is faster.
""" """
__props__ = ('memset_0',) __props__ = ('memset_0', 'context_name')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, memset_0=False): def __init__(self, context_name, memset_0=False):
self.context_name = context_name
self.memset_0 = memset_0 self.memset_0 = memset_0
def get_context(self, node):
return get_context(self.context_name)
def __str__(self): def __str__(self):
# Hide the memset parameter when not used to prevent confusion. # Hide the memset parameter when not used to prevent confusion.
if self.memset_0: if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0) m = "{memset_0=True}"
else: else:
s = self.__class__.__name__ m = ""
return s return "%s<%s>%s" % (self.__class__.__name__, self.context_name, m)
def make_node(self, value, *shape): def make_node(self, value, *shape):
value = as_gpuarray_variable(value) value = as_gpuarray_variable(value, context_name=self.context_name)
sh, bcast = self.validate_shape(shape) sh, bcast = self.validate_shape(shape)
if value.ndim > len(sh): if value.ndim > len(sh):
TypeError("The GpuAlloc value to use has more dimensions " TypeError("The GpuAlloc value to use has more dimensions "
...@@ -371,15 +505,15 @@ class GpuAlloc(HideC, Alloc): ...@@ -371,15 +505,15 @@ class GpuAlloc(HideC, Alloc):
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>'] return ['<numpy_compat.h>']
def perform(self, node, inputs, outs): def perform(self, node, inputs, outs, ctx):
out, = outs out, = outs
v = inputs[0] v = inputs[0]
sh = tuple(map(int, inputs[1:])) sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
if self.memset_0: if self.memset_0:
out[0] = gpuarray.zeros(sh, dtype=v.dtype) out[0] = gpuarray.zeros(sh, dtype=v.dtype, context=ctx)
else: else:
out[0] = gpuarray.empty(sh, dtype=v.dtype) out[0] = gpuarray.empty(sh, dtype=v.dtype, context=ctx)
out[0][...] = v out[0][...] = v
else: else:
out[0][...] = v out[0][...] = v
...@@ -414,7 +548,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -414,7 +548,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
...@@ -423,7 +557,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -423,7 +557,7 @@ class GpuAlloc(HideC, Alloc):
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
...@@ -434,9 +568,9 @@ class GpuAlloc(HideC, Alloc): ...@@ -434,9 +568,9 @@ class GpuAlloc(HideC, Alloc):
if (err != GA_NO_ERROR) if (err != GA_NO_ERROR)
{ {
PyErr_Format(PyExc_MemoryError, PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d" "GpuAlloc: Error memsetting %%llu"
" element of device memory to 0.", " element of device memory to 0.",
PyGpuArray_SIZE(%(zz)s)); (unsigned long long)PyGpuArray_SIZE(%(zz)s));
%(fail)s; %(fail)s;
} }
} }
...@@ -446,7 +580,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -446,7 +580,7 @@ class GpuAlloc(HideC, Alloc):
%(fail)s %(fail)s
} }
} }
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['context'],
fail=sub['fail'], memset_0=memset_0) fail=sub['fail'], memset_0=memset_0)
if config.gpuarray.sync: if config.gpuarray.sync:
...@@ -455,7 +589,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -455,7 +589,7 @@ class GpuAlloc(HideC, Alloc):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
from . import subtensor, blas from . import subtensor, blas
...@@ -488,29 +622,32 @@ class GpuAlloc(HideC, Alloc): ...@@ -488,29 +622,32 @@ class GpuAlloc(HideC, Alloc):
return True return True
gpu_alloc = GpuAlloc()
class GpuAllocEmpty(HideC, Alloc): class GpuAllocEmpty(HideC, Alloc):
__props__ = ('dtype',) __props__ = ('dtype', 'context_name')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def __init__(self, dtype): def __init__(self, dtype, context_name):
self.dtype = dtype self.dtype = dtype
self.context_name = context_name
def get_context(self, node):
return get_context(self.context_name)
def make_node(self, *shape): def make_node(self, *shape):
sh, bcast = self.validate_shape(shape) sh, bcast = self.validate_shape(shape)
output = GpuArrayType(dtype=self.dtype, broadcastable=bcast)() output = GpuArrayType(dtype=self.dtype, broadcastable=bcast,
context_name=self.context_name)()
output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true output.tag.values_eq_approx = tensor.type.values_eq_approx_always_true
# The outut can contain nan/inf. # The outut can contain nan/inf.
output.type.filter_checks_isfinite = False output.type.filter_checks_isfinite = False
return Apply(self, sh, [output]) return Apply(self, sh, [output])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_, ctx):
out = out_[0] out = out_[0]
sh = [int(i) for i in inputs] sh = [int(i) for i in inputs]
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
out[0] = pygpu.empty(sh, dtype=self.dtype) out[0] = pygpu.empty(sh, dtype=self.dtype, context=ctx)
# if out[0] is the right shape, we just return it # if out[0] is the right shape, we just return it
def c_headers(self): def c_headers(self):
...@@ -536,16 +673,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0]; ...@@ -536,16 +673,16 @@ shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
code.append(""" code.append("""
if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
pygpu_default_context())) { %(ctx)s)) {
%(fail)s %(fail)s
} }
""" % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype), """ % dict(zz=zz, ndim=ndim, type=gpuarray.dtype_to_typecode(self.dtype),
fail=fail)) fail=fail, ctx=sub['context']))
return ''.join(code) return ''.join(code)
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,) return (1,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
return False return False
...@@ -559,7 +696,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER, ...@@ -559,7 +696,7 @@ if (theano_prep_output(&%(zz)s, %(ndim)s, shape, %(type)s, GA_C_ORDER,
def empty_like(var): def empty_like(var):
return GpuAllocEmpty(var.type.dtype)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
class GpuContiguous(Op): class GpuContiguous(Op):
...@@ -568,7 +705,6 @@ class GpuContiguous(Op): ...@@ -568,7 +705,6 @@ class GpuContiguous(Op):
not already c contiguous. not already c contiguous.
""" """
__props__ = () __props__ = ()
view_map = {0: [0]} view_map = {0: [0]}
_f16_ok = True _f16_ok = True
...@@ -576,12 +712,13 @@ class GpuContiguous(Op): ...@@ -576,12 +712,13 @@ class GpuContiguous(Op):
def grad(self, inputs, dout): def grad(self, inputs, dout):
x, = inputs x, = inputs
dout, = dout dout, = dout
dout = as_gpuarray_variable(dout) dout = as_gpuarray_variable(dout, context_name=infer_context_name(x))
return [dout] return [dout]
def make_node(self, input): def make_node(self, input):
input = as_gpuarray_variable(input) input = as_gpuarray_variable(input,
context_name=infer_context_name(input))
return Apply(self, [input], [input.type()]) return Apply(self, [input], [input.type()])
def c_headers(self): def c_headers(self):
...@@ -633,10 +770,12 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -633,10 +770,12 @@ class GpuReshape(HideC, tensor.Reshape):
# __hash__, __eq__, __str__ come from tensor.Reshape # __hash__, __eq__, __str__ come from tensor.Reshape
def make_node(self, x, shp): def make_node(self, x, shp):
x = as_gpuarray_variable(x) ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim) res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype, otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable) broadcastable=res.broadcastable,
context_name=ctx_name)
return Apply(self, [x, shp], [otype()]) return Apply(self, [x, shp], [otype()])
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
...@@ -744,22 +883,30 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -744,22 +883,30 @@ class GpuReshape(HideC, tensor.Reshape):
class GpuJoin(HideC, Join): class GpuJoin(HideC, Join):
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
def make_node(self, axis, *tensors): def make_node(self, axis, *tensors):
node = Join.make_node(self, axis, *tensors) node = Join.make_node(self, axis, *tensors)
return Apply(self, [node.inputs[0]] + list(map(as_gpuarray_variable, ctx_name = infer_context_name(*tensors)
tensors)),
def agv(v):
return as_gpuarray_variable(v, context_name=ctx_name)
return Apply(self, [node.inputs[0]] + list(map(agv, tensors)),
[GpuArrayType(broadcastable=node.outputs[0].broadcastable, [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype)()]) dtype=node.outputs[0].dtype,
context_name=ctx_name)()])
def get_context(self, node):
return node.outputs[0].type.context
def perform(self, node, axis_and_tensors, out_): def perform(self, node, axis_and_tensors, out_, ctx):
out, = out_ out, = out_
axis = int(axis_and_tensors[0]) axis = int(axis_and_tensors[0])
tensors = axis_and_tensors[1:] tensors = axis_and_tensors[1:]
out[0] = pygpu.concatenate(tensors, axis=axis).astype( out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
node.outputs[0].dtype) node.outputs[0].dtype)
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -793,15 +940,14 @@ if (axis < 0) { ...@@ -793,15 +940,14 @@ if (axis < 0) {
} }
%(out)s = pygpu_concatenate(als, %(n)s, axis, %(out)s = pygpu_concatenate(als, %(n)s, axis,
%(restype)s, (PyObject *)&PyGpuArrayType, %(restype)s, (PyObject *)&PyGpuArrayType,
pygpu_default_context()); %(ctx)s);
} }
PyMem_Free(als); PyMem_Free(als);
if (%(out)s == NULL) if (%(out)s == NULL)
%(fail)s %(fail)s
""" % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0], """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list), axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
restype=restype) restype=restype, ctx=sub['context'])
gpu_join = GpuJoin() gpu_join = GpuJoin()
...@@ -809,21 +955,26 @@ gpu_join = GpuJoin() ...@@ -809,21 +955,26 @@ gpu_join = GpuJoin()
class GpuSplit(HideC, Split): class GpuSplit(HideC, Split):
def make_node(self, x, axis, splits): def make_node(self, x, axis, splits):
node = Split.make_node(self, x, axis, splits) node = Split.make_node(self, x, axis, splits)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)() outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable,
context_name=x.type.context_name)()
for o in node.outputs] for o in node.outputs]
return Apply(self, [x] + node.inputs[1:], outs) return Apply(self, [x] + node.inputs[1:], outs)
# we reuse the perform of the CPU op, which is suitable # we reuse the perform of the CPU op, which is suitable
class GpuEye(GpuKernelBase, Op): class GpuEye(GpuKernelBase, Op):
__props__ = ('dtype',) __props__ = ('dtype', 'context_name')
_f16_ok = True _f16_ok = True
def __init__(self, dtype=None): def __init__(self, dtype=None, context_name=None):
if dtype is None: if dtype is None:
dtype = config.floatX dtype = config.floatX
self.dtype = dtype self.dtype = dtype
self.context_name = context_name
def get_context(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k): def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n) n = tensor.as_tensor_variable(n)
...@@ -833,7 +984,8 @@ class GpuEye(GpuKernelBase, Op): ...@@ -833,7 +984,8 @@ class GpuEye(GpuKernelBase, Op):
assert m.ndim == 0 assert m.ndim == 0
assert k.ndim == 0 assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype, otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False)) broadcastable=(False, False),
context_name=self.context_name)
# k != 0 isn't implemented on the GPU yet. # k != 0 isn't implemented on the GPU yet.
assert tensor.get_scalar_constant_value(k) == 0 assert tensor.get_scalar_constant_value(k) == 0
...@@ -866,6 +1018,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -866,6 +1018,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
n, m = inp n, m = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync) sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
...@@ -882,7 +1035,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -882,7 +1035,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
%(z)s = pygpu_zeros(2, dims, %(z)s = pygpu_zeros(2, dims,
%(typecode)s, %(typecode)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (%(z)s == NULL) { if (%(z)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -908,4 +1061,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -908,4 +1061,4 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (4, self.GpuKernelBase_version) return (5,)
import os.path import os.path
from theano import Apply, config from theano import Apply, config, Op
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup from theano.gof import LocalOptGroup
from theano.tensor.basic import as_tensor_variable from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt_util import inplace_allocempty
try: try:
import pygpu import pygpu
...@@ -18,7 +19,7 @@ except ImportError as e: ...@@ -18,7 +19,7 @@ except ImportError as e:
pass pass
class BlasOp(HideC): class BlasOp(Op):
def c_headers(self): def c_headers(self):
return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>'] return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']
...@@ -28,34 +29,27 @@ class BlasOp(HideC): ...@@ -28,34 +29,27 @@ class BlasOp(HideC):
def c_init_code(self): def c_init_code(self):
return ['import_pygpu__blas();'] return ['import_pygpu__blas();']
def c_support_code(self):
return """ class GpuGemv(BlasOp):
PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out, __props__ = ('inplace',)
PyGpuArrayObject *y) {
if (out && def __init__(self, inplace=False):
GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) && self.inplace = inplace
theano_size_check(out, PyGpuArray_NDIM(y), if self.inplace:
PyGpuArray_DIMS(y), self.destroy_map = {0: [0]}
y->ga.typecode)) {
if (pygpu_move(out, y)) {
Py_XDECREF(out);
return NULL;
}
} else {
Py_XDECREF(out);
out = pygpu_copy(y, GA_ANY_ORDER);
}
return out;
}
"""
class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta): def make_node(self, y, alpha, A, x, beta):
Gemv.make_node(self, y, alpha, A, x, beta) ctx_name = infer_context_name(y, A, x)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype assert A.dtype == x.dtype == y.dtype
return Apply(self, [y, alpha, A, x, beta], [y.type()]) return Apply(self, [y, alpha, A, x, beta], [y.type()])
...@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
if self.inplace: if self.inplace:
code = """ code = """
if (%(y)s->ga.strides[0] <= 0) { if (%(y)s->ga.strides[0] <= 0) {
%(out)s = gpublas_try_copy(%(out)s, %(y)s); %(out)s = theano_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
""" % vars """ % vars
else: else:
code = """ code = """
%(out)s = gpublas_try_copy(%(out)s, %(y)s); %(out)s = theano_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv): ...@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm): class GpuGemm(BlasOp):
__props__ = ('inplace',)
_f16_ok = True _f16_ok = True
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta): def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha) alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta) beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A) assert alpha.ndim == 0
B = as_gpuarray_variable(B) assert beta.ndim == 0
C = as_gpuarray_variable(C) assert A.ndim == 2
assert B.ndim == 2
assert C.ndim == 2
assert A.dtype == B.dtype == C.dtype assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
...@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
if self.inplace: if self.inplace:
code = """ code = """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) { if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = gpublas_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
""" % vars """ % vars
else: else:
code = """ code = """
%(out)s = gpublas_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm): ...@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True) gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp, Ger): class GpuGer(BlasOp):
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, A, alpha, x, y): def make_node(self, A, alpha, x, y):
Ger.make_node(self, A, alpha, x, y) ctx_name = infer_context_name(A, x, y)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
assert alpha.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype assert A.dtype == x.dtype == y.dtype
return Apply(self, [A, alpha, x, y], [A.type()]) return Apply(self, [A, alpha, x, y], [A.type()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
A, alpha, x, y = inp A, alpha, x, y = inp
inplace = self.destructive inplace = self.inplace
if inplace and not A.flags.forc: if inplace and not A.flags.forc:
inplace = False inplace = False
out[0][0] = blas.ger(alpha, x, y, A, out[0][0] = blas.ger(alpha, x, y, A,
...@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger): ...@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3], vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
fail=sub['fail'], name=name) fail=sub['fail'], name=name)
if self.destructive: if self.inplace:
code = """ code = """
if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) { if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
%(out)s = gpublas_try_copy(%(out)s, %(A)s); %(out)s = theano_try_copy(%(out)s, %(A)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger): ...@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
""" % vars """ % vars
else: else:
code = """ code = """
%(out)s = gpublas_try_copy(%(out)s, %(A)s); %(out)s = theano_try_copy(%(out)s, %(A)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
...@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger): ...@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
gpuger_no_inplace = GpuGer(inplace=False)
gpuger_inplace = GpuGer(inplace=True)
gpuger_no_inplace = GpuGer(destructive=False)
gpuger_inplace = GpuGer(destructive=True)
class GpuDot22(BlasOp):
__props__ = ()
class GpuDot22(BlasOp, Dot22):
def make_node(self, x, y): def make_node(self, x, y):
Dot22.make_node(self, x, y) ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y, ctx_name)
assert x.ndim == 2
assert y.ndim == 2
assert x.dtype == y.dtype assert x.dtype == y.dtype
return Apply(self, [x, y], [x.type()]) return Apply(self, [x, y], [x.type()])
...@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22): ...@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
dims[1] = PyGpuArray_DIMS(%(B)s)[1]; dims[1] = PyGpuArray_DIMS(%(B)s)[1];
if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER, if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
pygpu_default_context())) { %(A)s->context)) {
%(fail)s %(fail)s
} }
...@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22): ...@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
@local_optimizer([gpugemv_no_inplace], inplace=True) @inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node): def local_inplace_gpuagemv(node, inputs):
if node.op == gpugemv_no_inplace: return [gpugemv_inplace(*inputs)]
return [gpugemv_inplace(*node.inputs)]
@local_optimizer([gpugemm_no_inplace], inplace=True) @inplace_allocempty(GpuGemm, 0)
def local_inplace_gpuagemm(node): def local_inplace_gpuagemm(node, inputs):
if node.op == gpugemm_no_inplace: return [gpugemm_inplace(*inputs)]
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True) @inplace_allocempty(GpuGer, 0)
def local_inplace_gpuager(node): def local_inplace_gpuager(node, inputs):
if node.op == gpuger_no_inplace: return [gpuger_inplace(*inputs)]
return [gpuger_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv, gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
local_inplace_gpuagemm, local_inplace_gpuagemm,
......
...@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -134,7 +134,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
const int out_size_byte = out_size*sizeof(float); const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %llud columns!", " %d kernel columns, but the kernel we received had %llu columns!",
THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]); THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -217,13 +217,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i, nb_split=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1], nb_split);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s)," "INFO: impl 'conv_patch' failed (%s),"
...@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -307,21 +300,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i" "INFO: used 'conv_patch_stack' version with nb_split=%i"
...@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -334,21 +312,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%llu, subsample_cols=%llu\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s)," "INFO: impl 'conv_patch_stack' failed (%s),"
...@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -394,12 +357,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s)," "INFO: impl 'conv_rows' failed (%s),"
...@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -428,19 +385,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
if (0)
fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d,
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], n_blocks[2]);
GpuKernel *k = NULL; GpuKernel *k = NULL;
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0; k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
} else { } else {
//fprintf(stderr, "using true version\n");
k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0; k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
} }
...@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -460,23 +408,11 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s)," "INFO: impl 'conv_rows_stack' failed (%s),"
...@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -543,12 +479,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
work_complete = true; work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with" "INFO: used 'conv_rows_stack2' version %s with"
...@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -558,12 +488,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i version=%d\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1],(version==9?2:3));
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s)," "INFO: impl 'conv_rows_stack2' failed (%s),"
...@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -680,13 +604,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i, "
"n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i,"
" nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version" "INFO: used 'conv_patch_stack_reduce' version"
...@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -697,14 +614,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i,"
" nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s)," "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
...@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -714,7 +623,7 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} // else no good nb_splits was found } // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if ((version==6||version==-1) &&
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
...@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -782,12 +691,6 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0],
n_reduce_buf, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s)," "INFO: impl 'conv_valid_row_reduce' failed (%s),"
...@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -805,43 +708,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
(size_t)256), (size_t)256),
(size_t)1, (size_t)1}; (size_t)1, (size_t)1};
if (1) if (verbose)
{ fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1)
fprintf(stderr, " img : %i %llu %i %i %p "
"%lld %lld %lld %lld\n",
nbatch, (unsigned long long)stack_len, img_len, img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p "
"%lld %lld %lld %lld\n",
nkern, nstack, kern_len, kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose>1)
fprintf(stderr, " out : %llu %llu %i %i %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
out_len, out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks[0], threads_per_block[0]);
}
void *kernel_params[] = { void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
...@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1113,15 +981,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
if (err == GA_NO_ERROR) if (err == GA_NO_ERROR)
{ {
if (verbose>1)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i, shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'" "INFO: used 'conv_full_patch_stack_padded'"
...@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1131,15 +990,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, threads_per_block[2]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,shmem_sz=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads_per_block[0], threads_per_block[1], threads_per_block[2],
n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1] * threads_per_block[2],
out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s" "INFO: impl 'conv_full_patch_stack_padded' %s %s"
...@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1179,12 +1029,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s)," "INFO: impl 'conv_full_patch' failed (%s),"
...@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1225,12 +1069,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1], shmem_sz,
threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'" fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
...@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1276,12 +1114,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr,
"threads_per_block[0]=%i, threads_per_block[1]=%i, n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], threads_per_block[1], n_blocks[0], n_blocks[1],
shmem_sz, threads_per_block[0] * threads_per_block[1]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
GpuKernel_error(k, err)); GpuKernel_error(k, err));
...@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1298,55 +1130,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
(size_t)256), (size_t)256),
(size_t)1, (size_t)1}; (size_t)1, (size_t)1};
if (0)
{
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose)
fprintf(stderr, " img : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)nbatch,
(unsigned long long)stack_len,
(unsigned long long)img_len,
(unsigned long long)img_wid,
(void *)(cuda_get_ptr(img->ga.data) + img->ga.offset),
(long long)img_stride_batch,
(long long)img_stride_stack,
(long long)img_stride_row,
(long long)img_stride_col);
if (verbose)
fprintf(stderr, " kern: %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)nkern,
(unsigned long long)nstack,
(unsigned long long)kern_len,
(unsigned long long)kern_wid,
(void *)(cuda_get_ptr(kern->ga.data) + kern->ga.offset),
(long long)kern_stride_nkern,
(long long)kern_stride_stack,
(long long)kern_stride_row,
(long long)kern_stride_col);
if (verbose)
fprintf(stderr, " out : %llu %llu %llu %llu %p "
"%lld %lld %lld %lld\n",
(unsigned long long)PyGpuArray_DIMS(out)[0],
(unsigned long long)PyGpuArray_DIMS(out)[1],
(unsigned long long)out_len,
(unsigned long long)out_wid,
(void *)(cuda_get_ptr(out->ga.data) + out->ga.offset),
(long long)out_stride_batch,
(long long)out_stride_nkern,
(long long)out_stride_row,
(long long)out_stride_col);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks[0], threads_per_block[0]);
if (verbose)
fprintf(stderr, " subsample params: %llu %llu\n",
(unsigned long long)subsample_rows,
(unsigned long long)subsample_cols);
}
void *kernel_params[] = { void *kernel_params[] = {
(void *)&nbatch, (void *)&nkern, (void *)&stack_len, (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
(void *)&img_len, (void *)&img_wid, (void *)&img_len, (void *)&img_wid,
...@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1377,11 +1160,6 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr, "threads_per_block[0]=%i, threads_per_block[1]=%i,"
" n_blocks[0]=%i, n_blocks[1]=%i,"
" shmem_sz=%i, nb_threads=%i\n",
threads_per_block[0], 1, n_blocks[0], 1, 0, threads_per_block[0]);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s)," fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
...@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -1465,7 +1243,7 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
rval = pygpu_zeros(4, out_dim, rval = pygpu_zeros(4, out_dim,
img->ga.typecode, GA_C_ORDER, img->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); img->context, Py_None);
//rval might be null //rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
...@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern, ...@@ -1488,14 +1266,3 @@ PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
} }
return (PyObject*)rval; return (PyObject*)rval;
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
import copy import copy
import os import os
import theano from theano import gof
from theano import config, gof
try: try:
from pygpu import gpuarray from pygpu import gpuarray
...@@ -10,7 +9,8 @@ except ImportError: ...@@ -10,7 +9,8 @@ except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from theano.gof import utils from theano.gof import utils
...@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
them. them.
""" """
__props__ = ('border_mode', 'subsample', 'logical_img_hw',
'logical_kern_hw', 'logical_kern_align_top', 'version',
'verbose', 'kshp', 'imshp', 'max_threads_dim0')
@staticmethod @staticmethod
def logical_output_shape_2d(imshp, kshp, mode): def logical_output_shape_2d(imshp, kshp, mode):
...@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1 return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
raise ValueError(mode) raise ValueError(mode)
def __init__(self, border_mode, def __init__(self, border_mode, subsample=(1, 1),
subsample=(1, 1), logical_img_hw=None, logical_kern_hw=None,
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True, logical_kern_align_top=True,
version=-1, version=-1, direction_hint=None,
direction_hint=None, verbose=0, kshp=None, imshp=None,
verbose=0,
kshp=None,
imshp=None,
max_threads_dim0=None, max_threads_dim0=None,
nkern=None, nkern=None, bsize=None, fft_opt=True):
bsize=None,
fft_opt=True):
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
if logical_img_hw is not None: if logical_img_hw is not None:
...@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
self.bsize = bsize self.bsize = bsize
self.fft_opt = fft_opt self.fft_opt = fft_opt
def __eq__(self, other):
return type(self) == type(other) \
and self.border_mode == other.border_mode \
and self.subsample == other.subsample \
and self.logical_img_hw == other.logical_img_hw \
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top \
and self.version == other.version \
and self.verbose == other.verbose \
and self.kshp == other.kshp\
and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, "imshp"): if not hasattr(self, "imshp"):
...@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
if not hasattr(self, "fft_opt"): if not hasattr(self, "fft_opt"):
self.fft_opt = True self.fft_opt = True
def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and
# hash(-2)==-2 in python!
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.logical_img_hw) \
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top) \
^ self.version \
^ hash(self.verbose) \
^ hash(self.kshp)\
^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.logical_img_hw),
str(self.logical_kern_hw),
str(self.logical_kern_align_top),
str(self.imshp),
str(self.kshp))
def make_node(self, img, kern): def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32": if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work" raise NotImplementedError("GpuConv currently only work"
...@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4: if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D tensor')
img = as_gpuarray_variable(img) ctx_name = infer_context_name(img, kern)
kern = as_gpuarray_variable(kern) img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False] False, False]
out = GpuArrayType(img.dtype, broadcastable)() out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
return gof.Apply(self, [img, kern], [out]) return gof.Apply(self, [img, kern], [out])
def get_context(self, node):
return node.inputs[0].type.context
def flops(self, inputs, outputs): def flops(self, inputs, outputs):
""" """
Useful with the hack in profilemode to print the MFlops. Useful with the hack in profilemode to print the MFlops.
...@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node) node_ = copy.copy(node)
assert node.op is node_.op assert node.op is node_.op
if config.gpuarray.sync:
raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
if node_.op.max_threads_dim0 is None: if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
device_id = cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
return super(GpuConv, node_.op).make_thunk(node_, storage_map, return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling) compute_map, no_recycling)
...@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 22) return (0, 23)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
if node.inputs[0].type.context.kind != "cuda":
raise NotImplementedError("GpuConv only works for cuda devices")
img, kern = inp img, kern = inp
out, = out_ out, = out_
dx = self.subsample[0] dx = self.subsample[0]
...@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
""" % locals() """ % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read() code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]]) for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags) gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin) bcode = ','.join(hex(ord(c)) for c in bin)
...@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
static const char conv_bcode[] = {%(bcode)s}; static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s"; static const char *conv_code = "%(code)s";
""" % locals() """ % locals()
for k in kernels: return mod
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read() def c_support_code_struct(self, node, name):
mod = GpuKernelBase.c_support_code_struct(self, node, name)
with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
mod += f.read()
return mod return mod
@utils.memoize @utils.memoize
......
...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64 //Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit. //bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){ __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64) if (nb_thread < 64)
...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (thread_id < nb_thread) if (thread_id < nb_thread)
{ {
const float * my_src_ptr = (const float *)( const float * my_src_ptr = (const float *)(
((uintptr_t)src) & COALESCED_ALIGN); ((size_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id; my_src_ptr += thread_id;
while (my_src_ptr < src + N) while (my_src_ptr < src + N)
{ {
......
...@@ -15,8 +15,9 @@ from theano.tensor.nnet import SoftmaxGrad ...@@ -15,8 +15,9 @@ from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from . import pygpu, init_dev from . import pygpu
from .basic_ops import (as_gpuarray_variable, from .type import get_context, gpu_context_type, list_contexts
from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, HostFromGpu, gpu_contiguous, HostFromGpu,
GpuAllocEmpty, empty_like) GpuAllocEmpty, empty_like)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
...@@ -29,28 +30,14 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter ...@@ -29,28 +30,14 @@ from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge, inplace_allocempty from .opt_util import alpha_merge, output_merge, inplace_allocempty
def dnn_available(): def _dnn_check_compile():
if dnn_available.avail is not None:
return dnn_available.avail
if pygpu is None:
dnn_available.msg = "PyGPU not available"
dnn_available.avail = False
return False
if not init_dev.device.startswith('cuda'):
dnn_available.msg = "Not on a CUDA device. Got %s." % init_dev.device
dnn_available.avail = False
return False
# This is a hack because bin_id is in the from of
# "sm_<major><minor>" for cuda devices.
if pygpu.get_default_context().bin_id[:-2] < '30':
dnn_available.msg = "Device not supported by cuDNN"
dnn_available.avail = False
preambule = """ preambule = """
#include <stdio.h> #include <stdio.h>
#include <cudnn.h> #include <cudnn.h>
#include <cudnn_helper.h> #include <cudnn_helper.h>
""" """
# No need for the context in here since we won't execute that code
body = """ body = """
cudnnHandle_t _handle = NULL; cudnnHandle_t _handle = NULL;
cudnnStatus_t err; cudnnStatus_t err;
...@@ -70,35 +57,71 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -70,35 +57,71 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags( avail, out, err = GCC_compiler.try_flags(
params, preambule=preambule, body=body, params, preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
dnn_available.avail = comp if not avail:
if not dnn_available.avail: return False, ("Theano cannot compile with cuDNN. "
dnn_available.msg = ( "We got this error:\n" + str(err))
"Theano cannot compile with cuDNN. We got this error:\n" + return True, None
str(err))
else:
# If we can compile, check that we can import and run. def _dnn_check_version():
v = version() v = version()
if v < 2000: if v < 2000:
dnn_available.avail = False return False, (
dnn_available.msg = ( "You have an old release of CuDNN (or a release candidate) "
"You have an old release of CuDNN (or a release candidate) " "that isn't supported. Please update to at least v2 final "
"that isn't supported. Please update to at least v2 final " "version.")
"version.") if v >= 3000 and v < 3007:
raise RuntimeError(dnn_available.msg) return False, (
if v >= 3000 and v < 3007: "You have installed a release candidate of CuDNN v3. This "
dnn_available.avail = False "isn't supported. Please update to v3 final version.")
dnn_available.msg = (
"You have installed a release candidate of CuDNN v3. This " return True, None
"isn't supported. Please update to v3 final version.")
raise RuntimeError(dnn_available.msg)
def dnn_present():
return dnn_available.avail if dnn_present.avail is not None:
return dnn_present.avail
dnn_available.avail = None
if pygpu is None:
dnn_present.msg = "PyGPU not available"
dnn_present.avail = False
return False
dnn_present.avail, dnn_present.msg = _dnn_check_compile()
if dnn_present.avail:
dnn_present.avail, dnn_present.msg = _dnn_check_version()
if not dnn_present.avail:
raise RuntimeError(dnn_present.msg)
return dnn_present.avail
dnn_present.avail = None
dnn_present.msg = None
def dnn_available(context_name):
if not dnn_present():
dnn_available.msg = dnn_present.msg
return False
ctx = get_context(context_name)
if not ctx.kind == 'cuda':
dnn_available.msg = "Not on a CUDA device."
return False
# This is a hack because bin_id is in the from of
# "<something>_<major><minor>" for cuda devices.
if ctx.bin_id[:-2] < '30':
dnn_available.msg = "Device not supported by cuDNN"
return False
return True
dnn_available.msg = None dnn_available.msg = None
...@@ -110,6 +133,10 @@ class DnnBase(COp): ...@@ -110,6 +133,10 @@ class DnnBase(COp):
# dnn does not know about broadcasting, so we do not need to assert # dnn does not know about broadcasting, so we do not need to assert
# the input broadcasting pattern. # the input broadcasting pattern.
check_broadcast = False check_broadcast = False
context_type = gpu_context_type
def get_context(self, node):
return node.outputs[0].type.context
def __init__(self, files=None, c_func=None): def __init__(self, files=None, c_func=None):
if files is None: if files is None:
...@@ -181,7 +208,7 @@ def version(): ...@@ -181,7 +208,7 @@ def version():
This also does a check that the header version matches the runtime version. This also does a check that the header version matches the runtime version.
""" """
if not dnn_available(): if not dnn_present():
raise Exception( raise Exception(
"We can't determine the cudnn version as it is not available", "We can't determine the cudnn version as it is not available",
dnn_available.msg) dnn_available.msg)
...@@ -390,9 +417,10 @@ class GpuDnnConv(DnnBase): ...@@ -390,9 +417,10 @@ class GpuDnnConv(DnnBase):
return defs return defs
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) ctx_name = infer_context_name(img, kern, output)
kern = as_gpuarray_variable(kern) img = as_gpuarray_variable(img, ctx_name)
output = as_gpuarray_variable(output) kern = as_gpuarray_variable(kern, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if img.type.ndim not in (4, 5): if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D or 5D tensor') raise TypeError('img must be 4D or 5D tensor')
if kern.type.ndim not in (4, 5): if kern.type.ndim not in (4, 5):
...@@ -574,9 +602,10 @@ class GpuDnnConvGradW(DnnBase): ...@@ -574,9 +602,10 @@ class GpuDnnConvGradW(DnnBase):
return defs return defs
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) ctx_name = infer_context_name(img, topgrad, output)
topgrad = as_gpuarray_variable(topgrad) img = as_gpuarray_variable(img, ctx_name)
output = as_gpuarray_variable(output) topgrad = as_gpuarray_variable(topgrad, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if img.type.ndim not in (4, 5): if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D or 5D tensor') raise TypeError('img must be 4D or 5D tensor')
if topgrad.type.ndim not in (4, 5): if topgrad.type.ndim not in (4, 5):
...@@ -689,9 +718,10 @@ class GpuDnnConvGradI(DnnBase): ...@@ -689,9 +718,10 @@ class GpuDnnConvGradI(DnnBase):
return defs return defs
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_gpuarray_variable(kern) ctx_name = infer_context_name(kern, topgrad, output)
topgrad = as_gpuarray_variable(topgrad) kern = as_gpuarray_variable(kern, ctx_name)
output = as_gpuarray_variable(output) topgrad = as_gpuarray_variable(topgrad, ctx_name)
output = as_gpuarray_variable(output, ctx_name)
if kern.type.ndim not in (4, 5): if kern.type.ndim not in (4, 5):
raise TypeError('kern must be 4D or 5D tensor') raise TypeError('kern must be 4D or 5D tensor')
if topgrad.type.ndim not in (4, 5): if topgrad.type.ndim not in (4, 5):
...@@ -770,6 +800,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -770,6 +800,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
warnings.warn("workmem is deprecated, use algo instead", stacklevel=2) warnings.warn("workmem is deprecated, use algo instead", stacklevel=2)
algo = workmem algo = workmem
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
ctx_name = infer_context_name(img, kerns)
if (border_mode == 'valid' and subsample == (1, 1) and if (border_mode == 'valid' and subsample == (1, 1) and
direction_hint == 'bprop weights'): direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set # Special case: We are asked to use GpuDnnConvGradW. We need to set
...@@ -782,12 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -782,12 +813,13 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3)) kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1 shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1 shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = GpuAllocEmpty(img.dtype)(shape_i(kerns, 1, fgraph), out = GpuAllocEmpty(img.dtype, ctx_name)(
shape_i(img, 1, fgraph), shape2, shape3) shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(out.shape) conv_mode='cross')(out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc) conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3)) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1) and elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!'): direction_hint != 'forward!'):
...@@ -799,9 +831,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -799,9 +831,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode = 'cross' if conv_mode == 'conv' else 'conv' conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1 shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1 shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = GpuAllocEmpty(img.dtype)(shape_i(img, 0, fgraph), out = GpuAllocEmpty(img.dtype, ctx_name)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape_i(kerns, 1, fgraph),
shape2, shape3) shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(kerns.shape) conv_mode=conv_mode)(kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc) return GpuDnnConvGradI()(kerns, img, out, desc)
...@@ -817,7 +849,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -817,7 +849,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape, out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = GpuAllocEmpty(img.dtype)(*out_shp) out = GpuAllocEmpty(img.dtype, ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
...@@ -948,7 +980,7 @@ class GpuDnnPool(DnnBase): ...@@ -948,7 +980,7 @@ class GpuDnnPool(DnnBase):
DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)") DnnBase.__init__(self, ["dnn_pool.c"], "APPLY_SPECIFIC(dnn_pool)")
def make_node(self, img, desc): def make_node(self, img, desc):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img, infer_context_name(img))
if desc.owner is not None: if desc.owner is not None:
e_ndim = desc.owner.op.get_ndim() + 2 e_ndim = desc.owner.op.get_ndim() + 2
...@@ -1002,7 +1034,7 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1002,7 +1034,7 @@ class GpuDnnPoolGrad(DnnBase):
The input of the pooling. The input of the pooling.
out out
The output of the pooling in the forward. The output of the pooling in the forward.
inp_grad out_grad
Same size as out, but is the corresponding gradient information. Same size as out, but is the corresponding gradient information.
desc desc
The pooling descriptor. The pooling descriptor.
...@@ -1016,9 +1048,10 @@ class GpuDnnPoolGrad(DnnBase): ...@@ -1016,9 +1048,10 @@ class GpuDnnPoolGrad(DnnBase):
"APPLY_SPECIFIC(dnn_pool_grad)") "APPLY_SPECIFIC(dnn_pool_grad)")
def make_node(self, inp, out, out_grad, desc): def make_node(self, inp, out, out_grad, desc):
inp = as_gpuarray_variable(inp) ctx_name = infer_context_name(inp, out, out_grad)
out_grad = as_gpuarray_variable(out_grad) inp = as_gpuarray_variable(inp, ctx_name)
out = as_gpuarray_variable(out) out_grad = as_gpuarray_variable(out_grad, ctx_name)
out = as_gpuarray_variable(out, ctx_name)
if desc.owner is not None: if desc.owner is not None:
nd = desc.owner.op.get_ndim() + 2 nd = desc.owner.op.get_ndim() + 2
...@@ -1147,7 +1180,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase): ...@@ -1147,7 +1180,7 @@ class GpuDnnSoftmax(GpuDnnSoftmaxBase):
c_func = "APPLY_SPECIFIC(softmax)" c_func = "APPLY_SPECIFIC(softmax)"
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
assert x.ndim == 4 assert x.ndim == 4
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
...@@ -1181,8 +1214,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1181,8 +1214,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
c_func = "APPLY_SPECIFIC(softmax_grad)" c_func = "APPLY_SPECIFIC(softmax_grad)"
def make_node(self, dy, sm): def make_node(self, dy, sm):
dy = as_gpuarray_variable(dy) ctx_name = infer_context_name(dy, sm)
sm = as_gpuarray_variable(sm) dy = as_gpuarray_variable(dy, ctx_name)
sm = as_gpuarray_variable(sm, ctx_name)
assert dy.ndim == 4 assert dy.ndim == 4
assert sm.ndim == 4 assert sm.ndim == 4
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
...@@ -1191,9 +1225,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase): ...@@ -1191,9 +1225,9 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
# @register_opt('cudnn') # this optimizer is registered in opt.py instead. # @register_opt('cudnn') # this optimizer is registered in opt.py instead.
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if not dnn_available():
return
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
if node.op.border_mode not in ['full', 'valid']: if node.op.border_mode not in ['full', 'valid']:
return return
img, kern = node.inputs img, kern = node.inputs
...@@ -1211,9 +1245,9 @@ def local_conv_dnn(node): ...@@ -1211,9 +1245,9 @@ def local_conv_dnn(node):
# because for some input/kernel shape configurations, this is faster. # because for some input/kernel shape configurations, this is faster.
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn_alternative(node): def local_conv_dnn_alternative(node):
if not dnn_available():
return
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if not dnn_available(node.outputs[0].type.context_name):
return
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
if border_mode not in ['full', 'valid'] or subsample != (1, 1): if border_mode not in ['full', 'valid'] or subsample != (1, 1):
...@@ -1304,8 +1338,8 @@ def local_dnn_convi_output_merge(node, *inputs): ...@@ -1304,8 +1338,8 @@ def local_dnn_convi_output_merge(node, *inputs):
@register_opt('cudnn') @register_opt('cudnn')
@op_lifter([DownsampleFactorMax]) @op_lifter([DownsampleFactorMax])
def local_pool_dnn_alternative(node): def local_pool_dnn_alternative(node, ctx_name):
if not dnn_available(): if not dnn_available(ctx_name):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
...@@ -1320,8 +1354,8 @@ def local_pool_dnn_alternative(node): ...@@ -1320,8 +1354,8 @@ def local_pool_dnn_alternative(node):
@register_opt('cudnn') @register_opt('cudnn')
@op_lifter([MaxPoolGrad]) @op_lifter([MaxPoolGrad])
def local_pool_dnn_grad_stride(node): def local_pool_dnn_grad_stride(node, ctx_name):
if not dnn_available(): if not dnn_available(ctx_name):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
...@@ -1340,8 +1374,8 @@ def local_pool_dnn_grad_stride(node): ...@@ -1340,8 +1374,8 @@ def local_pool_dnn_grad_stride(node):
@register_opt('cudnn') @register_opt('cudnn')
@op_lifter([AveragePoolGrad]) @op_lifter([AveragePoolGrad])
def local_avg_pool_dnn_grad_stride(node): def local_avg_pool_dnn_grad_stride(node, ctx_name):
if not dnn_available(): if not dnn_available(ctx_name):
return return
if not node.op.ignore_border: if not node.op.ignore_border:
return return
...@@ -1363,22 +1397,23 @@ def local_avg_pool_dnn_grad_stride(node): ...@@ -1363,22 +1397,23 @@ def local_avg_pool_dnn_grad_stride(node):
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuSoftmax]) @local_optimizer([GpuSoftmax])
def local_softmax_dnn(node): def local_softmax_dnn(node):
if not dnn_available():
return
if isinstance(node.op, GpuSoftmax): if isinstance(node.op, GpuSoftmax):
if not dnn_available(node.outputs[0].type.context_name):
return
ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x') ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
ins = gpu_contiguous(ins) ins = gpu_contiguous(ins)
out = GpuDnnSoftmax('accurate', 'channel')(ins) out = GpuDnnSoftmax('accurate', 'channel')(ins)
out = as_gpuarray_variable(out.dimshuffle(0, 1)) out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
return [out] return [out]
@register_opt('cudnn') @register_opt('cudnn')
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
def local_log_softmax_dnn(node): def local_log_softmax_dnn(node):
if not dnn_available() or version() < 3000: if version() < 3000:
# No log-softmax before cudnn v3 # No log-softmax before cudnn v3
return return
# This looks for GpuDnnSoftmax so we know that we have cudnn.
if (isinstance(node.op, GpuElemwise) and if (isinstance(node.op, GpuElemwise) and
isinstance(node.op.scalar_op, Log) and isinstance(node.op.scalar_op, Log) and
node.inputs[0].owner and node.inputs[0].owner and
...@@ -1392,24 +1427,25 @@ def local_log_softmax_dnn(node): ...@@ -1392,24 +1427,25 @@ def local_log_softmax_dnn(node):
class NoCuDNNRaise(Optimizer): class NoCuDNNRaise(Optimizer):
def apply(self, fgraph): def apply(self, fgraph):
""" """
Raise a RuntimeError if cudnn can't be used. Raise a error if cudnn can't be used.
""" """
if not dnn_available(): for c in list_contexts():
# Make an assert error as we want Theano to fail, not if not dnn_available(c):
# just skip this optimization. # Make an assert error as we want Theano to fail, not
raise AssertionError( # just skip this optimization.
"cuDNN optimization was enabled, but Theano was not able" raise AssertionError(
" to use it. We got this error: \n" + "cuDNN optimization was enabled, but Theano was not able "
dnn_available.msg) "to use it for context " + c + ". We got this error: \n" +
dnn_available.msg)
gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn') gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@op_lifter([SoftmaxGrad]) @op_lifter([SoftmaxGrad])
def local_softmax_dnn_grad(node): def local_softmax_dnn_grad(node, ctx_name):
if not dnn_available(): if not dnn_available(ctx_name):
return return
ins = [] ins = []
for n in node.inputs: for n in node.inputs:
......
...@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle); ...@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct #section init_code_struct
{ {
cuda_enter(pygpu_default_context()->ctx); cuda_enter(CONTEXT->ctx);
cudnnStatus_t err; cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL; APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) { if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s", PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(pygpu_default_context()->ctx); cuda_exit(CONTEXT->ctx);
FAIL; FAIL;
} }
cuda_exit(pygpu_default_context()->ctx); cuda_exit(CONTEXT->ctx);
} }
...@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *om, PyGpuArrayObject *om,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, double alpha, double beta,
PyGpuArrayObject **output) { PyGpuArrayObject **output,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
......
...@@ -4,12 +4,12 @@ int ...@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input) { double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......
...@@ -4,12 +4,12 @@ int ...@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km, PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc, cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns) { double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
......
...@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI ...@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc, cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) { PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err; cudnnStatus_t err;
size_t dims[5]; size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported."); PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
......
...@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out, PyGpuArrayObject *out,
PyGpuArrayObject *out_grad, PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc, cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) { PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnStatus_t err; cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported."); PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
...@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp), if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), inp->ga.typecode, PyGpuArray_DIMS(inp), inp->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) { GA_C_ORDER, c) != 0) {
return 1; return 1;
} }
......
...@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL) ...@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) { PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err; cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0) if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1; return 1;
......
...@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL) ...@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm, PyGpuArrayObject *sm,
PyGpuArrayObject **dx) { PyGpuArrayObject **dx,
PyGpuContextObject *c) {
cudnnStatus_t err; cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0) if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1; return 1;
......
...@@ -20,8 +20,8 @@ try: ...@@ -20,8 +20,8 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, HideC, from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
GpuKernelBase, Kernel) infer_context_name)
from .type import GpuArrayType from .type import GpuArrayType
from .fp16_help import load_w, write_w from .fp16_help import load_w, write_w
...@@ -37,7 +37,7 @@ def make_argument(v, name): ...@@ -37,7 +37,7 @@ def make_argument(v, name):
return ArrayArg(numpy.dtype(v.type.dtype), name) return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_allocated(storage, shape, dtype): def ensure_allocated(storage, shape, dtype, ctx):
odat = storage[0] odat = storage[0]
if odat is not None: if odat is not None:
if odat.shape != shape: if odat.shape != shape:
...@@ -45,7 +45,7 @@ def ensure_allocated(storage, shape, dtype): ...@@ -45,7 +45,7 @@ def ensure_allocated(storage, shape, dtype):
# we have to allocate output storage. # we have to allocate output storage.
odat = None odat = None
if odat is None: if odat is None:
odat = pygpu.empty(shape, dtype=dtype) odat = pygpu.empty(shape, dtype=dtype, context=ctx)
storage[0] = odat storage[0] = odat
return odat return odat
...@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -67,12 +67,14 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items) return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def make_node(self, *inputs): def make_node(self, *inputs):
ctx_name = infer_context_name(*inputs)
res = Elemwise.make_node(self, *inputs) res = Elemwise.make_node(self, *inputs)
outputs = [GpuArrayType(broadcastable=o.type.broadcastable, outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
context_name=ctx_name,
dtype=o.type.dtype)() for o in res.outputs] dtype=o.type.dtype)() for o in res.outputs]
if len(outputs) > 1: if len(outputs) > 1:
raise NotImplementedError() raise NotImplementedError()
inputs = [as_gpuarray_variable(i) for i in inputs] inputs = [as_gpuarray_variable(i, ctx_name) for i in inputs]
node = Apply(self, inputs, outputs) node = Apply(self, inputs, outputs)
# Try to generate the kernel to catch SupportCodeErrors # Try to generate the kernel to catch SupportCodeErrors
...@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -99,6 +101,9 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return node return node
def get_context(self, node):
return node.inputs[0].type.context
def generate_kernel(self, node, nodename): def generate_kernel(self, node, nodename):
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(node.inputs)] enumerate(node.inputs)]
...@@ -168,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -168,7 +173,8 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float64", "ga_double"), ("npy_float64", "ga_double"),
]: ]:
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps + outs, kop, preamble=support_code) return ElemwiseKernel(self.get_context(node), inps + outs, kop,
preamble=support_code)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -177,8 +183,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -177,8 +183,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return self.scalar_op.c_support_code() return self.scalar_op.c_support_code()
def _gpu_kernel_code(self, node, nodename): def _gpu_kernel_code(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
# This is useless by itself, but will serve an eventual c_code # This is useless by itself, but will serve an eventual c_code
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
...@@ -191,8 +195,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -191,8 +195,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
return '\n'.join(res) return '\n'.join(res)
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
src = self._gpu_kernel_code(node, nodename) src = self._gpu_kernel_code(node, nodename)
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
params = ['uintp'] params = ['uintp']
...@@ -214,12 +216,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -214,12 +216,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
objvar='elem_%d_%s' % (nd, nodename))] objvar='elem_%d_%s' % (nd, nodename))]
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if pygpu.get_default_context().kind == 'opencl': if node.inputs[0].type.context.kind != 'cuda':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
nd = node.outputs[0].ndim nd = node.outputs[0].ndim
fail = sub["fail"] fail = sub["fail"]
initial_dims = ','.join('1' for i in xrange(nd)) initial_dims = ','.join('1' for i in xrange(nd))
opname = str(self.scalar_op) opname = str(self.scalar_op)
ctx = sub['context']
# check that all inputs have valid dimensions # check that all inputs have valid dimensions
emitted_inames = {} emitted_inames = {}
...@@ -264,11 +267,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -264,11 +267,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if iname in emitted_inames: if iname in emitted_inames:
continue continue
code += """ code += """
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) if (%(nd)s != PyGpuArray_NDIM(%(iname)s))
{ {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"need %(nd)s dims, not %%i", "need %(nd)s dims, not %%u",
PyGpuArray_NDIM(%(iname)s)); PyGpuArray_NDIM(%(iname)s));
%(fail)s; %(fail)s;
} }
...@@ -279,14 +281,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -279,14 +281,13 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyGpuArray_DIMS(%(iname)s)[i] == 1)) && PyGpuArray_DIMS(%(iname)s)[i] == 1)) &&
(dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) (dims[i] != PyGpuArray_DIMS(%(iname)s)[i]))
{ {
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input" "GpuElemwise. Input dimension mis-match. Input"
" %(idx)d (indices start at 0) has shape[%%i] == %%i" " %(idx)d (indices start at 0) has shape[%%d] == %%llu"
", but the output's size on that axis is %%i.", ", but the output's size on that axis is %%llu.",
i, i,
PyGpuArray_DIMS(%(iname)s)[i], (unsigned long long)PyGpuArray_DIMS(%(iname)s)[i],
dims[i] (unsigned long long)dims[i]
); );
%(fail)s; %(fail)s;
} }
...@@ -314,15 +315,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -314,15 +315,11 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
{ {
%(oname)s = pygpu_empty(%(nd)d, dims, %(oname)s = pygpu_empty(%(nd)d, dims,
%(typecode)s, GA_C_ORDER, %(typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(oname)s) { if (!%(oname)s) {
//TODO, this check don't seam good. %(fail)s
//TODO, set exception?
%(fail)s
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
else: else:
input_idx = self.inplace_pattern[idx] input_idx = self.inplace_pattern[idx]
...@@ -337,19 +334,17 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -337,19 +334,17 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"GpuElemwise. Output dimension mis-match. Output" "GpuElemwise. Output dimension mis-match. Output"
" %(idx)d (indices start at 0), working inplace" " %(idx)d (indices start at 0), working inplace"
" on input %(input_idx)s, has shape[%%i] == %%i" " on input %(input_idx)s, has shape[%%i] == %%llu"
", but the output's size on that axis is %%i.", ", but the output's size on that axis is %%llu.",
i, i,
PyGpuArray_DIMS(%(oname)s)[i], (unsigned long long)PyGpuArray_DIMS(%(oname)s)[i],
dims[i] (unsigned long long)dims[i]
); );
Py_DECREF(%(oname)s); Py_DECREF(%(oname)s);
%(oname)s = NULL; %(oname)s = NULL;
%(fail)s; %(fail)s;
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals() """ % locals()
z = outputs[0] z = outputs[0]
code += """numEls = PyGpuArray_SIZE(%(z)s); code += """numEls = PyGpuArray_SIZE(%(z)s);
...@@ -367,7 +362,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -367,7 +362,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if (threads_per_block * n_blocks < numEls) if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (size_t) 256); threads_per_block = std::min(numEls/n_blocks, (size_t) 256);
//std::cerr << "calling callkernel returned\\n";
""" % locals() """ % locals()
kname = 'elem_%d_%s' % (nd, name) kname = 'elem_%d_%s' % (nd, name)
...@@ -407,7 +401,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -407,7 +401,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
""" % locals() """ % locals()
return str(code) return str(code)
def perform(self, node, inputs, output_storage): def perform(self, node, inputs, output_storage, ctx):
# Try to reuse the kernel from a previous call to hopefully # Try to reuse the kernel from a previous call to hopefully
# avoid recompiling # avoid recompiling
if not hasattr(node, '_cache_elemwise_k'): if not hasattr(node, '_cache_elemwise_k'):
...@@ -428,7 +422,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -428,7 +422,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
if n in self.inplace_pattern: if n in self.inplace_pattern:
stor[0] = inputs[self.inplace_pattern[n]] stor[0] = inputs[self.inplace_pattern[n]]
else: else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype)) args.append(ensure_allocated(stor, out_shape, out.type.dtype, ctx))
node._cache_elemwise_k(*args, broadcast=True) node._cache_elemwise_k(*args, broadcast=True)
if config.gpuarray.sync: if config.gpuarray.sync:
...@@ -453,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -453,10 +447,12 @@ class GpuDimShuffle(HideC, DimShuffle):
_f16_ok = True _f16_ok = True
def make_node(self, input): def make_node(self, input):
ctx_name = infer_context_name(input)
res = DimShuffle.make_node(self, input) res = DimShuffle.make_node(self, input)
otype = GpuArrayType(dtype=res.outputs[0].type.dtype, otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
broadcastable=res.outputs[0].type.broadcastable) broadcastable=res.outputs[0].type.broadcastable,
input = as_gpuarray_variable(input) context_name=ctx_name)
input = as_gpuarray_variable(input, ctx_name)
return Apply(self, [input], [otype()]) return Apply(self, [input], [otype()])
def __str__(self): def __str__(self):
...@@ -588,7 +584,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -588,7 +584,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
quite possible that the GPU might be slower for some cases. quite possible that the GPU might be slower for some cases.
""" """
__props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
'pre_scalar_op')
_f16_ok = True _f16_ok = True
def __init__(self, scalar_op, axis=None, def __init__(self, scalar_op, axis=None,
...@@ -607,24 +604,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -607,24 +604,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if pre_scalar_op: if pre_scalar_op:
assert pre_scalar_op.nin == 1 assert pre_scalar_op.nin == 1
def __eq__(self, other):
return (type(self) == type(other) and
self.axis == other.axis and
self.reduce_mask == other.reduce_mask and
self.dtype == other.dtype and
self.acc_dtype == other.acc_dtype and
self.scalar_op == other.scalar_op and
self.pre_scalar_op == other.pre_scalar_op)
def __hash__(self):
return (hash(type(self)) ^
hash(self.axis) ^
hash(self.reduce_mask) ^
hash(self.dtype) ^
hash(self.acc_dtype) ^
hash(type(self.scalar_op)) ^
hash(type(self.pre_scalar_op)))
def __str__(self): def __str__(self):
pre = "" pre = ""
if self.pre_scalar_op: if self.pre_scalar_op:
...@@ -641,7 +620,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -641,7 +620,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
self.pre_scalar_op = None self.pre_scalar_op = None
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
if x.type.context.kind != 'cuda':
raise TypeError("GpuCAReduceCuda doesn't work for non-cuda devices")
ret = super(GpuCAReduceCuda, self).make_node(x) ret = super(GpuCAReduceCuda, self).make_node(x)
self = copy.copy(self) self = copy.copy(self)
self.axis = ret.op.axis self.axis = ret.op.axis
...@@ -666,9 +647,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -666,9 +647,13 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
"complex" in self._acc_dtype(x.dtype)): "complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction") raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype, return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()]) ret.outputs[0].type.broadcastable,
context_name=x.type.context_name)()])
def perform(self, node, inp, out): def get_context(self, node):
return node.inputs[0].type.context
def perform(self, node, inp, out, ctx):
raise MethodNotDefined("") raise MethodNotDefined("")
def supports_c_code(self, inputs): def supports_c_code(self, inputs):
...@@ -698,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -698,7 +683,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))] inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))] out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
sub = {'fail': 'fake failure code'} sub = {'fail': 'fake failure code', 'context': 'fake context'}
try: try:
self.c_code(node, name, inp, out, sub) self.c_code(node, name, inp, out, sub)
...@@ -733,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -733,7 +718,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s) if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
{ {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
"required nd=%(nd_in)s, got nd=%%i", PyGpuArray_NDIM(%(x)s)); "required nd=%(nd_in)s, got nd=%%u", PyGpuArray_NDIM(%(x)s));
%(fail)s; %(fail)s;
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -791,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -791,7 +776,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(%(nd_out)s, new_dims, %(z)s = pygpu_empty(%(nd_out)s, new_dims,
%(out_typecode)s, GA_C_ORDER, %(out_typecode)s, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (NULL == %(z)s) if (NULL == %(z)s)
{ {
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...@@ -1338,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1338,8 +1323,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)%(z)s->ga.data, (void *)%(z)s->ga.data,
(void *)&%(z)s->ga.offset}; (void *)&%(z)s->ga.offset};
if (verbose) printf("running kernel_reduce_ccontig_%(name)s" if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads=%%lu, size=%%lu, ndim=%%d\\n", " n_threads=%%llu, size=%%llu, ndim=%%u\\n",
n_threads,numEls, n_threads, numEls,
PyGpuArray_NDIM(%(x)s)); PyGpuArray_NDIM(%(x)s));
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params); int err = GpuKernel_call(&%(k_var)s, 1, &n_threads, &n_blocks, n_shared, kernel_params);
...@@ -1521,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1521,9 +1506,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1}; size_t n_blocks[3] = {1, std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 4096), 1};
if (verbose) { if (verbose) {
fprintf(stderr, fprintf(stderr,
"running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n", "running kernel_reduce_10_%(name)s n_blocks=(%%llu,%%llu)\\n",
n_blocks[0], (unsigned long long)n_blocks[0],
n_blocks[1]); (unsigned long long)n_blocks[1]);
} }
assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]); assert(PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0]; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads[0];
...@@ -1911,12 +1896,17 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1911,12 +1896,17 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [17] # the version corresponding to the c code in this Op version = [18] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
version.extend(self.scalar_op.c_code_cache_version()) scalar_node = Apply(
self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version_apply(scalar_node))
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
version.extend(self.kernel_version(node))
if all(version): if all(version):
return tuple(version) return tuple(version)
else: else:
...@@ -2644,7 +2634,6 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2644,7 +2634,6 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
Too slow for now as it only have a python interface. Too slow for now as it only have a python interface.
""" """
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None): def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'): if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op") raise ValueError("No identity on scalar op")
...@@ -2658,10 +2647,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2658,10 +2647,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
return "GpuReduce{%s}%s" % (self.scalar_op, ax) return "GpuReduce{%s}%s" % (self.scalar_op, ax)
def make_node(self, input): def make_node(self, input):
ctx_name = infer_context_name(input)
res = CAReduceDtype.make_node(self, input) res = CAReduceDtype.make_node(self, input)
input = as_gpuarray_variable(input) input = as_gpuarray_variable(input, ctx_name)
otype = GpuArrayType(dtype=res.outputs[0].dtype, otype = GpuArrayType(dtype=res.outputs[0].dtype,
broadcastable=res.outputs[0].broadcastable) broadcastable=res.outputs[0].broadcastable,
context_name=ctx_name)
if res.op.axis is not None: if res.op.axis is not None:
redux = [] redux = []
...@@ -2673,11 +2664,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2673,11 +2664,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
return Apply(res.op, [input], [otype()]) return Apply(res.op, [input], [otype()])
def get_context(self, node):
return node.outputs[0].type.context
def make_thunk(self, node, storage_map, compute_map, no_recycling): def make_thunk(self, node, storage_map, compute_map, no_recycling):
# cache the kernel object # cache the kernel object
self.get_kernel_cache(node) self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map, return super(GpuCAReduceCPY, self).make_thunk(
compute_map, no_recycling) node, storage_map, compute_map, no_recycling)
def get_kernel_cache(self, node): def get_kernel_cache(self, node):
attr = '@cache_reduction_k' attr = '@cache_reduction_k'
...@@ -2776,33 +2770,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2776,33 +2770,33 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
j += 1 j += 1
code += """ code += """
if (need_out) { if (need_out) {
%(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, pygpu_default_context(), Py_None); %(output)s = pygpu_empty(%(nd_out)s, out_dims, %(out_type)s, GA_C_ORDER, %(ctx)s, Py_None);
if (!%(output)s) { if (!%(output)s) {
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'], """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else: else:
code += """ code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) { if (%(output)s == NULL || %(output)s->ga.nd != 0) {
Py_XDECREF(%(output)s); Py_XDECREF(%(output)s);
%(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER, %(output)s = pygpu_empty(0, NULL, %(out_type)s, GA_C_ORDER,
pygpu_default_context(), Py_None); %(ctx)s, Py_None);
if (!%(output)s) { if (!%(output)s) {
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype: if acc_dtype != node.outputs[0].type.dtype:
code += """ code += """
tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions, tmp = pygpu_empty(%(output)s->ga.nd, %(output)s->ga.dimensions,
%(acc_type)s, GA_C_ORDER, pygpu_default_context(), %(acc_type)s, GA_C_ORDER, %(ctx)s, Py_None);
Py_None);
if (!tmp) %(fail)s if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'], ctx=sub['context'],
acc_type=dtype_to_typecode(acc_dtype)) acc_type=dtype_to_typecode(acc_dtype))
else: else:
code += """ code += """
...@@ -2893,12 +2887,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2893,12 +2887,12 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
reduce_expr = "a * b" reduce_expr = "a * b"
else: else:
raise NotImplementedError() raise NotImplementedError()
return ReductionKernel(pygpu.get_default_context(), odtype, return ReductionKernel(node.inputs[0].type.context, odtype,
self.scalar_op.identity, reduce_expr, redux, self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')], arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim) init_nd=node.inputs[0].ndim)
def perform(self, node, inp, out): def perform(self, node, inp, out, ctx):
input, = inp input, = inp
output, = out output, = out
...@@ -2912,6 +2906,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2912,6 +2906,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
copy=False, dtype=node.outputs[0].type.dtype) copy=False, dtype=node.outputs[0].type.dtype)
else: else:
output[0] = pygpu.gpuarray.array(input, copy=True, output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype) dtype=node.outputs[0].type.dtype,
context=ctx)
# To allow reloading old pickled files # To allow reloading old pickled files
GpuCAReduce = GpuCAReduceCPY GpuCAReduce = GpuCAReduceCPY
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
/* Why do we need this? */ /* Why do we need this? */
size_t dim = 2048 * 32; size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(), rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
Py_None); Py_None);
if (rand_buf == NULL) { if (rand_buf == NULL) {
FAIL; FAIL;
...@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf; ...@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;
int gemm16(PyGpuArrayObject *C, float alpha, int gemm16(PyGpuArrayObject *C, float alpha,
PyGpuArrayObject *A, PyGpuArrayObject *B, PyGpuArrayObject *A, PyGpuArrayObject *B,
float beta, PyGpuArrayObject **out) { float beta, PyGpuArrayObject **out,
PyGpuContextObject *c) {
PyGpuArrayObject *_A = NULL; PyGpuArrayObject *_A = NULL;
PyGpuArrayObject *_B = NULL; PyGpuArrayObject *_B = NULL;
GpuKernel *gk; GpuKernel *gk;
......
...@@ -10,7 +10,8 @@ try: ...@@ -10,7 +10,8 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
...@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
self.mode = mode self.mode = mode
def make_node(self, ten4, neib_shape, neib_step): def make_node(self, ten4, neib_shape, neib_step):
ten4 = as_gpuarray_variable(ten4) ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
neib_shape = T.as_tensor_variable(neib_shape) neib_shape = T.as_tensor_variable(neib_shape)
neib_step = T.as_tensor_variable(neib_step) neib_step = T.as_tensor_variable(neib_step)
...@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return Apply(self, [ten4, neib_shape, neib_step], return Apply(self, [ten4, neib_shape, neib_step],
[GpuArrayType(broadcastable=(False, False), [GpuArrayType(broadcastable=(False, False),
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype,
context_name=ten4.type.context_name)()])
def get_context(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self): def c_code_cache_version(self):
return (11,) return (11,)
...@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp_less" kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename k_var = "k_multi_warp_less_" + nodename
code = """ code = """
//a version that use less register but don't work in all case. // a version that uses less registers but doesn't work in all cases.
KERNEL void %(kname)s( KERNEL void %(kname)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
...@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return kernels return kernels
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype dtype_neib_step = node.inputs[2].dtype
...@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4, neib_shape, neib_step = inp ten4, neib_shape, neib_step = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
mode = self.mode mode = self.mode
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dims[0] = z_dim0; dims[0] = z_dim0;
dims[1] = z_dim1; dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, %(typecode_z)s, %(z)s = pygpu_empty(2, dims, %(typecode_z)s,
GA_C_ORDER, pygpu_default_context(), GA_C_ORDER, %(ctx)s, Py_None);
Py_None);
if (!%(z)s) if (!%(z)s)
{ {
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:" PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
...@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@op_lifter([Images2Neibs]) @op_lifter([Images2Neibs])
def use_gpu_images2neibs(node): def use_gpu_images2neibs(node, context_name):
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']: if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(node.op.mode) return GpuImages2Neibs(node.op.mode)
......
...@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp ...@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
from theano.scalar import as_scalar, constant from theano.scalar import as_scalar, constant
from . import opt from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty) from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
from .pycuda_helper import ensure_pycuda_context
try: try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
...@@ -43,6 +43,7 @@ def ensure_float(val, name): ...@@ -43,6 +43,7 @@ def ensure_float(val, name):
class Gemm16(COp): class Gemm16(COp):
__props__ = ('relu', 'inplace') __props__ = ('relu', 'inplace')
_f16_ok = True _f16_ok = True
context_type = gpu_context_type
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32', KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32', 'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32', 'tn_128x128', 'tn_128x64', 'tn_128x32',
...@@ -61,10 +62,11 @@ class Gemm16(COp): ...@@ -61,10 +62,11 @@ class Gemm16(COp):
def make_node(self, C, alpha, A, B, beta): def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None: if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found") raise RuntimeError("Can't use Gemm16: nervanagpu not found")
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C, ctx_name)
alpha = ensure_float(alpha, 'alpha') alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta') beta = ensure_float(beta, 'beta')
...@@ -73,27 +75,8 @@ class Gemm16(COp): ...@@ -73,27 +75,8 @@ class Gemm16(COp):
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs): def get_context(self, node):
ensure_pycuda_context() return node.inputs[0].type.context
C, alpha, A, B, beta = inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if A.flags.f_contiguous and B.flags.f_contiguous:
if A.size < B.size:
A = A.copy()
else:
B = B.copy()
inplace = self.inplace
if inplace and not C.flags.c_contiguous:
inplace = False
if not inplace:
C = C.copy()
At = to_gputensor(A)
Bt = to_gputensor(B)
Ct = to_gputensor(C)
nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
outputs[0][0] = C
def c_headers(self): def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h', return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
...@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz, ...@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name)) codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;") codel.append("const char *bcode;")
codel.append("size_t sz;") codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = pygpu_default_context();") codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, " codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, " "GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};") "GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
...@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz, ...@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
@opt.register_opt() @opt.register_opt()
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node): def local_dot_to_gemm16(node, ctx_name):
if nerv is None: if nerv is None:
return return
A = node.inputs[0] A = node.inputs[0]
...@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node): ...@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')( C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -10,7 +10,8 @@ try: ...@@ -10,7 +10,8 @@ try:
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel) from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .type import GpuArrayType from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel, from .kernel_codegen import (nvcc_kernel,
inline_softmax, inline_softmax,
...@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
nin = 3 nin = 3
nout = 3 nout = 3
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
def make_node(self, x, b, y_idx): def make_node(self, x, b, y_idx):
# N.B. won't work when we don't cast y_idx to float anymore ctx_name = infer_context_name(x, b, y_idx)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b) b = as_gpuarray_variable(b, ctx_name)
y_idx = as_gpuarray_variable(y_idx) y_idx = as_gpuarray_variable(y_idx, ctx_name)
nll = GpuArrayType(x.type.dtype, nll = GpuArrayType(x.type.dtype,
y_idx.type.broadcastable)() y_idx.type.broadcastable,
context_name=ctx_name)()
sm = x.type() sm = x.type()
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
def get_context(self, node):
return node.inputs[0].type.context
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError('cuda only')
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype) typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype) typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
...@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
dtype_am = node.outputs[2].dtype dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__ classname = self.__class__.__name__
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals() k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{ {
Py_XDECREF(%(nll)s); Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s, %(typecode_x)s, GA_C_ORDER, %(ctx)s,
GA_C_ORDER, Py_None);
pygpu_default_context(), Py_None);
if (!%(nll)s) { if (!%(nll)s) {
%(fail)s %(fail)s
} }
...@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{ {
Py_XDECREF(%(sm)s); Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s), %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s, %(typecode_b)s, GA_C_ORDER,
GA_C_ORDER, %(ctx)s, Py_None);
pygpu_default_context(), Py_None);
if(!%(sm)s) if(!%(sm)s)
{ {
PyErr_SetString(PyExc_MemoryError, PyErr_SetString(PyExc_MemoryError,
...@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{ {
Py_XDECREF(%(am)s); Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s, %(typecode_y_idx)s, GA_C_ORDER,
GA_C_ORDER, %(ctx)s, Py_None);
pygpu_default_context(), Py_None);
if(!%(am)s) if(!%(am)s)
{ {
PyErr_SetString(PyExc_MemoryError, PyErr_SetString(PyExc_MemoryError,
...@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
Gradient wrt x of the CrossentropySoftmax1Hot Op. Gradient wrt x of the CrossentropySoftmax1Hot Op.
""" """
nin = 3 nin = 3
nout = 1 nout = 1
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
def make_node(self, dnll, sm, y_idx): def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll) ctx_name = infer_context_name(dnll, sm, y_idx)
sm = as_gpuarray_variable(sm) dnll = as_gpuarray_variable(dnll, ctx_name)
y_idx = as_gpuarray_variable(y_idx) sm = as_gpuarray_variable(sm, ctx_name)
y_idx = as_gpuarray_variable(y_idx, ctx_name)
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def get_context(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self): def c_code_cache_version(self):
return (11,) return (11,)
...@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
...@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
...@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
{ {
Py_XDECREF(%(dx)s); Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s), %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode_dx)s, %(typecode_dx)s, GA_C_ORDER,
GA_C_ORDER, %(ctx)s, Py_None);
pygpu_default_context(), Py_None);
if (!%(dx)s) { if (!%(dx)s) {
%(fail)s %(fail)s
} }
...@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
def make_node(self, x): def make_node(self, x):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x, infer_context_name(x))
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
def get_context(self, node):
return node.inputs[0].type.context
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return shape return shape
...@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x) work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
...@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
x, = inp x, = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg); PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
...@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s), %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s, %(typecode)s, GA_C_ORDER,
GA_C_ORDER, %(ctx)s, Py_None);
pygpu_default_context(), Py_None);
if (!%(z)s) { if (!%(z)s) {
%(fail)s %(fail)s
} }
...@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuKernelBase, Op): class GpuSoftmaxWithBias(GpuKernelBase, Op):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
nin = 2 nin = 2
nout = 1 nout = 1
__props__ = () __props__ = ()
_f16_ok = True _f16_ok = True
def make_node(self, x, b): def make_node(self, x, b):
x = as_gpuarray_variable(x) ctx_name = infer_context_name(x, b)
b = as_gpuarray_variable(b) x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b, ctx_name)
return Apply(self, [x, b], [x.type()]) return Apply(self, [x, b], [x.type()])
def get_context(self, node):
return node.inputs[0].type.context
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[0]] return [shape[0]]
...@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError('cuda only')
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype dtype_z = node.outputs[0].dtype
...@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
x, b = inp x, b = inp
z, = out z, = out
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
err_check = """ err_check = """
if (err != GA_NO_ERROR) { if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg); PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
...@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s), %(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s, %(typecode)s, GA_C_ORDER,
GA_C_ORDER, %(ctx)s, Py_None);
pygpu_default_context(), Py_None);
if (!%(z)s) { if (!%(z)s) {
%(fail)s %(fail)s
} }
......
...@@ -3,11 +3,6 @@ import numpy ...@@ -3,11 +3,6 @@ import numpy
import logging import logging
from six.moves import xrange from six.moves import xrange
try:
import pygpu
except ImportError:
pass
import theano import theano
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
...@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt ...@@ -22,12 +17,12 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.nnet.conv import ConvOp from theano.tensor.nnet.conv import ConvOp
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from .type import GpuArrayType, GpuArrayConstant from .type import GpuArrayType, GpuArrayConstant, get_context
from .basic_ops import (as_gpuarray_variable, from .basic_ops import (as_gpuarray_variable, infer_context_name,
host_from_gpu, gpu_from_host, host_from_gpu, GpuToGpu,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, GpuSplit, GpuContiguous,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
gpugemm_no_inplace) gpugemm_no_inplace)
...@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert', ...@@ -79,9 +74,9 @@ gpu_optimizer.register('local_remove_all_assert',
'unsafe') 'unsafe')
def safe_to_gpu(x): def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return gpu_from_host(x) return GpuFromHost(ctx_name)(x)
else: else:
return x return x
...@@ -102,28 +97,53 @@ def op_lifter(OP, cuda_only=False): ...@@ -102,28 +97,53 @@ def op_lifter(OP, cuda_only=False):
""" """
def f(maker): def f(maker):
def local_opt(node): def local_opt(node):
dev = theano.sandbox.gpuarray.init_dev.device
if cuda_only and not dev.startswith('cuda'):
return
if type(node.op) in OP: if type(node.op) in OP:
# Either one of our inputs is on the gpu or # Either one of our inputs is on the gpu or
# all of our client are on the gpu # all of our clients are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu replace = False
for i in node.inputs]) or # TODO: Maybe set context_name with infer_context_name()?
all([c != 'output' and c.op == gpu_from_host context_name = None
for c, idx in node.outputs[0].clients])): # We replace if any input is a host_from_gpu
new_op = maker(node) for i in node.inputs:
# This is needed as sometimes new_op inherit from OP. if i.owner and i.owner.op == host_from_gpu:
if new_op and new_op != node.op: context_name = i.owner.inputs[0].type.context_name
if isinstance(new_op, theano.Op): replace = True
return [safe_to_cpu(o) for o in break
new_op(*node.inputs, return_list=True)] if not replace:
elif isinstance(new_op, (tuple, list)): # We replace if *all* clients are on the GPU
return [safe_to_cpu(o) for o in new_op] clients = [c for o in node.outputs for c in o.clients]
else: # suppose it is a variable on the GPU replace = len(clients) != 0
return [host_from_gpu(new_op)] for c, idx in clients:
if (c == 'output' or
not isinstance(c.op, GpuFromHost)):
replace = False
# TODO: check that the clients want the same context?
if replace:
# All clients are GpuFromHost and we have at least one
context_name = clients[0][0].op.context_name
# Check if we should replace
if (not replace or
(cuda_only and
get_context(context_name).kind != 'cuda')):
return False
new_op = maker(node, context_name)
# This is needed as sometimes new_op inherits from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
# tag the inputs with the context in case
# the context was derived from the outputs
def tag(i, ctx):
i.tag.context_name = ctx
return i
inputs = [tag(i, context_name) for i in node.inputs]
return [safe_to_cpu(o) for o in
new_op(*inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False return False
local_opt.__name__ = maker.__name__ local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt) return local_optimizer(OP)(local_opt)
...@@ -146,35 +166,81 @@ class InputToGpuOptimizer(Optimizer): ...@@ -146,35 +166,81 @@ class InputToGpuOptimizer(Optimizer):
if (len(input.clients) == 1 and if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or (input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)): isinstance(input.clients[0][0].op, GpuFromHost))):
continue continue
ctx_name = getattr(input.tag, 'context_name', None)
try: try:
new_input = host_from_gpu(gpu_from_host(input)) new_input = host_from_gpu(GpuFromHost(ctx_name)(input))
fgraph.replace_validate(input, new_input, fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer") "InputToGpuOptimizer")
except TypeError: except TypeError:
# This could fail if the inputs are not TensorTypes # This could fail if the inputs are not TensorTypes
pass pass
except ValueError:
# If there is no context tag and no default context
# then it stays on the CPU
if not hasattr(input.tag, 'context_name'):
raise
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([gpu_from_host, host_from_gpu]) @local_optimizer([GpuFromHost, GpuToGpu, host_from_gpu])
def local_cut_gpu_host_gpu(node): def local_cut_gpu_transfers(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): # gpu[ab] -> host -> gpub
return [node.inputs[0].owner.inputs[0]] if (isinstance(node.op, GpuFromHost) and
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host): node.inputs[0].owner and
return [node.inputs[0].owner.inputs[0]] node.inputs[0].owner.op == host_from_gpu):
return False other = node.inputs[0].owner.inputs[0]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu, if node.op.context_name == other.type.context_name:
return [other]
else:
return [GpuToGpu(node.op.context_name)(other)]
# ? -> gpua -> host
elif (node.op == host_from_gpu and
node.inputs[0].owner):
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [n2.inputs[0]]
# gpub ->
if isinstance(n2.op, GpuToGpu):
return [host_from_gpu(n2.inputs[0])]
# ? -> gpua -> gpub
elif isinstance(node.op, GpuToGpu):
# Transfer within same context
if node.inputs[0].type.context_name == node.op.context_name:
return [node.inputs[0]]
if node.inputs[0].owner:
n2 = node.inputs[0].owner
# host ->
if isinstance(n2.op, GpuFromHost):
return [GpuFromHost(node.op.context_name)(n2.inputs[0])]
# gpuc ->
if isinstance(n2.op, GpuToGpu):
if node.op.context_name == n2.inputs[0].type.context_name:
return [n2.inputs[0]]
else:
return [node.op(n2.inputs[0])]
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'inplace', 'gpuarray') 'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers', gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding, tensor.opt.constant_folding,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua', optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, local_cut_gpu_transfers,
'fast_compile', 'fast_run', 'gpuarray') 'fast_compile', 'fast_run', 'gpuarray')
...@@ -187,6 +253,11 @@ def local_gpuaalloc2(node): ...@@ -187,6 +253,11 @@ def local_gpuaalloc2(node):
Moves an alloc that is an input to join to the gpu. Moves an alloc that is an input to join to the gpu.
""" """
try:
get_context(None)
except ValueError:
# If there is no default context then we do not perform the move here.
return
if (isinstance(node.op, tensor.Alloc) and if (isinstance(node.op, tensor.Alloc) and
all(c != 'output' and all(c != 'output' and
c.op == tensor.join and c.op == tensor.join and
...@@ -194,23 +265,13 @@ def local_gpuaalloc2(node): ...@@ -194,23 +265,13 @@ def local_gpuaalloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc] i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]) for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)): for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(*node.inputs))] return [host_from_gpu(GpuAlloc(None)(*node.inputs))]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
def local_gpuaalloc(node): def local_gpuaalloc(node, context_name):
new_out = gpu_alloc(*node.inputs) return GpuAlloc(context_name)(*node.inputs)
# We need to hide new broadcastable dimensions because
# ReplaceValidate doesn't like when they change.
if new_out.broadcastable != node.outputs[0].broadcastable:
# but if a dim is suddenly not broadcastable anymore then that's a bug
for b_old, b_new in zip(node.outputs[0].broadcastable,
new_out.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out,
node.outputs[0].broadcastable)
return (new_out,)
@register_opt() @register_opt()
...@@ -221,8 +282,8 @@ def local_gpualloc_memset_0(node): ...@@ -221,8 +282,8 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_out] return [new_op(*node.inputs)]
@register_opt() @register_opt()
...@@ -240,7 +301,7 @@ def local_gpu_contiguous_gpu_contiguous(node): ...@@ -240,7 +301,7 @@ def local_gpu_contiguous_gpu_contiguous(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Reshape]) @op_lifter([tensor.Reshape])
def local_gpureshape(node): def local_gpureshape(node, context_name):
op = node.op op = node.op
name = op.name name = op.name
if name: if name:
...@@ -251,14 +312,14 @@ def local_gpureshape(node): ...@@ -251,14 +312,14 @@ def local_gpureshape(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Rebroadcast]) @op_lifter([tensor.Rebroadcast])
def local_gpu_rebroadcast(node): def local_gpu_rebroadcast(node, context_name):
if isinstance(node.inputs[0].owner.op, HostFromGpu): if isinstance(node.inputs[0].owner.op, HostFromGpu):
return node.op(node.inputs[0].owner.inputs[0]) return node.op(node.inputs[0].owner.inputs[0])
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Flatten]) @op_lifter([tensor.Flatten])
def local_gpuflatten(node): def local_gpuflatten(node, context_name):
op = node.op op = node.op
shp = [] shp = []
if op.outdim != 1: if op.outdim != 1:
...@@ -271,7 +332,7 @@ def local_gpuflatten(node): ...@@ -271,7 +332,7 @@ def local_gpuflatten(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Elemwise]) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node): def local_gpu_elemwise(node, context_name):
op = node.op op = node.op
scal_op = op.scalar_op scal_op = op.scalar_op
name = op.name name = op.name
...@@ -344,28 +405,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -344,28 +405,28 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.DimShuffle]) @op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node): def local_gpua_dimshuffle(node, context_name):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.SpecifyShape]) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node): def local_gpua_specifyShape(node, context_name):
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
inp = [gpu_from_host(node.inputs[0])] + node.inputs[1:] inp = [GpuFromHost(context_name)(node.inputs[0])] + node.inputs[1:]
return tensor.specify_shape(*inp) return tensor.specify_shape(*inp)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.compile.ops.Shape]) @op_lifter([theano.compile.ops.Shape])
def local_gpua_shape(node): def local_gpua_shape(node, context_name):
# op_lifter will call this opt too frequently as the output is # op_lifter will call this opt too frequently as the output is
# always on the CPU. # always on the CPU.
if isinstance(node.inputs[0].type, GpuArrayType): if isinstance(node.inputs[0].type, GpuArrayType):
return return
return [gpu_from_host(node.inputs[0]).shape] return [GpuFromHost(context_name)(node.inputs[0]).shape]
def gpu_print_wrapper(op, cnda): def gpu_print_wrapper(op, cnda):
...@@ -374,7 +435,7 @@ def gpu_print_wrapper(op, cnda): ...@@ -374,7 +435,7 @@ def gpu_print_wrapper(op, cnda):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.printing.Print]) @op_lifter([tensor.printing.Print])
def local_gpu_print_op(node): def local_gpu_print_op(node, context_name):
x, = node.inputs x, = node.inputs
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
new_op = node.op.__class__(global_fn=gpu_print_wrapper) new_op = node.op.__class__(global_fn=gpu_print_wrapper)
...@@ -404,9 +465,14 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -404,9 +465,14 @@ def local_gpu_pdbbreakpoint_op(node):
input_is_from_gpu = (inp.owner and input_is_from_gpu = (inp.owner and
isinstance(inp.owner.op, HostFromGpu)) isinstance(inp.owner.op, HostFromGpu))
output_goes_to_gpu = any([c[0] != "output" and output_goes_to_gpu = False
isinstance(c[0].op, GpuFromHost) for c in out.clients:
for c in out.clients]) if c == 'output':
continue
if isinstance(c[0].op, GpuFromHost):
output_goes_to_gpu = True
context_name = c[0].op.context_name
break
if input_is_from_gpu: if input_is_from_gpu:
# The op should be applied on the GPU version of the input # The op should be applied on the GPU version of the input
...@@ -415,7 +481,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -415,7 +481,7 @@ def local_gpu_pdbbreakpoint_op(node):
elif output_goes_to_gpu: elif output_goes_to_gpu:
# The input should be transfered to the gpu # The input should be transfered to the gpu
new_inputs.append(gpu_from_host(inp)) new_inputs.append(GpuFromHost(context_name)(inp))
input_transfered.append(True) input_transfered.append(True)
else: else:
...@@ -447,7 +513,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -447,7 +513,7 @@ def local_gpu_pdbbreakpoint_op(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Join]) @op_lifter([tensor.Join])
def local_gpua_join(node): def local_gpua_join(node, context_name):
return gpu_join return gpu_join
...@@ -462,13 +528,13 @@ def local_gpuajoin_1(node): ...@@ -462,13 +528,13 @@ def local_gpuajoin_1(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Split]) @op_lifter([tensor.Split])
def local_gpua_split(node): def local_gpua_split(node, context_name):
return GpuSplit(node.op.len_splits) return GpuSplit(node.op.len_splits)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Subtensor]) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node): def local_gpua_subtensor(node, context_name):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
...@@ -482,14 +548,14 @@ def local_gpua_subtensor(node): ...@@ -482,14 +548,14 @@ def local_gpua_subtensor(node):
for n, _ in node.outputs[0].clients]): for n, _ in node.outputs[0].clients]):
return return
else: else:
return [host_from_gpu(gpu_from_host(node.outputs[0]))] return [host_from_gpu(gpu_x.owner.op(node.outputs[0]))]
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.IncSubtensor]) @op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node): def local_gpua_incsubtensor(node, context_name):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace, return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc, node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
...@@ -497,16 +563,16 @@ def local_gpua_incsubtensor(node): ...@@ -497,16 +563,16 @@ def local_gpua_incsubtensor(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1]) @op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node): def local_gpua_advanced_subtensor(node, context_name):
return GpuAdvancedSubtensor1() return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node, context_name):
# This optimization is disabled if cuda is not active # This is disabled on non-cuda contexts
if pygpu.get_default_context().kind != "cuda": if get_context(context_name).kind != 'cuda':
return None return None
x, y, ilist = node.inputs x, y, ilist = node.inputs
...@@ -535,17 +601,19 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -535,17 +601,19 @@ def local_gpua_advanced_incsubtensor(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod]) @op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
def local_gpua_careduce(node): def local_gpua_careduce(node, context_name):
if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul, if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
scalar.Maximum, scalar.Minimum)): scalar.Maximum, scalar.Minimum)):
dev = theano.sandbox.gpuarray.init_dev.device ctx = get_context(context_name)
if dev.startswith('opencl'): if ctx.kind == 'opencl':
op = GpuCAReduceCPY op = GpuCAReduceCPY
if node.op.scalar_op not in [scalar.add, scalar.mul]: if node.op.scalar_op not in [scalar.add, scalar.mul]:
# We don't support yet all reduction with cpy code. # We don't support yet all reduction with cpy code.
return return
else: elif ctx.kind == 'cuda':
op = GpuCAReduceCuda op = GpuCAReduceCuda
else:
return False
x, = node.inputs x, = node.inputs
greduce = op( greduce = op(
...@@ -556,7 +624,7 @@ def local_gpua_careduce(node): ...@@ -556,7 +624,7 @@ def local_gpua_careduce(node):
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
if (op is GpuCAReduceCPY or if (op is GpuCAReduceCPY or
gvar.owner.op.supports_c_code([gpu_from_host(x)])): gvar.owner.op.supports_c_code([GpuFromHost(context_name)(x)])):
return greduce return greduce
else: else:
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
...@@ -596,7 +664,7 @@ def local_gpua_careduce(node): ...@@ -596,7 +664,7 @@ def local_gpua_careduce(node):
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
reshaped_x = x.reshape(tensor.stack(new_in_shp)) reshaped_x = x.reshape(tensor.stack(new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x) gpu_reshaped_x = GpuFromHost(context_name)(reshaped_x)
gvar = greduce(gpu_reshaped_x) gvar = greduce(gpu_reshaped_x)
# We need to have the make node called, otherwise the mask can # We need to have the make node called, otherwise the mask can
# be None # be None
...@@ -615,19 +683,19 @@ def local_gpua_careduce(node): ...@@ -615,19 +683,19 @@ def local_gpua_careduce(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv]) @op_lifter([tensor.blas.Gemv, tensor.blas_c.CGemv])
def local_gpua_gemv(node): def local_gpua_gemv(node, context_name):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Gemm]) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node): def local_gpua_gemm(node, context_name):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Dot]) @op_lifter([tensor.basic.Dot])
def local_gpua_hgemm(node): def local_gpua_hgemm(node, context_name):
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
_logger.warning("Not performing dot of float16 on the GPU since " _logger.warning("Not performing dot of float16 on the GPU since "
...@@ -639,8 +707,9 @@ def local_gpua_hgemm(node): ...@@ -639,8 +707,9 @@ def local_gpua_hgemm(node):
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(shape_i(A, 0, fgraph), C = GpuAllocEmpty(dtype='float16', context_name=context_name)(
shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph),
shape_i(B, 1, fgraph))
return gpugemm_no_inplace(C, 1.0, A, B, 0.0) return gpugemm_no_inplace(C, 1.0, A, B, 0.0)
...@@ -658,49 +727,49 @@ def local_gpuagemm_output_merge(node, *inputs): ...@@ -658,49 +727,49 @@ def local_gpuagemm_output_merge(node, *inputs):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer]) @op_lifter([tensor.blas.Ger, tensor.blas_c.CGer, tensor.blas_scipy.ScipyGer])
def local_gpua_ger(node): def local_gpua_ger(node, context_name):
return GpuGer(destructive=node.op.destructive) return GpuGer(inplace=node.op.destructive)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.blas.Dot22]) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node): def local_gpua_dot22(node, context_name):
return gpu_dot22 return gpu_dot22
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.basic.Eye]) @op_lifter([tensor.basic.Eye])
def local_gpua_eye(node): def local_gpua_eye(node, context_name):
return GpuEye(dtype=node.op.dtype) return GpuEye(dtype=node.op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], cuda_only=True)
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node): def local_gpua_crossentropysoftmaxargmax1hotwithbias(node, context_name):
return GpuCrossentropySoftmaxArgmax1HotWithBias() return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx], cuda_only=True)
def local_gpua_crossentropysoftmax1hotwithbiasdx(node): def local_gpua_crossentropysoftmax1hotwithbiasdx(node, context_name):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.Softmax], cuda_only=True) @op_lifter([tensor.nnet.Softmax], cuda_only=True)
def local_gpua_softmax(node): def local_gpua_softmax(node, context_name):
return GpuSoftmax() return GpuSoftmax()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True) @op_lifter([tensor.nnet.SoftmaxWithBias], cuda_only=True)
def local_gpua_softmaxwithbias(node): def local_gpua_softmaxwithbias(node, context_name):
return GpuSoftmaxWithBias() return GpuSoftmaxWithBias()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.opt.Assert]) @op_lifter([theano.tensor.opt.Assert])
def local_assert(node): def local_assert(node, context_name):
if (node.inputs[0].owner and if (node.inputs[0].owner and
isinstance(node.inputs[0].owner.op, HostFromGpu)): isinstance(node.inputs[0].owner.op, HostFromGpu)):
return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0], return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0],
...@@ -708,21 +777,14 @@ def local_assert(node): ...@@ -708,21 +777,14 @@ def local_assert(node):
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([gpu_from_host, ConvOp]) @op_lifter([ConvOp])
def local_gpu_conv(node): def local_gpu_conv(node, context_name):
"""
gpu_from_host(conv) -> gpu_conv(gpu_from_host)
conv(host_from_gpu) -> host_from_gpu(gpu_conv)
"""
def GpuConvOp_from_ConvOp(op): def GpuConvOp_from_ConvOp(op):
logical_img_hw = None logical_img_hw = None
if op.kshp_logical is not None and op.kshp_logical != op.kshp: if op.kshp_logical is not None and op.kshp_logical != op.kshp:
return None return None
# print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode, ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy), subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw, logical_img_hw=logical_img_hw,
...@@ -735,13 +797,10 @@ def local_gpu_conv(node): ...@@ -735,13 +797,10 @@ def local_gpu_conv(node):
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern, nkern=op.nkern,
bsize=op.bsize, bsize=op.bsize,
fft_opt=op.fft_opt fft_opt=op.fft_opt)
)
if op.imshp_logical is not None: if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3] logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]: if logical_img_hw != op.imshp[1:3]:
# this case is not implemented
# return None
rstride = int(numpy.ceil(op.imshp_logical[1] / rstride = int(numpy.ceil(op.imshp_logical[1] /
float(op.imshp[1]))) float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] / cstride = int(numpy.ceil(op.imshp_logical[2] /
...@@ -752,7 +811,7 @@ def local_gpu_conv(node): ...@@ -752,7 +811,7 @@ def local_gpu_conv(node):
img.shape[0], *op.imshp_logical) img.shape[0], *op.imshp_logical)
img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride], img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
img) img)
img = gpu_from_host(img) img = GpuFromHost(context_name)(img)
return ret(img, kern) return ret(img, kern)
return make_graph return make_graph
...@@ -779,15 +838,13 @@ def local_gpu_conv(node): ...@@ -779,15 +838,13 @@ def local_gpu_conv(node):
gpu_conv = GpuConvOp_from_ConvOp(node.op) gpu_conv = GpuConvOp_from_ConvOp(node.op)
if gpu_conv is None: if gpu_conv is None:
return return
out = gpu_conv(gpu_from_host(img), out = gpu_conv(GpuFromHost(context_name)(img),
gpu_from_host(kern)) GpuFromHost(context_name)(kern))
# in some case the ConvOp broadcast the last 2 dimensions assert isinstance(out.type, GpuArrayType)
# differently then the gpu ConvOp # Make sure to keep the broadcastable pattern of the original
out = tensor.patternbroadcast( # convolution even if we might gain or lose some due to different
host_from_gpu(out), # information at the node level.
node.outputs[0].broadcastable) out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
# op_lifter want the output on the GPU.
out = gpu_from_host(out)
out.values_eq_approx = values_eq_approx out.values_eq_approx = values_eq_approx
return [out] return [out]
...@@ -818,9 +875,10 @@ def local_gpu_elemwise_careduce(node): ...@@ -818,9 +875,10 @@ def local_gpu_elemwise_careduce(node):
pre_scalar_op=scalar.basic.sqr)(inp)] pre_scalar_op=scalar.basic.sqr)(inp)]
def tensor_to_gpu(x): def tensor_to_gpu(x, context_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
y = GpuArrayType(broadcastable=x.type.broadcastable, y = GpuArrayType(broadcastable=x.type.broadcastable,
context_name=context_name,
dtype=x.type.dtype)() dtype=x.type.dtype)()
if x.name: if x.name:
y.name = x.name + '[Gpua]' y.name = x.name + '[Gpua]'
...@@ -842,6 +900,7 @@ def gpu_safe_new(x, tag=''): ...@@ -842,6 +900,7 @@ def gpu_safe_new(x, tag=''):
nw_name = x.name + tag nw_name = x.name + tag
else: else:
nw_name = None nw_name = None
if isinstance(x, theano.Constant): if isinstance(x, theano.Constant):
return x.clone() return x.clone()
...@@ -870,7 +929,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -870,7 +929,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
@register_opt('scan', 'fast_compile') @register_opt('scan', 'fast_compile')
@op_lifter([scan_op.Scan]) @op_lifter([scan_op.Scan])
def local_scan_to_gpua(node): def local_scan_to_gpua(node, context_name):
info = copy.deepcopy(node.op.info) info = copy.deepcopy(node.op.info)
if info.get('gpua', False): if info.get('gpua', False):
return return
...@@ -882,20 +941,20 @@ def local_scan_to_gpua(node): ...@@ -882,20 +941,20 @@ def local_scan_to_gpua(node):
node.op.n_mit_sot + node.op.n_mit_sot +
node.op.n_sit_sot + node.op.n_sit_sot +
node.op.n_shared_outs) node.op.n_shared_outs)
nw_ins += [safe_to_gpu(x) for x in node.inputs[1:e]] nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[1:e]]
b = e b = e
e = e + node.op.n_nit_sot e = e + node.op.n_nit_sot
nw_ins += node.inputs[b:e] nw_ins += node.inputs[b:e]
nw_ins += [safe_to_gpu(x) for x in node.inputs[e:]] nw_ins += [safe_to_gpu(x, context_name) for x in node.inputs[e:]]
scan_ins = [tensor_to_gpu(x) for x in node.op.inputs] scan_ins = [tensor_to_gpu(x, context_name) for x in node.op.inputs]
# The inner output corresponding to the looping condition should not be # The inner output corresponding to the looping condition should not be
# moved to the gpu # moved to the gpu
if node.op.info['as_while']: if node.op.info['as_while']:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs[:-1]] scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs[:-1]]
scan_outs += [node.op.outputs[-1]] scan_outs += [node.op.outputs[-1]]
else: else:
scan_outs = [safe_to_gpu(x) for x in node.op.outputs] scan_outs = [safe_to_gpu(x, context_name) for x in node.op.outputs]
scan_outs = scan_utils.clone( scan_outs = scan_utils.clone(
scan_outs, scan_outs,
replace=list(zip(node.op.inputs, replace=list(zip(node.op.inputs,
...@@ -909,12 +968,25 @@ def local_scan_to_gpua(node): ...@@ -909,12 +968,25 @@ def local_scan_to_gpua(node):
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
def typebuild(dtype, broadcastable, context_name=context_name):
return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
context_name=context_name)
nw_op = scan_op.Scan(scan_ins, scan_outs, info, nw_op = scan_op.Scan(scan_ins, scan_outs, info,
typeConstructor=GpuArrayType).make_node(*nw_ins) typeConstructor=typebuild).make_node(*nw_ins)
return nw_op.outputs return nw_op.outputs
def _scan_type_infer(node):
context_name = infer_context_name(*node.inputs)
def typebuild(dtype, broadcastable, context_name=context_name):
return GpuArrayType(dtype=dtype, broadcastable=broadcastable,
context_name=context_name)
return typebuild
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=GpuArrayType, scan_opt.ScanInplaceOptimizer(typeInfer=_scan_type_infer,
gpua_flag=True), gpua_flag=True),
75, 75,
'gpuarray', 'gpuarray',
......
...@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx): ...@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
function can be as simple as: function can be as simple as:
def maker(node, inputs): def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs) return [node.op.__class__(inplace=True)(*inputs)]
Parameters Parameters
---------- ----------
...@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx): ...@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
if (alloc.owner and if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype) alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) return maker(node, inputs)
return opt return opt
......
try:
from pycuda.driver import Context
if not hasattr(Context, 'attach'):
raise ImportError('too old')
except ImportError:
Context = None
pycuda_initialized = False
pycuda_context = None
def ensure_pycuda_context():
global pycuda_context, pycuda_initialized
if not pycuda_initialized:
if Context is None:
raise RuntimeError("PyCUDA not found or too old.")
else:
pycuda_context = Context.attach()
import atexit
atexit.register(pycuda_context.detach)
pycuda_initialized = True
return pycuda_context
from __future__ import print_function from __future__ import print_function
import copy
import os import os
import copy
import numpy import numpy
import theano import theano
from theano import tensor, gof, config from theano import tensor, gof
from theano.gof.utils import MethodNotDefined
from six.moves import StringIO from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace import theano.tensor.inplace
...@@ -19,7 +18,8 @@ except ImportError: ...@@ -19,7 +18,8 @@ except ImportError:
pass pass
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel) from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
...@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
_f16_ok = True _f16_ok = True
def make_node(self, x, *inputs): def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = tensor.Subtensor.make_node(self, x, *inputs) rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype, otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable) broadcastable=rval.outputs[0].type.broadcastable,
x = as_gpuarray_variable(x) context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()]) return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_):
...@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname) return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs): def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x) ctx_name = infer_context_name(x, y)
y = as_gpuarray_variable(y) x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
op = copy.copy(self) op = copy.copy(self)
ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()]) ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
op.create_iadd_node(ret) op.create_iadd_node(ret)
return ret return ret
def get_context(self, node):
return node.outputs[0].type.context
def create_iadd_node(self, node): def create_iadd_node(self, node):
# We store a iadd_node in the op that contain the info needed # We store a iadd_node in the op that contain the info needed
# for the inplace add. # for the inplace add.
...@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
iadd_node = gop(xview, y).owner iadd_node = gop(xview, y).owner
self.iadd_node = iadd_node self.iadd_node = iadd_node
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_, ctx):
out, = out_ out, = out_
x, y = inputs[:2] x, y = inputs[:2]
indices = list(reversed(inputs[2:])) indices = list(reversed(inputs[2:]))
...@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
%(view_ndim)s, %(view_ndim)s,
dims, dims,
xview_strides, xview_strides,
pygpu_default_context(), %(x)s->context,
1, 1,
(PyObject *)%(x)s, (PyObject *)%(x)s,
(PyObject *)&PyGpuArrayType); (PyObject *)&PyGpuArrayType);
...@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" """
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals() return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_struct(self, node, nodename):
gop = self.iadd_node.op gop = self.iadd_node.op
sub_name = nodename + "_add_to_zview" sub_name = nodename + "_add_to_zview"
ret = gop.c_support_code_apply(self.iadd_node, sub_name) ret = gop.c_support_code_struct(self.iadd_node, sub_name)
ret += """ ret += """
PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst, PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){ PyGpuArrayObject* src){
...@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" % locals() """ % locals()
inputs = ["dst", "src"] inputs = ["dst", "src"]
outputs = ["ret"] outputs = ["ret"]
sub = {"fail": "return NULL;"} sub = {"fail": "return NULL;", "context": "dst->context"}
ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub) ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
ret += """ ret += """
return dst; return ret;
} }
""" """
return ret return ret
...@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
def make_node(self, x, ilist): def make_node(self, x, ilist):
x_ = as_gpuarray_variable(x) ctx_name = infer_context_name(x, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
ilist__ = tensor.as_tensor_variable(ilist) ilist__ = tensor.as_tensor_variable(ilist)
if ilist__.type.dtype[:3] not in ('int', 'uin'): if ilist__.type.dtype[:3] not in ('int', 'uin'):
...@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): ...@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
if ilist__.type.dtype != 'int64': if ilist__.type.dtype != 'int64':
ilist__ = tensor.cast(ilist__, 'int64') ilist__ = tensor.cast(ilist__, 'int64')
ilist_ = as_gpuarray_variable(ilist__) ilist_ = as_gpuarray_variable(ilist__, ctx_name)
if ilist_.type.dtype != 'int64': if ilist_.type.dtype != 'int64':
raise TypeError('index must be int64') raise TypeError('index must be int64')
...@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): ...@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
bcast = ilist_.broadcastable + x_.broadcastable[1:] bcast = ilist_.broadcastable + x_.broadcastable[1:]
return gof.Apply(self, [x_, ilist_], return gof.Apply(self, [x_, ilist_],
[GpuArrayType(dtype=x.dtype, [GpuArrayType(dtype=x.dtype,
context_name=ctx_name,
broadcastable=bcast)()]) broadcastable=bcast)()])
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
...@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1): ...@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
""" """
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
x_ = as_gpuarray_variable(x) ctx_name = infer_context_name(x, y)
y_ = as_gpuarray_variable(y) x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y, ctx_name)
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
...@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
only avail on compute capability 2.0 and more recent. only avail on compute capability 2.0 and more recent.
""" """
_f16_ok = True _f16_ok = True
def make_node(self, x, y, ilist): def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure """It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long. the index are of type long.
""" """
x_ = as_gpuarray_variable(x) ctx_name = infer_context_name(x, y, ilist)
y_ = as_gpuarray_variable(y) x_ = as_gpuarray_variable(x, ctx_name)
ilist_ = as_gpuarray_variable(ilist) y_ = as_gpuarray_variable(y, ctx_name)
ilist_ = as_gpuarray_variable(ilist, ctx_name)
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim assert x_.type.ndim >= y_.type.ndim
...@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1): ...@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_context(self, node):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (6,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<numpy_compat.h>', '<gpuarray_helper.h>',
raise MethodNotDefined('cuda only')
return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
'<gpuarray/types.h>'] '<gpuarray/types.h>']
def c_header_dirs(self): def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl': return [os.path.dirname(__file__)]
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
res = [os.path.dirname(__file__)]
if cuda_root:
res.append(os.path.join(cuda_root, 'include'))
return res
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number() ctx = self.get_context(node)
device_properties = theano.sandbox.cuda.device_properties if ctx.kind != 'cuda':
compute_capability = device_properties(active_device_no)['major'] raise NotImplementedError("cuda only")
if ((self.set_instead_of_inc) or if (self.set_instead_of_inc or
(node.inputs[0].ndim != node.inputs[1].ndim) or node.inputs[0].ndim != node.inputs[1].ndim or
(node.inputs[0].ndim != 2) or node.inputs[0].ndim != 2 or
(compute_capability < 2)): ctx.bin_id[-2] < '2'):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0] x = inputs[0]
...@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
return [Kernel(code=code, name=kname, params=params, return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)] flags=flags, objvar=k_var)]
def c_support_code_apply(self, node, nodename): def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype dtype_ind = node.inputs[2].dtype
...@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
itemsize_out = numpy.dtype(dtype_out).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """ return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self, int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other, PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr) PyGpuArrayObject *indices_arr)
......
from nose.plugins.skip import SkipTest
import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
if not theano.sandbox.gpuarray.pygpu_activated:
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available:
cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
test_ctx_name = None
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
...@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc ...@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray from ..type import (GpuArrayType, get_context,
from ..type import (GpuArrayType,
gpuarray_shared_constructor) gpuarray_shared_constructor)
from ..basic_ops import ( from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape, host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous) gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather from pygpu import gpuarray
# import stuff from this file (like mode_with_gpu) to reuse it.
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu,
# as many tests don't work in that setup.
cuda_ndarray.use('gpu',
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
utt.seed_rng() utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed()) rng = numpy.random.RandomState(seed=utt.fetch_seed())
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False, def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None): on_unused_input='raise', name=None):
...@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs): ...@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
cls = kwargs.pop('cls', None) cls = kwargs.pop('cls', None)
if len(kwargs) != 0: if len(kwargs) != 0:
raise TypeError('Unexpected argument %s', list(kwargs.keys())[0]) raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
return gpuarray.array(r, dtype=dtype, cls=cls) return gpuarray.array(r, dtype=dtype, cls=cls,
context=get_context(test_ctx_name))
def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
...@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
def test_all(self): def test_all(self):
if skip: if skip:
from nose.plugins.skip import SkipTest
raise SkipTest(skip) raise SkipTest(skip)
for testname, inputs in iteritems(cases): for testname, inputs in iteritems(cases):
...@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu(): ...@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 4), dtype='float32') av = numpy.asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av) gv = gpuarray.array(av, context=get_context(test_ctx_name))
f = theano.function([a], gpu_from_host(a)) f = theano.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av) fv = f(av)
assert GpuArrayType.values_eq(fv, gv) assert GpuArrayType.values_eq(fv, gv)
...@@ -218,12 +189,12 @@ def test_transfer_strided(): ...@@ -218,12 +189,12 @@ def test_transfer_strided():
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g') g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 8), dtype='float32') av = numpy.asarray(rng.rand(5, 8), dtype='float32')
gv = gpuarray.array(av) gv = gpuarray.array(av, context=get_context(test_ctx_name))
av = av[:, ::2] av = av[:, ::2]
gv = gv[:, ::2] gv = gv[:, ::2]
f = theano.function([a], gpu_from_host(a)) f = theano.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av) fv = f(av)
assert GpuArrayType.values_eq(fv, gv) assert GpuArrayType.values_eq(fv, gv)
...@@ -233,14 +204,14 @@ def test_transfer_strided(): ...@@ -233,14 +204,14 @@ def test_transfer_strided():
def gpu_alloc_expected(x, *shp): def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype) g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
g[:] = x g[:] = x
return g return g
GpuAllocTester = makeTester( GpuAllocTester = makeTester(
name="GpuAllocTester", name="GpuAllocTester",
op=alloc, op=alloc,
gpu_op=gpu_alloc, gpu_op=GpuAlloc(test_ctx_name),
cases=dict( cases=dict(
correct01=(rand(), numpy.int32(7)), correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU # just gives a DeepCopyOp with possibly wrong results on the CPU
...@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc): ...@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
dtype = "float32" dtype = "float32"
mode = mode_with_gpu mode = mode_with_gpu
shared = staticmethod(gpuarray_shared_constructor) shared = staticmethod(gpuarray_shared_constructor)
allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()] allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), T.Alloc()]
def test_alloc_empty(): def test_alloc_empty():
for dt in ['float32', 'int8']: for dt in ['float32', 'int8']:
f = theano.function([], GpuAllocEmpty(dt)(2, 3)) f = theano.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
assert len(f.maker.fgraph.apply_nodes) == 1 assert len(f.maker.fgraph.apply_nodes) == 1
out = f() out = f()
assert out.shape == (2, 3) assert out.shape == (2, 3)
assert out.dtype == dt assert out.dtype == dt
f = theano.function([], [GpuAllocEmpty('uint64')(3, 2), f = theano.function([], [GpuAllocEmpty('uint64', test_ctx_name)(3, 2),
GpuAllocEmpty('uint64')(3, 2)]) GpuAllocEmpty('uint64', test_ctx_name)(3, 2)])
out = f() out = f()
assert out[0].shape == (3, 2) assert out[0].shape == (3, 2)
assert out[0].dtype == 'uint64' assert out[0].dtype == 'uint64'
...@@ -284,7 +255,7 @@ def test_alloc_empty(): ...@@ -284,7 +255,7 @@ def test_alloc_empty():
def test_shape(): def test_shape():
x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])() x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype='float32') v = gpuarray.zeros((3, 4, 5), dtype='float32', context=get_context(test_ctx_name))
f = theano.function([x], x.shape) f = theano.function([x], x.shape)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert numpy.all(f(v) == (3, 4, 5)) assert numpy.all(f(v) == (3, 4, 5))
...@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i(): ...@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))() ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
cv = gpuarray.asarray(numpy.random.rand(5, 4), cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32') dtype='float32',
context=get_context(test_ctx_name))
f = theano.function([a], gpu_from_host(a), mode=m) f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
assert gpu_from_host in [x.op assert any(isinstance(x.op, GpuFromHost)
for x in f.maker.fgraph.toposort()] for x in f.maker.fgraph.toposort())
f = theano.function([a], gpu_from_host(a).shape, mode=m) f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, T.opt.Shape_i) assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op, T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i)
......
...@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22 ...@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
from .test_basic_ops import (makeTester, rand, from .config import mode_with_gpu
mode_with_gpu) from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_inplace,
...@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin): ...@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
self.ops = [gpuger_no_inplace, gpuger_inplace] self.ops = [gpuger_no_inplace, gpuger_inplace]
def clone(self, op): def clone(self, op):
return GpuGer(destructive=op.destructive) return GpuGer(inplace=op.inplace)
GpuDot22Tester = makeTester( GpuDot22Tester = makeTester(
......
...@@ -14,8 +14,8 @@ from theano import tensor ...@@ -14,8 +14,8 @@ from theano import tensor
from theano.tests.unittest_tools import seed_rng from theano.tests.unittest_tools import seed_rng
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import mode_with_gpu from .config import mode_with_gpu, test_ctx_name
from ..type import GpuArrayType from ..type import GpuArrayType, get_context
from ..conv import GpuConv from ..conv import GpuConv
from theano.sandbox.gpuarray import dnn from theano.sandbox.gpuarray import dnn
...@@ -28,7 +28,7 @@ try: ...@@ -28,7 +28,7 @@ try:
except ImportError: except ImportError:
pass pass
gftensor4 = GpuArrayType('float32', [False] * 4) gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
...@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange( npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = pygpu.array(npy_img) img = pygpu.array(npy_img, context=get_context(test_ctx_name))
kern = pygpu.array(npy_kern) kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))
# we take the stride after the transfert as we make c_contiguous # we take the stride after the transfert as we make c_contiguous
# data on the GPU. # data on the GPU.
......
...@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad ...@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
from .. import dnn from .. import dnn
from ..basic_ops import GpuAllocEmpty from ..basic_ops import GpuAllocEmpty
from .test_basic_ops import mode_with_gpu, mode_without_gpu from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from . import test_nnet from . import test_nnet
def test_dnn_conv_desc_merge(): def test_dnn_conv_desc_merge():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
kern_shp = T.as_tensor_variable( kern_shp = T.as_tensor_variable(
numpy.asarray([3, 1, 2, 2]).astype('int64')) numpy.asarray([3, 1, 2, 2]).astype('int64'))
...@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge(): ...@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():
def test_dnn_conv_merge(): def test_dnn_conv_merge():
# This test that we merge correctly multiple dnn_conv. # This test that we merge correctly multiple dnn_conv.
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8] img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6] kern_shp = [3, 5, 5, 6]
...@@ -80,7 +80,7 @@ def test_dnn_conv_inplace(): ...@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
GpuAllocEmpty get merged together. GpuAllocEmpty get merged together.
""" """
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8] img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6] kern_shp = [3, 5, 5, 6]
...@@ -105,7 +105,7 @@ def test_dnn_conv_inplace(): ...@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2 assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad w op # Test grad w op
out = GpuAllocEmpty(kern.dtype)(*kern.shape) out = GpuAllocEmpty(kern.dtype, test_ctx_name)(*kern.shape)
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1) o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2) o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu) f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
...@@ -116,7 +116,7 @@ def test_dnn_conv_inplace(): ...@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2 assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad i op # Test grad i op
out = GpuAllocEmpty(img.dtype)(*img.shape) out = GpuAllocEmpty(img.dtype, test_ctx_name)(*img.shape)
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1) o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2) o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu) f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
...@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None, ...@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
def test_pooling(): def test_pooling():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
x = T.ftensor4() x = T.ftensor4()
...@@ -269,7 +269,7 @@ def test_pooling(): ...@@ -269,7 +269,7 @@ def test_pooling():
def test_pooling_opt(): def test_pooling_opt():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
x = T.fmatrix() x = T.fmatrix()
...@@ -318,7 +318,7 @@ def test_dnn_tag(): ...@@ -318,7 +318,7 @@ def test_dnn_tag():
max_pool_2d(x, ds=(2, 2), ignore_border=True), max_pool_2d(x, ds=(2, 2), ignore_border=True),
mode=mode_with_gpu.including("cudnn")) mode=mode_with_gpu.including("cudnn"))
except (AssertionError, RuntimeError): except (AssertionError, RuntimeError):
assert not dnn.dnn_available() assert not dnn.dnn_available(test_ctx_name)
raised = True raised = True
finally: finally:
theano.config.on_opt_error = old theano.config.on_opt_error = old
...@@ -327,7 +327,7 @@ def test_dnn_tag(): ...@@ -327,7 +327,7 @@ def test_dnn_tag():
logging.getLogger('theano').addHandler(theano.logging_default_handler) logging.getLogger('theano').addHandler(theano.logging_default_handler)
if not raised: if not raised:
assert dnn.dnn_available() assert dnn.dnn_available(test_ctx_name)
assert any([isinstance(n.op, dnn.GpuDnnPool) assert any([isinstance(n.op, dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()]) for n in f.maker.fgraph.toposort()])
...@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
self.mode = mode_with_gpu self.mode = mode_with_gpu
def test_softmax(self): def test_softmax(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
t = T.ftensor4('t') t = T.ftensor4('t')
rand_tensor = numpy.asarray( rand_tensor = numpy.asarray(
...@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
def test_conv(self): def test_conv(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img') img = T.ftensor4('img')
kerns = T.ftensor4('kerns') kerns = T.ftensor4('kerns')
...@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
def test_conv_gradw(self): def test_conv_gradw(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img') img = T.ftensor4('img')
kerns = T.ftensor4('kerns') kerns = T.ftensor4('kerns')
...@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
def test_conv_gradi(self): def test_conv_gradi(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img') img = T.ftensor4('img')
kerns = T.ftensor4('kerns') kerns = T.ftensor4('kerns')
...@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
def test_pool(self): def test_pool(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img') img = T.ftensor4('img')
img_val = numpy.asarray( img_val = numpy.asarray(
...@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
) )
def test_pool_grad(self): def test_pool_grad(self):
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img') img = T.ftensor4('img')
img_grad = T.ftensor4('img_grad') img_grad = T.ftensor4('img_grad')
...@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester): ...@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
# this has been a problem in the past # this has been a problem in the past
def test_dnn_conv_border_mode(): def test_dnn_conv_border_mode():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4() img = T.ftensor4()
kern = T.ftensor4() kern = T.ftensor4()
...@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode(): ...@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():
def test_dnn_conv_alpha_output_merge(): def test_dnn_conv_alpha_output_merge():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4() img = T.ftensor4()
kern = T.ftensor4() kern = T.ftensor4()
...@@ -678,7 +678,7 @@ def test_dnn_conv_grad(): ...@@ -678,7 +678,7 @@ def test_dnn_conv_grad():
def test_version(): def test_version():
if not dnn.dnn_available(): if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg) raise SkipTest(dnn.dnn_available.msg)
assert isinstance(dnn.version(), int) assert isinstance(dnn.version(), int)
......
...@@ -4,19 +4,19 @@ import theano ...@@ -4,19 +4,19 @@ import theano
from theano import scalar, gof from theano import scalar, gof
from theano.tests.unittest_tools import SkipTest, assert_allclose from theano.tests.unittest_tools import SkipTest, assert_allclose
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle, from theano.tensor.tests import test_elemwise
test_CAReduce, T_reduce_dtype)
from .test_basic_ops import mode_with_gpu, rand_gpuarray from .config import mode_with_gpu, test_ctx_name
from .test_basic_ops import rand_gpuarray
from ..elemwise import (GpuElemwise, GpuDimShuffle, from ..elemwise import (GpuElemwise, GpuDimShuffle,
GpuCAReduceCuda, GpuCAReduceCPY) GpuCAReduceCuda, GpuCAReduceCPY)
from ..type import GpuArrayType from ..type import GpuArrayType, get_context
from pygpu import ndgpuarray as gpuarray from pygpu import ndgpuarray as gpuarray
# This is acutally a test for GpuElemwise # This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_Broadcast): class test_gpu_Broadcast(test_elemwise.test_Broadcast):
op = GpuElemwise op = GpuElemwise
type = GpuArrayType type = GpuArrayType
cop = GpuElemwise cop = GpuElemwise
...@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast): ...@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
linkers = [gof.PerformLinker, gof.CLinker] linkers = [gof.PerformLinker, gof.CLinker]
def setUp(self): def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
self.linkers = [gof.PerformLinker] self.linkers = [gof.PerformLinker]
def rand_val(self, shp): def rand_val(self, shp):
...@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast): ...@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
return rand_gpuarray(*shp, **dict(cls=gpuarray)) return rand_gpuarray(*shp, **dict(cls=gpuarray))
def test_c(self): def test_c(self):
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c() super(test_gpu_Broadcast, self).test_c()
def test_c_inplace(self): def test_c_inplace(self):
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c_inplace() super(test_gpu_Broadcast, self).test_c_inplace()
...@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast): ...@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
def test_elemwise_pow(): def test_elemwise_pow():
# Test that GpuElemwise(pow) can compile with any combination of integer # Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype. # or float input dtype.
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
dtypes = ["uint8", "uint16", "uint32", "uint64", dtypes = ["uint8", "uint16", "uint32", "uint64",
...@@ -77,11 +73,11 @@ def test_elemwise_pow(): ...@@ -77,11 +73,11 @@ def test_elemwise_pow():
assert_allclose(out, expected_out) assert_allclose(out, expected_out)
class test_GpuDimShuffle(test_DimShuffle): class test_GpuDimShuffle(test_elemwise.test_DimShuffle):
op = GpuDimShuffle op = GpuDimShuffle
class test_GpuCAReduceCPY(test_CAReduce): class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
dtypes = ["float32"] dtypes = ["float32"]
bin_dtypes = ["uint8", "int8"] bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY op = GpuCAReduceCPY
...@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce): ...@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):
def test_infer_shape(self): def test_infer_shape(self):
for dtype in self.dtypes: for dtype in self.dtypes:
test_CAReduce.test_infer_shape(self, dtype) super(test_GpuCAReduceCPY, self).test_infer_shape(dtype)
class test_GpuCAReduceCuda(test_GpuCAReduceCPY): class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
...@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((5, 6), (1, )), ((5, 6), (1, )),
((5, 6), (-1, )), ((5, 6), (-1, )),
((5, 6), (-2, )), ((5, 6), (-2, )),
#((5, 6), ()), #reduce on no axis(copy) isn't implemented # ((5, 6), ()), #reduce on no axis(copy) isn't implemented
#((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented # ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
#((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented # ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((5, 0), None), ((5, 0), None),
((5, 0), (0, )), ((5, 0), (0, )),
((5, 0), (1, )), ((5, 0), (1, )),
#((5, 0), ()), reduce on no axis isn't implemented # ((5, 0), ()), reduce on no axis isn't implemented
#((), None), reduce on no axis isn't implemented # ((), None), reduce on no axis isn't implemented
#((), ()) reduce on no axis isn't implemented # ((), ()) reduce on no axis isn't implemented
# Test all GPU cases implemented # Test all GPU cases implemented
((1, 0), (1,)), ((1, 0), (1,)),
...@@ -158,7 +154,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -158,7 +154,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((0, 0, 0, 0), [0, 1, 2, 3]), ((0, 0, 0, 0), [0, 1, 2, 3]),
((5, 4, 3, 20), [2, 3]), ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]), ((5, 4, 3, 2), [1, 2, 3]), ((5, 4, 3, 20), [2, 3]), ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]), ((5, 4, 3, 2), [1, 2, 3]),
# test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions # test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100, 3), [0]), ((3, 4101), [0]), # 10 ((4100, 3), [0]), ((3, 4101), [0]), # 10
((1024, 33), [0]), ((33, 1024), [0]), # 10 ((1024, 33), [0]), ((33, 1024), [0]), # 10
((1025, 33), [0]), ((33, 1025), [0]), # 10 ((1025, 33), [0]), ((33, 1025), [0]), # 10
...@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001 ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110 ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011 ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented # ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111 ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111 ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
...@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
# test pattern implemented by reshape # test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape # Skip them as this test the op directly, not the optimization with reshape
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000 # ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100 # ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010 # ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001 # ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 # ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]), # ((5,4,3,10,11),[1,2]),
] ]
op = GpuCAReduceCuda op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul, reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum] scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr pre_scalar_op = None
def test_perform(self): def test_perform(self):
return return
...@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY): ...@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
def setUp(self): def setUp(self):
super(test_GpuCAReduceCuda, self).setUp() super(test_GpuCAReduceCuda, self).setUp()
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
class T_gpureduce_dtype(T_reduce_dtype): class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
mode = mode_with_gpu.excluding('local_cut_useless_reduce') mode = mode_with_gpu.excluding('local_cut_useless_reduce')
op = GpuCAReduceCuda op = GpuCAReduceCuda
# Currently we don't support reduction on 0 axis # Currently we don't support reduction on 0 axis
...@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype): ...@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
'float32', 'float64'] 'float32', 'float64']
def setUp(self): def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device if get_context(test_ctx_name).kind != 'cuda':
if not dev.startswith('cuda'):
raise SkipTest("Cuda specific tests") raise SkipTest("Cuda specific tests")
......
from theano.tensor.nnet.tests import test_neighbours from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed.
from .test_basic_ops import mode_with_gpu from .config import mode_with_gpu
from ..neighbours import GpuImages2Neibs from ..neighbours import GpuImages2Neibs
......
...@@ -6,7 +6,7 @@ from theano import function ...@@ -6,7 +6,7 @@ from theano import function
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor import vector, matrix, dot from theano.tensor import vector, matrix, dot
from .test_basic_ops import mode_with_gpu from .config import mode_with_gpu
from ..nerv import Gemm16, nerv from ..nerv import Gemm16, nerv
......
...@@ -7,9 +7,7 @@ import theano ...@@ -7,9 +7,7 @@ import theano
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
# We let that import do the init of the back-end if needed. from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from ..nnet import ( from ..nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
......
...@@ -4,17 +4,16 @@ import theano ...@@ -4,17 +4,16 @@ import theano
from theano import tensor from theano import tensor
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from .. import basic_ops from .. import basic_ops
from ..type import GpuArrayType, gpuarray_shared_constructor from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import (GpuAlloc, GpuReshape, gpu_alloc, from ..basic_ops import GpuAlloc, GpuReshape, GpuFromHost, host_from_gpu
gpu_from_host, host_from_gpu)
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from .test_basic_ops import rand_gpuarray, mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu, test_ctx_name
def test_local_assert(): def test_local_assert():
...@@ -97,7 +96,7 @@ def test_flatten(): ...@@ -97,7 +96,7 @@ def test_flatten():
def test_reduce(): def test_reduce():
dev = theano.sandbox.gpuarray.init_dev.device kind = get_context(test_ctx_name).kind
for method, param in [('sum', dict(acc_dtype='float32')), for method, param in [('sum', dict(acc_dtype='float32')),
('prod', dict(acc_dtype='float32')), ('prod', dict(acc_dtype='float32')),
...@@ -113,7 +112,7 @@ def test_reduce(): ...@@ -113,7 +112,7 @@ def test_reduce():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
ops = [type(node.op) for node in topo] ops = [type(node.op) for node in topo]
if dev.startswith('opencl') and method in ["max", "min"]: if kind == 'opencl' and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops) assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
else: else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
...@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0(): ...@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
ones = numpy.ones((2,), dtype='float32') ones = numpy.ones((2,), dtype='float32')
# Test with 0 # Test with 0
a = gpu_alloc(z, i) a = GpuAlloc(test_ctx_name)(z, i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 1 assert len(topo) == 1
...@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0(): ...@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
assert (numpy.asarray(f(6)) == 0).all() assert (numpy.asarray(f(6)) == 0).all()
# Test with 1 # Test with 1
a = gpu_alloc(o, i) a = GpuAlloc(test_ctx_name)(o, i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 1 assert len(topo) == 1
...@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0(): ...@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
assert (numpy.asarray(f(6)) == 1).all() assert (numpy.asarray(f(6)) == 1).all()
# Test with 1, 1 # Test with 1, 1
a = gpu_alloc(ones, i) a = GpuAlloc(test_ctx_name)(ones, i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 1 assert len(topo) == 1
...@@ -180,7 +179,7 @@ def test_print_op(): ...@@ -180,7 +179,7 @@ def test_print_op():
f = theano.function([b], theano.printing.Print()(b) * 2, f = theano.function([b], theano.printing.Print()(b) * 2,
mode=mode_with_gpu) mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert topo[0].op == gpu_from_host assert isinstance(topo[0].op, GpuFromHost)
assert isinstance(topo[1].op, theano.printing.Print) assert isinstance(topo[1].op, theano.printing.Print)
assert isinstance(topo[2].op, GpuElemwise) assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu assert topo[3].op == host_from_gpu
...@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op(): ...@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce(): def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix() x = theano.tensor.matrix()
o = (x*x).sum() o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu) f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 3 assert len(topo) == 3
...@@ -234,7 +233,7 @@ def test_local_gpu_subtensor(): ...@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input # Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix() t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu) f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo]) assert any([isinstance(node.op, GpuSubtensor) for node in topo])
...@@ -242,7 +241,7 @@ def test_local_gpu_subtensor(): ...@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input + input as output # Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix() t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu) f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo]) assert any([isinstance(node.op, GpuSubtensor) for node in topo])
...@@ -250,7 +249,7 @@ def test_local_gpu_subtensor(): ...@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
# Test shared forced on CPU end we do computation on the output of # Test shared forced on CPU end we do computation on the output of
# the subtensor. # the subtensor.
t = tensor._shared(numpy.zeros(20, "float32")) t = tensor._shared(numpy.zeros(20, "float32"))
f = theano.function([], t[3:4]+1, mode=mode_with_gpu) f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
...@@ -319,7 +318,7 @@ def test_local_gpu_elemwise(): ...@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
utt.assert_allclose(out[1], a_v * c_v) utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input # Test non-contiguous input
c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32')) c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32'))
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu) mode=mode_with_gpu)
out = f(a_v, b_v) out = f(a_v, b_v)
......
...@@ -6,10 +6,10 @@ import theano ...@@ -6,10 +6,10 @@ import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.sandbox.rng_mrg import theano.sandbox.rng_mrg
from ..basic_ops import gpu_from_host, GpuFromHost, HostFromGpu from ..basic_ops import GpuFromHost, HostFromGpu
from ..elemwise import GpuElemwise from ..elemwise import GpuElemwise
from .test_basic_ops import mode_with_gpu from .config import mode_with_gpu, test_ctx_name
class T_Scan(TestCase): class T_Scan(TestCase):
...@@ -35,7 +35,7 @@ class T_Scan(TestCase): ...@@ -35,7 +35,7 @@ class T_Scan(TestCase):
go_backwards=False, go_backwards=False,
mode=mode) mode=mode)
output = gpu_from_host(output) output = GpuFromHost(test_ctx_name)(output)
f2 = theano.function([u, x0, W_in, W], f2 = theano.function([u, x0, W_in, W],
output, output,
updates=updates, updates=updates,
...@@ -216,7 +216,7 @@ class T_Scan(TestCase): ...@@ -216,7 +216,7 @@ class T_Scan(TestCase):
dtype='float32') dtype='float32')
vsample = theano.shared(v_vsample) vsample = theano.shared(v_vsample)
trng = theano.sandbox.rng_mrg.MRG_RandomStreams( trng = theano.sandbox.rng_mrg.MRG_RandomStreams(
utt.fetch_seed()) utt.fetch_seed())
def f(vsample_tm1): def f(vsample_tm1):
return trng.binomial(vsample_tm1.shape, n=1, p=0.3, return trng.binomial(vsample_tm1.shape, n=1, p=0.3,
...@@ -238,4 +238,4 @@ class T_Scan(TestCase): ...@@ -238,4 +238,4 @@ class T_Scan(TestCase):
# I leave this to tested by debugmode, this test was anyway # I leave this to tested by debugmode, this test was anyway
# more of does the graph compile kind of test # more of does the graph compile kind of test
t_result = my_f() my_f()
...@@ -11,8 +11,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -11,8 +11,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1) GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
from .test_basic_ops import mode_with_gpu from .config import mode_with_gpu
class G_subtensor(test_subtensor.T_subtensor): class G_subtensor(test_subtensor.T_subtensor):
...@@ -46,8 +45,8 @@ def test_advinc_subtensor1(): ...@@ -46,8 +45,8 @@ def test_advinc_subtensor1():
yval[:] = 10 yval[:] = 10
x = shared(xval, name='x') x = shared(xval, name='x')
y = tensor.tensor(dtype='float32', y = tensor.tensor(dtype='float32',
broadcastable=(False,) * len(shp), broadcastable=(False,) * len(shp),
name='y') name='y')
expr = tensor.advanced_inc_subtensor1(x, y, [0, 2]) expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
f = theano.function([y], expr, mode=mode_with_gpu) f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1) assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
......
...@@ -14,14 +14,80 @@ try: ...@@ -14,14 +14,80 @@ try:
except ImportError: except ImportError:
pass pass
_context_reg = {}
def reg_context(name, ctx):
"""
Register a context by mapping it to a name.
The context must be of type `GpuContext` and the name can be
anything hashable (but is usually a string). Only one context can
be registered per name and the second registration for a given
name will raise an error.
Parameters
----------
name : hashable object
Name to associate the context with (usually a string)
ctx : GpuContext
Context instance
"""
if name in _context_reg:
raise ValueError("context name %s is already defined" % (name,))
if not isinstance(ctx, gpuarray.GpuContext):
raise TypeError("context is not GpuContext")
_context_reg[name] = ctx
def get_context(name):
"""
Retrive the context associated with a name.
Return the context object mapped to `ref` that was previously
register through :func:`reg_context`. Trying to get the context
for an unregistered `ref` will raise a exception.
Parameters
----------
name : hashable object
Name associated with the context we want (usually a string)
"""
if name not in _context_reg:
raise ValueError("context name %s not defined" % (name,))
return _context_reg[name]
def list_contexts():
"""
Return an iterable of all the registered context names.
"""
return _context_reg.keys()
# Private method
def _name_for_ctx(ctx):
for k, v in _context_reg:
if v == ctx:
return k
raise ValueError('context is not registered')
# This is a private method for use by the tests only
def _unreg_context(name):
del _context_reg[name]
class GpuArrayType(Type): class GpuArrayType(Type):
def __init__(self, dtype, broadcastable, name=None): def __init__(self, dtype, broadcastable, context_name=None, name=None):
# In case this was not provided and no global value is available # In case this was not provided and no global value is available
self.dtype = str(dtype) self.dtype = str(dtype)
self.broadcastable = tuple(bool(b) for b in broadcastable) self.broadcastable = tuple(bool(b) for b in broadcastable)
self.ndim = len(self.broadcastable) self.ndim = len(self.broadcastable)
self.name = name self.name = name
self.context_name = context_name
try: try:
self.typecode = gpuarray.dtype_to_typecode(self.dtype) self.typecode = gpuarray.dtype_to_typecode(self.dtype)
except gpuarray.GpuArrayException: except gpuarray.GpuArrayException:
...@@ -34,10 +100,16 @@ class GpuArrayType(Type): ...@@ -34,10 +100,16 @@ class GpuArrayType(Type):
if broadcastable is None: if broadcastable is None:
broadcastable = self.broadcastable broadcastable = self.broadcastable
return self.__class__(dtype=dtype, broadcastable=broadcastable, return self.__class__(dtype=dtype, broadcastable=broadcastable,
name=self.name) context_name=self.context_name, name=self.name)
# This is a property to keep the type pickleable
@property
def context(self):
return get_context(self.context_name)
def __repr__(self): def __repr__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable) return "GpuArrayType<%s>(%s, %s)" % (self.context_name, self.dtype,
self.broadcastable)
def filter(self, data, strict=False, allow_downcast=None): def filter(self, data, strict=False, allow_downcast=None):
if (isinstance(data, gpuarray.GpuArray) and if (isinstance(data, gpuarray.GpuArray) and
...@@ -54,25 +126,28 @@ class GpuArrayType(Type): ...@@ -54,25 +126,28 @@ class GpuArrayType(Type):
"got %d (dtype %s)." % "got %d (dtype %s)." %
(self, self.typecode, self.dtype, (self, self.typecode, self.dtype,
data.typecode, str(data.dtype))) data.typecode, str(data.dtype)))
if self.context != data.context:
raise TypeError("data context does not match type context")
# fallthrough to ndim check # fallthrough to ndim check
elif (allow_downcast or elif (allow_downcast or
(allow_downcast is None and (allow_downcast is None and
type(data) == float and type(data) == float and
self.dtype == config.floatX)): self.dtype == config.floatX)):
data = gpuarray.array(data, dtype=self.typecode, copy=False, data = gpuarray.array(data, dtype=self.typecode, copy=False,
ndmin=len(self.broadcastable)) ndmin=len(self.broadcastable),
context=self.context)
else: else:
if not hasattr(data, 'dtype'): if not hasattr(data, 'dtype'):
# This is to convert objects that don't have a dtype # This is to convert objects that don't have a dtype
# (like lists). We anticipate that the type below # (like lists). We anticipate that the type below
# will match and we pass copy=False so it won't make a # will match and we pass copy=False so it won't make a
# second object on the GPU. # second object on the GPU.
data = gpuarray.array(data, copy=False) data = gpuarray.array(data, copy=False, context=self.context)
up_dtype = scalar.upcast(self.dtype, data.dtype) up_dtype = scalar.upcast(self.dtype, data.dtype)
if up_dtype == self.dtype: if up_dtype == self.dtype:
data = gpuarray.array(data, dtype=self.dtype, data = gpuarray.array(data, dtype=self.dtype, copy=False,
copy=False) context=self.context)
else: else:
raise TypeError("%s cannot store a value of dtype %s " raise TypeError("%s cannot store a value of dtype %s "
"without risking loss of precision." % "without risking loss of precision." %
...@@ -90,8 +165,10 @@ class GpuArrayType(Type): ...@@ -90,8 +165,10 @@ class GpuArrayType(Type):
return data return data
def filter_variable(self, other, allow_convert=True): def filter_variable(self, other, allow_convert=True):
from theano.sandbox.gpuarray import GpuFromHost
if hasattr(other, '_as_GpuArrayVariable'): if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable() other = other._as_GpuArrayVariable(self.context_name)
if not isinstance(other, Variable): if not isinstance(other, Variable):
other = self.Constant(type=self, data=other) other = self.Constant(type=self, data=other)
...@@ -120,7 +197,7 @@ class GpuArrayType(Type): ...@@ -120,7 +197,7 @@ class GpuArrayType(Type):
str(self.broadcastable))) str(self.broadcastable)))
other = other2 other = other2
return theano.sandbox.gpuarray.basic_ops.gpu_from_host(other) return GpuFromHost(self.context_name)(other)
@staticmethod @staticmethod
def values_eq(a, b): def values_eq(a, b):
...@@ -189,7 +266,8 @@ class GpuArrayType(Type): ...@@ -189,7 +266,8 @@ class GpuArrayType(Type):
return pygpu.gpuarray.may_share_memory(a, b) return pygpu.gpuarray.may_share_memory(a, b)
def value_zeros(self, shape): def value_zeros(self, shape):
return pygpu.gpuarray.zeros(shape, dtype=self.typecode) return pygpu.gpuarray.zeros(shape, dtype=self.typecode,
context=self.context)
def make_variable(self, name=None): def make_variable(self, name=None):
return self.Variable(self, name=name) return self.Variable(self, name=name)
...@@ -197,19 +275,22 @@ class GpuArrayType(Type): ...@@ -197,19 +275,22 @@ class GpuArrayType(Type):
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other) and return (type(self) == type(other) and
self.typecode == other.typecode and self.typecode == other.typecode and
self.broadcastable == other.broadcastable) self.broadcastable == other.broadcastable and
self.context_name == other.context_name)
def convert_variable(self, var): def convert_variable(self, var):
vt = var.type vt = var.type
if (type(self) == type(vt) and if (type(self) == type(vt) and
self.typecode == vt.typecode and self.typecode == vt.typecode and
self.ndim == vt.ndim and self.ndim == vt.ndim and
self.context_name == vt.context_name and
all(sb == ob or ob for sb, ob in zip(self.broadcastable, all(sb == ob or ob for sb, ob in zip(self.broadcastable,
vt.broadcastable))): vt.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self): def __hash__(self):
return (hash(self.typecode) ^ hash(self.broadcastable)) return hash((type(self), self.typecode, self.broadcastable,
self.context_name))
def dtype_specs(self): def dtype_specs(self):
""" """
...@@ -324,8 +405,12 @@ class _operators(_tensor_py_operators): ...@@ -324,8 +405,12 @@ class _operators(_tensor_py_operators):
from .basic_ops import host_from_gpu from .basic_ops import host_from_gpu
return host_from_gpu(self) return host_from_gpu(self)
def _as_GpuArrayVariable(self): def _as_GpuArrayVariable(self, context_name):
return self if self.type.context_name == context_name:
return self
else:
from .basic_ops import GpuToGpu
return GpuToGpu(context_name)(self)
class GpuArrayVariable(_operators, Variable): class GpuArrayVariable(_operators, Variable):
...@@ -370,7 +455,8 @@ class GpuArraySharedVariable(_operators, SharedVariable): ...@@ -370,7 +455,8 @@ class GpuArraySharedVariable(_operators, SharedVariable):
def set_value(self, value, borrow=False): def set_value(self, value, borrow=False):
if isinstance(value, pygpu.gpuarray.GpuArray): if isinstance(value, pygpu.gpuarray.GpuArray):
value = pygpu.gpuarray.array(value, copy=(not borrow)) value = pygpu.gpuarray.array(value, copy=(not borrow),
context=self.type.context)
self.container.value = value self.container.value = value
def __getitem__(self, *args): def __getitem__(self, *args):
...@@ -382,7 +468,8 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable ...@@ -382,7 +468,8 @@ GpuArrayType.SharedVariable = GpuArraySharedVariable
def gpuarray_shared_constructor(value, name=None, strict=False, def gpuarray_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, allow_downcast=None, borrow=False,
broadcastable=None): broadcastable=None,
context_name=None):
""" """
SharedVariable constructor for GpuArrayType. SharedVariable constructor for GpuArrayType.
...@@ -390,10 +477,20 @@ def gpuarray_shared_constructor(value, name=None, strict=False, ...@@ -390,10 +477,20 @@ def gpuarray_shared_constructor(value, name=None, strict=False,
if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)): if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError('ndarray or GpuArray required') raise TypeError('ndarray or GpuArray required')
try:
get_context(context_name)
except ValueError:
# Don't make this a hard error if we attempt to make a shared
# variable while there is no default context.
if context_name is None:
raise TypeError('No default context and no context specified')
raise
if broadcastable is None: if broadcastable is None:
broadcastable = (False,) * value.ndim broadcastable = (False,) * value.ndim
type = GpuArrayType(value.dtype, broadcastable) type = GpuArrayType(value.dtype, broadcastable, context_name=context_name)
deviceval = pygpu.gpuarray.array(value, copy=(not borrow)) deviceval = pygpu.gpuarray.array(value, copy=(not borrow),
context=type.context)
return GpuArraySharedVariable(type=type, value=deviceval, name=name, return GpuArraySharedVariable(type=type, value=deviceval, name=name,
strict=strict) strict=strict)
...@@ -485,3 +582,63 @@ theano.compile.register_specify_shape_c_code( ...@@ -485,3 +582,63 @@ theano.compile.register_specify_shape_c_code(
""", """,
version=1, version=1,
c_support_code_apply='#include <numpy_compat.h>') c_support_code_apply='#include <numpy_compat.h>')
class GpuContextType(Type):
def filter(self, data, strict=False, allow_downcast=None):
if not isinstance(data, gpuarray.GpuContext):
raise TypeError('context is not a GpuContext')
return data
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
@staticmethod
def values_eq(a, b):
return a == b
def c_declare(self, name, sub, check_input=True):
return "PyGpuContextObject *%s;" % (name,)
def c_init(self, name, sub):
return "%s = NULL;" % (name,)
def c_extract(self, name, sub, check_input=True):
if check_input:
res = """
if (!PyObject_TypeCheck(py_%(name)s, &PyGpuContextType)) {
PyErr_SetString(PyExc_TypeError, "expected a GpuContext");
%(fail)s
}
""" % dict(name=name, fail=sub['fail'])
else:
res = ""
return res + """
%(name)s = (PyGpuContextObject *)py_%(name)s;
Py_INCREF(%(name)s);
""" % dict(name=name)
def c_cleanup(self, name, sub):
return "Py_XDECREF(%(name)s); %(name)s = NULL;" % dict(name=name)
# c_sync is intentionally not declared to prevent normal usage
def c_init_code(self):
return ['import_pygpu__gpuarray();']
def c_headers(self):
return ['<gpuarray_api.h>']
def c_header_dirs(self):
return [pygpu.get_include()]
def c_code_cache_version(self):
ver = pygpu.gpuarray.api_version()
return (0, ver[0])
# Variable, Contstant, ... not declared
gpu_context_type = GpuContextType()
...@@ -771,6 +771,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base): ...@@ -771,6 +771,9 @@ class GPUA_mrg_uniform(GpuKernelBase, mrg_uniform_base):
# GpuArray version # GpuArray version
_f16_ok = True _f16_ok = True
def get_context(self, node):
return node.inputs[0].type.context
@classmethod @classmethod
def new(cls, rstate, ndim, dtype, size): def new(cls, rstate, ndim, dtype, size):
v_size = as_tensor_variable(size) v_size = as_tensor_variable(size)
......
...@@ -1014,9 +1014,9 @@ class ScanInplaceOptimizer(Optimizer): ...@@ -1014,9 +1014,9 @@ class ScanInplaceOptimizer(Optimizer):
""" """
def __init__(self, typeConstructor=None, gpu_flag=False, gpua_flag=False): def __init__(self, typeInfer=None, gpu_flag=False, gpua_flag=False):
Optimizer.__init__(self) Optimizer.__init__(self)
self.typeConstructor = typeConstructor self.typeInfer = typeInfer
self.gpu_flag = gpu_flag self.gpu_flag = gpu_flag
self.gpua_flag = gpua_flag self.gpua_flag = gpua_flag
...@@ -1062,10 +1062,15 @@ class ScanInplaceOptimizer(Optimizer): ...@@ -1062,10 +1062,15 @@ class ScanInplaceOptimizer(Optimizer):
ls[idx] = deep_copy_op(ls[idx]) ls[idx] = deep_copy_op(ls[idx])
inputs = ls_begin + ls + ls_end inputs = ls_begin + ls + ls_end
if self.typeInfer is None:
typeConstructor = None
else:
typeConstructor = self.typeInfer(node)
new_op = scan_op.Scan(op.inputs, new_op = scan_op.Scan(op.inputs,
op.outputs, op.outputs,
info, info,
typeConstructor=self.typeConstructor) typeConstructor=typeConstructor)
# Do not call make_node for test_value # Do not call make_node for test_value
new_outs = new_op(*inputs, **dict(return_list=True)) new_outs = new_op(*inputs, **dict(return_list=True))
...@@ -2325,7 +2330,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB() ...@@ -2325,7 +2330,7 @@ scan_eqopt2 = theano.gof.EquilibriumDB()
optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan') optdb.register('scan_eqopt1', scan_eqopt1, .1, 'fast_run', 'scan')
optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan') optdb.register('scan_eqopt2', scan_eqopt2, 1.6, 'fast_run', 'scan')
optdb.register('scanOp_make_inplace', optdb.register('scanOp_make_inplace',
ScanInplaceOptimizer(typeConstructor=None, ScanInplaceOptimizer(typeInfer=None,
gpu_flag=False), gpu_flag=False),
75, 75,
'fast_run', 'fast_run',
......
...@@ -4874,6 +4874,12 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests): ...@@ -4874,6 +4874,12 @@ class T_Scan_Gpuarray(unittest.TestCase, ScanGpuTests):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
from theano.sandbox import gpuarray from theano.sandbox import gpuarray
self.gpu_backend = gpuarray self.gpu_backend = gpuarray
# This is unfortunate, but required
def gpu_from_host(v):
return gpuarray.GpuFromHost(None)(v)
self.gpu_backend.gpu_from_host = gpu_from_host
self.mode_with_gpu = mode_with_opt.including('gpuarray', 'scan') self.mode_with_gpu = mode_with_opt.including('gpuarray', 'scan')
self.mode_with_gpu_nodebug = mode_nodebug.including('gpuarray', 'scan') self.mode_with_gpu_nodebug = mode_nodebug.including('gpuarray', 'scan')
super(T_Scan_Gpuarray, self).__init__(*args, **kwargs) super(T_Scan_Gpuarray, self).__init__(*args, **kwargs)
......
...@@ -158,10 +158,6 @@ whitelist_flake8 = [ ...@@ -158,10 +158,6 @@ whitelist_flake8 = [
"sandbox/linalg/__init__.py", "sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/tests/test_subtensor.py",
"sandbox/gpuarray/tests/test_scan.py",
"sandbox/gpuarray/tests/test_opt.py",
"sandbox/gpuarray/tests/test_elemwise.py",
"scan_module/scan_utils.py", "scan_module/scan_utils.py",
"scan_module/scan_views.py", "scan_module/scan_views.py",
"scan_module/scan.py", "scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论