提交 4814cd99 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3482 from abergeron/multi_gpu_new2

Multi-gpu support
......@@ -112,7 +112,8 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
if (config.device.startswith('cuda') or
config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
config.init_gpu_device.startswith('opencl') or
config.contexts != ''):
import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
......
......@@ -111,6 +111,29 @@ AddConfigVar(
BoolParam(False, allow_override=False),
in_c_key=False)
class ContextsParam(ConfigParam):
def __init__(self):
def filter(val):
if val == '':
return val
for v in val.split(';'):
s = v.split('->')
if len(s) != 2:
raise ValueError("Malformed context map: %s" % (v,))
return val
ConfigParam.__init__(self, '', filter, False)
AddConfigVar(
'contexts',
"""
Context map for multi-gpu operation. Format is a
semicolon-separated list of names and device names in the
'name->dev_name' format. An example that would map name 'test' to
device 'cuda0' and name 'test2' to device 'opencl0:0' follows:
"test->cuda0;test2->opencl0:0".
""", ContextsParam(), in_c_key=False)
AddConfigVar(
'print_active_device',
"Print active device at when the GPU device is initialized.",
......
#! /usr/bin/env python
"""
This file compare the runtime of two independent dot products on one
and two GPU to measure the speedup.
This should be 2x if the GPUs are equivalent.
"""
import time
import numpy
import theano
from theano.sandbox.gpuarray import init_dev
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor as shared
from theano.sandbox.gpuarray.blas import gpu_dot22
def main(dev1, dev2):
init_dev(dev1, 'ctx1')
init_dev(dev2, 'ctx2')
val1a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1c = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val1d = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx1')
val2a = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
val2b = shared(numpy.random.randn(1024, 1024).astype('float32'),
context_name='ctx2')
f1 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val1c, val1d)])
f2 = theano.function([], [gpu_dot22(val1a, val1b),
gpu_dot22(val2a, val2b)])
r = f1()
r[0].sync(), r[1].sync()
r = None
t = time.time()
r = f1()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("one ctx %f" % (t2 - t,))
r = f2()
r[0].sync(), r[1].sync()
r = None
t = time.time()
r = f2()
r[0].sync(), r[1].sync()
t2 = time.time()
r = None
print("two ctx %f" % (t2 - t,))
if __name__ == '__main__':
import sys
if len(sys.argv) != 3:
raise ValueError("This script require two device names.")
main(sys.argv[1], sys.argv[2])
......@@ -92,10 +92,7 @@ class HostFromGpu(GpuOp):
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, tensor.TensorType):
return [gpu_from_host(ev)]
else:
return [ev]
return self(ev)
def infer_shape(self, node, xshp):
return xshp
......@@ -155,10 +152,7 @@ class GpuFromHost(GpuOp):
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, CudaNdarrayType):
return [host_from_gpu(ev)]
else:
return [ev]
self(ev)
def infer_shape(self, node, xshp):
return xshp
......
......@@ -2478,8 +2478,11 @@ def local_gpu_allocempty(node):
return False
def typeInfer(node):
return typeConstructor
optdb.register('gpu_scanOp_make_inplace',
scan_opt.ScanInplaceOptimizer(typeConstructor=typeConstructor,
scan_opt.ScanInplaceOptimizer(typeInfer=typeInfer,
gpu_flag=True),
75,
'gpu',
......
......@@ -21,26 +21,30 @@ except ImportError:
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context)
from . import opt, nerv
def init_dev(dev):
def init_dev(dev, name=None):
if pygpu.gpuarray.api_version() != (-10000, 0):
raise RuntimeError("Wrong API version for gpuarray:",
pygpu.gpuarray.api_version(),
"Make sure Theano and libgpuarray/pygpu "
"are in sync.")
global pygpu_activated
context = pygpu.init(dev)
pygpu.set_default_context(context)
if dev not in init_dev.devmap:
init_dev.devmap[dev] = pygpu.init(dev)
context = init_dev.devmap[dev]
# This will map the context name to the real context object.
reg_context(name, context)
pygpu_activated = True
if config.print_active_device:
print("Using device %s: %s" % (dev, context.devname), file=sys.stderr)
# remember the active device
init_dev.device = dev
print("Mapped name %s to device %s: %s" % (name, dev, context.devname),
file=sys.stderr)
init_dev.device = None
# This maps things like 'cuda0' to the context object on that device.
init_dev.devmap = {}
if pygpu:
try:
......@@ -52,11 +56,21 @@ if pygpu:
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
if config.device != 'cpu':
raise ValueError('you must set device=cpu to use init_gpu_device.')
if config.contexts != '':
print("Using contexts will make init_gpu_device act like device and move all computations by default, which might not be what you want.")
init_dev(config.init_gpu_device)
if config.contexts != '':
for n, d in (c.split('->') for c in config.contexts.split(';')):
init_dev(d.strip(), n.strip())
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
from .basic_ops import host_from_gpu, gpu_from_host
from .basic_ops import host_from_gpu, GpuFromHost
from .elemwise import GpuElemwise
from .subtensor import (GpuSubtensor, GpuIncSubtensor,
GpuAdvancedIncSubtensor1)
......@@ -67,5 +81,6 @@ else:
if (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl') or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
config.device.startswith('cuda') or
config.contexts != ''):
error("pygpu was configured but could not be imported", exc_info=True)
import os.path
from theano import Apply, config
from theano import Apply, config, Op
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.gof import LocalOptGroup
from theano.tensor.basic import as_tensor_variable
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable, GpuAllocEmpty
from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt_util import inplace_allocempty
try:
import pygpu
......@@ -18,7 +19,7 @@ except ImportError as e:
pass
class BlasOp(HideC):
class BlasOp(Op):
def c_headers(self):
return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']
......@@ -28,34 +29,27 @@ class BlasOp(HideC):
def c_init_code(self):
return ['import_pygpu__blas();']
def c_support_code(self):
return """
PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
PyGpuArrayObject *y) {
if (out &&
GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
theano_size_check(out, PyGpuArray_NDIM(y),
PyGpuArray_DIMS(y),
y->ga.typecode)) {
if (pygpu_move(out, y)) {
Py_XDECREF(out);
return NULL;
}
} else {
Py_XDECREF(out);
out = pygpu_copy(y, GA_ANY_ORDER);
}
return out;
}
"""
class GpuGemv(BlasOp):
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta):
Gemv.make_node(self, y, alpha, A, x, beta)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
ctx_name = infer_context_name(y, A, x)
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype
return Apply(self, [y, alpha, A, x, beta], [y.type()])
......@@ -73,7 +67,7 @@ class GpuGemv(BlasOp, Gemv):
if self.inplace:
code = """
if (%(y)s->ga.strides[0] <= 0) {
%(out)s = gpublas_try_copy(%(out)s, %(y)s);
%(out)s = theano_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -85,7 +79,7 @@ class GpuGemv(BlasOp, Gemv):
""" % vars
else:
code = """
%(out)s = gpublas_try_copy(%(out)s, %(y)s);
%(out)s = theano_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -106,21 +100,33 @@ class GpuGemv(BlasOp, Gemv):
return code
def c_code_cache_version(self):
return (3,)
return (4,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
class GpuGemm(BlasOp):
__props__ = ('inplace',)
_f16_ok = True
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = as_tensor_variable(alpha)
beta = as_tensor_variable(beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
assert alpha.ndim == 0
assert beta.ndim == 0
assert A.ndim == 2
assert B.ndim == 2
assert C.ndim == 2
assert A.dtype == B.dtype == C.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()])
......@@ -138,7 +144,7 @@ class GpuGemm(BlasOp, Gemm):
if self.inplace:
code = """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = gpublas_try_copy(%(out)s, %(C)s);
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -150,7 +156,7 @@ class GpuGemm(BlasOp, Gemm):
""" % vars
else:
code = """
%(out)s = gpublas_try_copy(%(out)s, %(C)s);
%(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -171,25 +177,36 @@ class GpuGemm(BlasOp, Gemm):
return code
def c_code_cache_version(self):
return (4,)
return (5,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
class GpuGer(BlasOp, Ger):
class GpuGer(BlasOp):
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, A, alpha, x, y):
Ger.make_node(self, A, alpha, x, y)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
ctx_name = infer_context_name(A, x, y)
A = as_gpuarray_variable(A, ctx_name)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
alpha = as_tensor_variable(alpha)
assert alpha.ndim == 0
assert A.ndim == 2
assert x.ndim == 1
assert y.ndim == 1
assert A.dtype == x.dtype == y.dtype
return Apply(self, [A, alpha, x, y], [A.type()])
def perform(self, node, inp, out):
A, alpha, x, y = inp
inplace = self.destructive
inplace = self.inplace
if inplace and not A.flags.forc:
inplace = False
out[0][0] = blas.ger(alpha, x, y, A,
......@@ -198,10 +215,10 @@ class GpuGer(BlasOp, Ger):
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], A=inp[0], alpha=inp[1], x=inp[2], y=inp[3],
fail=sub['fail'], name=name)
if self.destructive:
if self.inplace:
code = """
if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
%(out)s = gpublas_try_copy(%(out)s, %(A)s);
%(out)s = theano_try_copy(%(out)s, %(A)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -213,7 +230,7 @@ class GpuGer(BlasOp, Ger):
""" % vars
else:
code = """
%(out)s = gpublas_try_copy(%(out)s, %(A)s);
%(out)s = theano_try_copy(%(out)s, %(A)s);
if (%(out)s == NULL) {
%(fail)s
}
......@@ -231,18 +248,22 @@ class GpuGer(BlasOp, Ger):
return code
def c_code_cache_version(self):
return (2,)
return (3,)
gpuger_no_inplace = GpuGer(destructive=False)
gpuger_inplace = GpuGer(destructive=True)
gpuger_no_inplace = GpuGer(inplace=False)
gpuger_inplace = GpuGer(inplace=True)
class GpuDot22(BlasOp, Dot22):
class GpuDot22(BlasOp):
__props__ = ()
def make_node(self, x, y):
Dot22.make_node(self, x, y)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
assert x.ndim == 2
assert y.ndim == 2
assert x.dtype == y.dtype
return Apply(self, [x, y], [x.type()])
......@@ -268,7 +289,7 @@ class GpuDot22(BlasOp, Dot22):
dims[1] = PyGpuArray_DIMS(%(B)s)[1];
if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
pygpu_default_context())) {
%(A)s->context)) {
%(fail)s
}
......@@ -287,32 +308,24 @@ class GpuDot22(BlasOp, Dot22):
return code
def c_code_cache_version(self):
return (3,)
return (4,)
gpu_dot22 = GpuDot22()
@local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace:
return [gpugemv_inplace(*node.inputs)]
@inplace_allocempty(GpuGemv, 0)
def local_inplace_gpuagemv(node, inputs):
return [gpugemv_inplace(*inputs)]
@local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
@inplace_allocempty(GpuGemm, 0)
def local_inplace_gpuagemm(node, inputs):
return [gpugemm_inplace(*inputs)]
@local_optimizer([gpuger_no_inplace], inplace=True)
def local_inplace_gpuager(node):
if node.op == gpuger_no_inplace:
return [gpuger_inplace(*node.inputs)]
@inplace_allocempty(GpuGer, 0)
def local_inplace_gpuager(node, inputs):
return [gpuger_inplace(*inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
local_inplace_gpuagemm,
......
import copy
import os
import theano
from theano import config, gof
from theano import gof
try:
from pygpu import gpuarray
......@@ -10,7 +9,8 @@ except ImportError:
pass
from .type import GpuArrayType
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from theano.gof import utils
......@@ -58,6 +58,9 @@ class GpuConv(GpuKernelBase, gof.Op):
them.
"""
__props__ = ('border_mode', 'subsample', 'logical_img_hw',
'logical_kern_hw', 'logical_kern_align_top', 'version',
'verbose', 'kshp', 'imshp', 'max_threads_dim0')
@staticmethod
def logical_output_shape_2d(imshp, kshp, mode):
......@@ -67,20 +70,13 @@ class GpuConv(GpuKernelBase, gof.Op):
return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
raise ValueError(mode)
def __init__(self, border_mode,
subsample=(1, 1),
logical_img_hw=None,
logical_kern_hw=None,
def __init__(self, border_mode, subsample=(1, 1),
logical_img_hw=None, logical_kern_hw=None,
logical_kern_align_top=True,
version=-1,
direction_hint=None,
verbose=0,
kshp=None,
imshp=None,
version=-1, direction_hint=None,
verbose=0, kshp=None, imshp=None,
max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True):
nkern=None, bsize=None, fft_opt=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
......@@ -108,19 +104,6 @@ class GpuConv(GpuKernelBase, gof.Op):
self.bsize = bsize
self.fft_opt = fft_opt
def __eq__(self, other):
return type(self) == type(other) \
and self.border_mode == other.border_mode \
and self.subsample == other.subsample \
and self.logical_img_hw == other.logical_img_hw \
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top \
and self.version == other.version \
and self.verbose == other.verbose \
and self.kshp == other.kshp\
and self.imshp == other.imshp\
and self.max_threads_dim0 == other.max_threads_dim0
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, "imshp"):
......@@ -136,32 +119,6 @@ class GpuConv(GpuKernelBase, gof.Op):
if not hasattr(self, "fft_opt"):
self.fft_opt = True
def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and
# hash(-2)==-2 in python!
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.logical_img_hw) \
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top) \
^ self.version \
^ hash(self.verbose) \
^ hash(self.kshp)\
^ hash(self.imshp)\
^ hash(self.max_threads_dim0)
def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
self.__class__.__name__,
self.border_mode,
str(self.subsample),
str(self.logical_img_hw),
str(self.logical_kern_hw),
str(self.logical_kern_align_top),
str(self.imshp),
str(self.kshp))
def make_node(self, img, kern):
if img.dtype != "float32" or kern.dtype != "float32":
raise NotImplementedError("GpuConv currently only work"
......@@ -170,13 +127,17 @@ class GpuConv(GpuKernelBase, gof.Op):
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
img = as_gpuarray_variable(img)
kern = as_gpuarray_variable(kern)
ctx_name = infer_context_name(img, kern)
img = as_gpuarray_variable(img, ctx_name)
kern = as_gpuarray_variable(kern, ctx_name)
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
out = GpuArrayType(img.dtype, broadcastable)()
out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
return gof.Apply(self, [img, kern], [out])
def get_context(self, node):
return node.inputs[0].type.context
def flops(self, inputs, outputs):
"""
Useful with the hack in profilemode to print the MFlops.
......@@ -202,22 +163,8 @@ class GpuConv(GpuKernelBase, gof.Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling):
node_ = copy.copy(node)
assert node.op is node_.op
if config.gpuarray.sync:
raise NotImplementedError("GpuConv do not implement gpuarray.sync Theano flag")
if node_.op.max_threads_dim0 is None:
cuda = theano.sandbox.cuda
device_id = cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
return super(GpuConv, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling)
......@@ -232,9 +179,11 @@ class GpuConv(GpuKernelBase, gof.Op):
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 22)
return (0, 23)
def c_code(self, node, nodename, inp, out_, sub):
if node.inputs[0].type.context.kind != "cuda":
raise NotImplementedError("GpuConv only works for cuda devices")
img, kern = inp
out, = out_
dx = self.subsample[0]
......@@ -302,7 +251,6 @@ class GpuConv(GpuKernelBase, gof.Op):
""" % locals()
code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
kname = "conv_full_load_everything"
gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
bin = gk._binary
bcode = ','.join(hex(ord(c)) for c in bin)
......@@ -313,9 +261,12 @@ class GpuConv(GpuKernelBase, gof.Op):
static const char conv_bcode[] = {%(bcode)s};
static const char *conv_code = "%(code)s";
""" % locals()
for k in kernels:
mod += "static GpuKernel " + k.name + '_' + name + ";\n"
mod += open(os.path.join(os.path.split(__file__)[0], "conv.cu")).read()
return mod
def c_support_code_struct(self, node, name):
mod = GpuKernelBase.c_support_code_struct(self, node, name)
with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
mod += f.read()
return mod
@utils.memoize
......
......@@ -46,7 +46,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
......@@ -75,7 +75,7 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
if (thread_id < nb_thread)
{
const float * my_src_ptr = (const float *)(
((uintptr_t)src) & COALESCED_ALIGN);
((size_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id;
while (my_src_ptr < src + N)
{
......
......@@ -107,14 +107,14 @@ cudnnHandle_t APPLY_SPECIFIC(_handle);
#section init_code_struct
{
cuda_enter(pygpu_default_context()->ctx);
cuda_enter(CONTEXT->ctx);
cudnnStatus_t err;
APPLY_SPECIFIC(_handle) = NULL;
if ((err = cudnnCreate(&APPLY_SPECIFIC(_handle))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(pygpu_default_context()->ctx);
cuda_exit(CONTEXT->ctx);
FAIL;
}
cuda_exit(pygpu_default_context()->ctx);
cuda_exit(CONTEXT->ctx);
}
......@@ -5,12 +5,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArrayObject *om,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta,
PyGpuArrayObject **output) {
PyGpuArrayObject **output,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
......
......@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **input) {
double alpha, double beta, PyGpuArrayObject **input,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......
......@@ -4,12 +4,12 @@ int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyGpuArrayObject *km,
cudnnConvolutionDescriptor_t desc,
double alpha, double beta, PyGpuArrayObject **kerns) {
double alpha, double beta, PyGpuArrayObject **kerns,
PyGpuContextObject *c) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
float af = alpha, bf = beta;
void *alpha_p;
void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
......
......@@ -29,10 +29,10 @@ if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFI
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) {
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
......
......@@ -53,9 +53,9 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) {
PyGpuArrayObject **inp_grad,
PyGpuContextObject *c) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
......@@ -81,7 +81,7 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), inp->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
GA_C_ORDER, c) != 0) {
return 1;
}
......
......@@ -34,9 +34,9 @@ if (APPLY_SPECIFIC(output) != NULL)
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) {
PyGpuArrayObject **out,
PyGpuContextObject *c) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
......
......@@ -45,9 +45,9 @@ if (APPLY_SPECIFIC(dx) != NULL)
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **dx) {
PyGpuArrayObject **dx,
PyGpuContextObject *c) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
......
......@@ -2,7 +2,7 @@
/* Why do we need this? */
size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, pygpu_default_context(),
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, CONTEXT,
Py_None);
if (rand_buf == NULL) {
FAIL;
......@@ -14,7 +14,8 @@ PyGpuArrayObject *rand_buf;
int gemm16(PyGpuArrayObject *C, float alpha,
PyGpuArrayObject *A, PyGpuArrayObject *B,
float beta, PyGpuArrayObject **out) {
float beta, PyGpuArrayObject **out,
PyGpuContextObject *c) {
PyGpuArrayObject *_A = NULL;
PyGpuArrayObject *_B = NULL;
GpuKernel *gk;
......
......@@ -10,7 +10,8 @@ try:
except ImportError:
pass
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
......@@ -25,7 +26,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step):
ten4 = as_gpuarray_variable(ten4)
ten4 = as_gpuarray_variable(ten4, infer_context_name(ten4))
neib_shape = T.as_tensor_variable(neib_shape)
neib_step = T.as_tensor_variable(neib_step)
......@@ -37,7 +38,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return Apply(self, [ten4, neib_shape, neib_step],
[GpuArrayType(broadcastable=(False, False),
dtype=ten4.type.dtype)()])
dtype=ten4.type.dtype,
context_name=ten4.type.context_name)()])
def get_context(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self):
return (11,)
......@@ -56,7 +61,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
kname = "k_multi_warp_less"
k_var = "k_multi_warp_less_" + nodename
code = """
//a version that use less register but don't work in all case.
// a version that uses less registers but doesn't work in all cases.
KERNEL void %(kname)s(
const int nb_batch,
const int nb_stack,
......@@ -233,6 +238,8 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
return kernels
def c_code(self, node, name, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
dtype_ten4 = node.inputs[0].dtype
dtype_neib_shape = node.inputs[1].dtype
dtype_neib_step = node.inputs[2].dtype
......@@ -243,6 +250,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
ctx = sub['context']
mode = self.mode
err_check = """
if (err != GA_NO_ERROR) {
......@@ -369,8 +377,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dims[0] = z_dim0;
dims[1] = z_dim1;
%(z)s = pygpu_empty(2, dims, %(typecode_z)s,
GA_C_ORDER, pygpu_default_context(),
Py_None);
GA_C_ORDER, %(ctx)s, Py_None);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError, "GpuImages2Neibs:"
......@@ -453,7 +460,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
@op_lifter([Images2Neibs])
def use_gpu_images2neibs(node):
def use_gpu_images2neibs(node, context_name):
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(node.op.mode)
......
......@@ -8,10 +8,10 @@ from theano.gof import local_optimizer, COp
from theano.scalar import as_scalar, constant
from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty)
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge
from .pycuda_helper import ensure_pycuda_context
try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
......@@ -43,6 +43,7 @@ def ensure_float(val, name):
class Gemm16(COp):
__props__ = ('relu', 'inplace')
_f16_ok = True
context_type = gpu_context_type
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32',
......@@ -61,10 +62,11 @@ class Gemm16(COp):
def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found")
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta')
......@@ -73,27 +75,8 @@ class Gemm16(COp):
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
ensure_pycuda_context()
C, alpha, A, B, beta = inputs
# The nervana code does not support the case where both inputs
# are trans, so we need to copy one if them if that is the
# case. We copy the smaller one.
if A.flags.f_contiguous and B.flags.f_contiguous:
if A.size < B.size:
A = A.copy()
else:
B = B.copy()
inplace = self.inplace
if inplace and not C.flags.c_contiguous:
inplace = False
if not inplace:
C = C.copy()
At = to_gputensor(A)
Bt = to_gputensor(B)
Ct = to_gputensor(C)
nerv.dot(At, Bt, Ct, alpha=alpha, beta=beta, relu=False)
outputs[0][0] = C
def get_context(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
......@@ -145,7 +128,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;")
codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = pygpu_default_context();")
codel.append("PyGpuContextObject *c = %s;" % (sub['context'],))
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
......@@ -162,7 +145,7 @@ if (GpuKernel_init(&k_%(name)s, c->ops, c->ctx, 1, &bcode, &sz,
@opt.register_opt()
@opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node):
def local_dot_to_gemm16(node, ctx_name):
if nerv is None:
return
A = node.inputs[0]
......@@ -170,7 +153,7 @@ def local_dot_to_gemm16(node):
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph
C = GpuAllocEmpty(dtype='float16')(
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
......
......@@ -10,7 +10,8 @@ try:
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name)
from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel,
inline_softmax,
......@@ -23,23 +24,26 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
__props__ = ()
_f16_ok = True
def make_node(self, x, b, y_idx):
# N.B. won't work when we don't cast y_idx to float anymore
x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b)
y_idx = as_gpuarray_variable(y_idx)
ctx_name = infer_context_name(x, b, y_idx)
x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b, ctx_name)
y_idx = as_gpuarray_variable(y_idx, ctx_name)
nll = GpuArrayType(x.type.dtype,
y_idx.type.broadcastable)()
y_idx.type.broadcastable,
context_name=ctx_name)()
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def get_context(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -144,6 +148,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
flags=flags, objvar=k_var)]
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError('cuda only')
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
......@@ -163,6 +169,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail']
ctx = sub['context']
k_var = "k_xent_sm_1hot_bias_%(nodename)s" % locals()
err_check = """
if (err != GA_NO_ERROR) {
......@@ -214,9 +221,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode_x)s, GA_C_ORDER, %(ctx)s,
Py_None);
if (!%(nll)s) {
%(fail)s
}
......@@ -229,9 +235,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode_b)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
......@@ -246,9 +251,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode_y_idx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
......@@ -306,18 +310,21 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
Gradient wrt x of the CrossentropySoftmax1Hot Op.
"""
nin = 3
nout = 1
__props__ = ()
_f16_ok = True
def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll)
sm = as_gpuarray_variable(sm)
y_idx = as_gpuarray_variable(y_idx)
ctx_name = infer_context_name(dnll, sm, y_idx)
dnll = as_gpuarray_variable(dnll, ctx_name)
sm = as_gpuarray_variable(sm, ctx_name)
y_idx = as_gpuarray_variable(y_idx, ctx_name)
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def get_context(self, node):
return node.inputs[0].type.context
def c_code_cache_version(self):
return (11,)
......@@ -325,6 +332,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
......@@ -338,6 +347,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
ctx = sub['context']
k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename
err_check = """
if (err != GA_NO_ERROR) {
......@@ -403,9 +413,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
{
Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode_dx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode_dx)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(dx)s) {
%(fail)s
}
......@@ -512,14 +521,16 @@ class GpuSoftmax(GpuKernelBase, Op):
Implement Softmax on the gpu.
"""
__props__ = ()
_f16_ok = True
def make_node(self, x):
x = as_gpuarray_variable(x)
x = as_gpuarray_variable(x, infer_context_name(x))
return Apply(self, [x], [x.type()])
def get_context(self, node):
return node.inputs[0].type.context
def infer_shape(self, node, shape):
return shape
......@@ -530,6 +541,8 @@ class GpuSoftmax(GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError("cuda only")
dtype_x = node.inputs[0].dtype
work_x = work_dtype(dtype_x)
dtype_z = node.outputs[0].dtype
......@@ -539,6 +552,7 @@ class GpuSoftmax(GpuKernelBase, Op):
x, = inp
z, = out
fail = sub['fail']
ctx = sub['context']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
......@@ -568,9 +582,8 @@ class GpuSoftmax(GpuKernelBase, Op):
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(z)s) {
%(fail)s
}
......@@ -698,22 +711,25 @@ class GpuSoftmax(GpuKernelBase, Op):
gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (GpuKernelBase, Op):
class GpuSoftmaxWithBias(GpuKernelBase, Op):
"""
Implement SoftmaxWithBias on the gpu.
"""
nin = 2
nout = 1
__props__ = ()
_f16_ok = True
def make_node(self, x, b):
x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b)
ctx_name = infer_context_name(x, b)
x = as_gpuarray_variable(x, ctx_name)
b = as_gpuarray_variable(b, ctx_name)
return Apply(self, [x, b], [x.type()])
def get_context(self, node):
return node.inputs[0].type.context
def infer_shape(self, node, shape):
return [shape[0]]
......@@ -724,6 +740,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
if node.inputs[0].type.context.kind != 'cuda':
raise NotImplementedError('cuda only')
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_z = node.outputs[0].dtype
......@@ -735,6 +753,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
x, b = inp
z, = out
fail = sub['fail']
ctx = sub['context']
err_check = """
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, fmt_str, msg);
......@@ -777,9 +796,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
{
Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
%(typecode)s, GA_C_ORDER,
%(ctx)s, Py_None);
if (!%(z)s) {
%(fail)s
}
......
......@@ -294,7 +294,7 @@ def inplace_allocempty(op, idx):
function can be as simple as:
def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs)
return [node.op.__class__(inplace=True)(*inputs)]
Parameters
----------
......@@ -320,7 +320,8 @@ def inplace_allocempty(op, idx):
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype,
alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs)
return opt
......
try:
from pycuda.driver import Context
if not hasattr(Context, 'attach'):
raise ImportError('too old')
except ImportError:
Context = None
pycuda_initialized = False
pycuda_context = None
def ensure_pycuda_context():
global pycuda_context, pycuda_initialized
if not pycuda_initialized:
if Context is None:
raise RuntimeError("PyCUDA not found or too old.")
else:
pycuda_context = Context.attach()
import atexit
atexit.register(pycuda_context.detach)
pycuda_initialized = True
return pycuda_context
from __future__ import print_function
import copy
import os
import copy
import numpy
import theano
from theano import tensor, gof, config
from theano.gof.utils import MethodNotDefined
from theano import tensor, gof
from six.moves import StringIO
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
import theano.tensor.inplace
......@@ -19,7 +18,8 @@ except ImportError:
pass
from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel)
from .basic_ops import (as_gpuarray_variable, HideC, GpuKernelBase, Kernel,
infer_context_name)
from .elemwise import GpuElemwise
......@@ -27,10 +27,12 @@ class GpuSubtensor(HideC, Subtensor):
_f16_ok = True
def make_node(self, x, *inputs):
ctx_name = infer_context_name(x)
rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable)
x = as_gpuarray_variable(x)
broadcastable=rval.outputs[0].type.broadcastable,
context_name=ctx_name)
x = as_gpuarray_variable(x, ctx_name)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
......@@ -191,14 +193,18 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
return self.iadd_node.op.gpu_kernels(self.iadd_node, subname)
def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
ctx_name = infer_context_name(x, y)
x = as_gpuarray_variable(x, ctx_name)
y = as_gpuarray_variable(y, ctx_name)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
op = copy.copy(self)
ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
op.create_iadd_node(ret)
return ret
def get_context(self, node):
return node.outputs[0].type.context
def create_iadd_node(self, node):
# We store a iadd_node in the op that contain the info needed
# for the inplace add.
......@@ -210,7 +216,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
iadd_node = gop(xview, y).owner
self.iadd_node = iadd_node
def perform(self, node, inputs, out_):
def perform(self, node, inputs, out_, ctx):
out, = out_
x, y = inputs[:2]
indices = list(reversed(inputs[2:]))
......@@ -321,7 +327,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
%(view_ndim)s,
dims,
xview_strides,
pygpu_default_context(),
%(x)s->context,
1,
(PyObject *)%(x)s,
(PyObject *)&PyGpuArrayType);
......@@ -355,10 +361,10 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
"""
return """GpuArray_setarray(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_support_code_apply(self, node, nodename):
def c_support_code_struct(self, node, nodename):
gop = self.iadd_node.op
sub_name = nodename + "_add_to_zview"
ret = gop.c_support_code_apply(self.iadd_node, sub_name)
ret = gop.c_support_code_struct(self.iadd_node, sub_name)
ret += """
PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){
......@@ -366,10 +372,11 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
""" % locals()
inputs = ["dst", "src"]
outputs = ["ret"]
sub = {"fail": "return NULL;"}
sub = {"fail": "return NULL;", "context": "dst->context"}
ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
ret += """
return dst;
return ret;
}
"""
return ret
......@@ -399,7 +406,8 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
def make_node(self, x, ilist):
x_ = as_gpuarray_variable(x)
ctx_name = infer_context_name(x, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
ilist__ = tensor.as_tensor_variable(ilist)
if ilist__.type.dtype[:3] not in ('int', 'uin'):
......@@ -407,7 +415,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
if ilist__.type.dtype != 'int64':
ilist__ = tensor.cast(ilist__, 'int64')
ilist_ = as_gpuarray_variable(ilist__)
ilist_ = as_gpuarray_variable(ilist__, ctx_name)
if ilist_.type.dtype != 'int64':
raise TypeError('index must be int64')
......@@ -419,6 +427,7 @@ class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
bcast = ilist_.broadcastable + x_.broadcastable[1:]
return gof.Apply(self, [x_, ilist_],
[GpuArrayType(dtype=x.dtype,
context_name=ctx_name,
broadcastable=bcast)()])
def perform(self, node, inp, out_):
......@@ -475,8 +484,9 @@ class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
def make_node(self, x, y, ilist):
x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y)
ctx_name = infer_context_name(x, y)
x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y, ctx_name)
ilist_ = tensor.as_tensor_variable(ilist)
assert x_.type.dtype == y_.type.dtype
......@@ -567,16 +577,16 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
only avail on compute capability 2.0 and more recent.
"""
_f16_ok = True
def make_node(self, x, y, ilist):
"""It defer from GpuAdvancedIncSubtensor1 in that it make sure
the index are of type long.
"""
x_ = as_gpuarray_variable(x)
y_ = as_gpuarray_variable(y)
ilist_ = as_gpuarray_variable(ilist)
ctx_name = infer_context_name(x, y, ilist)
x_ = as_gpuarray_variable(x, ctx_name)
y_ = as_gpuarray_variable(y, ctx_name)
ilist_ = as_gpuarray_variable(ilist, ctx_name)
assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim >= y_.type.ndim
......@@ -599,32 +609,30 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, GpuAdvancedIncSubtensor1):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
def get_context(self, node):
return node.outputs[0].type.context
def perform(self, node, inp, out, ctx):
return super(GpuAdvancedIncSubtensor1_dev20, self).perform(node, inp, out)
def c_code_cache_version(self):
return (6,)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<numpy_compat.h>', '<gpuarray_helper.h>',
return ['<numpy_compat.h>', '<gpuarray_helper.h>',
'<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
res = [os.path.dirname(__file__)]
if cuda_root:
res.append(os.path.join(cuda_root, 'include'))
return res
return [os.path.dirname(__file__)]
def c_code(self, node, name, inputs, outputs, sub):
active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties
compute_capability = device_properties(active_device_no)['major']
if ((self.set_instead_of_inc) or
(node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or
(compute_capability < 2)):
ctx = self.get_context(node)
if ctx.kind != 'cuda':
raise NotImplementedError("cuda only")
if (self.set_instead_of_inc or
node.inputs[0].ndim != node.inputs[1].ndim or
node.inputs[0].ndim != 2 or
ctx.bin_id[-2] < '2'):
raise NotImplementedError("This case does not have C code yet.")
x = inputs[0]
......@@ -754,7 +762,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
return [Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)]
def c_support_code_apply(self, node, nodename):
def c_support_code_struct(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_y = node.inputs[1].dtype
dtype_ind = node.inputs[2].dtype
......@@ -765,7 +773,7 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
itemsize_out = numpy.dtype(dtype_out).itemsize
k_var = "k_vector_add_fast_" + nodename
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_apply(node, nodename) + """
return super(GpuAdvancedIncSubtensor1_dev20, self).c_support_code_struct(node, nodename) + """
int GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
PyGpuArrayObject* py_other,
PyGpuArrayObject *indices_arr)
......
from nose.plugins.skip import SkipTest
import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
if not theano.sandbox.gpuarray.pygpu_activated:
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available:
cuda_ndarray.use('gpu', default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
test_ctx_name = None
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
......@@ -13,53 +13,22 @@ from theano.tensor.basic import alloc
from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray
from ..type import (GpuArrayType,
from ..type import (GpuArrayType, get_context,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape,
GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu,
# as many tests don't work in that setup.
cuda_ndarray.use('gpu',
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from pygpu import gpuarray
utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None):
......@@ -88,7 +57,8 @@ def rand_gpuarray(*shape, **kwargs):
cls = kwargs.pop('cls', None)
if len(kwargs) != 0:
raise TypeError('Unexpected argument %s', list(kwargs.keys())[0])
return gpuarray.array(r, dtype=dtype, cls=cls)
return gpuarray.array(r, dtype=dtype, cls=cls,
context=get_context(test_ctx_name))
def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
......@@ -114,6 +84,7 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
def test_all(self):
if skip:
from nose.plugins.skip import SkipTest
raise SkipTest(skip)
for testname, inputs in iteritems(cases):
......@@ -199,9 +170,9 @@ def test_transfer_cpu_gpu():
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
gv = gpuarray.array(av, context=get_context(test_ctx_name))
f = theano.function([a], gpu_from_host(a))
f = theano.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
......@@ -218,12 +189,12 @@ def test_transfer_strided():
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 8), dtype='float32')
gv = gpuarray.array(av)
gv = gpuarray.array(av, context=get_context(test_ctx_name))
av = av[:, ::2]
gv = gv[:, ::2]
f = theano.function([a], gpu_from_host(a))
f = theano.function([a], GpuFromHost(test_ctx_name)(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
......@@ -233,14 +204,14 @@ def test_transfer_strided():
def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype)
g = gpuarray.empty(shp, dtype=x.dtype, context=get_context(test_ctx_name))
g[:] = x
return g
GpuAllocTester = makeTester(
name="GpuAllocTester",
op=alloc,
gpu_op=gpu_alloc,
gpu_op=GpuAlloc(test_ctx_name),
cases=dict(
correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
......@@ -260,19 +231,19 @@ class TestAlloc(test_basic.TestAlloc):
dtype = "float32"
mode = mode_with_gpu
shared = staticmethod(gpuarray_shared_constructor)
allocs = [GpuAlloc(), GpuAlloc(), T.Alloc()]
allocs = [GpuAlloc(test_ctx_name), GpuAlloc(test_ctx_name), T.Alloc()]
def test_alloc_empty():
for dt in ['float32', 'int8']:
f = theano.function([], GpuAllocEmpty(dt)(2, 3))
f = theano.function([], GpuAllocEmpty(dt, context_name=test_ctx_name)(2, 3))
assert len(f.maker.fgraph.apply_nodes) == 1
out = f()
assert out.shape == (2, 3)
assert out.dtype == dt
f = theano.function([], [GpuAllocEmpty('uint64')(3, 2),
GpuAllocEmpty('uint64')(3, 2)])
f = theano.function([], [GpuAllocEmpty('uint64', test_ctx_name)(3, 2),
GpuAllocEmpty('uint64', test_ctx_name)(3, 2)])
out = f()
assert out[0].shape == (3, 2)
assert out[0].dtype == 'uint64'
......@@ -284,7 +255,7 @@ def test_alloc_empty():
def test_shape():
x = GpuArrayType(dtype='float32', broadcastable=[False, False, False])()
v = gpuarray.zeros((3, 4, 5), dtype='float32')
v = gpuarray.zeros((3, 4, 5), dtype='float32', context=get_context(test_ctx_name))
f = theano.function([x], x.shape)
topo = f.maker.fgraph.toposort()
assert numpy.all(f(v) == (3, 4, 5))
......@@ -436,12 +407,13 @@ def test_hostfromgpu_shape_i():
ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32')
dtype='float32',
context=get_context(test_ctx_name))
f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()]
f = theano.function([a], gpu_from_host(a).shape, mode=m)
f = theano.function([a], GpuFromHost(test_ctx_name)(a), mode=m)
assert any(isinstance(x.op, GpuFromHost)
for x in f.maker.fgraph.toposort())
f = theano.function([a], GpuFromHost(test_ctx_name)(a).shape, mode=m)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op, T.opt.Shape_i)
......
......@@ -10,8 +10,8 @@ from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor
from .test_basic_ops import (makeTester, rand,
mode_with_gpu)
from .config import mode_with_gpu
from .test_basic_ops import makeTester, rand
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace,
......@@ -100,7 +100,7 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
self.ops = [gpuger_no_inplace, gpuger_inplace]
def clone(self, op):
return GpuGer(destructive=op.destructive)
return GpuGer(inplace=op.inplace)
GpuDot22Tester = makeTester(
......
......@@ -14,8 +14,8 @@ from theano import tensor
from theano.tests.unittest_tools import seed_rng
# We let that import do the init of the back-end if needed.
from .test_basic_ops import mode_with_gpu
from ..type import GpuArrayType
from .config import mode_with_gpu, test_ctx_name
from ..type import GpuArrayType, get_context
from ..conv import GpuConv
from theano.sandbox.gpuarray import dnn
......@@ -28,7 +28,7 @@ try:
except ImportError:
pass
gftensor4 = GpuArrayType('float32', [False] * 4)
gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)
def py_conv_valid_numpy(img, kern):
......@@ -135,8 +135,8 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = pygpu.array(npy_img)
kern = pygpu.array(npy_kern)
img = pygpu.array(npy_img, context=get_context(test_ctx_name))
kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))
# we take the stride after the transfert as we make c_contiguous
# data on the GPU.
......
......@@ -15,12 +15,12 @@ from theano.tensor.signal.downsample import MaxPoolGrad, AveragePoolGrad
from .. import dnn
from ..basic_ops import GpuAllocEmpty
from .test_basic_ops import mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
from . import test_nnet
def test_dnn_conv_desc_merge():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
kern_shp = T.as_tensor_variable(
numpy.asarray([3, 1, 2, 2]).astype('int64'))
......@@ -41,7 +41,7 @@ def test_dnn_conv_desc_merge():
def test_dnn_conv_merge():
# This test that we merge correctly multiple dnn_conv.
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
......@@ -80,7 +80,7 @@ def test_dnn_conv_inplace():
GpuAllocEmpty get merged together.
"""
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img_shp = [2, 5, 6, 8]
kern_shp = [3, 5, 5, 6]
......@@ -105,7 +105,7 @@ def test_dnn_conv_inplace():
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad w op
out = GpuAllocEmpty(kern.dtype)(*kern.shape)
out = GpuAllocEmpty(kern.dtype, test_ctx_name)(*kern.shape)
o1 = dnn.GpuDnnConvGradW()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradW()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
......@@ -116,7 +116,7 @@ def test_dnn_conv_inplace():
assert len([n for n in topo if isinstance(n.op, GpuAllocEmpty)]) == 2
# Test grad i op
out = GpuAllocEmpty(img.dtype)(*img.shape)
out = GpuAllocEmpty(img.dtype, test_ctx_name)(*img.shape)
o1 = dnn.GpuDnnConvGradI()(img, kern, out, desc1)
o2 = dnn.GpuDnnConvGradI()(img, kern, out, desc2)
f = theano.function([img, kern], [o1, o2], mode=mode_with_gpu)
......@@ -163,7 +163,7 @@ def pool_2d_i2n(input, ds=(2, 2), strides=None,
def test_pooling():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
x = T.ftensor4()
......@@ -269,7 +269,7 @@ def test_pooling():
def test_pooling_opt():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
x = T.fmatrix()
......@@ -318,7 +318,7 @@ def test_dnn_tag():
max_pool_2d(x, ds=(2, 2), ignore_border=True),
mode=mode_with_gpu.including("cudnn"))
except (AssertionError, RuntimeError):
assert not dnn.dnn_available()
assert not dnn.dnn_available(test_ctx_name)
raised = True
finally:
theano.config.on_opt_error = old
......@@ -327,7 +327,7 @@ def test_dnn_tag():
logging.getLogger('theano').addHandler(theano.logging_default_handler)
if not raised:
assert dnn.dnn_available()
assert dnn.dnn_available(test_ctx_name)
assert any([isinstance(n.op, dnn.GpuDnnPool)
for n in f.maker.fgraph.toposort()])
......@@ -338,7 +338,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
self.mode = mode_with_gpu
def test_softmax(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
t = T.ftensor4('t')
rand_tensor = numpy.asarray(
......@@ -368,7 +368,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
def test_conv(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
......@@ -406,7 +406,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
def test_conv_gradw(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
......@@ -455,7 +455,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
def test_conv_gradi(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
kerns = T.ftensor4('kerns')
......@@ -499,7 +499,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
def test_pool(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
img_val = numpy.asarray(
......@@ -524,7 +524,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
)
def test_pool_grad(self):
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4('img')
img_grad = T.ftensor4('img_grad')
......@@ -568,7 +568,7 @@ class TestDnnInferShapes(utt.InferShapeTester):
# this has been a problem in the past
def test_dnn_conv_border_mode():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
......@@ -580,7 +580,7 @@ def test_dnn_conv_border_mode():
def test_dnn_conv_alpha_output_merge():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
img = T.ftensor4()
kern = T.ftensor4()
......@@ -678,7 +678,7 @@ def test_dnn_conv_grad():
def test_version():
if not dnn.dnn_available():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
assert isinstance(dnn.version(), int)
......
......@@ -4,19 +4,19 @@ import theano
from theano import scalar, gof
from theano.tests.unittest_tools import SkipTest, assert_allclose
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce, T_reduce_dtype)
from theano.tensor.tests import test_elemwise
from .test_basic_ops import mode_with_gpu, rand_gpuarray
from .config import mode_with_gpu, test_ctx_name
from .test_basic_ops import rand_gpuarray
from ..elemwise import (GpuElemwise, GpuDimShuffle,
GpuCAReduceCuda, GpuCAReduceCPY)
from ..type import GpuArrayType
from ..type import GpuArrayType, get_context
from pygpu import ndgpuarray as gpuarray
# This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_Broadcast):
class test_gpu_Broadcast(test_elemwise.test_Broadcast):
op = GpuElemwise
type = GpuArrayType
cop = GpuElemwise
......@@ -25,8 +25,7 @@ class test_gpu_Broadcast(test_Broadcast):
linkers = [gof.PerformLinker, gof.CLinker]
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
self.linkers = [gof.PerformLinker]
def rand_val(self, shp):
......@@ -36,14 +35,12 @@ class test_gpu_Broadcast(test_Broadcast):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
def test_c(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c()
def test_c_inplace(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
super(test_gpu_Broadcast, self).test_c_inplace()
......@@ -51,8 +48,7 @@ class test_gpu_Broadcast(test_Broadcast):
def test_elemwise_pow():
# Test that GpuElemwise(pow) can compile with any combination of integer
# or float input dtype.
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
dtypes = ["uint8", "uint16", "uint32", "uint64",
......@@ -77,11 +73,11 @@ def test_elemwise_pow():
assert_allclose(out, expected_out)
class test_GpuDimShuffle(test_DimShuffle):
class test_GpuDimShuffle(test_elemwise.test_DimShuffle):
op = GpuDimShuffle
class test_GpuCAReduceCPY(test_CAReduce):
class test_GpuCAReduceCPY(test_elemwise.test_CAReduce):
dtypes = ["float32"]
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduceCPY
......@@ -120,7 +116,7 @@ class test_GpuCAReduceCPY(test_CAReduce):
def test_infer_shape(self):
for dtype in self.dtypes:
test_CAReduce.test_infer_shape(self, dtype)
super(test_GpuCAReduceCPY, self).test_infer_shape(dtype)
class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
......@@ -133,15 +129,15 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((5, 6), (1, )),
((5, 6), (-1, )),
((5, 6), (-2, )),
#((5, 6), ()), #reduce on no axis(copy) isn't implemented
#((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
#((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
# ((5, 6), ()), #reduce on no axis(copy) isn't implemented
# ((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
# ((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
((5, 0), None),
((5, 0), (0, )),
((5, 0), (1, )),
#((5, 0), ()), reduce on no axis isn't implemented
#((), None), reduce on no axis isn't implemented
#((), ()) reduce on no axis isn't implemented
# ((5, 0), ()), reduce on no axis isn't implemented
# ((), None), reduce on no axis isn't implemented
# ((), ()) reduce on no axis isn't implemented
# Test all GPU cases implemented
((1, 0), (1,)),
......@@ -176,7 +172,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
# ((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
......@@ -189,17 +185,17 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
# test pattern implemented by reshape
# Skip them as this test the op directly, not the optimization with reshape
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
# ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
# ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
# ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
# ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
# ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
# ((5,4,3,10,11),[1,2]),
]
op = GpuCAReduceCuda
reds = [scalar.add, scalar.mul,
scalar.maximum, scalar.minimum]
pre_scalar_op = scalar.sqr
pre_scalar_op = None
def test_perform(self):
return
......@@ -209,12 +205,11 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
def setUp(self):
super(test_GpuCAReduceCuda, self).setUp()
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
class T_gpureduce_dtype(T_reduce_dtype):
class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
mode = mode_with_gpu.excluding('local_cut_useless_reduce')
op = GpuCAReduceCuda
# Currently we don't support reduction on 0 axis
......@@ -225,8 +220,7 @@ class T_gpureduce_dtype(T_reduce_dtype):
'float32', 'float64']
def setUp(self):
dev = theano.sandbox.gpuarray.init_dev.device
if not dev.startswith('cuda'):
if get_context(test_ctx_name).kind != 'cuda':
raise SkipTest("Cuda specific tests")
......
from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed.
from .test_basic_ops import mode_with_gpu
from .config import mode_with_gpu
from ..neighbours import GpuImages2Neibs
......
......@@ -6,7 +6,7 @@ from theano import function
from theano.tests import unittest_tools as utt
from theano.tensor import vector, matrix, dot
from .test_basic_ops import mode_with_gpu
from .config import mode_with_gpu
from ..nerv import Gemm16, nerv
......
......@@ -7,9 +7,7 @@ import theano
import theano.tensor as T
import theano.tests.unittest_tools as utt
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from .config import mode_with_gpu, mode_without_gpu
from ..nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
......
......@@ -4,17 +4,16 @@ import theano
from theano import tensor
from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests import test_basic
import theano.sandbox.gpuarray
from .. import basic_ops
from ..type import GpuArrayType, gpuarray_shared_constructor
from ..basic_ops import (GpuAlloc, GpuReshape, gpu_alloc,
gpu_from_host, host_from_gpu)
from ..type import GpuArrayType, gpuarray_shared_constructor, get_context
from ..basic_ops import GpuAlloc, GpuReshape, GpuFromHost, host_from_gpu
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor
from .test_basic_ops import rand_gpuarray, mode_with_gpu, mode_without_gpu
from .config import mode_with_gpu, test_ctx_name
def test_local_assert():
......@@ -97,7 +96,7 @@ def test_flatten():
def test_reduce():
dev = theano.sandbox.gpuarray.init_dev.device
kind = get_context(test_ctx_name).kind
for method, param in [('sum', dict(acc_dtype='float32')),
('prod', dict(acc_dtype='float32')),
......@@ -113,7 +112,7 @@ def test_reduce():
topo = f.maker.fgraph.toposort()
ops = [type(node.op) for node in topo]
if dev.startswith('opencl') and method in ["max", "min"]:
if kind == 'opencl' and method in ["max", "min"]:
assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
else:
assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
......@@ -126,7 +125,7 @@ def test_local_gpualloc_memset_0():
ones = numpy.ones((2,), dtype='float32')
# Test with 0
a = gpu_alloc(z, i)
a = GpuAlloc(test_ctx_name)(z, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
......@@ -134,7 +133,7 @@ def test_local_gpualloc_memset_0():
assert (numpy.asarray(f(6)) == 0).all()
# Test with 1
a = gpu_alloc(o, i)
a = GpuAlloc(test_ctx_name)(o, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
......@@ -143,7 +142,7 @@ def test_local_gpualloc_memset_0():
assert (numpy.asarray(f(6)) == 1).all()
# Test with 1, 1
a = gpu_alloc(ones, i)
a = GpuAlloc(test_ctx_name)(ones, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
......@@ -180,7 +179,7 @@ def test_print_op():
f = theano.function([b], theano.printing.Print()(b) * 2,
mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert topo[0].op == gpu_from_host
assert isinstance(topo[0].op, GpuFromHost)
assert isinstance(topo[1].op, theano.printing.Print)
assert isinstance(topo[2].op, GpuElemwise)
assert topo[3].op == host_from_gpu
......@@ -208,7 +207,7 @@ def test_pdbbreakpoint_op():
def test_local_gpu_elemwise_careduce():
x = theano.tensor.matrix()
o = (x*x).sum()
o = (x * x).sum()
f = theano.function([x], o, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
......@@ -234,7 +233,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu)
f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo])
......@@ -242,7 +241,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu)
f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, GpuSubtensor) for node in topo])
......@@ -250,7 +249,7 @@ def test_local_gpu_subtensor():
# Test shared forced on CPU end we do computation on the output of
# the subtensor.
t = tensor._shared(numpy.zeros(20, "float32"))
f = theano.function([], t[3:4]+1, mode=mode_with_gpu)
f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
......@@ -319,7 +318,7 @@ def test_local_gpu_elemwise():
utt.assert_allclose(out[1], a_v * c_v)
# Test non-contiguous input
c = cuda.shared_constructor(numpy.asarray(c_v, dtype='float32'))
c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32'))
f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]),
mode=mode_with_gpu)
out = f(a_v, b_v)
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论