提交 44f9d0f7 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1001 from abergeron/compyte

Support for a new type based on compyte in theano
......@@ -91,6 +91,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \
config.gpuarray.init_device != '':
import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
import numpy
......
......@@ -2,9 +2,8 @@ import os
import logging
import subprocess
from theano.configparser import (
AddConfigVar, BoolParam, ConfigParam, EnumStr, IntParam,
TheanoConfigParser)
from theano.configparser import (AddConfigVar, BoolParam, ConfigParam, EnumStr,
IntParam, StrParam, TheanoConfigParser)
from theano.misc.cpucount import cpuCount
from theano.misc.windows import call_subprocess_Popen
......@@ -44,20 +43,42 @@ AddConfigVar('int_division',
# gpu means let the driver select the gpu. Needed in case of gpu in
# exclusive mode.
# gpuX mean use the gpu number X.
class DeviceParam(ConfigParam):
def __init__(self, default, *options, **kwargs):
self.default = default
def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'):
return val
else:
raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"'
% (val, self.fullname)))
over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over)
def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
AddConfigVar('device',
("Default device for computations. If gpu*, change the default to try "
"to move computation to it and to put shared variable of float32 "
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."),
EnumStr('cpu', 'gpu',
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
DeviceParam('cpu', allow_override=False),
in_c_key=False,
)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False)
AddConfigVar('init_gpu_device',
("Initialize the gpu device to use, works only if device=cpu. "
"Unlike 'device', setting this option will NOT move computations, "
......
import logging
import theano
from theano.configparser import config
from theano.compile import optdb
_logger_name = 'theano.sandbox.gpuarray'
_logger = logging.getLogger(_logger_name)
_logger.setLevel(logging.WARNING)
error = _logger.error
info = _logger.info
pygpu_activated = False
try:
import pygpu
import pygpu.gpuarray
except ImportError:
pygpu = None
# This is for documentation not to depend on the availability of pygpu
from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
import opt
def init_dev(dev):
global pygpu_activated
context = pygpu.init(dev)
pygpu.set_default_context(context)
pygpu_activated = True
if pygpu:
try:
if (config.device.startswith('cuda') or
config.device.startswith('opencl')):
init_dev(config.device)
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'inplace')
elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device)
except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True)
else:
if (config.gpuarray.init_device != '' or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True)
import os
import numpy
import theano
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar, config
from theano.scalar import Scalar
from theano.gof.python25 import all, any
try:
import pygpu
from pygpu import gpuarray, elemwise
except ImportError:
pass
from type import GpuArrayType
def as_gpuarray_variable(x):
if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable()
# TODO we need to have the cuda -> gpu path taken care of.
tensor_x = tensor.as_tensor_variable(x)
return gpu_from_host(tensor_x)
def as_gpuarray(x):
return gpuarray.array(x, copy=False)
class HostFromGpu(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'HostFromGpu(gpuarray)'
def make_node(self, x):
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
return Apply(self, [x],
[tensor.TensorType(dtype=x.dtype,
broadcastable=x.broadcastable,)()])
def perform(self, node, inp, out):
x, = inp
z, = out
z[0] = numpy.asarray(x)
def c_code(self, node, name, inputs, outputs, sub):
return """
GpuArray %(name)s_ga_s;
GpuArray *%(name)s_ga = NULL;
int %(name)serr;
PyArray_Descr *%(name)s_dtype;
if (!GpuArray_ISONESEGMENT(&%(inp)s->ga)) {
if (GpuArray_copy(&%(name)s_ga_s, &%(inp)s->ga, GA_C_ORDER) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't make contiguous copy");
%(fail)s;
}
%(name)s_ga = &%(name)s_ga_s;
} else {
%(name)s_ga = &%(inp)s->ga;
}
%(name)s_dtype = typecode_to_dtype(%(inp)s->ga.typecode);
Py_XDECREF(%(out)s);
// PyArray_Empty below steals a reference to the dtype we pass it
// so we need an extra one to spare.
Py_INCREF(%(name)s_dtype);
%(out)s = (PyArrayObject *)PyArray_Empty(%(inp)s->ga.nd,
(npy_intp *)%(inp)s->ga.dimensions,
%(name)s_dtype,
(%(inp)s->ga.flags & GA_F_CONTIGUOUS) &&
!(%(inp)s->ga.flags & GA_C_CONTIGUOUS));
if (%(out)s == NULL) {
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
%(fail)s
}
%(name)serr = GpuArray_read(PyArray_DATA(%(out)s),
PyArray_NBYTES(%(out)s),
%(name)s_ga);
if (%(name)s_ga == &%(name)s_ga_s) GpuArray_clear(%(name)s_ga);
if (%(name)serr != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not read device data.");
%(fail)s
}
""" % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
'out': outputs[0]}
def c_code_cache_version(self):
return (1,)
def grad(self, inputs, grads):
gz, = grads
return [gpu_from_host(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, tensor.TensorType):
return [gpu_from_host(ev)]
else:
return [ev]
def infer_shape(self, node, xshp):
return xshp
host_from_gpu = HostFromGpu()
class GpuFromHost(Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'GpuFromHost(gpuarray)'
def make_node(self, x):
if not isinstance(x.type, tensor.TensorType):
raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
dtype=x.dtype)()])
def perform(self, node, inp, out):
x, = inp
z, = out
type = node.outputs[0].type
z[0] = gpuarray.array(x)
def grad(self, inputs, grads):
gz, = grads
return [host_from_gpu(as_gpuarray_variable(gz))]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isintance(ev, GpuArrayType):
return [host_from_gpu(ev)]
else:
return ev
def infer_shape(self, node, xshp):
return xshp
def c_code(self, node, name, inputs, outputs, sub):
return """
PyArrayObject *%(name)s_tmp;
int %(name)serr;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
if (%(name)s_tmp == NULL) {
// PyArray_GETCONTIGUOUS sets an error message if it fails
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (%(out)s == NULL) {
Py_DECREF(%(name)s_tmp);
// new_GpuArray calls __new__ which will set an error message
// if it returns NULL.
%(fail)s
}
%(name)serr = GpuArray_empty(&%(out)s->ga,
GpuArray_default_context()->ops,
GpuArray_default_context()->ctx,
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(inp)s),
(size_t *)PyArray_DIMS(%(inp)s),
GA_C_ORDER);
if (%(name)serr != GA_NO_ERROR) {
Py_DECREF(%(name)s_tmp);
Py_DECREF(%(out)s);
%(out)s = NULL;
PyErr_SetString(PyExc_MemoryError, "Can't allocate device memory for result.");
%(fail)s
}
%(name)serr = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_DECREF(%(name)s_tmp);
if (%(name)serr != GA_NO_ERROR) {
Py_DECREF(%(out)s);
PyErr_SetString(PyExc_RuntimeError, "Could not copy array data to device");
%(fail)s
}
""" % {'name': name, 'inp': inputs[0],
'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (1,)
gpu_from_host = GpuFromHost()
class GpuFromCuda(Op):
view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'GpuFromCuda'
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
dtype=x.dtype)()])
def perform(self, node, inp, out):
x, = inp
z, = out
z[0] = gpuarray.array(numpy.asarray(x))
def grad(self, inputs, grads):
gz, = grads
return [cuda_from_gpu(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isintance(ev, GpuArrayType):
return [cuda_from_gpu(ev)]
else:
return ev
def infer_shape(self, node, xshp):
return xshp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<compyte/extension.h>',
'<compyte/types.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *))compyte_get_extension("cuda_get_ctx");',
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))compyte_get_extension("cuda_make_buf");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr;
gpudata *%(name)sdata;
CUcontext %(name)scur;
size_t *%(name)sdims;
ssize_t *%(name)sstr;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
%(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
if (%(name)sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)s
}
%(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
if (%(name)sstr == NULL) {
free(%(name)sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)s
}
for (unsigned int i = 0; i < %(in)s->nd; i++) {
%(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
%(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
}
Py_XDECREF(%(out)s);
%(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (%(out)s == NULL) {
free(%(name)sdims);
free(%(name)sstr);
%(fail)s
}
%(name)sdata = cuda_make_buf(GpuArray_default_context()->ctx,
(CUdeviceptr)%(in)s->devdata,
((size_t)%(in)s->data_allocated)*4);
if (%(name)sdata == NULL) {
Py_DECREF(%(out)s);
free(%(name)sdims);
free(%(name)sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)s
}
%(name)serr = GpuArray_fromdata(&%(out)s->ga,
GpuArray_default_context()->ops,
%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
%(name)sdims, %(name)sstr, 1);
free(%(name)sdims);
free(%(name)sstr);
if (%(name)serr != GA_NO_ERROR) {
Py_DECREF(%(out)s);
PyErr_SetString(PyExc_MemoryError, "Could not allocate GpuArray structure.");
%(fail)s
}
Py_INCREF(%(in)s);
%(out)s->base = (PyObject *)%(in)s;
""" % {'name':name, 'in': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self):
return (1,)
gpu_from_cuda = GpuFromCuda()
class CudaFromGpu(Op):
view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return 'CudaFromGpu'
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
if x.type.dtype != 'float32':
raise TypeError(x)
return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
def perform(self, node, inp, out):
from theano.sandbox.cuda import filter as cuda_filter
x, = inp
z, = out
z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
tuple([0] * x.ndim), 0, z[0])
def grad(self, inputs, grads):
gz, = grads
return [gpu_from_cuda(gz)]
def R_op(self, inputs, eval_points):
from theano.sandbox.cuda import CudaNdArrayType
ev, = eval_points
if (isinstance(ev, CudaNdarrayType)):
return [gpu_from_cuda(ev)]
else:
return [ev]
def infer_shape(self, node, shp):
return shp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<compyte/extension.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *ctx))compyte_get_extension("cuda_get_ctx");',
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr = 0, %(name)si;
CUcontext %(name)scur;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
if (!%(out)s) {
%(fail)s
}
for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
}
%(name)serr = CudaNdarray_set_device_data(%(out)s,
(float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
(PyObject *)%(inp)s);
if (%(name)serr) {
%(fail)s
}
""" % {'name': name, 'inp': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self):
return (1,)
cuda_from_gpu = CudaFromGpu()
class GpuAlloc(Op):
def __str__(self):
return 'GpuAlloc'
def __hash__(self):
return hash(type(self))
def __eq__(self, other):
return type(self) == type(other)
def make_node(self, value, *shape):
v = as_gpuarray_variable(value)
sh = [tensor.as_tensor_variable(s) for s in shape]
bcast = []
if v.ndim > len(shape):
raise TypeError(
'GpuAlloc value has more dimensions than arguments',
value.ndim, len(shape))
for i, s in enumerate(sh):
if s.type.dtype[:3] not in ('int', 'uint'):
raise TypeError('Shape arguments must be integers', s)
try:
const_shp = tensor.get_scalar_constant_value(s)
except tensor.NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
return Apply(self, [v] + sh, [otype()])
def perform(self, node, inputs, outs):
out, = outs
v = inputs[0]
sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh:
out[0] = gpuarray.empty(sh, dtype=v.dtype)
out[0][...] = v
def infer_shape(self, node, input_shapes):
return [node.inputs[1:]]
def grad(self, input, grads):
return [None for i in inputs]
def do_constant_folding(self, node):
if not getattr(node.ouputs[0], 'clients', []):
return False
for client in node.outputs[0].clients:
if client[0] == 'output':
return False
return True
gpu_alloc = GpuAlloc()
import numpy
from theano import Op, Apply, scalar
try:
from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel
except ImportError:
pass
from basic_ops import as_gpuarray_variable
from type import GpuArrayType
from theano.gof.utils import MethodNotDefined
def _is_scalar(v):
False
def make_argument(v, name):
if _is_scalar(v):
return ScalarArg(numpy.dtype(v.type.dtype), name)
else:
return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_out(o, ref):
if o is None:
return ref._empty_like_me()
else:
return o
class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op):
self.scalar_op = scalar_op
self.destroy_map = {}
def __getstate__(self):
d = copy.copy(self.__dict__)
d.pop('__epydoc_asRoutine', None)
d.pop('_hashval')
return d
def __setstate__(self, d):
self.__dict__.update(d)
self._rehash()
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op)
def __hash__(self):
return hash(type(self)) ^ hash(self.scalar_op)
def __str__(self):
return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,)
def make_node(self, *inputs):
_inputs = [as_gpuarray_variable(i) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError("Wrong argument count", (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('mismatched rank amongst inputs')
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
assert self.nout > 0
inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(inputs)]
scal_ins = [scalar.Scalar(i.dtype) for i in inputs]
res = Apply(self, _inputs,
[GpuArrayType(o.dtype, broadcastable)()
for o in self.scalar_op.output_types(scal_ins)])
outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(res.outputs)]
scal_out = [scalar.Scalar(o.dtype) for o in res.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out])
kcode = self.scalar_op.c_code(fake_node, 'kcode',
[i.expr() for i in inps],
[o.expr() for o in outs],
sub=dict(fail='return;'))
res.tag.kcode = kcode
try:
code = self.scalar_op.c_support_code_apply(fake_node, 'kcode')
if code:
raise SupportCodeError()
except MethodNotDefined:
pass
support_code = ""
try:
support_code += self.scalar_op.c_support_code()
except MethodNotDefined:
pass
if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)":
# Avoid the C++ complex struct
raise SupportCodeError()
k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code)
res.tag.kernel = k
return res
def perform(self, node, inps, out):
k = node.tag.kernel
outs = [ensure_out(o[0], inps[0]) for o in out]
# the dict call is there to avoid syntax error in python <= 2.5
k(*(inps+outs), **dict(broadcast=True))
for o, og in zip(out, outs):
o[0] = og
class SupportCodeError(Exception):
"""
We do not support certain things (such as the C++ complex struct)
"""
import theano, numpy
from theano import tensor
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
InconsistencyError, EquilibriumOptimizer)
from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
from elemwise import GpuElemwise, _is_scalar
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'gpuarray')
# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1,
'gpuarray')
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
return local_opt
return f
register_opt()(theano.tensor.opt.local_track_shape_i)
class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave."
def add_requirements(self, fgraph):
fgraph.attach_feature(toolbox.ReplaceValidate())
fgraph.attach_feature(DestroyHandler())
def apply(self, fgraph):
for input in fgraph.inputs:
if isinstance(input.type, GpuArrayType):
continue
if (len(input.clients) == 1 and
(input.clients[0][0] == 'output' or
input.clients[0][0].op == gpu_from_host)):
continue
try:
new_input = host_from_gpu(gpu_from_host(input))
fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer")
except TypeError, e:
# This could fail if the inputs are not TensorTypes
pass
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
return [node.inputs[0].owner.inputs[0]]
if tensor.opt.opt.check_chain(node, host_from_gpu, gpu_from_host):
return [node.inputs[0].owner.inputs[0]]
return False
gpu_cut_copies.register('cut_gpua_host_transfers', local_cut_gpu_host_gpu,
'fast_run', 'inplace', 'gpuarray')
gpu_cut_copies.register('cut_gpua_constant_transfers',
tensor.opt.constant_folding,
'fast_run', 'gpuarray')
optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt()
@local_optimizer([tensor.Alloc])
def local_gpualloc(node):
replace = False
if node.op == tensor.alloc:
if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
replace = True
elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
replace = True
elif all([c != 'output' and c.op == tensor.join and
all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
replace = True
if replace:
val = node.inputs[0]
shp = node.inputs[1:]
old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
new_out = host_from_gpu(gpu_alloc(val, *shp))
if new_out.type != old_out.type:
assert new_out.type.ndim == old_out.type.ndim
assert new_out.type.dtype == old_out.type.dtype
for b_old, b_new in zip(old_out.type.broadcastable,
new_out.type.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
return [new_out]
@register_opt()
@local_optimizer([])
def local_gpu_elemwise(node):
do_replace = False
gpu_out = False
# check for gpu_from_host(Elemwise)) and extract the Elemwise node
if node.op == gpu_from_host:
host_i, = node.inputs
if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and
len(host_i.clients) == 1):
node = host_i.owner
do_replace = True
gpu_out = True
# check for elemwise(..., host_from_gpu, ...)
if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and
i.owner.op == host_from_gpu
for i in node.inputs]):
do_replace = True
if numpy.all([_is_scalar(i)
for i in node.inputs]):
do_replace = False
if do_replace:
new_op = GpuElemwise(node.op.scalar_op)
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
if gpu_out:
return [gpu_elemwise]
else:
return [host_from_gpu(gpu_elemwise)]
else:
return False
import unittest
from itertools import izip
from copy import copy, deepcopy
import numpy
import theano
import theano.tensor as T
from theano.compile import DeepCopyOp
from theano.tensor.tests.test_basic import safe_make_node
from theano.tests.unittest_tools import SkipTest
from numpy.testing.noseclasses import KnownFailureTest
import theano.sandbox.gpuarray
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
cuda_ndarray.use('gpu')
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from theano.sandbox.gpuarray.type import (GpuArrayType,
gpuarray_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda,
cuda_from_gpu)
from theano.tests import unittest_tools as utt
utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray'\
)
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def may_fail(msg, EClass):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def test_decorator(f):
def wrapper():
try:
f()
except Exception, e:
if isinstance(e, EClass):
raise KnownFailureTest(msg, e)
raise
wrapper.__name__ = f.__name__
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None):
if mode is None:
mode = mode_with_gpu
return theano.function(inputs, outputs, mode=mode,
allow_input_downcast=allow_input_downcast,
accept_inplace=True,
on_unused_input=on_unused_input, name=name)
def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
from theano.tensor.sharedvar import tensor_constructor, scalar_constructor
for c in (gpuarray_shared_constructor, tensor_constructor,
scalar_constructor):
try:
return c(value, name=name, strict=strict,
allow_downcast=allow_downcast, **kwargs)
except TypeError:
continue
def rand_gpuarray(*shape, **kwargs):
r = rng.rand(*shape) * 2 - 1
dtype = kwargs.pop('dtype', theano.config.floatX)
if len(kwargs) != 0:
raise TypeError('Unexpected argument %s', kwargs.keys()[0])
return gpuarray.array(r, dtype=dtype)
def makeTester(name, op, expected, good=None, bad_build=None, checks=None,
bad_runtime=None, mode=None, skip=False, eps=1e-10):
if good is None:
good = {}
if bad_build is None:
bad_build = {}
if bad_runtime is None:
bad_runtime = {}
if checks is None:
checks = {}
_op = op
_expected = expected
_good = good
_bad_build = bad_build
_bad_runtime = bad_runtime
_skip = skip
_checks = checks
class Checker(unittest.TestCase):
op = staticmethod(_op)
expected = staticmethod(_expected)
good = _good
bad_build = _bad_build
bad_runtime = _bad_runtime
skip = _skip
checks = _checks
def setUp(self):
eval(self.__class__.__module__ + '.' + self.__class__.__name__)
def test_good(self):
if skip:
raise SkipTest(skip)
for testname, inputs in good.items():
inputs = [copy(input) for input in inputs]
inputrs = [fake_shared(input) for input in inputs]
try:
node = safe_make_node(self.op, *inputrs)
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while making "
"a node with inputs %s") % (self.op, testname,
inputs)
exc.args += (err_msg,)
raise
try:
f = inplace_func([], node.outputs, mode=mode,
name='test_good')
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while trying to "
"make a Function") % (self.op, testname)
exc.args += (err_msg,)
raise
if isinstance(self.expected, dict) and \
testname in self.expected:
expecteds = self.expected[testname]
else:
expecteds = self.expected(*inputs)
if not isinstance(expecteds, (list, tuple)):
expecteds = (expecteds,)
try:
variables = f()
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while calling "
"the Function on the inputs %s") % (self.op,
testname,
inputs)
exc.args += (err_msg,)
raise
for i, (variable, expected) in \
enumerate(izip(variables, expecteds)):
if variable.dtype != expected.dtype or \
variable.shape != expected.shape or \
not GpuArrayType.values_eq_approx(variable,
expected):
self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % (
self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype))
for description, check in self.checks.items():
if not check(inputs, variables):
self.fail(("Test %s::%s: Failed check: %s "
"(inputs were %s, ouputs were %s)") %
(self.op, testname, description,
inputs, variables))
def test_bad_build(self):
if skip:
raise SkipTest(skip)
for testname, inputs in self.bad_build.items():
inputs = [copy(input) for input in inputs]
inputrs = [fake_shared(input) for input in inputs]
self.assertRaises(Exception, safe_make_node, self.op, *inputrs)
def test_bad_runtime(self):
if skip:
raise SkipTest(skip)
for testname, inputs in self.bad_runtime.items():
inputrs = [fake_shared(input) for input in inputs]
try:
node = safe_make_node(self.op, *inputrs)
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while trying to "
"make a node with inputs %s") % (self.op,
testname,
inputs)
exc.args += (err_msg,)
raise
try:
f = inplace_func([], node.outputs, mode=mode,
name="test_bad_runtime")
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while trying to "
"make a Function") % (self.op, testname)
exc.args += (err_msg,)
raise
self.assertRaises(Exception, f, [])
Checker.__name__ = name
return Checker
def test_transfer_cpu_gpu():
a = T.fmatrix('a')
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
f = theano.function([a], gpu_from_host(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
f = theano.function([g], host_from_gpu(g))
fv = f(gv)
assert numpy.all(fv == av)
def test_transfer_strided():
# This is just to ensure that it works in theano
# compyte has a much more comprehensive suit of tests to ensure correctness
a = T.fmatrix('a')
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
av = numpy.asarray(rng.rand(5, 8), dtype='float32')
gv = gpuarray.array(av)
av = av[:,::2]
gv = gv[:,::2]
f = theano.function([a], gpu_from_host(a))
fv = f(av)
assert GpuArrayType.values_eq(fv, gv)
f = theano.function([g], host_from_gpu(g))
fv = f(gv)
assert numpy.all(fv == av)
@may_fail("Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:,::-2]
cvs = cv[:,::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
assert GpuArrayType.values_eq_approx(fv, gv)
fvs = f(cvs)
assert GpuArrayType.values_eq_approx(fvs, gvs)
f = theano.function([g], cuda_from_gpu(g))
fv = f(gv)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
fvs = f(gvs)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype)
g[:] = x
return g
GpuAllocTester = makeTester(
name="GpuAllocTester",
op=gpu_alloc,
expected=gpu_alloc_expected,
good=dict(
correct01=(rand_gpuarray(), numpy.int32(7)),
correct01_bcast=(rand_gpuarray(1), numpy.int32(7)),
correct02=(rand_gpuarray(), numpy.int32(4), numpy.int32(7)),
correct12=(rand_gpuarray(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand_gpuarray(7), numpy.int32(2), numpy.int32(4),
numpy.int32(7)),
correct23=(rand_gpuarray(4, 7), numpy.int32(2), numpy.int32(4),
numpy.int32(7))
),
bad_runtime=dict(
bad_shape12=(rand_gpuarray(7), numpy.int32(7), numpy.int32(5)),
)
)
def test_deep_copy():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
import numpy
import theano
from theano import Type, Variable, Constant, tensor, config, scalar
from theano.compile import SharedVariable
# Make sure this is importable even if pygpu is absent
# (it will not work though)
try:
import pygpu
from pygpu import gpuarray
from pygpu.elemwise import compare, elemwise2
except ImportError:
pass
class GpuArrayType(Type):
def __init__(self, dtype, broadcastable, name=None):
# In case this was not provided and no global value is available
self.dtype = str(dtype)
self.broadcastable = tuple(bool(b) for b in broadcastable)
self.ndim = len(self.broadcastable)
self.name = name
try:
self.typecode = gpuarray.dtype_to_typecode(self.dtype)
except gpuarray.GpuArrayException:
raise TypeError("Unsupported dtype for %s: %s" %
(self.__class__.__name__, self.dtype))
def filter(self, data, strict=False, allow_downcast=None):
if strict:
if not isinstance(data, gpuarray.GpuArray):
raise TypeError("%s expected a GpuArray object." % self,
data, type(data))
if self.typecode != data.typecode:
raise TypeError("%s expected typecode %d (dtype %s), "
"got %d (dtype %s)." %
(self, self.typecode, self.dtype,
data.typecode, str(data.dtype)))
# fallthrough to ndim check
elif allow_downcast:
data = gpuarray.array(data, dtype=self.typecode, copy=False,
ndmin=len(self.broadcastable))
else:
up_dtype = scalar.upcast(self.dtype, data.dtype)
if up_dtype == self.dtype:
data = gpuarray.array(data, dtype=self.typecode, copy=False)
else:
raise TypeError("%s cannot store a value of dtype %s "
"without risking loss of precision." %
(self, data.dtype))
if self.ndim != data.ndim:
raise TypeError("Wrong number of dimensions: expected %s, "
"got %s with shape %s." % (self.ndim, data.ndim,
data.shape), data)
shp = data.shape
for i, b in enumerate(self.broadcastable):
if b and shp[i] != 1:
raise TypeError("Non-unit value on shape on a broadcastable"
" dimension.", shp, self.broadcastable)
return data
def filter_variable(self, other):
if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable()
if not isinstance(other, Variable):
other = self.Constant(type=self, data=other)
if other.type == self:
return other
if not isinstance(other.type, tensor.TensorType):
raise TypeError('Incompatible type', (self, other.type))
if (other.type.dtype != self.dtype):
raise TypeError('Incompatible dtype', (self.dtype,
other.type.dtype))
if other.type.ndim != self.ndim:
raise TypeError('Incompatible number of dimensions.'
' Expected %d, got %d.' % (self.ndim, other.ndim))
if other.type.broadcastable != self.broadcastable:
raise TypeError('Incompatible broadcastable dimensions.'
' Expected %s, got %s.' %
(str(other.type.broadcastable),
str(self.broadcastable)))
return theano.sandbox.gpuarray.basic_ops.gpu_from_host(other)
@staticmethod
def values_eq(a, b):
if a.shape != b.shape:
return False
if a.typecode != b.typecode:
return False
return numpy.asarray(compare(a, '==', b)).all()
@staticmethod
def values_eq_approx(a, b):
if a.shape != b.shape or a.dtype != b.dtype:
return False
if 'int' in str(a.dtype):
return GpuArrayType.values_eq(a, b)
else:
res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
op_tmpl="res[i] = ((%(a)s - %(b)s) <" \
"(1e-8 + 1e-5 * fabs(%(b)s)))")
return numpy.asarray(res).all()
def value_zeros(self, shape):
return pygpu.gpuarray.zeros(shape, dtype=self.typecode)
def make_variable(self, name=None):
return self.Variable(self, name=name)
def __eq__(self, other):
return (type(self) == type(other) and
self.typecode == other.typecode and
self.broadcastable == other.broadcastable)
def __hash__(self):
return (hash(self.typecode) ^ hash(self.broadcastable))
def __str__(self):
return "GpuArray<%s>" % (self.dtype,)
def get_shape_info(self, obj):
return obj.shape
def get_size(self, shape_info):
if shape_info:
return numpy.prod(shape_info) * numpy.dtype(self.dtype).itemsize
else:
return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub):
return "GpuArrayObject *%s;" % (name,)
def c_init(self, name, sub):
return "%s = NULL;" % (name,)
def c_extract(self, name, sub):
# TODO I don't check broadcast stuff for now.
return """
%(name)s = NULL;
if (py_%(name)s == Py_None) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray, not None");
%(fail)s
}
/* First check if we are the base type exactly (the most common case),
then do the full subclass check if needed. */
if (py_%(name)s->ob_type != &GpuArrayType &&
!PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
%(fail)s
}
%(name)s = (GpuArrayObject *)py_%(name)s;
Py_INCREF(%(name)s);
""" % {'name': name, 'fail': sub['fail']}
def c_cleanup(self, name, sub):
return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name }
def c_sync(self, name, sub):
return """
if (!%(name)s) {
Py_XDECREF(py_%(name)s);
Py_INCREF(Py_None);
py_%(name)s = Py_None;
} else if ((void *)py_%(name)s != (void *)%(name)s) {
Py_XDECREF(py_%(name)s);
py_%(name)s = (PyObject *)%(name)s;
Py_INCREF(py_%(name)s);
}
""" % {'name': name}
def c_init_code(self):
# We don't actually need the numpy API except in
# HostFromGpu and GpuFromHost and those case will be covered
# by the TensorType parameter
return ['import_pygpu__gpuarray();']
def c_headers(self):
# We need arrayobject for the PyArrayDescr struct def
# (even if we just use a pointer to it in a function def)
return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
'<numpy/arrayobject.h>', '<gpuarray_api.h>']
def c_header_dirs(self):
return [pygpu.get_include(), numpy.get_include()]
def c_libraries(self):
return ['compyte']
def c_code_cache_version(self):
return (1,)
class _operators(tensor.basic._tensor_py_operators):
def _as_TensorVariable(self):
from basic_ops import host_from_gpu
return host_from_gpu(self)
def _as_GpuArrayVariable(self):
return self
dtype = property(lambda s: s.type.dtype)
broadcastable = property(lambda s: s.type.broadcastable)
ndim = property(lambda s: s.type.ndim)
class GpuArrayVariable(_operators, Variable):
pass
GpuArrayType.Variable = GpuArrayVariable
class GpuArraySignature(tensor.basic.TensorConstantSignature):
pass # might do something better if we can run the sum on the
# GPU, but for now this will suffice.
class GpuArrayConstant(_operators, Constant):
def signature(self):
return GpuArraySignature((self.type, numpy.asarray(self.data)))
def __str__(self):
if self.name is not None:
return self.name
return "GpuArrayConstant{%s}" % numpy.asarray(self.data)
GpuArrayType.Constant = GpuArrayConstant
class GpuArraySharedVariable(_operators, SharedVariable):
def get_value(self, borrow=False, return_internal_type=False):
if return_internal_type:
if borrow:
return self.container.value
else:
return self.container.value.copy()
else:
return numpy.asarray(self.container.value)
def set_value(self, value, borrow=False):
self.container.value = pygpu.gpuarray.array(value, copy=(not borrow))
def __getitem__(self, *args):
return _operators.__getitem__(self, *args)
GpuArrayType.SharedVariable = GpuArraySharedVariable
def gpuarray_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False,
broadcastable=None):
"""SharedVariable constructor for GpuArrayType"""
if not isinstance(value, (numpy.ndarray, pygpu.gpuarray.GpuArray)):
raise TypeError('ndarray or GpuArray required')
if broadcastable is None:
broadcastable = (False,) * value.ndim
type = GpuArrayType(value.dtype, broadcastable)
deviceval = pygpu.gpuarray.array(value, copy=(not borrow))
return GpuArraySharedVariable(type=type, value=deviceval, name=name,
strict=strict)
theano.compile.register_view_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_XINCREF(%(oname)s);
""", version=(0,))
theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s);
%(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (!%(oname)s) { %(fail)s }
int err;
err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error during copy");
%(fail)s
}
""", version=(1,))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论