提交 8bd900f8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1582 from abergeron/compyte2-rb

Compyte2 rebase
...@@ -932,7 +932,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -932,7 +932,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
adv_incsub1 = cuda.GpuAdvancedIncSubtensor1 adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
mode = mode_with_gpu mode = mode_with_gpu
dtype = 'float32' dtype = 'float32'
ignore_topo = (B.HostFromGpu, B.GpuFromHost) ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
fast_compile = False fast_compile = False
ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor, ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1) cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)
......
import logging import logging
import theano import theano
from theano.configparser import config from theano.configparser import config, AddConfigVar, BoolParam
from theano.compile import optdb from theano.compile import optdb
_logger_name = 'theano.sandbox.gpuarray' _logger_name = 'theano.sandbox.gpuarray'
...@@ -18,6 +18,13 @@ try: ...@@ -18,6 +18,13 @@ try:
except ImportError: except ImportError:
pygpu = None pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor)
......
...@@ -6,8 +6,10 @@ import theano ...@@ -6,8 +6,10 @@ import theano
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar, config from theano import tensor, scalar, config
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.basic import Alloc
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.gof.utils import MethodNotDefined
try: try:
import pygpu import pygpu
...@@ -17,6 +19,7 @@ except ImportError: ...@@ -17,6 +19,7 @@ except ImportError:
from type import GpuArrayType from type import GpuArrayType
def as_gpuarray_variable(x): def as_gpuarray_variable(x):
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
return x._as_GpuArrayVariable() return x._as_GpuArrayVariable()
...@@ -29,6 +32,32 @@ def as_gpuarray(x): ...@@ -29,6 +32,32 @@ def as_gpuarray(x):
return gpuarray.array(x, copy=False) return gpuarray.array(x, copy=False)
class HideC(object):
def __hide(*args):
raise MethodNotDefined()
c_code = __hide
c_code_cleanup = __hide
c_headers = __hide
c_header_dirs = __hide
c_libraries = __hide
c_lib_dirs = __hide
c_support_code = __hide
c_support_code_apply = __hide
c_compile_args = __hide
c_no_compile_args = __hide
c_init_code = __hide
def c_code_cache_version(self):
return ()
def c_code_cache_version_apply(self, node):
return self.c_code_cache_version()
class HostFromGpu(Op): class HostFromGpu(Op):
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -66,7 +95,7 @@ class HostFromGpu(Op): ...@@ -66,7 +95,7 @@ class HostFromGpu(Op):
} else { } else {
%(name)s_ga = &%(inp)s->ga; %(name)s_ga = &%(inp)s->ga;
} }
%(name)s_dtype = typecode_to_dtype(%(inp)s->ga.typecode); %(name)s_dtype = typecode_to_dtype(%(name)s_ga->typecode);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
// PyArray_Empty below steals a reference to the dtype we pass it // PyArray_Empty below steals a reference to the dtype we pass it
// so we need an extra one to spare. // so we need an extra one to spare.
...@@ -89,7 +118,7 @@ class HostFromGpu(Op): ...@@ -89,7 +118,7 @@ class HostFromGpu(Op):
%(fail)s %(fail)s
} }
""" % {'name': name, 'fail': sub['fail'], 'inp': inputs[0], """ % {'name': name, 'fail': sub['fail'], 'inp': inputs[0],
'out': outputs[0]} 'out': outputs[0]}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (1,)
...@@ -97,7 +126,7 @@ class HostFromGpu(Op): ...@@ -97,7 +126,7 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [gpu_from_host(gz)] return [gpu_from_host(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
if isinstance(ev, tensor.TensorType): if isinstance(ev, tensor.TensorType):
...@@ -150,48 +179,22 @@ class GpuFromHost(Op): ...@@ -150,48 +179,22 @@ class GpuFromHost(Op):
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
return """ return """
PyArrayObject *%(name)s_tmp;
int %(name)serr;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
if (%(name)s_tmp == NULL) {
// PyArray_GETCONTIGUOUS sets an error message if it fails
%(fail)s
}
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context()); %(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
if (%(out)s == NULL) { get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
Py_DECREF(%(name)s_tmp);
// new_GpuArray calls __new__ which will set an error message
// if it returns NULL.
%(fail)s
}
%(name)serr = GpuArray_empty(&%(out)s->ga,
GpuArray_default_context()->ops,
GpuArray_default_context()->ctx,
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(inp)s), PyArray_NDIM(%(inp)s),
(size_t *)PyArray_DIMS(%(inp)s), (size_t *)PyArray_DIMS(%(inp)s),
GA_C_ORDER); (ssize_t *)PyArray_STRIDES(%(inp)s),
if (%(name)serr != GA_NO_ERROR) { pygpu_default_context(),
Py_DECREF(%(name)s_tmp); Py_None);
Py_DECREF(%(out)s); if (%(out)s == NULL) {
%(out)s = NULL;
PyErr_SetString(PyExc_MemoryError, "Can't allocate device memory for result.");
%(fail)s
}
%(name)serr = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
PyArray_NBYTES(%(name)s_tmp));
Py_DECREF(%(name)s_tmp);
if (%(name)serr != GA_NO_ERROR) {
Py_DECREF(%(out)s);
PyErr_SetString(PyExc_RuntimeError, "Could not copy array data to device");
%(fail)s %(fail)s
} }
""" % {'name': name, 'inp': inputs[0], """ % {'name': name, 'inp': inputs[0],
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (4,)
gpu_from_host = GpuFromHost() gpu_from_host = GpuFromHost()
...@@ -276,7 +279,7 @@ class GpuFromCuda(Op): ...@@ -276,7 +279,7 @@ class GpuFromCuda(Op):
ssize_t *%(name)sstr; ssize_t *%(name)sstr;
cuCtxGetCurrent(&%(name)scur); cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) { if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context."); PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s %(fail)s
} }
...@@ -297,15 +300,7 @@ class GpuFromCuda(Op): ...@@ -297,15 +300,7 @@ class GpuFromCuda(Op):
%(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4; %(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
} }
Py_XDECREF(%(out)s); %(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
%(out)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
if (%(out)s == NULL) {
free(%(name)sdims);
free(%(name)sstr);
%(fail)s
}
%(name)sdata = cuda_make_buf(GpuArray_default_context()->ctx,
(CUdeviceptr)%(in)s->devdata, (CUdeviceptr)%(in)s->devdata,
((size_t)%(in)s->data_allocated)*4); ((size_t)%(in)s->data_allocated)*4);
if (%(name)sdata == NULL) { if (%(name)sdata == NULL) {
...@@ -315,24 +310,23 @@ class GpuFromCuda(Op): ...@@ -315,24 +310,23 @@ class GpuFromCuda(Op):
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure."); PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)s %(fail)s
} }
%(name)serr = GpuArray_fromdata(&%(out)s->ga, Py_XDECREF(%(out)s);
GpuArray_default_context()->ops, %(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
%(name)sdata, 0, GA_FLOAT, %(in)s->nd, %(name)sdims, %(name)sstr,
%(name)sdims, %(name)sstr, 1); pygpu_default_context(), 1,
(PyObject *)%(in)s,
(PyObject *)&PyGpuArrayType);
pygpu_default_context()->ops->buffer_release(%(name)sdata);
free(%(name)sdims); free(%(name)sdims);
free(%(name)sstr); free(%(name)sstr);
if (%(name)serr != GA_NO_ERROR) { if (%(out)s == NULL) {
Py_DECREF(%(out)s);
PyErr_SetString(PyExc_MemoryError, "Could not allocate GpuArray structure.");
%(fail)s %(fail)s
} }
Py_INCREF(%(in)s); """ % {'name': name, 'in': inputs[0], 'out': outputs[0],
%(out)s->base = (PyObject *)%(in)s;
""" % {'name':name, 'in': inputs[0], 'out': outputs[0],
'fail': sub['fail']} 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (5,)
gpu_from_cuda = GpuFromCuda() gpu_from_cuda = GpuFromCuda()
...@@ -417,11 +411,15 @@ class CudaFromGpu(Op): ...@@ -417,11 +411,15 @@ class CudaFromGpu(Op):
CUcontext %(name)scur; CUcontext %(name)scur;
cuCtxGetCurrent(&%(name)scur); cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(GpuArray_default_context()->ctx)) { if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context."); PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s %(fail)s
} }
if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
%(fail)s
}
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd); %(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
if (!%(out)s) { if (!%(out)s) {
...@@ -441,61 +439,123 @@ class CudaFromGpu(Op): ...@@ -441,61 +439,123 @@ class CudaFromGpu(Op):
'fail': sub['fail']} 'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (3,)
cuda_from_gpu = CudaFromGpu() cuda_from_gpu = CudaFromGpu()
class GpuAlloc(Op): class GpuAlloc(HideC, Alloc):
def __str__(self): def __str__(self):
return 'GpuAlloc' return 'GpuAlloc'
def __hash__(self):
return hash(type(self))
def __eq__(self, other):
return type(self) == type(other)
def make_node(self, value, *shape): def make_node(self, value, *shape):
v = as_gpuarray_variable(value) res = Alloc.make_node(self, value, *shape)
sh = [tensor.as_tensor_variable(s) for s in shape] value = as_gpuarray_variable(value)
bcast = [] otype = GpuArrayType(dtype=res.outputs[0].dtype,
if v.ndim > len(shape): broadcastable=res.outputs[0].broadcastable)
raise TypeError( return Apply(self, [value] + res.inputs[1:], [otype()])
'GpuAlloc value has more dimensions than arguments',
value.ndim, len(shape))
for i, s in enumerate(sh):
if s.type.dtype[:3] not in ('int', 'uint'):
raise TypeError('Shape arguments must be integers', s)
try:
const_shp = tensor.get_scalar_constant_value(s)
except tensor.NotScalarConstantError:
const_shp = None
bcast.append(numpy.all(1 == const_shp))
otype = GpuArrayType(dtype=v.dtype, broadcastable=bcast)
return Apply(self, [v] + sh, [otype()])
def perform(self, node, inputs, outs): def perform(self, node, inputs, outs):
out, = outs out, = outs
v = inputs[0] v = inputs[0]
sh = tuple(map(int, inputs[1:])) sh = tuple(map(int, inputs[1:]))
if out[0] is None or out[0].shape != sh: if out[0] is None or out[0].shape != sh:
out[0] = gpuarray.empty(sh, dtype=v.dtype) if v.size == 1 and numpy.asarray(v)[0].item() == 0:
out[0][...] = v out[0] = gpuarray.zeros(sh, dtype=v.dtype)
else:
out[0] = gpuarray.empty(sh, dtype=v.dtype)
out[0][...] = v
else:
out[0][...] = v
if config.gpuarray.sync:
out[0].sync()
def c_code(self, node, name, inp, out, sub):
vv = inp[0]
ndim = len(inp[1:])
zz, = out
code = """
int i;
size_t %(name)s_shape[%(ndim)s];
""" % dict(name=name, ndim=ndim)
for i, shp_i in enumerate(inp[1:]):
code += """
%(name)s_shape[%(i)s] = ((dtype_%(shp_i)s *)PyArray_DATA(%(shp_i)s))[0];
""" % dict(name=name, i=i, shp_i=shp_i)
code += """
int need_new_out = (NULL == %(zz)s || %(zz)s->ga.nd != %(ndim)s);
if (!need_new_out)
for (i = 0; i < %(ndim)s; i++)
need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
if (need_new_out) {
Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(zz)s) {
%(fail)s
}
}
def infer_shape(self, node, input_shapes): if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) {
return [node.inputs[1:]] PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
}
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail'])
def grad(self, input, grads): if config.gpuarray.sync:
return [None for i in inputs] code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
def do_constant_folding(self, node): return code
if not getattr(node.ouputs[0], 'clients', []):
return False def c_code_cache_version(self):
for client in node.outputs[0].clients: return (1,)
if client[0] == 'output':
return False
return True
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
class GpuReshape(HideC, tensor.Reshape):
"""
Implement Reshape on the gpu.
"""
# __hash__, __eq__, __str__ come from tensor.Reshape
def make_node(self, x, shp):
x = as_gpuarray_variable(x)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable)
return Apply(self, [x, shp], [otype()])
def perform(self, node, inp, out_):
x, shp = inp
out, = out_
if (len(shp) != self.ndim):
raise ValueError('shape argument to GpuReshape.perform'
' has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp)
s = shp.prod()
if shp.prod() != x.size:
# We need to do check here to raise the same error as NumPy.
# We should make pygpu do the same.
ss = 1
nb_m1 = 0
for i in shp:
if i == -1:
nb_m1 += 1
else:
ss *= i
if nb_m1 > 1:
raise ValueError("Only one -1 is accepted in the new shape")
elif nb_m1 == 1:
if (x.size % ss) != 0:
raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
else:
raise ValueError("total size of new array must be unchanged")
out[0] = x.reshape(tuple(shp))
from theano import Op, Apply, config
from theano.tensor.blas import Gemv, Gemm
from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
try:
import pygpu
from pygpu import blas
except ImportError, e:
# To make sure theano is importable
pass
class BlasOp(HideC):
def c_headers(self):
return ['<blas_api.h>']
def c_header_dirs(self):
return [pygpu.get_include()]
def c_init_code(self):
return ['import_pygpu__blas();']
class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta):
res = Gemv.make_node(self, y, alpha, A, x, beta)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False,
overwrite_y=self.inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
if self.inplace:
code = """
Py_XDECREF(%(out)s);
%(out)s = %(y)s;
Py_INCREF(%(out)s);
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rgemv(cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(x)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
"""
return code
def c_code_cache_version(self):
return (0,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
C, alpha, A, B, beta = inputs
outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
overwrite_c=self.inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
if self.inplace:
code = """
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
"""
return code
def c_code_cache_version(self):
return (0,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out
@local_optimizer([gpugemv_no_inplace])
def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace:
return [gpugemv_inplace(*node.inputs)]
@local_optimizer([gpugemm_no_inplace])
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup(
local_inplace_gpuagemv, local_inplace_gpuagemm),
name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace,
70.0, 'fast_run', 'inplace', 'gpuarray')
import copy
from itertools import izip
import numpy import numpy
from theano import Op, Apply, scalar from theano import Op, Apply, scalar, config
from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype
try: try:
import pygpu
from pygpu.tools import ScalarArg, ArrayArg from pygpu.tools import ScalarArg, ArrayArg
from pygpu.elemwise import ElemwiseKernel from pygpu.elemwise import ElemwiseKernel
from pygpu.reduction import ReductionKernel
except ImportError: except ImportError:
pass pass
from basic_ops import as_gpuarray_variable from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
from type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
def _is_scalar(v): def _is_scalar(v):
False False
def make_argument(v, name): def make_argument(v, name):
if _is_scalar(v): if _is_scalar(v):
return ScalarArg(numpy.dtype(v.type.dtype), name) return ScalarArg(numpy.dtype(v.type.dtype), name)
else: else:
return ArrayArg(numpy.dtype(v.type.dtype), name) return ArrayArg(numpy.dtype(v.type.dtype), name)
def ensure_out(o, ref):
if o is None:
return ref._empty_like_me()
else:
return o
class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op): def ensure_allocated(storage, shape, dtype):
self.scalar_op = scalar_op odat = storage[0]
self.destroy_map = {} if odat is not None:
if odat.shape != shape:
# It is unsafe to try to resize odat,
# we have to allocate output storage.
odat = None
if odat is None:
odat = pygpu.empty(shape, dtype=dtype)
storage[0] = odat
return odat
def __getstate__(self):
d = copy.copy(self.__dict__)
d.pop('__epydoc_asRoutine', None)
d.pop('_hashval')
return d
def __setstate__(self, d): def as_C_string_const(s):
self.__dict__.update(d) return '\n'.join('"%s\\n"' % (l.replace('"', '\\"'))
self._rehash() for l in s.split('\n'))
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op)
def __hash__(self): class GpuElemwise(HideC, Elemwise):
return hash(type(self)) ^ hash(self.scalar_op) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __str__(self): def __str__(self):
return "GpuElemwise{%s}(gpuarray)" % (self.scalar_op,) if self.name is not None:
return self.name
items = str(sorted(self.inplace_pattern.items()))
return "GpuElemwise{%s}%s<gpuarray>" % (self.scalar_op, items)
def make_node(self, *inputs): def make_node(self, *inputs):
_inputs = [as_gpuarray_variable(i) for i in inputs] res = Elemwise.make_node(self, *inputs)
if self.nin > 0 and len(_inputs) != self.nin: outputs = [GpuArrayType(broadcastable=o.type.broadcastable,
raise TypeError("Wrong argument count", (self.nin, len(_inputs))) dtype=o.type.dtype)() for o in res.outputs]
for i in _inputs[1:]: inputs = [as_gpuarray_variable(i) for i in inputs]
if i.type.ndim != inputs[0].type.ndim: res = Apply(self, inputs, outputs)
raise TypeError('mismatched rank amongst inputs') # Try to generate the kernel to catch SupportCodeErrors
k = self.generate_kernel(res, 'test')
broadcastable = [] return res
for d in xrange(_inputs[0].type.ndim):
bcast_d = True def generate_kernel(self, node, nodename):
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
assert self.nout > 0
inps = [make_argument(i, 'i%d' % (n,)) for n, i in inps = [make_argument(i, 'i%d' % (n,)) for n, i in
enumerate(inputs)] enumerate(node.inputs)]
scal_ins = [scalar.Scalar(i.dtype) for i in inputs] scal_ins = [scalar.Scalar(i.dtype) for i in node.inputs]
res = Apply(self, _inputs,
[GpuArrayType(o.dtype, broadcastable)()
for o in self.scalar_op.output_types(scal_ins)])
outs = [make_argument(o, 'o%d' % (n,)) for n, o in outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(res.outputs)] enumerate(node.outputs) if not n in self.inplace_pattern]
scal_out = [scalar.Scalar(o.dtype) for o in res.outputs] scal_out = [scalar.Scalar(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
[o() for o in scal_out]) [o() for o in scal_out])
kcode = self.scalar_op.c_code(fake_node, 'kcode',
[i.expr() for i in inps],
[o.expr() for o in outs],
sub=dict(fail='return;'))
res.tag.kcode = kcode
try: try:
code = self.scalar_op.c_support_code_apply(fake_node, 'kcode') code = self.scalar_op.c_support_code_apply(fake_node, nodename)
if code: if code:
raise SupportCodeError() raise SupportCodeError(code)
except MethodNotDefined: except MethodNotDefined:
pass pass
support_code = "" support_code = ""
try: try:
support_code += self.scalar_op.c_support_code() support_code = self.scalar_op.c_support_code()
except MethodNotDefined: except MethodNotDefined:
pass pass
if support_code != "#define THEANO_MACRO_MOD(x,y) (x % y)": if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
# Avoid the C++ complex struct support_code.strip() != ""):
raise SupportCodeError() # The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code)
k = ElemwiseKernel(None, inps+outs, kcode, preamble=support_code) scal_out = []
res.tag.kernel = k oi = 0
for n in range(len(fake_node.outputs)):
if n in self.inplace_pattern:
scal_out.append(inps[self.inplace_pattern[n]].name+'[i]')
else:
scal_out.append(outs[oi].name+'[i]')
oi += 1
return res kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
[i.name+'[i]' for i in inps],
scal_out,
dict(fail='return;'))
# Translate types for scalar composite ops (except complex).
support_code += """
#define npy_float64 ga_double
#define npy_float32 ga_float
#define npy_uint8 ga_ubyte
#define npy_int8 ga_byte
#define npy_uint16 ga_ushort
#define npy_int16 ga_short
#define npy_uint32 ga_uint
#define npy_int32 ga_int
#define npy_uint64 ga_ulong
#define npy_int64 ga_long
"""
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_support_code_apply(self, node, nodename):
# This is useless by itself, but will serve an eventual c_code
# implementation
k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim
res = []
for i in range(1, nd):
var = "static const char %s_%s[] = " % (nodename, str(i))
res.append(var + as_C_string_const(k.render_basic(i)) + ';')
res.append("static const gpukernel *%s_%s_k = NULL;" % (nodename,
str(i)))
var = "static const char %s_c[] = " % (nodename,)
res.append(var + as_C_string_const(k.contig_src) + ';')
res.append("static const gpukernel *%s_c_k = NULL;" % (nodename,))
return '\n'.join(res)
def perform(self, node, inputs, output_storage):
# Try to reuse the kernel from a previous call to hopefully
# avoid recompiling
if not hasattr(node, '_cache_elemwise_k'):
node._cache_elemwise_k = self.generate_kernel(node, "kcode")
out_shape = []
for values in izip(*[input.shape for input in inputs]):
if any(v == 0 for v in values):
# All non-broadcasted dimensions should be zero
assert max(values) <= 1
out_shape.append(0)
else:
out_shape.append(max(values))
out_shape = tuple(out_shape)
def perform(self, node, inps, out): args = copy.copy(inputs)
k = node.tag.kernel for n, (stor, out) in enumerate(izip(output_storage, node.outputs)):
outs = [ensure_out(o[0], inps[0]) for o in out] if n in self.inplace_pattern:
stor[0] = inputs[self.inplace_pattern[n]]
else:
args.append(ensure_allocated(stor, out_shape, out.type.dtype))
# the dict call is there to avoid syntax error in python <= 2.5 # the dict call is there to avoid a syntax error in python < 2.6
k(*(inps+outs), **dict(broadcast=True)) node._cache_elemwise_k(*args, **dict(broadcast=True))
if config.gpuarray.sync:
output_storage[0][0].sync()
for o, og in zip(out, outs):
o[0] = og
class SupportCodeError(Exception): class SupportCodeError(Exception):
""" """
We do not support certain things (such as the C++ complex struct) We do not support certain things (such as the C++ complex struct)
""" """
class GpuDimShuffle(HideC, DimShuffle):
def make_node(self, input):
res = DimShuffle.make_node(self, input)
otype = GpuArrayType(dtype=res.outputs[0].type.dtype,
broadcastable=res.outputs[0].type.broadcastable)
input = as_gpuarray_variable(input)
return Apply(self, [input], [otype()])
def __str__(self):
if self.inplace:
s = "InplaceGpuDimShuffle{%s}"
else:
s = "GpuDimShuffle{%s}"
return s % (','.join(str(x) for x in self.new_order))
def perform(self, node, inp, out):
input, = inp
storage, = out
res = input
res = res.transpose(self.shuffle+self.drop)
shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment:
shape.insert(augm, 1)
res = res.reshape(shape)
if not self.inplace:
res = res.copy()
storage[0] = res
def c_support_code_apply(self, node, name):
def copy_shape(nd_out):
stmts = []
e = 0
for d in range(nd_out):
if d in self.augment:
stmts.append("sh[%s] = 1;" % (d,))
else:
stmts.append("sh[%s] = tmp->ga.dimensions[%s];" % (d, e))
e += 1
return '\n '.join(stmts)
return """
static const unsigned int %(name)s_ax[] = {%(shuffle)s};
static PyGpuArrayObject *%(name)s_f(PyGpuArrayObject *a) {
PyGpuArrayObject *res, *tmp;
size_t sh[%(nd_out)s];
tmp = pygpu_transpose(a, %(name)s_ax);
if (!tmp) return NULL;
%(copy_shape)s
res = pygpu_reshape(tmp, %(nd_out)s, sh, GA_ANY_ORDER, 1, -1);
Py_DECREF(tmp);
return res;
}
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
name=name, nd_out=len(self.new_order),
copy_shape=copy_shape(len(self.new_order)))
def c_code(self, node, name, inputs, outputs, sub):
d = dict(name=name, fail=sub['fail'], inp=inputs[0], out=outputs[0],
nd=len(self.input_broadcastable))
process = """
PyGpuArrayObject *tmp = NULL;
if (%(inp)s->ga.nd != %(nd)s) {
PyErr_SetString(PyExc_TypeError, "input nd");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = %(name)s_f(%(inp)s);
if (%(out)s == NULL) {%(fail)s}
""" % d
if not self.inplace:
process += """
tmp = pygpu_copy(%(out)s, GA_ANY_ORDER);
Py_DECREF(%(out)s);
if (!tmp) {
%(out)s = NULL;
%(fail)s
}
%(out)s = tmp;
""" % d
return process
def c_code_cache_version(self):
return (3,)
class GpuCAReduce(HideC, CAReduceDtype):
def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
if not hasattr(scalar_op, 'identity'):
raise ValueError("No identity on scalar op")
CAReduceDtype.__init__(self, scalar_op, axis=axis, dtype=dtype,
acc_dtype=acc_dtype)
def __str__(self):
ax = ''
if self.axis is not None:
ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
return "GpuReduce{%s}%s" % (self.scalar_op, ax)
def make_node(self, input):
res = CAReduceDtype.make_node(self, input)
input = as_gpuarray_variable(input)
otype = GpuArrayType(dtype=res.outputs[0].dtype,
broadcastable=res.outputs[0].broadcastable)
if res.op.axis is not None:
redux = []
for i in range(len(input.type.broadcastable)):
redux.append(i in res.op.axis)
# since redux is just another way to describe what is in axis
# it doesn't need to be compared in __eq__ or __hash__
res.op.redux = redux
return Apply(res.op, [input], [otype()])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
if self.axis is None:
redux = [True] * node.inputs[0].ndim
else:
redux = self.redux
acc_dtype = getattr(self, 'acc_dtype', None)
if acc_dtype is None:
acc_dtype = node.outputs[0].type.dtype
if any(redux):
node._cache_reduction_k = self.generate_kernel(node, acc_dtype,
redux)
return super(GpuCAReduce, self).make_thunk(node, storage_map,
compute_map, no_recycling)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
reduce_expr = "a + b"
elif isinstance(self.scalar_op, scalar.basic.Mul):
reduce_expr = "a * b"
else:
raise NotImplementedError()
return ReductionKernel(pygpu.get_default_context(), odtype,
self.scalar_op.identity, reduce_expr, redux,
arguments=[make_argument(node.inputs[0], 'a')],
init_nd=node.inputs[0].ndim
)
def perform(self, node, inp, out):
input, = inp
output, = out
if self.axis is None:
redux = [True] * input.ndim
else:
redux = self.redux
if any(redux):
output[0] = node._cache_reduction_k(input).astype(copy=False,
dtype=node.outputs[0].type.dtype)
else:
output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype)
import theano, numpy import copy
from theano import tensor import theano
import numpy
from theano import tensor, scalar
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox, DestroyHandler,
...@@ -8,8 +10,12 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, ...@@ -8,8 +10,12 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
from elemwise import GpuElemwise, _is_scalar gpu_alloc, GpuReshape)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -26,6 +32,7 @@ optdb.register('gpuarray_opt', gpu_seqopt, ...@@ -26,6 +32,7 @@ optdb.register('gpuarray_opt', gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1, optdb.__position__.get('add_destroy_handler', 49.5) - 1,
'gpuarray') 'gpuarray')
def register_opt(*tags, **kwargs): def register_opt(*tags, **kwargs):
def f(local_opt): def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__ name = (kwargs and kwargs.pop('name')) or local_opt.__name__
...@@ -35,6 +42,36 @@ def register_opt(*tags, **kwargs): ...@@ -35,6 +42,36 @@ def register_opt(*tags, **kwargs):
register_opt()(theano.tensor.opt.local_track_shape_i) register_opt()(theano.tensor.opt.local_track_shape_i)
def op_lifter(OP):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
if type(node.op) is OP:
# This does not support nodes that have more than one output.
assert len(node.outputs) == 1
# either one of our inputs is on the gpu or
# all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu
for i in node.inputs]) or
all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients])):
new_op = maker(node)
# This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [host_from_gpu(new_op(*node.inputs))]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False
local_opt.__name__ = maker.__name__
return local_optimizer([OP])(local_opt)
return f
class InputToGpuOptimizer(Optimizer): class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave." "Transfer the input to the gpu to start the rolling wave."
...@@ -63,6 +100,7 @@ class InputToGpuOptimizer(Optimizer): ...@@ -63,6 +100,7 @@ class InputToGpuOptimizer(Optimizer):
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(), gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge') 0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([]) @local_optimizer([])
def local_cut_gpu_host_gpu(node): def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu): if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
...@@ -78,67 +116,117 @@ gpu_cut_copies.register('cut_gpua_constant_transfers', ...@@ -78,67 +116,117 @@ gpu_cut_copies.register('cut_gpua_constant_transfers',
optdb['canonicalize'].register('local_cut_gpua_host_gpua', optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray') local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @op_lifter(tensor.Alloc)
def local_gpualloc(node): def local_gpualloc(node):
replace = False return gpu_alloc
if node.op == tensor.alloc:
if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
replace = True
elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
replace = True
elif all([c != 'output' and c.op == tensor.join and
all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
replace = True
if replace:
val = node.inputs[0]
shp = node.inputs[1:]
old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
new_out = host_from_gpu(gpu_alloc(val, *shp))
if new_out.type != old_out.type:
assert new_out.type.ndim == old_out.type.ndim
assert new_out.type.dtype == old_out.type.dtype
for b_old, b_new in zip(old_out.type.broadcastable,
new_out.type.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
return [new_out]
@register_opt() @register_opt()
@local_optimizer([]) @op_lifter(tensor.Reshape)
def local_gpureshape(node):
op = node.op
name = op.name
if name:
name = 'Gpu' + name
res = GpuReshape(op.ndim, op.name)
return res
@register_opt()
@op_lifter(tensor.Flatten)
def local_gpuflatten(node):
op = node.op
if op.outdim != 1:
return None
res = GpuReshape(op.outdim, None)
o = res(node.inputs[0], theano.tensor.constant([-1]))
return o
@register_opt()
@op_lifter(tensor.Elemwise)
def local_gpu_elemwise(node): def local_gpu_elemwise(node):
do_replace = False op = node.op
gpu_out = False name = op.name
# check for gpu_from_host(Elemwise)) and extract the Elemwise node if name:
if node.op == gpu_from_host: name = 'Gpu'+name
host_i, = node.inputs res = GpuElemwise(op.scalar_op, name=name,
if (host_i.owner and inplace_pattern=copy.copy(op.inplace_pattern),
isinstance(host_i.owner.op, tensor.Elemwise) and nfunc_spec=op.nfunc_spec)
len(host_i.clients) == 1): return res
node = host_i.owner
do_replace = True
gpu_out = True def max_inputs_to_GpuElemwise(node):
# check for elemwise(..., host_from_gpu, ...) ptr_size = 8
if isinstance(node.op, tensor.Elemwise): int_size = 4
if numpy.any([i.owner and
i.owner.op == host_from_gpu # we take the limit from CUDA for now
for i in node.inputs]): argument_limit = 232
do_replace = True ndim = node.inputs[0].type.ndim
if numpy.all([_is_scalar(i) # number of elements and shape
for i in node.inputs]): size_param_mandatory = (int_size * (ndim + 1)) + \
do_replace = False (ptr_size + int_size * ndim) * len(node.outputs)
if do_replace: nb_bytes_avail = argument_limit - size_param_mandatory
new_op = GpuElemwise(node.op.scalar_op) nb_bytes_per_input = ptr_size + ndim * int_size
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs)) max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
if gpu_out:
return [gpu_elemwise] return max_nb_inputs
else:
return [host_from_gpu(gpu_elemwise)] gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
else: GpuElemwise,
return False max_inputs_to_GpuElemwise)
optdb.register('gpua_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise)
optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
@register_opt()
@op_lifter(tensor.DimShuffle)
def local_gpua_dimshuffle(node):
return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order)
@register_opt()
@op_lifter(tensor.SpecifyShape)
def local_gpua_specifyShape(node):
return tensor.specify_shape
@register_opt()
@op_lifter(tensor.Subtensor)
def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list)
@register_opt()
@op_lifter(tensor.CAReduce)
def local_gpua_careduce(node):
if (isinstance(node.op.scalar_op, scalar.basic.Add) or
isinstance(node.op.scalar_op, scalar.basic.Mul)):
return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
@register_opt()
@op_lifter(tensor.blas.Gemv)
def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.blas_c.CGemv)
def local_gpua_gemv2(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.blas.Gemm)
def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
import StringIO
import numpy
import theano
from theano import tensor, gof
from theano.tensor.subtensor import Subtensor, get_idx_list
from theano.gof.python25 import all, any
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
class GpuSubtensor(HideC, Subtensor):
def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable)
x = as_gpuarray_variable(x)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
out, = out_
x = inputs[0]
if self.perform_cache_cdata is not None:
out[0] = x.__getitem__(self.perform_cache_cdata)
return
cdata = get_idx_list(inputs, self.idx_list)
if len(cdata) == 1:
cdata = cdata[0]
if len(inputs) == 1:
self.perform_cache_cdata = cdata
out[0] = x.__getitem__(cdata)
def c_support_code(self):
return """
static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
int start_n, int stop_n, int step_n,
size_t len) {
if (step_n) *step = 1;
if (*step == 0) {
PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
return -1;
}
if (start_n) *start = (*step < 0) ? len-1 : 0;
else {
if (*start < 0) *start += len;
if (*start < 0) *start = (*step < 0) ? -1 : 0;
if (*start >= len) *start = (*step < 0) ? len-1 : len;
}
if (stop_n) *stop = (*step < 0) ? -1 : len;
else {
if (*stop < 0) *stop += len;
if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
if (*stop >= len) *stop = (*step < 0) ? len-1 : len;
}
if (*stop < *start && *step > 0)
*stop = *start;
return 0;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
inp_ndim = node.inputs[0].ndim
inp = inputs[0]
indices = inputs[1:]
# pad out the index list to the same dimension as the input
idx_list = self.idx_list + \
((slice(None),) * (inp_ndim - len(self.idx_list)))
# This case fails when we use pygpu_index(), so here is some
# special code
if len(idx_list) == 0:
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
if (!%(out)s) { %(fail)s }
""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
sio = StringIO.StringIO()
print >> sio, """
ssize_t starts[%(sz)s];
ssize_t stops[%(sz)s];
ssize_t steps[%(sz)s];
ssize_t cur;
int err;
if (%(inp)s->ga.nd != %(sz)s) {
PyErr_SetString(PyExc_IndexError, "invalid index");
%(fail)s
}
""" % dict(sz=len(idx_list), inp=inp, fail=sub['fail'])
def fix_idx(idx):
if idx is None:
return "0", 1
elif isinstance(idx, (numpy.integer, int)):
return str(idx), 0
elif isinstance(idx, gof.Type):
return indices.pop(0), 0
else:
assert 0, idx
for i, idx in enumerate(idx_list):
if isinstance(idx, slice):
start, start_n = fix_idx(idx.start)
stop, stop_n = fix_idx(idx.stop)
step, step_n = fix_idx(idx.step)
print >>sio, """
starts[%(i)s] = %(start)s;
stops[%(i)s] = %(stop)s;
steps[%(i)s] = %(step)s;
if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
%(start_n)s, %(stop_n)s, %(step_n)s,
%(inp)s->ga.dimensions[%(i)s]) == -1) {
%(fail)s
}
""" % dict(i=i, start=start, stop=stop, step=step,
start_n=start_n, stop_n=stop_n, step_n=step_n,
fail=sub['fail'], inp=inp)
else:
if isinstance(idx, gof.Type):
start = indices.pop(0)
elif isinstance(idx, (numpy.integer, int)):
start = idx
else:
assert 0, idx
print >>sio, """
cur = %(start)s;
if (cur < 0)
cur += %(inp)s->ga.dimensions[%(i)s];
starts[%(i)s] = cur;
steps[%(i)s] = 0;
""" % dict(i=i, start=start, fail=sub['fail'], inp=inp)
print >>sio, """
Py_XDECREF(%(out)s);
%(out)s = pygpu_index(%(inp)s, starts, stops, steps);
if (!%(out)s) { %(fail)s }
""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0])
return sio.getvalue()
def c_code_cache_version(self):
return (5,)
...@@ -5,8 +5,9 @@ from copy import copy, deepcopy ...@@ -5,8 +5,9 @@ from copy import copy, deepcopy
import numpy import numpy
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.compile import DeepCopyOp from theano.tensor import TensorType
from theano.tensor.tests.test_basic import safe_make_node from theano.tensor.basic import alloc
from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
from numpy.testing.noseclasses import KnownFailureTest from numpy.testing.noseclasses import KnownFailureTest
...@@ -33,7 +34,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType, ...@@ -33,7 +34,8 @@ from theano.sandbox.gpuarray.type import (GpuArrayType,
gpuarray_shared_constructor) gpuarray_shared_constructor)
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, gpu_from_cuda, gpu_alloc, gpu_from_cuda,
cuda_from_gpu) cuda_from_gpu, HostFromGpu,
GpuFromHost, GpuReshape)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
...@@ -42,11 +44,10 @@ rng = numpy.random.RandomState(seed=utt.fetch_seed()) ...@@ -42,11 +44,10 @@ rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray'\ mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
)
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
...@@ -88,139 +89,110 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs): ...@@ -88,139 +89,110 @@ def fake_shared(value, name=None, strict=False, allow_downcast=None, **kwargs):
def rand_gpuarray(*shape, **kwargs): def rand_gpuarray(*shape, **kwargs):
r = rng.rand(*shape) * 2 - 1 r = rng.rand(*shape) * 2 - 1
dtype = kwargs.pop('dtype', theano.config.floatX) dtype = kwargs.pop('dtype', theano.config.floatX)
cls = kwargs.pop('cls', None)
if len(kwargs) != 0: if len(kwargs) != 0:
raise TypeError('Unexpected argument %s', kwargs.keys()[0]) raise TypeError('Unexpected argument %s', kwargs.keys()[0])
return gpuarray.array(r, dtype=dtype) return gpuarray.array(r, dtype=dtype, cls=cls)
def makeTester(name, op, expected, good=None, bad_build=None, checks=None, def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
bad_runtime=None, mode=None, skip=False, eps=1e-10): mode_nogpu=mode_without_gpu, skip=False, eps=1e-10):
if good is None:
good = {}
if bad_build is None:
bad_build = {}
if bad_runtime is None:
bad_runtime = {}
if checks is None: if checks is None:
checks = {} checks = {}
_op = op _op = op
_expected = expected _gpu_op = gpu_op
_good = good _cases = cases
_bad_build = bad_build
_bad_runtime = bad_runtime
_skip = skip _skip = skip
_checks = checks _checks = checks
class Checker(unittest.TestCase): class Checker(unittest.TestCase, utt.TestOptimizationMixin):
op = staticmethod(_op) op = staticmethod(_op)
expected = staticmethod(_expected) gpu_op = staticmethod(_gpu_op)
good = _good cases = _cases
bad_build = _bad_build
bad_runtime = _bad_runtime
skip = _skip skip = _skip
checks = _checks checks = _checks
def setUp(self): def setUp(self):
eval(self.__class__.__module__ + '.' + self.__class__.__name__) eval(self.__class__.__module__ + '.' + self.__class__.__name__)
def test_good(self): def test_all(self):
if skip: if skip:
raise SkipTest(skip) raise SkipTest(skip)
for testname, inputs in good.items(): for testname, inputs in cases.items():
inputs = [copy(input) for input in inputs] self.run_case(testname, inputs)
inputrs = [fake_shared(input) for input in inputs]
try: def run_case(self, testname, inputs):
node = safe_make_node(self.op, *inputrs) inputs_ref = [theano.shared(inp) for inp in inputs]
except Exception, exc: inputs_tst = [theano.shared(inp) for inp in inputs]
err_msg = ("Test %s::%s: Error occured while making "
"a node with inputs %s") % (self.op, testname,
inputs)
exc.args += (err_msg,)
raise
try: try:
f = inplace_func([], node.outputs, mode=mode, node_ref = safe_make_node(self.op, *inputs_ref)
name='test_good') node_tst = safe_make_node(self.op, *inputs_tst)
except Exception, exc: except Exception, exc:
err_msg = ("Test %s::%s: Error occured while trying to " err_msg = ("Test %s::%s: Error occured while making "
"make a Function") % (self.op, testname) "a node with inputs %s") % (self.gpu_op, testname,
exc.args += (err_msg,) inputs)
raise exc.args += (err_msg,)
raise
if isinstance(self.expected, dict) and \ try:
testname in self.expected: f_ref = inplace_func([], node_ref.outputs, mode=mode_nogpu)
expecteds = self.expected[testname] f_tst = inplace_func([], node_tst.outputs, mode=mode_gpu)
else: except Exception, exc:
expecteds = self.expected(*inputs) err_msg = ("Test %s::%s: Error occured while trying to "
"make a Function") % (self.gpu_op, testname)
if not isinstance(expecteds, (list, tuple)): exc.args += (err_msg,)
expecteds = (expecteds,) raise
try:
variables = f()
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while calling "
"the Function on the inputs %s") % (self.op,
testname,
inputs)
exc.args += (err_msg,)
raise
for i, (variable, expected) in \ self.assertFunctionContains1(f_tst, self.gpu_op)
enumerate(izip(variables, expecteds)):
if variable.dtype != expected.dtype or \
variable.shape != expected.shape or \
not GpuArrayType.values_eq_approx(variable,
expected):
self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % (
self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype))
for description, check in self.checks.items():
if not check(inputs, variables):
self.fail(("Test %s::%s: Failed check: %s "
"(inputs were %s, ouputs were %s)") %
(self.op, testname, description,
inputs, variables))
def test_bad_build(self):
if skip:
raise SkipTest(skip)
for testname, inputs in self.bad_build.items():
inputs = [copy(input) for input in inputs]
inputrs = [fake_shared(input) for input in inputs]
self.assertRaises(Exception, safe_make_node, self.op, *inputrs)
def test_bad_runtime(self): ref_e = None
if skip: try:
raise SkipTest(skip) expecteds = f_ref()
for testname, inputs in self.bad_runtime.items(): except Exception, exc:
inputrs = [fake_shared(input) for input in inputs] ref_e = exc
try:
node = safe_make_node(self.op, *inputrs)
except Exception, exc:
err_msg = ("Test %s::%s: Error occured while trying to "
"make a node with inputs %s") % (self.op,
testname,
inputs)
exc.args += (err_msg,)
raise
try: try:
f = inplace_func([], node.outputs, mode=mode, variables = f_tst()
name="test_bad_runtime") except Exception, exc:
except Exception, exc: if ref_e is None:
err_msg = ("Test %s::%s: Error occured while trying to " err_msg = ("Test %s::%s: exception when calling the "
"make a Function") % (self.op, testname) "Function") % (self.gpu_op, testname)
exc.args += (err_msg,) exc.args += (err_msg,)
raise raise
else:
self.assertRaises(Exception, f, []) # if we raised an exception of the same type we're good.
if isinstance(exc, type(ref_e)):
return
else:
err_msg = ("Test %s::%s: exception raised during test "
"call was not the same as the reference "
"call (got: %s, expected %s)") % \
(self.gpu_op, testname, type(exc),
type(ref_e))
exc.args += (err_msg,)
raise
for i, (variable, expected) in \
enumerate(izip(variables, expecteds)):
if variable.dtype != expected.dtype or \
variable.shape != expected.shape or \
not TensorType.values_eq_approx(variable,
expected):
self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % (
self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype))
for description, check in self.checks.items():
if not check(inputs, variables):
self.fail(("Test %s::%s: Failed check: %s "
"(inputs were %s, ouputs were %s)") %
(self.op, testname, description,
inputs, variables))
Checker.__name__ = name Checker.__name__ = name
return Checker return Checker
...@@ -300,31 +272,37 @@ def gpu_alloc_expected(x, *shp): ...@@ -300,31 +272,37 @@ def gpu_alloc_expected(x, *shp):
GpuAllocTester = makeTester( GpuAllocTester = makeTester(
name="GpuAllocTester", name="GpuAllocTester",
op=gpu_alloc, op=alloc,
expected=gpu_alloc_expected, gpu_op=gpu_alloc,
good=dict( cases=dict(
correct01=(rand_gpuarray(), numpy.int32(7)), correct01=(rand(), numpy.int32(7)),
correct01_bcast=(rand_gpuarray(1), numpy.int32(7)), # just gives a DeepCopyOp with possibly wrong results on the CPU
correct02=(rand_gpuarray(), numpy.int32(4), numpy.int32(7)), # correct01_bcast=(rand(1), numpy.int32(7)),
correct12=(rand_gpuarray(7), numpy.int32(4), numpy.int32(7)), correct02=(rand(), numpy.int32(4), numpy.int32(7)),
correct13=(rand_gpuarray(7), numpy.int32(2), numpy.int32(4), correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand(7), numpy.int32(2), numpy.int32(4),
numpy.int32(7)),
correct23=(rand(4, 7), numpy.int32(2), numpy.int32(4),
numpy.int32(7)), numpy.int32(7)),
correct23=(rand_gpuarray(4, 7), numpy.int32(2), numpy.int32(4), bad_shape12=(rand(7), numpy.int32(7), numpy.int32(5)),
numpy.int32(7))
),
bad_runtime=dict(
bad_shape12=(rand_gpuarray(7), numpy.int32(7), numpy.int32(5)),
) )
) )
def test_deep_copy():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a) class G_reshape(T_reshape):
def shortDescription(self):
return None
def __init__(self, name):
T_reshape.__init__(self, name,
shared=gpuarray_shared_constructor,
op=GpuReshape,
mode=mode_with_gpu,
# avoid errors with limited devices
# dtype='float32',
ignore_topo=(HostFromGpu, GpuFromHost,
theano.compile.DeepCopyOp,
theano.sandbox.gpuarray.elemwise.GpuElemwise,
theano.tensor.opt.Shape_i,
theano.tensor.opt.MakeVector))
assert self.op == GpuReshape
from unittest import TestCase
from theano.tensor.blas import gemv_inplace, gemm_inplace
from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
gpugemm_inplace)
GpuGemvTester = makeTester('GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
GpuGemmTester = makeTester('GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
)
)
import unittest
from theano import scalar, gof
from theano.gof import FunctionGraph
from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce)
from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
GpuCAReduce)
from theano.sandbox.gpuarray.type import GpuArrayType
from pygpu.array import gpuarray
# This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_Broadcast):
op = GpuElemwise
type = GpuArrayType
def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
# no c_code() yet
#cop = GpuElemwise
#ctype = GpuArrayType
#def rand_cval(self, shp):
# return rand_gpuarray(*shp, **dict(cls=gpuarray))
class test_GpuDimShuffle(test_DimShuffle):
op = GpuDimShuffle
class test_GpuCAReduce(test_CAReduce):
dtypes = ["float32"]
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduce
reds = [scalar.add, scalar.mul]
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype)
def test_perform_nan(self):
for dtype in self.dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True)
def test_c(self):
raise SkipTest("no C code")
def test_c_nan(self):
raise SkipTest("no C code")
import numpy
import theano
from theano.tests import unittest_tools as utt
from theano.sandbox.gpuarray.basic_ops import GpuReshape
import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
cuda_ndarray.use('gpu')
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def test_flatten():
m = theano.tensor.fmatrix()
f = theano.function([m], m.flatten(), mode=mode_with_gpu)
val = numpy.random.rand(10,11).astype("float32")
res = f(val)
utt.assert_allclose(res, val.flatten())
assert res.shape == val.flatten().shape
assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()]
\ No newline at end of file
from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
from theano.compile import DeepCopyOp
from theano import tensor
class G_subtensor(T_subtensor):
def shortDescription(self):
return None
def __init__(self, name):
T_subtensor.__init__(self, name,
shared=gpuarray_shared_constructor,
sub=GpuSubtensor,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
assert self.sub == GpuSubtensor
import operator
import theano
from theano.compile import DeepCopyOp
from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType
def test_deep_copy():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
import numpy import numpy
import theano import theano
from theano.tensor.var import _tensor_py_operators
from theano import Type, Variable, Constant, tensor, config, scalar from theano import Type, Variable, Constant, tensor, config, scalar
from theano.compile import SharedVariable from theano.compile import SharedVariable
...@@ -26,7 +27,10 @@ class GpuArrayType(Type): ...@@ -26,7 +27,10 @@ class GpuArrayType(Type):
except gpuarray.GpuArrayException: except gpuarray.GpuArrayException:
raise TypeError("Unsupported dtype for %s: %s" % raise TypeError("Unsupported dtype for %s: %s" %
(self.__class__.__name__, self.dtype)) (self.__class__.__name__, self.dtype))
def __str__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
def filter(self, data, strict=False, allow_downcast=None): def filter(self, data, strict=False, allow_downcast=None):
if strict: if strict:
if not isinstance(data, gpuarray.GpuArray): if not isinstance(data, gpuarray.GpuArray):
...@@ -103,8 +107,8 @@ class GpuArrayType(Type): ...@@ -103,8 +107,8 @@ class GpuArrayType(Type):
return GpuArrayType.values_eq(a, b) return GpuArrayType.values_eq(a, b)
else: else:
res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'), res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
op_tmpl="res[i] = ((%(a)s - %(b)s) <" \ op_tmpl="res[i] = ((%(a)s - %(b)s) <"
"(1e-8 + 1e-5 * fabs(%(b)s)))") "(1e-8 + 1e-5 * fabs(%(b)s)))")
return numpy.asarray(res).all() return numpy.asarray(res).all()
def value_zeros(self, shape): def value_zeros(self, shape):
...@@ -134,7 +138,7 @@ class GpuArrayType(Type): ...@@ -134,7 +138,7 @@ class GpuArrayType(Type):
return numpy.dtype(self.dtype).itemsize return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub): def c_declare(self, name, sub):
return "GpuArrayObject *%s;" % (name,) return "PyGpuArrayObject *%s;" % (name,)
def c_init(self, name, sub): def c_init(self, name, sub):
return "%s = NULL;" % (name,) return "%s = NULL;" % (name,)
...@@ -149,17 +153,17 @@ class GpuArrayType(Type): ...@@ -149,17 +153,17 @@ class GpuArrayType(Type):
} }
/* First check if we are the base type exactly (the most common case), /* First check if we are the base type exactly (the most common case),
then do the full subclass check if needed. */ then do the full subclass check if needed. */
if (py_%(name)s->ob_type != &GpuArrayType && if (py_%(name)s->ob_type != &PyGpuArrayType &&
!PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) { !PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray"); PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
%(fail)s %(fail)s
} }
%(name)s = (GpuArrayObject *)py_%(name)s; %(name)s = (PyGpuArrayObject *)py_%(name)s;
Py_INCREF(%(name)s); Py_INCREF(%(name)s);
""" % {'name': name, 'fail': sub['fail']} """ % {'name': name, 'fail': sub['fail']}
def c_cleanup(self, name, sub): def c_cleanup(self, name, sub):
return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name } return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name}
def c_sync(self, name, sub): def c_sync(self, name, sub):
return """ return """
...@@ -184,7 +188,8 @@ class GpuArrayType(Type): ...@@ -184,7 +188,8 @@ class GpuArrayType(Type):
# We need arrayobject for the PyArrayDescr struct def # We need arrayobject for the PyArrayDescr struct def
# (even if we just use a pointer to it in a function def) # (even if we just use a pointer to it in a function def)
return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>', return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
'<numpy/arrayobject.h>', '<gpuarray_api.h>'] '<compyte/buffer_blas.h>', '<numpy/arrayobject.h>',
'<gpuarray_api.h>']
def c_header_dirs(self): def c_header_dirs(self):
return [pygpu.get_include(), numpy.get_include()] return [pygpu.get_include(), numpy.get_include()]
...@@ -193,10 +198,13 @@ class GpuArrayType(Type): ...@@ -193,10 +198,13 @@ class GpuArrayType(Type):
return ['compyte'] return ['compyte']
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) ver = pygpu.gpuarray.api_version()
# we only use the major version since the minor revision are
# API-compatible.
return (1, ver[0])
class _operators(tensor.basic._tensor_py_operators): class _operators(_tensor_py_operators):
def _as_TensorVariable(self): def _as_TensorVariable(self):
from basic_ops import host_from_gpu from basic_ops import host_from_gpu
return host_from_gpu(self) return host_from_gpu(self)
...@@ -204,10 +212,6 @@ class _operators(tensor.basic._tensor_py_operators): ...@@ -204,10 +212,6 @@ class _operators(tensor.basic._tensor_py_operators):
def _as_GpuArrayVariable(self): def _as_GpuArrayVariable(self):
return self return self
dtype = property(lambda s: s.type.dtype)
broadcastable = property(lambda s: s.type.broadcastable)
ndim = property(lambda s: s.type.ndim)
class GpuArrayVariable(_operators, Variable): class GpuArrayVariable(_operators, Variable):
pass pass
...@@ -276,12 +280,6 @@ theano.compile.register_view_op_c_code(GpuArrayType, """ ...@@ -276,12 +280,6 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
theano.compile.register_deep_copy_op_c_code(GpuArrayType, """ theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
%(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context()); %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
if (!%(oname)s) { %(fail)s } if (!%(oname)s) { %(fail)s }
int err; """, version=(5,))
err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error during copy");
%(fail)s
}
""", version=(1,))
...@@ -3018,7 +3018,7 @@ class Composite(ScalarOp): ...@@ -3018,7 +3018,7 @@ class Composite(ScalarOp):
rval = [] rval = []
for subnode in self.fgraph.toposort(): for subnode in self.fgraph.toposort():
try: try:
rval.append(subnode.op.c_support_code()) rval.append(subnode.op.c_support_code().strip())
except gof.utils.MethodNotDefined: except gof.utils.MethodNotDefined:
pass pass
# remove duplicate code blocks # remove duplicate code blocks
......
...@@ -143,10 +143,6 @@ class DimShuffle(Op): ...@@ -143,10 +143,6 @@ class DimShuffle(Op):
# list of dimensions of the input to drop # list of dimensions of the input to drop
self.drop = [] self.drop = []
# this maps i before dropping dimensions to j after dropping dimensions
# so self.shuffle can be set properly later on
i2j = {}
j = 0
for i, b in enumerate(input_broadcastable): for i, b in enumerate(input_broadcastable):
if i not in new_order: if i not in new_order:
# we want to drop this dimension because it's not a value in # we want to drop this dimension because it's not a value in
...@@ -158,14 +154,9 @@ class DimShuffle(Op): ...@@ -158,14 +154,9 @@ class DimShuffle(Op):
raise ValueError( raise ValueError(
"You cannot drop a non-broadcastable dimension.", "You cannot drop a non-broadcastable dimension.",
(input_broadcastable, new_order)) (input_broadcastable, new_order))
else:
i2j[i] = j
j += 1
# transposition of non-broadcastable dimensions # this is the list of the original dimensions that we keep
# This is how the dimensions will be permuted, without accounting for self.shuffle = [x for x in new_order if x != 'x']
# the extra 'x' broadcastable dimensions to insert.
self.shuffle = [i2j[x] for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not # list of dimensions of the output that are broadcastable and were not
# in the original input # in the original input
...@@ -237,16 +228,12 @@ class DimShuffle(Op): ...@@ -237,16 +228,12 @@ class DimShuffle(Op):
res = input res = input
if type(res) != numpy.ndarray and type(res) != numpy.memmap: if type(res) != numpy.ndarray and type(res) != numpy.memmap:
raise TypeError(res) raise TypeError(res)
shape = list(res.shape)
for drop in reversed(self.drop):
shape.pop(drop)
res = res.reshape(shape)
# transpose # transpose
res = res.transpose(self.shuffle) res = res.transpose(self.shuffle+self.drop)
# augment # augment
shape = list(res.shape) shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment: for augm in self.augment:
shape.insert(augm, 1) shape.insert(augm, 1)
res = res.reshape(shape) res = res.reshape(shape)
...@@ -259,9 +246,6 @@ class DimShuffle(Op): ...@@ -259,9 +246,6 @@ class DimShuffle(Op):
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
ishp, = shapes ishp, = shapes
ishp = list(ishp)
for drop in reversed(self.drop):
del ishp[drop]
# transpose # transpose
rval = [ishp[i] for i in self.shuffle] rval = [ishp[i] for i in self.shuffle]
......
...@@ -410,9 +410,9 @@ def local_dimshuffle_lift(node): ...@@ -410,9 +410,9 @@ def local_dimshuffle_lift(node):
inode = input.owner inode = input.owner
if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1): if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1):
# Don't use make_node to have tag.test_value set. # Don't use make_node to have tag.test_value set.
ret = inode.op(*[DimShuffle(inp.type.broadcastable, ret = inode.op(*[op.__class__(inp.type.broadcastable,
op.new_order, op.new_order,
op.inplace)(inp) for inp in op.inplace)(inp) for inp in
inode.inputs], **dict(return_list=True)) inode.inputs], **dict(return_list=True))
return ret return ret
if inode and isinstance(inode.op, DimShuffle): if inode and isinstance(inode.op, DimShuffle):
...@@ -424,8 +424,8 @@ def local_dimshuffle_lift(node): ...@@ -424,8 +424,8 @@ def local_dimshuffle_lift(node):
iinput.type.ndim): iinput.type.ndim):
return [iinput] return [iinput]
else: else:
ret = DimShuffle(iinput.type.broadcastable, new_order, ret = op.__class__(iinput.type.broadcastable, new_order,
inplace)(iinput, **dict(return_list=True)) inplace)(iinput, **dict(return_list=True))
return ret return ret
...@@ -460,7 +460,7 @@ def dimshuffle_as_view(node): ...@@ -460,7 +460,7 @@ def dimshuffle_as_view(node):
op = node.op op = node.op
if not isinstance(op, DimShuffle) or op.inplace: if not isinstance(op, DimShuffle) or op.inplace:
return False return False
new_op = DimShuffle(op.input_broadcastable, op.new_order, inplace=True) new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
return [new_op(*node.inputs)] return [new_op(*node.inputs)]
#Step 60 is the inplace optimization stage. #Step 60 is the inplace optimization stage.
...@@ -4609,7 +4609,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024): ...@@ -4609,7 +4609,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
# worthwhile if the summation axis doesn't line up with a # worthwhile if the summation axis doesn't line up with a
# contiguous dimension) # contiguous dimension)
if not isinstance(node.op, OP): if type(node.op) is not OP:
return False return False
inputs = [] # inputs of the new Elemwise op. inputs = [] # inputs of the new Elemwise op.
s_inputs = [] # inputs of the new scalar op used by the Composite. s_inputs = [] # inputs of the new scalar op used by the Composite.
......
...@@ -44,7 +44,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as, ...@@ -44,7 +44,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
dtensor3, SpecifyShape, Mean, dtensor3, SpecifyShape, Mean,
itensor3, Tile, switch, Diagonal, Diag, itensor3, Tile, switch, Diagonal, Diag,
nonzero, flatnonzero, nonzero_values, nonzero, flatnonzero, nonzero_values,
stacklists) stacklists, DimShuffle)
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
...@@ -4204,9 +4204,30 @@ class T_op_cache(unittest.TestCase): ...@@ -4204,9 +4204,30 @@ class T_op_cache(unittest.TestCase):
self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a))) self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))
class T_reshape(unittest.TestCase): class T_reshape(utt.InferShapeTester, utt.TestOptimizationMixin):
def setUp(self): def __init__(self, name, shared=tensor._shared, op=Reshape, mode=None,
utt.seed_rng() ignore_topo=(DeepCopyOp, opt.MakeVector,
opt.Shape_i, DimShuffle, theano.tensor.Elemwise)):
self.shared = shared
self.op = op
#The tag canonicalize is needed for the shape test in FAST_COMPILE
self.mode = mode
self.ignore_topo = ignore_topo
return super(T_reshape, self).__init__(name)
def function(self, inputs, outputs):
f = function(inputs, outputs, mode=self.mode)
if self.mode is not None or theano.config.mode != "FAST_COMPILE":
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op,
self.ignore_topo)]
assert len(topo_) == 1, topo_
return f
def eval_output_and_check(self, t):
f = self.function([], t)
tval = f()
return tval
def test_reshape(self): def test_reshape(self):
a = dvector() a = dvector()
...@@ -4215,7 +4236,7 @@ class T_reshape(unittest.TestCase): ...@@ -4215,7 +4236,7 @@ class T_reshape(unittest.TestCase):
#basic to 1 dim(without list) #basic to 1 dim(without list)
c = reshape(b, as_tensor_variable(6), ndim=1) c = reshape(b, as_tensor_variable(6), ndim=1)
f = inplace_func([b], c) f = self.function([b], c)
b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]]) b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5]) c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
...@@ -4231,7 +4252,7 @@ class T_reshape(unittest.TestCase): ...@@ -4231,7 +4252,7 @@ class T_reshape(unittest.TestCase):
#basic to 1 dim(with list) #basic to 1 dim(with list)
c = reshape(b, (as_tensor_variable(6),), ndim=1) c = reshape(b, (as_tensor_variable(6),), ndim=1)
f = inplace_func([b], c) f = self.function([b], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) == assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
numpy.asarray([0, 1, 2, 3, 4, 5])) numpy.asarray([0, 1, 2, 3, 4, 5]))
#print f.maker.fgraph.toposort() #print f.maker.fgraph.toposort()
...@@ -4239,14 +4260,14 @@ class T_reshape(unittest.TestCase): ...@@ -4239,14 +4260,14 @@ class T_reshape(unittest.TestCase):
#basic to shape object of same ndim #basic to shape object of same ndim
c = reshape(b, d.shape) c = reshape(b, d.shape)
f = inplace_func([b, d], c) f = self.function([b, d], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]), assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
[[0, 1], [2, 3], [4, 5]]) == [[0, 1], [2, 3], [4, 5]]) ==
numpy.asarray([[0, 1], [2, 3], [4, 5]])) numpy.asarray([[0, 1], [2, 3], [4, 5]]))
#basic to 2 dims #basic to 2 dims
c = reshape(a, [2, 3]) c = reshape(a, [2, 3])
f = inplace_func([a], c) f = self.function([a], c)
assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) == assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
numpy.asarray([[0, 1, 2], [3, 4, 5]])) numpy.asarray([[0, 1, 2], [3, 4, 5]]))
...@@ -4255,7 +4276,7 @@ class T_reshape(unittest.TestCase): ...@@ -4255,7 +4276,7 @@ class T_reshape(unittest.TestCase):
a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5]) a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]]) b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])
f_sub = inplace_func([a, b], c - b) f_sub = self.function([a, b], c - b)
assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(a_val == a_val_copy) assert numpy.all(a_val == a_val_copy)
...@@ -4264,35 +4285,33 @@ class T_reshape(unittest.TestCase): ...@@ -4264,35 +4285,33 @@ class T_reshape(unittest.TestCase):
a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64') a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64') b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
f_sub = inplace_func([a, b], c - b) f_sub = self.function([a, b], c - b)
assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(a_val == a_val_copy) assert numpy.all(a_val == a_val_copy)
# verify gradient # verify gradient
def just_vals(v): def just_vals(v):
return Reshape(2)(v, theano._asarray([2, 3], dtype='int32')) return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
utt.verify_grad(just_vals, [a_val]) utt.verify_grad(just_vals, [a_val], mode=self.mode)
#test infer_shape #test infer_shape
f_sub = function([a, b], (c - b).shape) self._compile_and_check([a], [c], (a_val,), self.op)
if config.mode == "FAST_COMPILE":
assert len(f_sub.maker.fgraph.toposort()) == 3
else:
topo = f_sub.maker.fgraph.toposort()
assert len(topo) == 1
topo[0].op == theano.compile.function_module.deep_copy_op
#assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
#assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
# test broadcast flag for constant value of 1 # test broadcast flag for constant value of 1
c = reshape(b, (b.shape[0], b.shape[1], 1)) c = reshape(b, (b.shape[0], b.shape[1], 1))
f = inplace_func([b], c) f = self.function([b], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) == assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]])) numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable == assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
(False, False, True)) (False, False, True))
assert numpy.all(f_sub(a_val, b_val) == [2, 3]) def test_m1(self):
t = tensor3()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
val = rng.uniform(size=(3, 4, 5)).astype(config.floatX)
for out in [t.reshape([-1]), t.reshape([-1, 5]),
t.reshape([5, -1]), t.reshape([5, -1, 3])]:
self._compile_and_check([t], [out], [val], self.op)
def test_reshape_long_in_shape(self): def test_reshape_long_in_shape(self):
v = dvector('v') v = dvector('v')
...@@ -4311,14 +4330,14 @@ class T_reshape(unittest.TestCase): ...@@ -4311,14 +4330,14 @@ class T_reshape(unittest.TestCase):
r = a.reshape(shapes, ndim=1) r = a.reshape(shapes, ndim=1)
z = zeros_like(r) z = zeros_like(r)
f = function([a, shapes], z.shape) f = self.function([a, shapes], z.shape)
self.assertRaises(ValueError, f, a_val, [13]) self.assertRaises(ValueError, f, a_val, [13])
#Test reshape to 2 dim #Test reshape to 2 dim
r = a.reshape(shapes, ndim=2) r = a.reshape(shapes, ndim=2)
z = zeros_like(r) z = zeros_like(r)
f = function([a, shapes], z.shape) f = self.function([a, shapes], z.shape)
self.assertRaises(ValueError, f, a_val, [-1, 5]) self.assertRaises(ValueError, f, a_val, [-1, 5])
self.assertRaises(ValueError, f, a_val, [7, -1]) self.assertRaises(ValueError, f, a_val, [7, -1])
......
...@@ -11,7 +11,7 @@ from theano.gof.python25 import all, any ...@@ -11,7 +11,7 @@ from theano.gof.python25 import all, any
from theano import gof, scalar, config from theano import gof, scalar, config
from theano import tensor from theano import tensor
from theano.tensor import TensorType from theano.tensor import TensorType, as_tensor_variable
from theano.compile.mode import get_default_mode from theano.compile.mode import get_default_mode
from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle, from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
Prod, ProdWithoutZeros) Prod, ProdWithoutZeros)
...@@ -24,6 +24,7 @@ def FunctionGraph(i, o): ...@@ -24,6 +24,7 @@ def FunctionGraph(i, o):
class test_DimShuffle(unittest_tools.InferShapeTester): class test_DimShuffle(unittest_tools.InferShapeTester):
op = DimShuffle
def with_linker(self, linker): def with_linker(self, linker):
for xsh, shuffle, zsh in [((2, 3), (1, 'x', 0), (3, 1, 2)), for xsh, shuffle, zsh in [((2, 3), (1, 'x', 0), (3, 1, 2)),
...@@ -38,12 +39,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester): ...@@ -38,12 +39,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
((1,), ('x', 'x'), (1, 1))]: ((1,), ('x', 'x'), (1, 1))]:
ib = [(entry == 1) for entry in xsh] ib = [(entry == 1) for entry in xsh]
x = TensorType('float64', ib)('x') x = TensorType('float64', ib)('x')
e = DimShuffle(ib, shuffle)(x) e = self.op(ib, shuffle)(x)
f = copy(linker).accept(FunctionGraph([x], [e])).make_function() f = copy(linker).accept(FunctionGraph([x], [e])).make_function()
assert f(numpy.ones(xsh)).shape == zsh assert f(numpy.ones(xsh)).shape == zsh
#test that DimShuffle.infer_shape work correctly #test that DimShuffle.infer_shape work correctly
x = TensorType('float64', ib)('x') x = TensorType('float64', ib)('x')
e = DimShuffle(ib, shuffle)(x) e = self.op(ib, shuffle)(x)
f = copy(linker).accept(FunctionGraph([x], [e. f = copy(linker).accept(FunctionGraph([x], [e.
shape])).make_function() shape])).make_function()
assert all(f(numpy.ones(xsh))) == all(zsh) assert all(f(numpy.ones(xsh))) == all(zsh)
...@@ -51,12 +52,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester): ...@@ -51,12 +52,12 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
# Test when we drop a axis that is not broadcastable # Test when we drop a axis that is not broadcastable
ib = [False, True, False] ib = [False, True, False]
x = TensorType('float64', ib)('x') x = TensorType('float64', ib)('x')
self.assertRaises(ValueError, DimShuffle, ib, shuffle) self.assertRaises(ValueError, self.op, ib, shuffle)
# Test when we drop a axis that don't have shape 1 # Test when we drop a axis that don't have shape 1
ib = [True, True, False] ib = [True, True, False]
x = TensorType('float64', ib)('x') x = TensorType('float64', ib)('x')
e = DimShuffle(ib, (1, 2))(x) e = self.op(ib, (1, 2))(x)
f = copy(linker).accept(FunctionGraph([x], [e.shape])).make_function() f = copy(linker).accept(FunctionGraph([x], [e.shape])).make_function()
self.assertRaises(TypeError, f, numpy.ones((2, 1, 4))) self.assertRaises(TypeError, f, numpy.ones((2, 1, 4)))
...@@ -89,8 +90,8 @@ class test_DimShuffle(unittest_tools.InferShapeTester): ...@@ -89,8 +90,8 @@ class test_DimShuffle(unittest_tools.InferShapeTester):
adtens = TensorType('float64', ib)('x') adtens = TensorType('float64', ib)('x')
adtens_val = numpy.ones(xsh) adtens_val = numpy.ones(xsh)
self._compile_and_check([adtens], self._compile_and_check([adtens],
[DimShuffle(ib, shuffle)(adtens)], [self.op(ib, shuffle)(adtens)],
[adtens_val], DimShuffle, [adtens_val], self.op,
warn=False) warn=False)
def test_too_big_rank(self): def test_too_big_rank(self):
...@@ -137,10 +138,23 @@ class test_reduce_axes(unittest.TestCase): ...@@ -137,10 +138,23 @@ class test_reduce_axes(unittest.TestCase):
m = x.var(a) m = x.var(a)
class test_Broadcast(unittest.TestCase): class test_Broadcast(unittest.TestCase):
# this is to allow other types to reuse this class to test their ops
type = TensorType
op = Elemwise
ctype = TensorType
cop = Elemwise
def rand_val(self, shp):
return numpy.asarray(numpy.random.rand(*shp))
def rand_cval(self, shp):
return numpy.asarray(numpy.random.rand(*shp))
def setUp(self): def setUp(self):
unittest_tools.seed_rng() unittest_tools.seed_rng()
def with_linker(self, linker): def with_linker(self, linker, op, type, rand_val):
for xsh, ysh in [((3, 5), (3, 5)), for xsh, ysh in [((3, 5), (3, 5)),
((3, 5), (1, 5)), ((3, 5), (1, 5)),
((3, 5), (3, 1)), ((3, 5), (3, 1)),
...@@ -150,12 +164,12 @@ class test_Broadcast(unittest.TestCase): ...@@ -150,12 +164,12 @@ class test_Broadcast(unittest.TestCase):
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 1, 1, 1)),
((), ())]: ((), ())]:
x = TensorType('float64', [(entry == 1) for entry in xsh])('x') x = type('float64', [(entry == 1) for entry in xsh])('x')
y = TensorType('float64', [(entry == 1) for entry in ysh])('y') y = type('float64', [(entry == 1) for entry in ysh])('y')
e = Elemwise(scalar.add)(x, y) e = op(scalar.add)(x, y)
f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function() f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
xv = numpy.asarray(numpy.random.rand(*xsh)) xv = rand_val(xsh)
yv = numpy.asarray(numpy.random.rand(*ysh)) yv = rand_val(ysh)
zv = xv + yv zv = xv + yv
self.assertTrue((f(xv, yv) == zv).all()) self.assertTrue((f(xv, yv) == zv).all())
...@@ -163,14 +177,14 @@ class test_Broadcast(unittest.TestCase): ...@@ -163,14 +177,14 @@ class test_Broadcast(unittest.TestCase):
#test Elemwise.infer_shape #test Elemwise.infer_shape
#the Shape op don't implement c_code! #the Shape op don't implement c_code!
if isinstance(linker, gof.PerformLinker): if isinstance(linker, gof.PerformLinker):
x = TensorType('float64', [(entry == 1) for entry in xsh])('x') x = type('float64', [(entry == 1) for entry in xsh])('x')
y = TensorType('float64', [(entry == 1) for entry in ysh])('y') y = type('float64', [(entry == 1) for entry in ysh])('y')
e = Elemwise(scalar.add)(x, y) e = op(scalar.add)(x, y)
f = copy(linker).accept(FunctionGraph([x, f = copy(linker).accept(FunctionGraph([x,
y], [e.shape])).make_function() y], [e.shape])).make_function()
assert tuple(f(xv, yv)) == tuple(zv.shape) assert tuple(f(xv, yv)) == tuple(zv.shape)
def with_linker_inplace(self, linker): def with_linker_inplace(self, linker, op, type, rand_val):
for xsh, ysh in [((5, 5), (5, 5)), for xsh, ysh in [((5, 5), (5, 5)),
((5, 5), (1, 5)), ((5, 5), (1, 5)),
((5, 5), (5, 1)), ((5, 5), (5, 1)),
...@@ -179,12 +193,12 @@ class test_Broadcast(unittest.TestCase): ...@@ -179,12 +193,12 @@ class test_Broadcast(unittest.TestCase):
((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)),
((2, 3, 4, 5), (1, 1, 1, 1)), ((2, 3, 4, 5), (1, 1, 1, 1)),
((), ())]: ((), ())]:
x = TensorType('float64', [(entry == 1) for entry in xsh])('x') x = type('float64', [(entry == 1) for entry in xsh])('x')
y = TensorType('float64', [(entry == 1) for entry in ysh])('y') y = type('float64', [(entry == 1) for entry in ysh])('y')
e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y) e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function() f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function()
xv = numpy.asarray(numpy.random.rand(*xsh)) xv = rand_val(xsh)
yv = numpy.asarray(numpy.random.rand(*ysh)) yv = rand_val(ysh)
zv = xv + yv zv = xv + yv
f(xv, yv) f(xv, yv)
...@@ -193,13 +207,13 @@ class test_Broadcast(unittest.TestCase): ...@@ -193,13 +207,13 @@ class test_Broadcast(unittest.TestCase):
#test Elemwise.infer_shape #test Elemwise.infer_shape
#the Shape op don't implement c_code! #the Shape op don't implement c_code!
if isinstance(linker, gof.PerformLinker): if isinstance(linker, gof.PerformLinker):
x = TensorType('float64', [(entry == 1) for entry in xsh])('x') x = type('float64', [(entry == 1) for entry in xsh])('x')
y = TensorType('float64', [(entry == 1) for entry in ysh])('y') y = type('float64', [(entry == 1) for entry in ysh])('y')
e = Elemwise(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y) e = op(scalar.Add(scalar.transfer_type(0)), {0: 0})(x, y)
f = copy(linker).accept(FunctionGraph([x, f = copy(linker).accept(FunctionGraph([x,
y], [e.shape])).make_function() y], [e.shape])).make_function()
xv = numpy.asarray(numpy.random.rand(*xsh)) xv = rand_val(xsh)
yv = numpy.asarray(numpy.random.rand(*ysh)) yv = rand_val(ysh)
zv = xv + yv zv = xv + yv
f(xv, yv) f(xv, yv)
...@@ -207,30 +221,33 @@ class test_Broadcast(unittest.TestCase): ...@@ -207,30 +221,33 @@ class test_Broadcast(unittest.TestCase):
assert xv.shape == zv.shape assert xv.shape == zv.shape
def test_perform(self): def test_perform(self):
self.with_linker(gof.PerformLinker()) self.with_linker(gof.PerformLinker(), self.op, self.type,
self.rand_val)
def test_c(self): def test_c(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
self.with_linker(gof.CLinker()) self.with_linker(gof.CLinker(), self.cop, self.ctype, self.rand_cval)
def test_perform_inplace(self): def test_perform_inplace(self):
self.with_linker_inplace(gof.PerformLinker()) self.with_linker_inplace(gof.PerformLinker(), self.op, self.type,
self.rand_val)
def test_c_inplace(self): def test_c_inplace(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
self.with_linker_inplace(gof.CLinker()) self.with_linker_inplace(gof.CLinker(), self.cop, self.ctype,
self.rand_cval)
def test_fill(self): def test_fill(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = TensorType('float64', [0, 0])('x') x = self.ctype('float64', [0, 0])('x')
y = TensorType('float64', [1, 1])('y') y = self.ctype('float64', [1, 1])('y')
e = Elemwise(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y) e = self.cop(scalar.Second(scalar.transfer_type(0)), {0: 0})(x, y)
f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function() f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
xv = numpy.ones((5, 5)) xv = self.rand_cval((5, 5))
yv = numpy.random.rand(1, 1) yv = self.rand_cval((1, 1))
f(xv, yv) f(xv, yv)
assert (xv == yv).all() assert (xv == yv).all()
...@@ -245,27 +262,28 @@ class test_Broadcast(unittest.TestCase): ...@@ -245,27 +262,28 @@ class test_Broadcast(unittest.TestCase):
def test_weird_strides(self): def test_weird_strides(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = TensorType('float64', [0, 0, 0, 0, 0])('x') x = self.ctype('float64', [0, 0, 0, 0, 0])('x')
y = TensorType('float64', [0, 0, 0, 0, 0])('y') y = self.ctype('float64', [0, 0, 0, 0, 0])('y')
e = Elemwise(scalar.add)(x, y) e = self.cop(scalar.add)(x, y)
f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function() f = gof.CLinker().accept(FunctionGraph([x, y], [e])).make_function()
xv = numpy.random.rand(2, 2, 2, 2, 2) xv = self.rand_cval((2, 2, 2, 2, 2))
yv = numpy.random.rand(2, 2, 2, 2, 2).transpose(4, 0, 3, 1, 2) yv = self.rand_cval((2, 2, 2, 2, 2)).transpose(4, 0, 3, 1, 2)
zv = xv + yv zv = xv + yv
assert (f(xv, yv) == zv).all() assert (f(xv, yv) == zv).all()
def test_same_inputs(self): def test_same_inputs(self):
if not theano.config.cxx: if not theano.config.cxx:
raise SkipTest("G++ not available, so we need to skip this test.") raise SkipTest("G++ not available, so we need to skip this test.")
x = TensorType('float64', [0, 0])('x') x = self.ctype('float64', [0, 0])('x')
e = Elemwise(scalar.add)(x, x) e = self.cop(scalar.add)(x, x)
f = gof.CLinker().accept(FunctionGraph([x], [e])).make_function() f = gof.CLinker().accept(FunctionGraph([x], [e])).make_function()
xv = numpy.random.rand(2, 2) xv = self.rand_cval((2, 2))
zv = xv + xv zv = xv + xv
assert (f(xv) == zv).all() assert (f(xv) == zv).all()
class test_CAReduce(unittest_tools.InferShapeTester): class test_CAReduce(unittest_tools.InferShapeTester):
op = CAReduce
def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX", def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
test_nan=False, tensor_op=None): test_nan=False, tensor_op=None):
...@@ -288,9 +306,9 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -288,9 +306,9 @@ class test_CAReduce(unittest_tools.InferShapeTester):
dtype = theano.config.floatX dtype = theano.config.floatX
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if tensor_op is None: if tensor_op is None:
e = CAReduce(scalar_op, axis=tosum)(x) e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x))
else: else:
e = tensor_op(x, axis=tosum) e = as_tensor_variable(tensor_op(x, axis=tosum))
if tosum is None: if tosum is None:
tosum = range(len(xsh)) tosum = range(len(xsh))
...@@ -395,7 +413,7 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -395,7 +413,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
if isinstance(linker, gof.PerformLinker): if isinstance(linker, gof.PerformLinker):
x = TensorType(dtype, [(entry == 1) for entry in xsh])('x') x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
if tensor_op is None: if tensor_op is None:
e = CAReduce(scalar_op, axis=tosum)(x) e = self.op(scalar_op, axis=tosum)(x)
else: else:
e = tensor_op(x, axis=tosum) e = tensor_op(x, axis=tosum)
if tosum is None: if tosum is None:
...@@ -491,8 +509,8 @@ class test_CAReduce(unittest_tools.InferShapeTester): ...@@ -491,8 +509,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
tosum = range(len(xsh)) tosum = range(len(xsh))
xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype) xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
self._compile_and_check([x], self._compile_and_check([x],
[CAReduce(scalar.add, axis=tosum)(x)], [self.op(scalar.add, axis=tosum)(x)],
[xv], CAReduce, ["local_cut_useless_reduce"]) [xv], self.op, ["local_cut_useless_reduce"])
class test_Prod(unittest.TestCase): class test_Prod(unittest.TestCase):
......
...@@ -122,10 +122,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -122,10 +122,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
try: try:
try: try:
self.eval_output_and_check(t) self.eval_output_and_check(t)
assert 0 except IndexError, e:
except Exception, e: return
if 'out of bounds' not in exc_message(e): self.fail()
raise
finally: finally:
_logger.setLevel(oldlevel) _logger.setLevel(oldlevel)
...@@ -161,7 +160,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -161,7 +160,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test1_0_dims(self): def test1_0_dims(self):
n = self.shared(numpy.ones((), dtype=self.dtype)) n = self.shared(numpy.ones((), dtype=self.dtype))
t = theano.tensor.Subtensor([])(n) t = self.sub([])(n)
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
mode = self.mode mode = self.mode
self.mode = mode.excluding("local_useless_subtensor") self.mode = mode.excluding("local_useless_subtensor")
...@@ -188,7 +187,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -188,7 +187,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(tval == 5.0) self.assertTrue(tval == 5.0)
def test1_ok_range_infinite(self): def test1_ok_range_infinite(self):
#Subtensor.debug = True
n = self.shared(numpy.arange(3, dtype=self.dtype)) n = self.shared(numpy.arange(3, dtype=self.dtype))
t = n[1:] t = n[1:]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
......
...@@ -543,8 +543,8 @@ class _tensor_py_operators: ...@@ -543,8 +543,8 @@ class _tensor_py_operators:
def get_scalar_constant_value(self): def get_scalar_constant_value(self):
return theano.tensor.basic.get_scalar_constant_value(self) return theano.tensor.basic.get_scalar_constant_value(self)
def zeros_like(self, dtype=None): def zeros_like(model, dtype=None):
return theano.tensor.basic.zeros_like(self, dtype=dtype) return theano.tensor.basic.zeros_like(model, dtype=dtype)
class TensorVariable(_tensor_py_operators, Variable): class TensorVariable(_tensor_py_operators, Variable):
......
...@@ -182,7 +182,10 @@ class InferShapeTester(unittest.TestCase): ...@@ -182,7 +182,10 @@ class InferShapeTester(unittest.TestCase):
def setUp(self): def setUp(self):
seed_rng() seed_rng()
# Take into account any mode that may be defined in a child class # Take into account any mode that may be defined in a child class
mode = getattr(self, 'mode', theano.compile.get_default_mode()) # and it can be None
mode = getattr(self, 'mode', None)
if mode is None:
mode = theano.compile.get_default_mode()
# This mode seems to be the minimal one including the shape_i # This mode seems to be the minimal one including the shape_i
# optimizations, if we don't want to enumerate them explicitly. # optimizations, if we don't want to enumerate them explicitly.
self.mode = mode.including("canonicalize") self.mode = mode.including("canonicalize")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论