提交 cfd5c827 authored 作者: abergeron's avatar abergeron

Merge pull request #1626 from nouiz/gpu

Assorted gpuarray work.
from theano.compile.ops import ( from theano.compile.ops import (
DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code, DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
Shape_i, register_shape_i_c_code,
ViewOp, view_op, register_view_op_c_code) ViewOp, view_op, register_view_op_c_code)
from theano.compile.function_module import * from theano.compile.function_module import *
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import copy import copy
import warnings import warnings
#import theano import theano
from theano import gof from theano import gof
...@@ -155,7 +155,7 @@ class DeepCopyOp(gof.Op): ...@@ -155,7 +155,7 @@ class DeepCopyOp(gof.Op):
# Else, we will return a list of (type name, version) pairs. # Else, we will return a list of (type name, version) pairs.
for t, (c, v) in sorted(self.c_code_and_version.items(), key=lambda pair: str(pair[0])): for t, (c, v) in sorted(self.c_code_and_version.items(), key=lambda pair: str(pair[0])):
if not v: if not v:
warnings.warn("Type %s has C code for OutputGuard, but it has " warnings.warn("Type %s has C code for DeepCopyOp, but it has "
"no version. You should add a 'version' keyword arg " "no version. You should add a 'version' keyword arg "
"when calling register_OutputGuard_c_code." % t, "when calling register_OutputGuard_c_code." % t,
stacklevel=2) stacklevel=2)
...@@ -180,6 +180,99 @@ class DeepCopyOp(gof.Op): ...@@ -180,6 +180,99 @@ class DeepCopyOp(gof.Op):
deep_copy_op = DeepCopyOp() deep_copy_op = DeepCopyOp()
class Shape_i(gof.Op):
"""
L{Op} to return the shape of a matrix.
@note: Non-differentiable.
"""
# Mapping from Type to C code (and version) to use.
# In the C code, the name of the input variable is %(iname)s,
# the output variable is %(oname)s.
c_code_and_version = {}
def __init__(self, i):
self.i = i
def __hash__(self):
return hash(type(self)) ^ self.i
def __eq__(self, other):
return type(self) == type(other) and self.i == other.i
def __str__(self):
return '%s{%i}' % (self.__class__.__name__, self.i)
def make_node(self, x):
# x could be one of a number of types
# the only thing we require is that the variable have a .ndim,
# and that the value have a .shape
if not isinstance(x, theano.Variable):
raise TypeError('x must be Variable with ndim attribute', x)
if x.ndim <= self.i:
raise TypeError('x has too few dimensions for Shape_i',
(x, self.i))
return theano.Apply(self, [x], [theano.tensor.lscalar()])
def perform(self, node, inp, out_):
x, = inp
out, = out_
if out[0] is None:
out[0] = theano._asarray(x.shape[self.i], dtype='int64')
else:
out[0][...] = x.shape[self.i]
def c_code_cache_version(self):
version = []
# If any of the c code is unversionned, we have to return ()
# Else, we will return a list of (type name, version) pairs.
for t, (c, v) in sorted(self.c_code_and_version.items(),
key=lambda pair: str(pair[0])):
if not v:
warnings.warn("Type %s has C code for Shape_i, but it has "
"no version. You should add a 'version' keyword arg "
"when calling register_OutputGuard_c_code." % t,
stacklevel=2)
return ()
version.append((str(t), v))
return tuple(version)
def c_code(self, node, name, inames, onames, sub):
iname, = inames
oname, = onames
fail = sub['fail']
i = self.i
itype = node.inputs[0].type.__class__
if itype in self.c_code_and_version:
code, version = self.c_code_and_version[itype]
return code % locals()
# Else, no C code
return super(Shape_i, self).c_code(node, name, inames, onames, sub)
def infer_shape(self, node, input_shapes):
return [()]
def grad(self, inp, grads):
return [None]
def register_shape_i_c_code(typ, code, version=()):
""" Tell DeepCopyOp how to generate C code for a Theano Type
:param typ: A Theano type. It must be the Theano class itself and not an
instance of the class.
:param code: C code that deep copies the Theano type 'typ'.
Use %(iname)s and %(oname)s for the input and output C
variable names respectively.
:param version: A number indicating the version of the code, for cache.
"""
Shape_i.c_code_and_version[typ] = (code, version)
# List of Theano Types that one can add an extra dimension and for which # List of Theano Types that one can add an extra dimension and for which
# Scan can deal with. # Scan can deal with.
expandable_types = () expandable_types = ()
...@@ -438,6 +438,13 @@ theano.compile.register_view_op_c_code( ...@@ -438,6 +438,13 @@ theano.compile.register_view_op_c_code(
""", """,
version=1) version=1)
theano.compile.register_shape_i_c_code(CudaNdarrayType, """
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0] =
CudaNdarray_HOST_DIMS(%(iname)s)[%(i)s];
""", version=(0,))
# Register CudaNdarrayType to the DeepCopyOp list of types with c code. # Register CudaNdarrayType to the DeepCopyOp list of types with c code.
theano.compile.register_deep_copy_op_c_code( theano.compile.register_deep_copy_op_c_code(
CudaNdarrayType, CudaNdarrayType,
......
...@@ -532,8 +532,26 @@ cuda_from_gpu = CudaFromGpu() ...@@ -532,8 +532,26 @@ cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
def __init__(self, memset_0=False):
"""memset_0 is only an optimized version. True, it mean the
value is always 0, so the c code call memset as it is faster.
"""
self.memset_0 = memset_0
def __eq__(self, other):
return type(self) == type(other) and self.memset_0 == other.memset_0
def __hash__(self):
return hash(type(self)) ^ hash(self.memset_0)
def __str__(self): def __str__(self):
return 'GpuAlloc' #Hide the memset parameter when not used to prevent confusion.
if self.memset_0:
s = "%s{memset_0=%s}" % (self.__class__.__name__, self.memset_0)
else:
s = self.__class__.__name__
return s
def make_node(self, value, *shape): def make_node(self, value, *shape):
res = Alloc.make_node(self, value, *shape) res = Alloc.make_node(self, value, *shape)
...@@ -542,6 +560,9 @@ class GpuAlloc(HideC, Alloc): ...@@ -542,6 +560,9 @@ class GpuAlloc(HideC, Alloc):
broadcastable=res.outputs[0].broadcastable) broadcastable=res.outputs[0].broadcastable)
return Apply(self, [value] + res.inputs[1:], [otype()]) return Apply(self, [value] + res.inputs[1:], [otype()])
def c_headers(self):
return ['<compyte/numpy_compat.h>']
def perform(self, node, inputs, outs): def perform(self, node, inputs, outs):
out, = outs out, = outs
v = inputs[0] v = inputs[0]
...@@ -562,6 +583,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -562,6 +583,7 @@ class GpuAlloc(HideC, Alloc):
ndim = len(inp[1:]) ndim = len(inp[1:])
zz, = out zz, = out
memset_0 = int(self.memset_0)
code = """ code = """
int i; int i;
size_t %(name)s_shape[%(ndim)s]; size_t %(name)s_shape[%(ndim)s];
...@@ -579,21 +601,45 @@ class GpuAlloc(HideC, Alloc): ...@@ -579,21 +601,45 @@ class GpuAlloc(HideC, Alloc):
for (i = 0; i < %(ndim)s; i++) for (i = 0; i < %(ndim)s; i++)
need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i]; need_new_out |= %(zz)s->ga.dimensions[i] != %(name)s_shape[i];
if (need_new_out) { if (need_new_out && (%(memset_0)s)) {
//pygpu_zeros can be faster then empty followed by memset.
Py_XDECREF(%(zz)s); Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape, %(zz)s = pygpu_zeros(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER, %(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(zz)s) { if (!%(zz)s) {
%(fail)s %(fail)s
} }
} else {
if (need_new_out) {
Py_XDECREF(%(zz)s);
%(zz)s = pygpu_empty(%(ndim)s, %(name)s_shape,
%(vv)s->ga.typecode, GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(zz)s) {
%(fail)s
}
}
if (%(memset_0)s && GpuArray_ISONESEGMENT(&%(zz)s->ga))
{
int err = GpuArray_memset(&%(zz)s->ga, 0);
if (err != GA_NO_ERROR)
{
PyErr_Format(PyExc_MemoryError,
"GpuAlloc: Error memsetting %%d"
" element of device memory to 0.",
PyGpuArray_SIZE(%(zz)s));
%(fail)s;
}
}
else if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) !=
GA_NO_ERROR) {
PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
}
} }
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv,
if (GpuArray_setarray(&%(zz)s->ga, &%(vv)s->ga) != GA_NO_ERROR) { fail=sub['fail'], memset_0=memset_0)
PyErr_SetString(PyExc_ValueError, "setarray failed");
%(fail)s
}
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, fail=sub['fail'])
if config.gpuarray.sync: if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz) code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
...@@ -601,7 +647,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -601,7 +647,7 @@ class GpuAlloc(HideC, Alloc):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
......
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.tensor.blas import Gemv, Gemm from theano.tensor.blas import Dot22, Gemv, Gemm
from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable) from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
try: try:
...@@ -28,12 +28,16 @@ class GpuGemv(BlasOp, Gemv): ...@@ -28,12 +28,16 @@ class GpuGemv(BlasOp, Gemv):
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y) y = as_gpuarray_variable(y)
assert A.dtype == x.dtype == y.dtype == alpha.dtype == beta.dtype
return Apply(self, [y, alpha, A, x, beta], [y.type()]) return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage): def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs y, alpha, A, x, beta = inputs
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False, inplace = self.inplace
overwrite_y=self.inplace) if inplace and y.strides[0] < 0:
inplace = False
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y,
overwrite_y=inplace)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3], vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
...@@ -64,7 +68,7 @@ class GpuGemv(BlasOp, Gemv): ...@@ -64,7 +68,7 @@ class GpuGemv(BlasOp, Gemv):
if config.gpuarray.sync: if config.gpuarray.sync:
code += """ code += """
GpuArray_sync(&%(out)s->ga); GpuArray_sync(&%(out)s->ga);
""" """ % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -80,12 +84,16 @@ class GpuGemm(BlasOp, Gemm): ...@@ -80,12 +84,16 @@ class GpuGemm(BlasOp, Gemm):
A = as_gpuarray_variable(A) A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B) B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C) C = as_gpuarray_variable(C)
assert A.dtype == B.dtype == C.dtype == alpha.dtype == beta.dtype
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
C, alpha, A, B, beta = inputs C, alpha, A, B, beta = inputs
inplace = self.inplace
if inplace and not C.flags.forc:
inplace = False
outputs[0][0] = blas.gemm(alpha, A, B, beta, C, outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
overwrite_c=self.inplace) overwrite_c=inplace)
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
...@@ -116,7 +124,7 @@ class GpuGemm(BlasOp, Gemm): ...@@ -116,7 +124,7 @@ class GpuGemm(BlasOp, Gemm):
if config.gpuarray.sync: if config.gpuarray.sync:
code += """ code += """
GpuArray_sync(&%(out)s->ga); GpuArray_sync(&%(out)s->ga);
""" """ % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -126,6 +134,67 @@ class GpuGemm(BlasOp, Gemm): ...@@ -126,6 +134,67 @@ class GpuGemm(BlasOp, Gemm):
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True) gpugemm_inplace = GpuGemm(inplace=True)
class GpuDot22(BlasOp, Dot22):
def make_node(self, x, y):
res = Dot22.make_node(self, x, y)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
assert x.dtype == y.dtype
return Apply(self, [x, y], [x.type()])
def perform(self, node, inputs, outputs):
x, y = inputs
out = pygpu.empty((x.shape[0], y.shape[1]), dtype=x.dtype)
outputs[0][0] = blas.gemm(1., x, y, 0., out,
overwrite_c=True)
def c_code(self, node, name, inputs, outputs, sub):
dtype = node.inputs[0].dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
vars = dict(A=inputs[0], B=inputs[1], dtype=dtype, out=outputs[0],
typecode=typecode,
fail=sub['fail'], name=name)
code = """
double one = 1.;
double zero = 0.;
size_t dims[] = {0, 0};
dims[0] = PyGpuArray_DIMS(%(A)s)[0];
dims[1] = PyGpuArray_DIMS(%(B)s)[1];
%(out)s = pygpu_empty(2, dims,
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(out)s) {
%(fail)s
}
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
one,
%(A)s, %(B)s,
zero,
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (0,)
def c_headers(self):
ret = super(GpuDot22, self).c_headers()
return ret + ['<compyte/numpy_compat.h>']
gpu_dot22 = GpuDot22()
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out from theano.tensor.opt import in2out
......
...@@ -3,7 +3,8 @@ import theano ...@@ -3,7 +3,8 @@ import theano
import numpy import numpy
from theano import tensor, scalar from theano import tensor, scalar
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB,
SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox, DestroyHandler,
InconsistencyError, EquilibriumOptimizer) InconsistencyError, EquilibriumOptimizer)
...@@ -12,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType ...@@ -12,12 +13,15 @@ from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
gpu_from_host, gpu_from_host,
gpu_alloc, GpuReshape, gpu_alloc,
GpuAlloc,
GpuReshape,
GpuEye) GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
...@@ -52,7 +56,7 @@ def op_lifter(OP): ...@@ -52,7 +56,7 @@ def op_lifter(OP):
""" """
def f(maker): def f(maker):
def local_opt(node): def local_opt(node):
if type(node.op) is OP: if type(node.op) in OP:
# This does not support nodes that have more than one output. # This does not support nodes that have more than one output.
assert len(node.outputs) == 1 assert len(node.outputs) == 1
# either one of our inputs is on the gpu or # either one of our inputs is on the gpu or
...@@ -70,7 +74,7 @@ def op_lifter(OP): ...@@ -70,7 +74,7 @@ def op_lifter(OP):
return [host_from_gpu(new_op)] return [host_from_gpu(new_op)]
return False return False
local_opt.__name__ = maker.__name__ local_opt.__name__ = maker.__name__
return local_optimizer([OP])(local_opt) return local_optimizer(OP)(local_opt)
return f return f
...@@ -120,13 +124,25 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua', ...@@ -120,13 +124,25 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
@register_opt() @register_opt()
@op_lifter(tensor.Alloc) @op_lifter([tensor.Alloc])
def local_gpualloc(node): def local_gpualloc(node):
return gpu_alloc return gpu_alloc
@register_opt() @register_opt()
@op_lifter(tensor.Reshape) @local_optimizer([GpuAlloc])
def local_gpualloc_memset_0(node):
if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0]
if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs)
return [new_out]
@register_opt()
@op_lifter([tensor.Reshape])
def local_gpureshape(node): def local_gpureshape(node):
op = node.op op = node.op
name = op.name name = op.name
...@@ -137,7 +153,7 @@ def local_gpureshape(node): ...@@ -137,7 +153,7 @@ def local_gpureshape(node):
@register_opt() @register_opt()
@op_lifter(tensor.Flatten) @op_lifter([tensor.Flatten])
def local_gpuflatten(node): def local_gpuflatten(node):
op = node.op op = node.op
shp =[] shp =[]
...@@ -150,10 +166,12 @@ def local_gpuflatten(node): ...@@ -150,10 +166,12 @@ def local_gpuflatten(node):
@register_opt() @register_opt()
@op_lifter(tensor.Elemwise) @op_lifter([tensor.Elemwise])
def local_gpu_elemwise(node): def local_gpu_elemwise(node):
op = node.op op = node.op
name = op.name name = op.name
if node.outputs[0].ndim == 0:
return
if name: if name:
name = 'Gpu'+name name = 'Gpu'+name
res = GpuElemwise(op.scalar_op, name=name, res = GpuElemwise(op.scalar_op, name=name,
...@@ -193,26 +211,26 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75, ...@@ -193,26 +211,26 @@ optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
@register_opt() @register_opt()
@op_lifter(tensor.DimShuffle) @op_lifter([tensor.DimShuffle])
def local_gpua_dimshuffle(node): def local_gpua_dimshuffle(node):
return GpuDimShuffle(node.op.input_broadcastable, return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order) node.op.new_order)
@register_opt() @register_opt()
@op_lifter(tensor.SpecifyShape) @op_lifter([tensor.SpecifyShape])
def local_gpua_specifyShape(node): def local_gpua_specifyShape(node):
return tensor.specify_shape return tensor.specify_shape
@register_opt() @register_opt()
@op_lifter(tensor.Subtensor) @op_lifter([tensor.Subtensor])
def local_gpua_subtensor(node): def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt() @register_opt()
@op_lifter(tensor.CAReduce) @op_lifter([tensor.CAReduce, tensor.Sum])
def local_gpua_careduce(node): def local_gpua_careduce(node):
if (isinstance(node.op.scalar_op, scalar.basic.Add) or if (isinstance(node.op.scalar_op, scalar.basic.Add) or
isinstance(node.op.scalar_op, scalar.basic.Mul)): isinstance(node.op.scalar_op, scalar.basic.Mul)):
...@@ -220,23 +238,32 @@ def local_gpua_careduce(node): ...@@ -220,23 +238,32 @@ def local_gpua_careduce(node):
dtype=getattr(node.op, 'dtype', None), dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None)) acc_dtype=getattr(node.op, 'acc_dtype', None))
@register_opt() @register_opt()
@op_lifter(tensor.blas.Gemv) @op_lifter([tensor.blas.Gemv])
def local_gpua_gemv(node): def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt() @register_opt()
@op_lifter(tensor.blas_c.CGemv) @op_lifter([tensor.blas_c.CGemv])
def local_gpua_gemv2(node): def local_gpua_gemv2(node):
return GpuGemv(inplace=node.op.inplace) return GpuGemv(inplace=node.op.inplace)
@register_opt() @register_opt()
@op_lifter(tensor.blas.Gemm) @op_lifter([tensor.blas.Gemm])
def local_gpua_gemm(node): def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace) return GpuGemm(inplace=node.op.inplace)
@register_opt() @register_opt()
@op_lifter(tensor.basic.Eye) @op_lifter([tensor.blas.Dot22])
def local_gpua_dot22(node):
return gpu_dot22
@register_opt()
@op_lifter([tensor.basic.Eye])
def local_gpua_eye(node): def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype) return GpuEye(dtype=node.op.dtype)
...@@ -336,3 +336,39 @@ def test_gpueye(): ...@@ -336,3 +336,39 @@ def test_gpueye():
# M != N, k = 0 # M != N, k = 0
yield check, dtype, 3, 5 yield check, dtype, 3, 5
yield check, dtype, 5, 3 yield check, dtype, 5, 3
def test_hostfromgpu_shape_i():
"""
Test that the shape is lifted over hostfromgpu
"""
m = mode_with_gpu.including('local_dot_to_dot22',
'local_dot22_to_dot22scalar','specialize')
a = T.fmatrix('a')
ca = theano.sandbox.gpuarray.type.GpuArrayType('float32', (False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32')
gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()]
f = theano.function([a], gpu_from_host(a).shape, mode=m)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op, T.opt.Shape_i)
assert isinstance(topo[2].op, T.opt.MakeVector)
assert tuple(f(av)) == (5, 4)
f = theano.function([ca], host_from_gpu(ca), mode=m)
assert host_from_gpu in [x.op
for x in f.maker.fgraph.toposort()]
f = theano.function([ca], host_from_gpu(ca).shape, mode=m)
topo = f.maker.fgraph.toposort()
assert isinstance(topo[0].op, theano.compile.Shape_i)
assert isinstance(topo[1].op, theano.compile.Shape_i)
assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
assert tuple(f(cv)) == (5, 4)
from unittest import TestCase from unittest import TestCase
from theano.tensor.blas import gemv_inplace, gemm_inplace import theano
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
from theano.sandbox.gpuarray.blas import (gpugemv_inplace, from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
gpugemm_inplace) gpugemm_inplace, gpu_dot22)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester('GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace, op=gemv_inplace, gpu_op=gpugemv_inplace,
...@@ -29,7 +31,28 @@ GpuGemmTester = makeTester('GpuGemmTester', ...@@ -29,7 +31,28 @@ GpuGemmTester = makeTester('GpuGemmTester',
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6], test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0], test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0], test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0], test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0], test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
) # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
GpuDot22Tester = makeTester(
'GpuGemmTester',
op=_dot22, gpu_op=gpu_dot22,
cases=dict(
test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
)
) )
...@@ -2,7 +2,8 @@ import numpy ...@@ -2,7 +2,8 @@ import numpy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.sandbox.gpuarray.basic_ops import GpuReshape from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
from theano.sandbox.gpuarray.elemwise import GpuCAReduce
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
...@@ -29,7 +30,7 @@ else: ...@@ -29,7 +30,7 @@ else:
def test_flatten(): def test_flatten():
m = theano.tensor.fmatrix() m = theano.tensor.fmatrix()
f = theano.function([m], m.flatten(), mode=mode_with_gpu) f = theano.function([m], m.flatten(), mode=mode_with_gpu)
val = numpy.random.rand(10,11).astype("float32") val = numpy.random.rand(10, 11).astype("float32")
res = f(val) res = f(val)
utt.assert_allclose(res, val.flatten()) utt.assert_allclose(res, val.flatten())
assert res.shape == val.flatten().shape assert res.shape == val.flatten().shape
...@@ -58,3 +59,48 @@ def test_flatten(): ...@@ -58,3 +59,48 @@ def test_flatten():
assert res.shape == val.reshape(10, -1).shape assert res.shape == val.reshape(10, -1).shape
assert GpuReshape in [type(node.op) assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()] for node in f.maker.fgraph.toposort()]
def test_sum_prod():
for method in ['sum']:
m = theano.tensor.fmatrix()
f = theano.function([m], getattr(m, method)(), mode=mode_with_gpu)
val = numpy.random.rand(10, 11).astype("float32")
res = f(val)
utt.assert_allclose(res, val.sum())
assert res.shape == ()
assert GpuCAReduce in [type(node.op)
for node in f.maker.fgraph.toposort()]
def test_local_gpualloc_memset_0():
i = theano.tensor.iscalar()
z = numpy.zeros((1,), dtype='float32')
o = numpy.ones((1,), dtype='float32')
ones = numpy.ones((2,), dtype='float32')
# Test with 0
a = gpu_alloc(z, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 0).all()
# Test with 1
a = gpu_alloc(o, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(6)) == 1).all()
# Test with 1, 1
a = gpu_alloc(ones, i)
f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc)
assert not topo[0].op.memset_0
assert (numpy.asarray(f(2)) == 1).all()
...@@ -278,6 +278,13 @@ theano.compile.register_view_op_c_code(GpuArrayType, """ ...@@ -278,6 +278,13 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
Py_XINCREF(%(oname)s); Py_XINCREF(%(oname)s);
""", version=(0,)) """, version=(0,))
theano.compile.register_shape_i_c_code(GpuArrayType, """
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0] =
%(iname)s->ga.dimensions[%(i)s];
""", version=(0,))
theano.compile.register_deep_copy_op_c_code(GpuArrayType, """ theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
%(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER); %(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
......
...@@ -1589,7 +1589,7 @@ class Dot22(GemmRelated): ...@@ -1589,7 +1589,7 @@ class Dot22(GemmRelated):
raise raise
def __str__(self): def __str__(self):
return "_dot22" return self.__class__.__name__
setup_z_Nz_Sz = """ setup_z_Nz_Sz = """
if ((NULL == %(_zout)s) if ((NULL == %(_zout)s)
...@@ -1862,7 +1862,7 @@ class Dot22Scalar(GemmRelated): ...@@ -1862,7 +1862,7 @@ class Dot22Scalar(GemmRelated):
raise raise
def __str__(self): def __str__(self):
return "_dot22scalar" return self.__class__.__name__
setup_z_Nz_Sz = Dot22.setup_z_Nz_Sz setup_z_Nz_Sz = Dot22.setup_z_Nz_Sz
......
...@@ -20,8 +20,8 @@ def make_declare(loop_orders, dtypes, sub): ...@@ -20,8 +20,8 @@ def make_declare(loop_orders, dtypes, sub):
# the stride in that dimension, # the stride in that dimension,
# and the jump from an iteration to the next # and the jump from an iteration to the next
decl += """ decl += """
int %(var)s_n%(value)i; npy_intp %(var)s_n%(value)i;
int %(var)s_stride%(value)i; ssize_t %(var)s_stride%(value)i;
int %(var)s_jump%(value)i_%(j)i; int %(var)s_jump%(value)i_%(j)i;
""" % locals() """ % locals()
else: else:
......
...@@ -29,6 +29,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice, ...@@ -29,6 +29,7 @@ from theano.tensor.subtensor import (get_idx_list, get_canonical_form_slice,
from theano import scalar from theano import scalar
from theano.tensor import basic as T from theano.tensor import basic as T
from theano import compile # to register the optimizer built by this file from theano import compile # to register the optimizer built by this file
from theano.compile.ops import Shape_i
from theano.gof.python25 import any, all from theano.gof.python25 import any, all
from theano.gof.opt import (Optimizer, pre_constant_merge, from theano.gof.opt import (Optimizer, pre_constant_merge,
...@@ -637,78 +638,6 @@ T.pprint.assign(lambda pstate, r: r.owner and isinstance( ...@@ -637,78 +638,6 @@ T.pprint.assign(lambda pstate, r: r.owner and isinstance(
r.owner.op, MakeVector), MakeVectorPrinter()) r.owner.op, MakeVector), MakeVectorPrinter())
class Shape_i(T.Op):
"""
L{Op} to return the shape of a matrix.
@note: Non-differentiable.
"""
def __init__(self, i):
self.i = i
def __hash__(self):
return hash(type(self)) ^ self.i
def __eq__(self, other):
return type(self) == type(other) and self.i == other.i
def __str__(self):
return '%s{%i}' % (self.__class__.__name__, self.i)
def make_node(self, x):
# x could be one of a number of types
# the only thing we require is that the variable have a .ndim,
# and that the value have a .shape
if not isinstance(x, T.Variable):
raise TypeError('x must be Variable with ndim attribute', x)
if x.ndim <= self.i:
raise TypeError('x has too few dimensions for Shape_i',
(x, self.i))
return T.Apply(self, [x], [T.lscalar()])
def perform(self, node, inp, out_):
x, = inp
out, = out_
if out[0] is None:
out[0] = theano._asarray(x.shape[self.i], dtype='int64')
else:
out[0][...] = x.shape[self.i]
def c_code_cache_version(self):
return (0, 1)
def c_code(self, node, name, inp, out_, sub):
x, = inp
out, = out_
i = self.i
if isinstance(node.inputs[0].type, T.TensorType):
return """
if(!%(out)s)
%(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(out)s))[0]=PyArray_DIMS(%(x)s)[%(i)s];
""" % locals()
elif node.inputs[0].type.__class__.__name__ == "CudaNdarrayType":
#Don't want to import cuda stuff here.
return """
if(!%(out)s)
%(out)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(out)s))[0]=
CudaNdarray_HOST_DIMS(%(x)s)[%(i)s];
""" % locals()
else:
#TODO: if your type is not listed here, make a damn registry of
# shape_i ops for various types of variables.
# Do not continue this madness.
return super(Shape_i, self).c_code(node, name, (x,), (out,), sub)
def infer_shape(self, node, input_shapes):
return [()]
def grad(self, inp, grads):
return [None]
class ShapeFeature(object): class ShapeFeature(object):
"""Graph optimizer for removing all calls to shape() """Graph optimizer for removing all calls to shape()
......
...@@ -611,6 +611,16 @@ theano.compile.register_view_op_c_code( ...@@ -611,6 +611,16 @@ theano.compile.register_view_op_c_code(
""", """,
version=1) version=1)
# Register TensorType C code for ViewOp.
theano.compile.register_shape_i_c_code(
TensorType,
"""
if(!%(oname)s)
%(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);
((npy_int64*)PyArray_DATA(%(oname)s))[0]=PyArray_DIMS(%(iname)s)[%(i)s];
""",
version=1)
# Register TensorType C code for DeepCopyOp # Register TensorType C code for DeepCopyOp
theano.compile.register_deep_copy_op_c_code( theano.compile.register_deep_copy_op_c_code(
TensorType, TensorType,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论