提交 aa278955 authored 作者: James Bergstra's avatar James Bergstra

test_elemwise1 passes, didnt run in debugmode

上级 56d152c8
...@@ -4,3 +4,6 @@ from .var import (CudaNdarrayVariable, ...@@ -4,3 +4,6 @@ from .var import (CudaNdarrayVariable,
CudaNdarrayConstant, CudaNdarrayConstant,
CudaNdarraySharedVariable, CudaNdarraySharedVariable,
shared_constructor) shared_constructor)
import basic_ops
import opt
...@@ -18,6 +18,8 @@ class HostFromGpu(Op): ...@@ -18,6 +18,8 @@ class HostFromGpu(Op):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return '<HostFromGpu@%i>' % id(self)
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, CudaNdarrayType): if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x) raise TypeError(x)
...@@ -32,6 +34,8 @@ class GpuFromHost(Op): ...@@ -32,6 +34,8 @@ class GpuFromHost(Op):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self):
return '<GpuFromHost@%i>' % id(self)
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
...@@ -41,26 +45,67 @@ class GpuFromHost(Op): ...@@ -41,26 +45,67 @@ class GpuFromHost(Op):
def grad(self, inputs, (gz,)): def grad(self, inputs, (gz,)):
return [HostFromGpu()(gz)] return [HostFromGpu()(gz)]
class GpuAdd(Op):
class GpuElemwise(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern):
self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern
self.destroy_map = dict((o, [i]) for o, i in inplace_pattern.items())
if scalar_op.nin > 0:
self.ufunc = numpy.frompyfunc(scalar_op.impl, scalar_op.nin, scalar_op.nout)
else:
self.ufunc = None
self._rehash()
def __getstate__(self):
d = copy(self.__dict__)
d.pop('ufunc')
d.pop('__epydoc_asRoutine', None)
d.pop('_hashval')
return d
def __setstate__(self, d):
self.__dict__.update(d)
if self.scalar_op.nin > 0:
self.ufunc = numpy.frompyfunc(self.scalar_op.impl, self.scalar_op.nin, self.scalar_op.nout)
else:
self.ufunc = None
self._rehash()
def __eq__(self, other): def __eq__(self, other):
self.scalar_op = scalar.add return type(self) == type(other) and (self.scalar_op == other.scalar_op)
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, a, b):
_a = as_cuda_ndarray_variable(a)
_b = as_cuda_ndarray_variable(b)
if _a.type.broadcastable != _b.type.broadcastable:
raise NotImplementedError('different bcastable')
return Apply(self, [_a,_b], [CudaNdarrayType(broadcastable=_a.broadcastable)()])
def perform(self, node, (a,b), (z,)):
aval = numpy.asarray(a, dtype='float32')
bval = numpy.asarray(b, dtype='float32')
z[0] = type_support_filter(aval + bval, (0,)*len(zval.shape), 0)
def grad(self, inputs, (gz,)): def _rehash(self):
return [gz for i in inputs] items = self.inplace_pattern.items()
items.sort()
tuple_items = tuple([k for k,v in items] + [(tuple(v) if isinstance(v, (tuple, list)) else v) for k,v in items])
h = hash('Elemwise') ^ hash(self.scalar_op) ^ hash(tuple_items)
assert h == getattr(self,'_hashval', h)
self._hashval = h
def __hash__(self):
return self._hashval
def __str__(self):
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return "GpuElemwise{%s}%s" % (self.scalar_op, str(items))
else:
return "GpuElemwise{%s}" % (self.scalar_op)
def make_node(self, *inputs):
_inputs = [as_cuda_ndarray_variable(i) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.broadcastable != inputs[0].type.broadcastable:
raise NotImplementedError('different bcastable')
otype = CudaNdarrayType(broadcastable=_inputs[0].broadcastable)
assert self.nout > 0
return Apply(self, _inputs, [otype() for o in xrange(self.nout)])
def c_support_code(self): def c_support_code(self):
return """ return """
#define INTDIV_POW2(a, b) (a >> b) #define INTDIV_POW2(a, b) (a >> b)
...@@ -108,6 +153,7 @@ class GpuAdd(Op): ...@@ -108,6 +153,7 @@ class GpuAdd(Op):
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
# perform the scalar operation on the input and output references # perform the scalar operation on the input and output references
if d == 0:
print >> sio, " ", self.scalar_op.c_code(None, None, print >> sio, " ", self.scalar_op.c_code(None, None,
['ii_i%i_data[0]'%ipos for ipos, i in enumerate(node.inputs)], ['ii_i%i_data[0]'%ipos for ipos, i in enumerate(node.inputs)],
['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)], ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)],
...@@ -121,7 +167,8 @@ class GpuAdd(Op): ...@@ -121,7 +167,8 @@ class GpuAdd(Op):
#for ipos, i in enumerate(node.inputs): #for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print >> sio, "}" print >> sio, "}"
print sio.getvalue() if 0:
print sio.getvalue()
return sio.getvalue() return sio.getvalue()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -131,18 +178,44 @@ class GpuAdd(Op): ...@@ -131,18 +178,44 @@ class GpuAdd(Op):
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
d = dict() d = dict()
assert nd == 2 assert nd == 2
kernel_call_args = ("numEls, log2_dims[0], log2_dims[1]" #input_params and output_params go into the function declaration/definition
", a_str[0], a_str[1], a_data" input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
", b_str[0], b_str[1], b_data" for ipos in xrange(len(node.inputs)))
", z_str[0], z_str[1], z_data") output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
#input_args and output_args go into the recursive call.
input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos)
for ipos in xrange(len(node.outputs)))
# kernel_call_args are used to invoke the cuda kernel
kernel_call_args = ["numEls, log2_dims[0], log2_dims[1]"]
for ipos in xrange(len(node.inputs)):
strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)):
strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(nd))
kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
kernel_call_args = ",".join(kernel_call_args)
# the data_pointer_increments are inserted after each recursive call
data_ptr_inc = []
for ipos in xrange(len(node.inputs)):
data_ptr_inc.append("i%i_data += (1<< log2_dim) * i%i_str[d]" %(ipos, ipos))
for ipos in xrange(len(node.outputs)):
data_ptr_inc.append("o%i_data += (1<< log2_dim) * o%i_str[d]" %(ipos, ipos))
data_ptr_inc = ";\n".join(data_ptr_inc)
d.update(locals()) d.update(locals())
return """ return """
static void callkernel_%(nodename)s(const unsigned int numEls, const int d, static void callkernel_%(nodename)s(const unsigned int numEls, const int d,
const int * dims, int * log2_dims, const int * dims, int * log2_dims,
const float * a_data, const int * a_str, %(input_params)s,
const float * b_data, const int * b_str, %(output_params)s)
float * z_data, const int * z_str)
{ {
if (d == %(nd)s) if (d == %(nd)s)
{ {
...@@ -150,40 +223,39 @@ class GpuAdd(Op): ...@@ -150,40 +223,39 @@ class GpuAdd(Op):
//a ceil would be better here //a ceil would be better here
int n_blocks = std::min(numEls/threads_per_block + 1, (unsigned int)NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(numEls/threads_per_block + 1, (unsigned int)NUM_VECTOR_OP_BLOCKS);
kernel_%(nodename)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); kernel_%(nodename)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
std::cerr << "ADDCALL a str" << a_str[0] << " "<< a_str[1] << "\\n"; //std::cerr << "ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
std::cerr << "ADDCALL a data" << a_data << "\\n"; //std::cerr << "ADDCALL a data" << i0_data << "\\n";
std::cerr << "ADDCALL b str" << b_str[0] << " "<< b_str[1] << "\\n"; //std::cerr << "ADDCALL b str" << i1_str[0] << " "<< i1_str[1] << "\\n";
std::cerr << "ADDCALL b data" << b_data << "\\n"; //std::cerr << "ADDCALL b data" << i1_data << "\\n";
std::cerr << "ADDCALL z str" << z_str[0] << " "<< z_str[1] << "\\n"; //std::cerr << "ADDCALL z str" << o0_str[0] << " "<< o0_str[1] << "\\n";
std::cerr << "ADDCALL z data" << z_data << "\\n"; //std::cerr << "ADDCALL z data" << o0_data << "\\n";
} }
else else
{ {
std::cerr << "_ADDCALL d " << d << "\\n"; //std::cerr << "_ADDCALL d " << d << "\\n";
unsigned int dim_d = dims[d]; unsigned int dim_d = dims[d];
std::cerr << "_ADDCALL dim_d " << dim_d << "\\n"; //std::cerr << "_ADDCALL dim_d " << dim_d << "\\n";
int log2_dim = 0; int log2_dim = 0;
while(dim_d) while(dim_d)
{ {
std::cerr << "___ADDCALL d " << d << " " << dim_d << "\\n"; //std::cerr << "___ADDCALL d " << d << " " << dim_d << "\\n";
if (dim_d&1) if (dim_d&1)
{ {
log2_dims[d] = log2_dim; log2_dims[d] = log2_dim;
std::cerr << "___ADDCALL a str" << a_str[0] << " "<< a_str[1] << "\\n"; //std::cerr << "___ADDCALL a str" << i0_str[0] << " "<< i0_str[1] << "\\n";
std::cerr << "___ADDCALL a data" << a_data << "\\n"; //std::cerr << "___ADDCALL a data" << i0_data << "\\n";
std::cerr << "___ADDCALL b str" << b_str[0] << " "<< b_str[1] << "\\n"; //std::cerr << "___ADDCALL b str" << i1_str[0] << " "<< i1_str[1] << "\\n";
std::cerr << "___ADDCALL b data" << b_data << "\\n"; //std::cerr << "___ADDCALL b data" << i1_data << "\\n";
std::cerr << "___ADDCALL z str" << z_str[0] << " "<< z_str[1] << "\\n"; //std::cerr << "___ADDCALL z str" << o0_str[0] << " "<< o0_str[1] << "\\n";
std::cerr << "___ADDCALL z data" << z_data << "\\n"; //std::cerr << "___ADDCALL z data" << o0_data << "\\n";
callkernel_%(nodename)s(numEls * (1<<log2_dim), d+1, callkernel_%(nodename)s(numEls * (1<<log2_dim), d+1, dims, log2_dims,
dims, log2_dims, %(input_args)s,
a_data, a_str, %(output_args)s);
b_data, b_str,
z_data, z_str); %(data_ptr_inc)s;
a_data += (1 << log2_dim) * a_str[d]; //i0_data += (1 << log2_dim) * i0_str[d];
b_data += (1 << log2_dim) * b_str[d]; //i1_data += (1 << log2_dim) * i1_str[d];
z_data += (1 << log2_dim) * z_str[d]; //o0_data += (1 << log2_dim) * o0_str[d];
} }
log2_dim += 1; log2_dim += 1;
dim_d >>= 1; dim_d >>= 1;
...@@ -192,72 +264,122 @@ std::cerr << "___ADDCALL z data" << z_data << "\\n"; ...@@ -192,72 +264,122 @@ std::cerr << "___ADDCALL z data" << z_data << "\\n";
} }
""" %d """ %d
def c_code(self, node, nodename, (a,b), (z,), sub): def c_code(self, node, nodename, inputs, outputs, sub):
d = dict(sub) d = dict(sub)
nd = node.outputs[0].type.ndim nd = node.outputs[0].type.ndim
d.update(locals()) d.update(locals())
return """ sio = StringIO.StringIO()
std::cerr << "ADD start\\n"; nin = len(inputs)
nout = len(outputs)
fail = sub['fail']
opname = str(self.scalar_op)
print >> sio, """
std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks //standard elemwise size checks
if (cnda_%(a)s->nd != cnda_%(b)s->nd) const int * dims = NULL;
""" %locals()
for iname in inputs:
print >> sio, """
if (%(nd)s != cnda_%(iname)s->nd)
{ {
PyErr_SetString(PyExc_TypeError, "need same number of dims"); PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
return NULL; %(fail)s;
} }
""" %locals()
for iname0, iname1 in zip(inputs[1:], inputs[:-1]):
print >> sio, """
//standard elemwise dim checks //standard elemwise dim checks
unsigned int size = 1; for (int i = 0; i< %(nd)s; ++i)
for (int i = 0; i< cnda_%(a)s->nd; ++i)
{ {
if (cnda_%(a)s->dim[i] != cnda_%(b)s->dim[i]) if (cnda_%(iname0)s->dim[i] != cnda_%(iname1)s->dim[i])
{ {
PyErr_SetString(PyExc_TypeError, "need same dimensions"); PyErr_SetString(PyExc_TypeError, "need same dimensions");
return NULL; %(fail)s;
} }
size *= (unsigned int) cnda_%(a)s->dim[i];
} }
std::cerr << "ADD size " << size << "\\n"; """ %locals()
if (cnda_%(z)s){ iname0 = inputs[0]
print >> sio, """
dims = cnda_%(iname0)s->dim;
//unsigned int size = CudaNdarray_SIZE(cnda_%(iname0)s);
//std::cerr << "ADD size " << size << "\\n";
""" %locals()
for oname in outputs:
print >> sio, """
if (cnda_%(oname)s){
//TODO: check if we can maybe use existing storage //TODO: check if we can maybe use existing storage
Py_XDECREF(cnda_%(z)s); Py_XDECREF(cnda_%(oname)s);
cnda_%(z)s = NULL; cnda_%(oname)s = NULL;
std::cerr << "ADD decref z \\n";
} }
if (NULL == cnda_%(z)s) if (NULL == cnda_%(oname)s)
{ {
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null(); cnda_%(oname)s = (CudaNdarray*)CudaNdarray_new_null();
if (!cnda_%(z)s) if (!cnda_%(oname)s)
{ {
//error string already set
%(fail)s; %(fail)s;
} }
if (CudaNdarray_alloc_contiguous(cnda_%(z)s, cnda_%(a)s->nd, cnda_%(a)s->dim)) if (CudaNdarray_alloc_contiguous(cnda_%(oname)s, %(nd)s, dims))
{ {
Py_XDECREF(cnda_%(z)s); //error string already set
cnda_%(z)s = NULL; Py_XDECREF(cnda_%(oname)s);
cnda_%(oname)s = NULL;
%(fail)s; %(fail)s;
} }
} }
std::cerr << "ADD z nd" << cnda_%(z)s->nd << "\\n"; std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
std::cerr << "ADD z str" << cnda_%(z)s->str[0] << " "<< cnda_%(z)s->str[1] << "\\n"; std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
std::cerr << "ADD z data" << cnda_%(z)s->devdata << "\\n"; """ % locals()
{ //new block so that failure gotos don't skip over variable initialization print >> sio, """
{
//new block so that failure gotos don't skip over variable initialization
int log2_dims[%(nd)s]; int log2_dims[%(nd)s];
callkernel_%(nodename)s(1, 0, CudaNdarray_DIMS(cnda_%(z)s), log2_dims, callkernel_%(nodename)s(1, 0, dims, log2_dims
CudaNdarray_DEV_DATA(cnda_%(a)s), CudaNdarray_STRIDES(cnda_%(a)s), """ % locals()
CudaNdarray_DEV_DATA(cnda_%(b)s), CudaNdarray_STRIDES(cnda_%(b)s), for iname in inputs:
CudaNdarray_DEV_DATA(cnda_%(z)s), CudaNdarray_STRIDES(cnda_%(z)s)); print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_STRIDES(cnda_%(iname)s)
""" % locals()
for oname in outputs:
print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_STRIDES(cnda_%(oname)s)
""" % locals()
print >> sio, """
);
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "kExp", cudaGetErrorString(err)); PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n", "Elemwise %(nodename)s", cudaGetErrorString(err));
Py_XDECREF(cnda_%(z)s); """ % locals()
cnda_%(z)s = NULL; for oname in outputs:
print >> sio, """
Py_XDECREF(cnda_%(oname)s);
cnda_%(oname)s = NULL;
""" % locals()
print >> sio, """
%(fail)s; %(fail)s;
} }
} }
""" % d std::cerr << "C_CODE %(opname)s END\\n";
""" % locals()
return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return () return ()
if 0:
class GpuAdd(GpuElemwise):
def __init__(self):
super(GpuAdd, self).__init__(scalar.add)
def perform(self, node, args, (z,)):
print "GpuAdd perform"
zval = numpy.asarray(args[0])
for a in args[1:]:
zval += numpy.asarray(a)
z[0] = type_support_filter(zval, (0,)*len(zval.shape), 0)
gpu_add = GpuAdd()
import sys
from theano.compile.sandbox.sharedvalue import shared from theano.compile.sandbox.sharedvalue import shared
from theano.compile.sandbox.pfunc import pfunc from theano.compile.sandbox.pfunc import pfunc
from theano import tensor from theano import tensor
...@@ -11,7 +12,7 @@ def test_elemwise0(): ...@@ -11,7 +12,7 @@ def test_elemwise0():
a = tcn.shared_constructor(numpy.random.rand(4,4), 'a') a = tcn.shared_constructor(numpy.random.rand(4,4), 'a')
b = tensor.dmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, a+b)]) f = pfunc([b], [], updates=[(a, a+b)])
...@@ -27,11 +28,27 @@ def test_elemwise1(): ...@@ -27,11 +28,27 @@ def test_elemwise1():
""" Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ """ Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """
shape = (3,4) shape = (3,4)
a = tcn.shared_constructor(numpy.random.rand(*shape), 'a') a = tcn.shared_constructor(numpy.random.rand(*shape)+0.5, 'a')
b = tensor.dmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))])
#let debugmode catch any mistakes #let debugmode catch any mistakes
f(numpy.ones(shape)) print >> sys.stderr, "STARTING FUNCTION 1"
f = pfunc([b], [], updates=[(a, b**a)])
for i, node in enumerate(f.maker.env.toposort()):
print i, node
f(numpy.random.rand(*shape)+0.3)
print >> sys.stderr, "STARTING FUNCTION 2"
#let debugmode catch any mistakes
f = pfunc([b], [], updates=[(a, tensor.exp(b**a))])
for i, node in enumerate(f.maker.env.toposort()):
print i, node
f(numpy.random.rand(*shape)+0.3)
print >> sys.stderr, "STARTING FUNCTION 3"
#let debugmode catch any mistakes
f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))])
f(numpy.random.rand(*shape)+0.3)
def test_elemwise2(): def test_elemwise2():
""" Several kinds of elemwise expressions with dimension permutations """ """ Several kinds of elemwise expressions with dimension permutations """
...@@ -41,6 +58,11 @@ def test_elemwise2(): ...@@ -41,6 +58,11 @@ def test_elemwise2():
b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))() b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))()
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) *
tensor.exp(b**a).dimshuffle([2,0,3,1]))]) tensor.exp(b**a).dimshuffle([2,0,3,1]))])
has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()):
print i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise
#let debugmode catch errors #let debugmode catch errors
f(numpy.ones(shape)) f(numpy.ones(shape))
...@@ -54,3 +76,4 @@ def test_elemwise3(): ...@@ -54,3 +76,4 @@ def test_elemwise3():
b**a).dimshuffle([2,0,3,1]))]) b**a).dimshuffle([2,0,3,1]))])
#let debugmode catch errors #let debugmode catch errors
f(numpy.ones(6)) f(numpy.ones(6))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论