提交 ff24c985 authored 作者: James Bergstra's avatar James Bergstra

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set…

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set stride 0 for dimensions of size 1
上级 43f97ea3
...@@ -19,7 +19,7 @@ class HostFromGpu(Op): ...@@ -19,7 +19,7 @@ class HostFromGpu(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return '<HostFromGpu@%i>' % id(self) return 'HostFromGpu'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, CudaNdarrayType): if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x) raise TypeError(x)
...@@ -36,7 +36,7 @@ class GpuFromHost(Op): ...@@ -36,7 +36,7 @@ class GpuFromHost(Op):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return '<GpuFromHost@%i>' % id(self) return 'GpuFromHost'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
...@@ -102,9 +102,21 @@ class GpuElemwise(Op): ...@@ -102,9 +102,21 @@ class GpuElemwise(Op):
if self.nin > 0 and len(_inputs) != self.nin: if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs))) raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]: for i in _inputs[1:]:
if i.type.broadcastable != inputs[0].type.broadcastable: if i.type.ndim != inputs[0].type.ndim:
raise NotImplementedError('different bcastable') raise TypeError('different ranks among inputs')
otype = CudaNdarrayType(broadcastable=_inputs[0].broadcastable)
# output is broadcastable only along dimensions where all inputs are broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout > 0 assert self.nout > 0
return Apply(self, _inputs, [otype() for o in xrange(self.nout)]) return Apply(self, _inputs, [otype() for o in xrange(self.nout)])
def c_support_code(self): def c_support_code(self):
...@@ -274,37 +286,38 @@ class GpuElemwise(Op): ...@@ -274,37 +286,38 @@ class GpuElemwise(Op):
nout = len(outputs) nout = len(outputs)
fail = sub['fail'] fail = sub['fail']
opname = str(self.scalar_op) opname = str(self.scalar_op)
initial_dims = ','.join('1' for i in xrange(nd))
if 1 or self.scalar_op == scalar.pow:
print >> sio, """ print >> sio, """
//std::cerr << "C_CODE %(opname)s START\\n"; std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks //standard elemwise size checks
const int * dims = NULL; """ %locals()
print >> sio, """
int dims[%(nd)s] = {%(initial_dims)s};
""" %locals() """ %locals()
for iname in inputs: for iname in inputs:
print >> sio, """ print >> sio, """
std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != cnda_%(iname)s->nd) if (%(nd)s != cnda_%(iname)s->nd)
{ {
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd); PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
%(fail)s; %(fail)s;
} }
""" %locals()
for iname0, iname1 in zip(inputs[1:], inputs[:-1]):
print >> sio, """
//standard elemwise dim checks
for (int i = 0; i< %(nd)s; ++i) for (int i = 0; i< %(nd)s; ++i)
{ {
if (cnda_%(iname0)s->dim[i] != cnda_%(iname1)s->dim[i]) dims[i] = (dims[i] == 1) ? cnda_%(iname)s->dim[i] : dims[i];
if ((cnda_%(iname)s->dim[i] != 1) && (dims[i] != cnda_%(iname)s->dim[i]))
{ {
PyErr_SetString(PyExc_TypeError, "need same dimensions"); std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
i,
cnda_%(iname)s->dim[i],
dims[i]
);
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ %locals()
iname0 = inputs[0]
print >> sio, """
dims = cnda_%(iname0)s->dim;
//unsigned int size = CudaNdarray_SIZE(cnda_%(iname0)s);
//std::cerr << "ADD size " << size << "\\n";
""" %locals()
for oname in outputs: for oname in outputs:
print >> sio, """ print >> sio, """
...@@ -329,13 +342,14 @@ class GpuElemwise(Op): ...@@ -329,13 +342,14 @@ class GpuElemwise(Op):
%(fail)s; %(fail)s;
} }
} }
//std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n"; std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
""" % locals() """ % locals()
print >> sio, """ print >> sio, """
{ {
//new block so that failure gotos don't skip over variable initialization //new block so that failure gotos don't skip over variable initialization
int log2_dims[%(nd)s]; int log2_dims[%(nd)s];
std::cerr << "calling callkernel\\n";
callkernel_%(nodename)s(1, 0, dims, log2_dims callkernel_%(nodename)s(1, 0, dims, log2_dims
""" % locals() """ % locals()
for iname in inputs: for iname in inputs:
...@@ -349,6 +363,7 @@ class GpuElemwise(Op): ...@@ -349,6 +363,7 @@ class GpuElemwise(Op):
print >> sio, """ print >> sio, """
); );
std::cerr << "calling callkernel returned\\n";
cudaThreadSynchronize(); cudaThreadSynchronize();
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -462,6 +477,12 @@ class GpuDimShuffle(Op): ...@@ -462,6 +477,12 @@ class GpuDimShuffle(Op):
#alloc an output #alloc an output
print >> sio, """ print >> sio, """
if (cnda_%(res)s)
{
//TODO: re-use previously-allocated stuff
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
}
if (NULL == cnda_%(res)s) { if (NULL == cnda_%(res)s) {
cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null(); cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null();
if (NULL == cnda_%(res)s) if (NULL == cnda_%(res)s)
...@@ -493,16 +514,23 @@ class GpuDimShuffle(Op): ...@@ -493,16 +514,23 @@ class GpuDimShuffle(Op):
#reassign the dimension and strides in the host pointers #reassign the dimension and strides in the host pointers
for i, o in enumerate(self.new_order): for i, o in enumerate(self.new_order):
if o == 'x': if o == 'x':
assert node.outputs[0].type.broadcastable[i]
print >> sio, """ print >> sio, """
cnda_%(res)s->dim[%(i)s] = 1; cnda_%(res)s->dim[%(i)s] = 1;
cnda_%(res)s->str[%(i)s] = 0; cnda_%(res)s->str[%(i)s] = 0;
""" %locals() """ %locals()
else: else:
assert not node.outputs[0].type.broadcastable[i]
print >> sio, """ print >> sio, """
cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s]; cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s];
cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s]; cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s];
""" %locals() """ %locals()
for i, o in enumerate(self.new_order):
print >> sio, """
std::cerr << "GpuDimShuffle " << cnda_%(res)s << " str[%(i)s] = " << cnda_%(res)s->str[%(i)s] << "\\n";
""" %locals()
# copy the host dims and stride -> device # copy the host dims and stride -> device
print >> sio, """ print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s)) if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
......
...@@ -87,9 +87,34 @@ def test_elemwise3(): ...@@ -87,9 +87,34 @@ def test_elemwise3():
shape = (3,4,5,6) shape = (3,4,5,6)
a = tcn.shared_constructor(numpy.random.rand(*shape), 'a') a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
b = tensor.dvector() b = tensor.fvector()
print b.type
print tensor.constant(1).type
print (1 + b).type
print (1 + b**a).type
print tensor.exp((1 + b**a)).type
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 + f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
b**a).dimshuffle([2,0,3,1]))]) b**a).dimshuffle([2,0,3,1]))])
has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stderr, i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise
#let debugmode catch errors #let debugmode catch errors
f(numpy.random.rand(6)) f(numpy.random.rand(6))
def test_elemwise4():
""" Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""
shape = (3,4)
a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
b = tensor.fvector()
c = tensor.fvector()
f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*x.dimshuffle(0, 'x')))])
has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stderr, i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise
#let debugmode catch errors
f(numpy.random.rand(4), numpy.random.rand(3))
import sys, os import sys, os, StringIO
import numpy import numpy
from theano import Op, Type, Apply, Variable, Constant from theano import Op, Type, Apply, Variable, Constant
...@@ -130,10 +130,45 @@ class CudaNdarrayType(Type): ...@@ -130,10 +130,45 @@ class CudaNdarrayType(Type):
return "cnda_%(name)s = NULL;" % locals() return "cnda_%(name)s = NULL;" % locals()
def c_extract(self, name, sub): def c_extract(self, name, sub):
return """ sio = StringIO.StringIO()
fail = sub['fail']
nd = self.ndim
print >> sio, """
if (CudaNdarray_Check(py_%(name)s)) if (CudaNdarray_Check(py_%(name)s))
{ {
cnda_%(name)s = (CudaNdarray*)py_%(name)s; cnda_%(name)s = (CudaNdarray*)py_%(name)s;
std::cerr << "c_extract " << cnda_%(name)s << '\\n';
if (cnda_%(name)s->nd != %(nd)s)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", cnda_%(name)s->nd);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << " nd check passed\\n";
""" %locals()
for i, b in enumerate(self.broadcastable):
if b:
print >> sio, """
if (cnda_%(name)s->dim[%(i)s] != 1)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", cnda_%(name)s->dim[%(i)s], %(i)s);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
if (cnda_%(name)s->str[%(i)s])
{
std::cerr << "c_extract bad stride detected...\\n";
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", cnda_%(name)s->str[%(i)s], %(i)s);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << "bcast check %(i)s passed\\n";
""" %locals()
print >> sio, """
assert(cnda_%(name)s);
Py_INCREF(py_%(name)s); Py_INCREF(py_%(name)s);
} }
else else
...@@ -142,12 +177,19 @@ class CudaNdarrayType(Type): ...@@ -142,12 +177,19 @@ class CudaNdarrayType(Type):
cnda_%(name)s = NULL; cnda_%(name)s = NULL;
%(fail)s; %(fail)s;
} }
""" % dict(sub, name = name, type_num = self.dtype_specs()[2]) std::cerr << "c_extract done " << cnda_%(name)s << '\\n';
""" % locals()
#print sio.getvalue()
return sio.getvalue()
def c_cleanup(self, name, sub): def c_cleanup(self, name, sub):
return """ return """
//std::cerr << "cleanup " << py_%(name)s << "\\n"; std::cerr << "cleanup " << py_%(name)s << " " << cnda_%(name)s << "\\n";
Py_XDECREF(py_%(name)s); if (cnda_%(name)s)
{
Py_XDECREF(cnda_%(name)s);
}
std::cerr << "cleanup done" << py_%(name)s << "\\n";
""" % locals() """ % locals()
def c_sync(self, name, sub): def c_sync(self, name, sub):
...@@ -194,12 +236,6 @@ class CudaNdarrayType(Type): ...@@ -194,12 +236,6 @@ class CudaNdarrayType(Type):
def c_code_cache_version(self): def c_code_cache_version(self):
return () #do not cache this stuff until it matures return () #do not cache this stuff until it matures
def c_compiler(self):
def c_compiler(self): return nvcc_module_compile_str return nvcc_module_compile_str
...@@ -49,10 +49,14 @@ class CudaNdarraySharedVariable(SharedVariable, _operators): ...@@ -49,10 +49,14 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
if hasattr(other, '_as_CudaNdarrayVariable'): if hasattr(other, '_as_CudaNdarrayVariable'):
return other._as_CudaNdarrayVariable() return other._as_CudaNdarrayVariable()
if isinstance(other.type, tensor.TensorType) and (other.type.dtype == self.dtype) and (other.broadcastable == self.broadcastable): if not isinstance(other.type, tensor.TensorType):
raise TypeError('Incompatible type', other.type)
if (other.type.dtype != self.dtype):
raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype))
if (other.type.broadcastable != self.broadcastable):
raise TypeError('Incompatible broadcastable', (self.broadcastable, other.type.broadcastable))
return GpuFromHost()(other) return GpuFromHost()(other)
else:
raise TypeError((other, other.type))
CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
def shared_constructor(value, name, strict=False): def shared_constructor(value, name, strict=False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论