提交 ff24c985 authored 作者: James Bergstra's avatar James Bergstra

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set…

test_elemwise4 passed; modified setting of stride in alloc_contiguous to set stride 0 for dimensions of size 1
上级 43f97ea3
......@@ -19,7 +19,7 @@ class HostFromGpu(Op):
def __hash__(self):
return hash(type(self))
def __str__(self):
return '<HostFromGpu@%i>' % id(self)
return 'HostFromGpu'
def make_node(self, x):
if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x)
......@@ -36,7 +36,7 @@ class GpuFromHost(Op):
def __hash__(self):
return hash(type(self))
def __str__(self):
return '<GpuFromHost@%i>' % id(self)
return 'GpuFromHost'
def make_node(self, x):
if not isinstance(x.type, tensor.TensorType):
raise TypeError(x)
......@@ -102,9 +102,21 @@ class GpuElemwise(Op):
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.broadcastable != inputs[0].type.broadcastable:
raise NotImplementedError('different bcastable')
otype = CudaNdarrayType(broadcastable=_inputs[0].broadcastable)
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
# output is broadcastable only along dimensions where all inputs are broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout > 0
return Apply(self, _inputs, [otype() for o in xrange(self.nout)])
def c_support_code(self):
......@@ -274,37 +286,38 @@ class GpuElemwise(Op):
nout = len(outputs)
fail = sub['fail']
opname = str(self.scalar_op)
print >> sio, """
//std::cerr << "C_CODE %(opname)s START\\n";
initial_dims = ','.join('1' for i in xrange(nd))
if 1 or self.scalar_op == scalar.pow:
print >> sio, """
std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks
const int * dims = NULL;
""" %locals()
print >> sio, """
int dims[%(nd)s] = {%(initial_dims)s};
""" %locals()
for iname in inputs:
print >> sio, """
std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != cnda_%(iname)s->nd)
{
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", cnda_%(iname)s->nd);
%(fail)s;
}
""" %locals()
for iname0, iname1 in zip(inputs[1:], inputs[:-1]):
print >> sio, """
//standard elemwise dim checks
for (int i = 0; i< %(nd)s; ++i)
{
if (cnda_%(iname0)s->dim[i] != cnda_%(iname1)s->dim[i])
dims[i] = (dims[i] == 1) ? cnda_%(iname)s->dim[i] : dims[i];
if ((cnda_%(iname)s->dim[i] != 1) && (dims[i] != cnda_%(iname)s->dim[i]))
{
PyErr_SetString(PyExc_TypeError, "need same dimensions");
std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
i,
cnda_%(iname)s->dim[i],
dims[i]
);
%(fail)s;
}
}
""" %locals()
iname0 = inputs[0]
print >> sio, """
dims = cnda_%(iname0)s->dim;
//unsigned int size = CudaNdarray_SIZE(cnda_%(iname0)s);
//std::cerr << "ADD size " << size << "\\n";
""" %locals()
for oname in outputs:
print >> sio, """
......@@ -329,13 +342,14 @@ class GpuElemwise(Op):
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
std::cerr << "ELEMWISE NEW %(oname)s nd" << cnda_%(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << cnda_%(oname)s->devdata << "\\n";
""" % locals()
print >> sio, """
{
//new block so that failure gotos don't skip over variable initialization
int log2_dims[%(nd)s];
std::cerr << "calling callkernel\\n";
callkernel_%(nodename)s(1, 0, dims, log2_dims
""" % locals()
for iname in inputs:
......@@ -349,6 +363,7 @@ class GpuElemwise(Op):
print >> sio, """
);
std::cerr << "calling callkernel returned\\n";
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -462,6 +477,12 @@ class GpuDimShuffle(Op):
#alloc an output
print >> sio, """
if (cnda_%(res)s)
{
//TODO: re-use previously-allocated stuff
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
}
if (NULL == cnda_%(res)s) {
cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null();
if (NULL == cnda_%(res)s)
......@@ -493,16 +514,23 @@ class GpuDimShuffle(Op):
#reassign the dimension and strides in the host pointers
for i, o in enumerate(self.new_order):
if o == 'x':
assert node.outputs[0].type.broadcastable[i]
print >> sio, """
cnda_%(res)s->dim[%(i)s] = 1;
cnda_%(res)s->str[%(i)s] = 0;
""" %locals()
else:
assert not node.outputs[0].type.broadcastable[i]
print >> sio, """
cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s];
cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s];
""" %locals()
for i, o in enumerate(self.new_order):
print >> sio, """
std::cerr << "GpuDimShuffle " << cnda_%(res)s << " str[%(i)s] = " << cnda_%(res)s->str[%(i)s] << "\\n";
""" %locals()
# copy the host dims and stride -> device
print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
......
......@@ -87,9 +87,34 @@ def test_elemwise3():
shape = (3,4,5,6)
a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
b = tensor.dvector()
b = tensor.fvector()
print b.type
print tensor.constant(1).type
print (1 + b).type
print (1 + b**a).type
print tensor.exp((1 + b**a)).type
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 +
b**a).dimshuffle([2,0,3,1]))])
has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stderr, i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise
#let debugmode catch errors
f(numpy.random.rand(6))
def test_elemwise4():
""" Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update"""
shape = (3,4)
a = tcn.shared_constructor(numpy.random.rand(*shape), 'a')
b = tensor.fvector()
c = tensor.fvector()
f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*x.dimshuffle(0, 'x')))])
has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stderr, i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise
#let debugmode catch errors
f(numpy.random.rand(4), numpy.random.rand(3))
import sys, os
import sys, os, StringIO
import numpy
from theano import Op, Type, Apply, Variable, Constant
......@@ -130,10 +130,45 @@ class CudaNdarrayType(Type):
return "cnda_%(name)s = NULL;" % locals()
def c_extract(self, name, sub):
return """
sio = StringIO.StringIO()
fail = sub['fail']
nd = self.ndim
print >> sio, """
if (CudaNdarray_Check(py_%(name)s))
{
cnda_%(name)s = (CudaNdarray*)py_%(name)s;
std::cerr << "c_extract " << cnda_%(name)s << '\\n';
if (cnda_%(name)s->nd != %(nd)s)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", cnda_%(name)s->nd);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << " nd check passed\\n";
""" %locals()
for i, b in enumerate(self.broadcastable):
if b:
print >> sio, """
if (cnda_%(name)s->dim[%(i)s] != 1)
{
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", cnda_%(name)s->dim[%(i)s], %(i)s);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
if (cnda_%(name)s->str[%(i)s])
{
std::cerr << "c_extract bad stride detected...\\n";
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", cnda_%(name)s->str[%(i)s], %(i)s);
cnda_%(name)s = NULL;
%(fail)s;
}
std::cerr << "c_extract " << cnda_%(name)s << "bcast check %(i)s passed\\n";
""" %locals()
print >> sio, """
assert(cnda_%(name)s);
Py_INCREF(py_%(name)s);
}
else
......@@ -142,12 +177,19 @@ class CudaNdarrayType(Type):
cnda_%(name)s = NULL;
%(fail)s;
}
""" % dict(sub, name = name, type_num = self.dtype_specs()[2])
std::cerr << "c_extract done " << cnda_%(name)s << '\\n';
""" % locals()
#print sio.getvalue()
return sio.getvalue()
def c_cleanup(self, name, sub):
return """
//std::cerr << "cleanup " << py_%(name)s << "\\n";
Py_XDECREF(py_%(name)s);
std::cerr << "cleanup " << py_%(name)s << " " << cnda_%(name)s << "\\n";
if (cnda_%(name)s)
{
Py_XDECREF(cnda_%(name)s);
}
std::cerr << "cleanup done" << py_%(name)s << "\\n";
""" % locals()
def c_sync(self, name, sub):
......@@ -194,12 +236,6 @@ class CudaNdarrayType(Type):
def c_code_cache_version(self):
return () #do not cache this stuff until it matures
def c_compiler(self): return nvcc_module_compile_str
def c_compiler(self):
return nvcc_module_compile_str
......@@ -49,10 +49,14 @@ class CudaNdarraySharedVariable(SharedVariable, _operators):
if hasattr(other, '_as_CudaNdarrayVariable'):
return other._as_CudaNdarrayVariable()
if isinstance(other.type, tensor.TensorType) and (other.type.dtype == self.dtype) and (other.broadcastable == self.broadcastable):
return GpuFromHost()(other)
else:
raise TypeError((other, other.type))
if not isinstance(other.type, tensor.TensorType):
raise TypeError('Incompatible type', other.type)
if (other.type.dtype != self.dtype):
raise TypeError('Incompatible dtype', (self.dtype, other.type.dtype))
if (other.type.broadcastable != self.broadcastable):
raise TypeError('Incompatible broadcastable', (self.broadcastable, other.type.broadcastable))
return GpuFromHost()(other)
CudaNdarrayType.SharedVariable = CudaNdarraySharedVariable
def shared_constructor(value, name, strict=False):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论