提交 a317f101 authored 作者: James Bergstra's avatar James Bergstra

upgraded to use CudaNdarray_HOST_DIMS and family

上级 71338c0b
...@@ -329,13 +329,13 @@ class GpuElemwise(Op): ...@@ -329,13 +329,13 @@ class GpuElemwise(Op):
} }
for (int i = 0; i< %(nd)s; ++i) for (int i = 0; i< %(nd)s; ++i)
{ {
dims[i] = (dims[i] == 1) ? cnda_%(iname)s->dim[i] : dims[i]; dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] : dims[i];
if ((cnda_%(iname)s->dim[i] != 1) && (dims[i] != cnda_%(iname)s->dim[i])) if ((CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i] != 1) && (dims[i] != CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i]))
{ {
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i", PyErr_Format(PyExc_TypeError, "GpuElemwise input has incompatible dim[%%i] == %%i, where output has size %%i",
i, i,
cnda_%(iname)s->dim[i], CudaNdarray_HOST_DIMS(cnda_%(iname)s)[i],
dims[i] dims[i]
); );
%(fail)s; %(fail)s;
...@@ -378,11 +378,11 @@ class GpuElemwise(Op): ...@@ -378,11 +378,11 @@ class GpuElemwise(Op):
""" % locals() """ % locals()
for iname in inputs: for iname in inputs:
print >> sio, """ print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_STRIDES(cnda_%(iname)s) , CudaNdarray_DEV_DATA(cnda_%(iname)s), CudaNdarray_HOST_STRIDES(cnda_%(iname)s)
""" % locals() """ % locals()
for oname in outputs: for oname in outputs:
print >> sio, """ print >> sio, """
, CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_STRIDES(cnda_%(oname)s) , CudaNdarray_DEV_DATA(cnda_%(oname)s), CudaNdarray_HOST_STRIDES(cnda_%(oname)s)
""" % locals() """ % locals()
print >> sio, """ print >> sio, """
); );
...@@ -508,21 +508,20 @@ class GpuDimShuffle(Op): ...@@ -508,21 +508,20 @@ class GpuDimShuffle(Op):
{ {
if (cnda_%(res)s) if (cnda_%(res)s)
{ {
Py_DECREF(cnda_%(res)s); if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s))
cnda_%(res)s = NULL; {
} Py_DECREF(cnda_%(res)s);
cnda_%(res)s = (CudaNdarray*) CudaNdarray_new_null(); cnda_%(res)s = NULL;
if (NULL == cnda_%(res)s) %(fail)s;
{ }
PyErr_SetString(PyExc_MemoryError, "Failed to allocate result");
%(fail)s;
} }
if (CudaNdarray_set_nd(cnda_%(res)s, %(nd_out)s)) else
{ {
// err message set cnda_%(res)s = (CudaNdarray*) CudaNdarray_New(%(nd_out)s);
Py_DECREF(cnda_%(res)s); if (NULL == cnda_%(res)s)
cnda_%(res)s = NULL; {
%(fail)s; %(fail)s;
}
} }
} }
""" %locals() """ %locals()
...@@ -542,14 +541,14 @@ class GpuDimShuffle(Op): ...@@ -542,14 +541,14 @@ class GpuDimShuffle(Op):
if o == 'x': if o == 'x':
assert node.outputs[0].type.broadcastable[i] assert node.outputs[0].type.broadcastable[i]
print >> sio, """ print >> sio, """
cnda_%(res)s->dim[%(i)s] = 1; CudaNdarray_set_dim(cnda_%(res)s, %(i)s, 1);
cnda_%(res)s->str[%(i)s] = 0; CudaNdarray_set_stride(cnda_%(res)s, %(i)s, 0);
""" %locals() """ %locals()
else: else:
assert not node.outputs[0].type.broadcastable[i] assert not node.outputs[0].type.broadcastable[i]
print >> sio, """ print >> sio, """
cnda_%(res)s->dim[%(i)s] = cnda_%(input)s->dim[%(o)s]; CudaNdarray_set_dim(cnda_%(res)s, %(i)s, CudaNdarray_HOST_DIMS(cnda_%(input)s)[%(o)s]);
cnda_%(res)s->str[%(i)s] = cnda_%(input)s->str[%(o)s]; CudaNdarray_set_stride(cnda_%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(cnda_%(input)s)[%(o)s]);
""" %locals() """ %locals()
for i, o in enumerate(self.new_order): for i, o in enumerate(self.new_order):
...@@ -558,17 +557,18 @@ class GpuDimShuffle(Op): ...@@ -558,17 +557,18 @@ class GpuDimShuffle(Op):
""" %locals() """ %locals()
# copy the host dims and stride -> device # copy the host dims and stride -> device
print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
{
//err msg set
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
%(fail)s;
}
""" %locals()
if 0: if 0:
print >> sio, """
if (CudaNdarray_copy_structure_to_device(cnda_%(res)s))
{
//err msg set
Py_DECREF(cnda_%(res)s);
cnda_%(res)s = NULL;
%(fail)s;
}
""" %locals()
if 1:
print '--------------------------------------' print '--------------------------------------'
print 'C_CODE' print 'C_CODE'
print '' print ''
......
...@@ -39,13 +39,13 @@ class GpuDot22(Op): ...@@ -39,13 +39,13 @@ class GpuDot22(Op):
%(fail)s; %(fail)s;
} }
if ((NULL == cnda_%(z)s) if ((NULL == cnda_%(z)s)
|| (cnda_%(z)s->dim[0] != cnda_%(x)s->dim[0]) || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[0] != CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
|| (cnda_%(z)s->dim[1] != cnda_%(y)s->dim[1])) || (CudaNdarray_HOST_DIMS(cnda_%(z)s)[1] != CudaNdarray_HOST_DIMS(cnda_%(y)s)[1]))
{ {
if (cnda_%(z)s) Py_DECREF(cnda_%(z)s); if (cnda_%(z)s) Py_DECREF(cnda_%(z)s);
npy_intp dims[2]; npy_intp dims[2];
dims[0] = cnda_%(x)s->dim[0]; dims[0] = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dims[1] = cnda_%(y)s->dim[1]; dims[1] = CudaNdarray_HOST_DIMS(cnda_%(y)s)[1];
cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null(); cnda_%(z)s = (CudaNdarray*)CudaNdarray_new_null();
if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims)) if ((NULL == cnda_%(z)s) || CudaNdarray_alloc_contiguous(cnda_%(z)s, 2, dims))
{ {
......
...@@ -120,3 +120,66 @@ def test_conv_nnet1(): ...@@ -120,3 +120,66 @@ def test_conv_nnet1():
rval_gpu = run_conv_nnet1(tcn.shared_constructor) rval_gpu = run_conv_nnet1(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6) assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
def run_conv_nnet2(shared_fn):
n_batch = 16
shape_img = (n_batch, 1, 32, 32)
n_kern = 20
shape_kern = (n_kern, 1, 5, 5)
n_kern1 = 30
shape_kern1 = (n_kern1, n_kern, 5, 5)
logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((32, 32), (5, 5), 'valid')
logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), (5, 5), 'valid')
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
b0 = shared_fn(numpy.asarray(numpy.zeros((n_kern,1,1)), dtype='float32'), 'b0')
w1 = shared_fn(numpy.asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
b1 = shared_fn(numpy.asarray(numpy.zeros((n_kern1,1,1)), dtype='float32'), 'b1')
v = shared_fn(numpy.asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(numpy.asarray(numpy.zeros(n_out), dtype='float32'), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,0,0,0))('x')
y = tensor.fmatrix('y')
lr = tensor.fscalar('lr')
conv_op = theano.sandbox.conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = theano.sandbox.conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0)+b0)
hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1)
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.tanh(tensor.dot(hid_flat, v)+c)
loss = tensor.sum(0.5 * (out-y)**2 * lr)
print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
gparams = tensor.grad(loss, params)
mode = theano.compile.ProfileMode()
print 'building pfunc ...'
train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = numpy.asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = numpy.asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
lr = numpy.asarray(0.01, dtype='float32')
for i in xrange(10):
rval = train(xval, yval, lr)
mode.print_summary()
return rval
def test_conv_nnet2():
numpy.random.seed(23456)
rval_cpu = run_conv_nnet2(shared)
numpy.random.seed(23456)
rval_gpu = run_conv_nnet2(tcn.shared_constructor)
assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
...@@ -149,19 +149,19 @@ class CudaNdarrayType(Type): ...@@ -149,19 +149,19 @@ class CudaNdarrayType(Type):
for i, b in enumerate(self.broadcastable): for i, b in enumerate(self.broadcastable):
if b: if b:
print >> sio, """ print >> sio, """
if (cnda_%(name)s->dim[%(i)s] != 1) if (CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s] != 1)
{ {
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", cnda_%(name)s->dim[%(i)s], %(i)s); PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(cnda_%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL; cnda_%(name)s = NULL;
%(fail)s; %(fail)s;
} }
//std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n"; //std::cerr << "c_extract " << cnda_%(name)s << "dim check %(i)s passed\\n";
//std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n"; //std::cerr << "c_extract " << cnda_%(name)s << "checking bcast %(i)s <" << cnda_%(name)s->str<< ">\\n";
//std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n"; //std::cerr << "c_extract " << cnda_%(name)s->str[%(i)s] << "\\n";
if (cnda_%(name)s->str[%(i)s]) if (CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s])
{ {
//std::cerr << "c_extract bad stride detected...\\n"; //std::cerr << "c_extract bad stride detected...\\n";
PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", cnda_%(name)s->str[%(i)s], %(i)s); PyErr_Format(PyExc_RuntimeError, "Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(cnda_%(name)s)[%(i)s], %(i)s);
cnda_%(name)s = NULL; cnda_%(name)s = NULL;
%(fail)s; %(fail)s;
} }
......
...@@ -44,7 +44,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s ...@@ -44,7 +44,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
} }
for (int i = 0; i < cnda->nd; ++i) for (int i = 0; i < cnda->nd; ++i)
{ {
if ((cnda->dim[i] > 1) and PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i)))) if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) and PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
{ {
std::cerr << "Non-unit size in bcastable dim:\n"; std::cerr << "Non-unit size in bcastable dim:\n";
PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i); PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论