提交 6c955ecf authored 作者: Frederic's avatar Frederic

New version of GpuAdvancedSubtensor1 with gpu code when input have up to 3…

New version of GpuAdvancedSubtensor1 with gpu code when input have up to 3 dimensions or is c_contiguous.
上级 6005ac2b
...@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1891,6 +1891,8 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
""" """
assert_fast = None
def make_node(self, x, ilist): def make_node(self, x, ilist):
x_ = as_cuda_ndarray_variable(x) x_ = as_cuda_ndarray_variable(x)
ilist_ = tensor.as_tensor_variable(ilist) ilist_ = tensor.as_tensor_variable(ilist)
...@@ -1908,11 +1910,35 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1908,11 +1910,35 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_) #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp x, idx = inp
out, = out_ out, = out_
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) + new_method = True
x.shape[1:]) #TODO: if more then 3 dims, reshape the inputs if it is contiguous.
for (j, i) in enumerate(idx): x_orig = x
o[j] = x[i] if x.ndim > 3 and x.is_c_contiguous():
out[0] = o x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
if x.ndim <= 3:
if self.assert_fast is not None:
assert self.assert_fast == True, (
"GpuAdvancedSubtensor1 used the fast version")
# Support x with dimensions 1,2,3 only.
o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),
0, out_[0][0]) # idx, axis, return[, clipmode]
if x is not x_orig:
o = o.reshape((len(idx),) + x_orig.shape[1:])
out[0] = o
else:
if self.assert_fast is not None:
assert self.assert_fast == False, (
"GpuAdvancedSubtensor1 didn't used the fast version")
if (out_[0][0] is None or out_[0][0].shape != (len(idx),) +
x.shape[1:]):
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
x.shape[1:])
else:
o = out_[0][0]
for (j, i) in enumerate(idx):
o[j] = x[i]
out[0] = o
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
......
...@@ -338,6 +338,12 @@ DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A); ...@@ -338,6 +338,12 @@ DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern); DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
//PyObject* PyArray_TakeFrom(PyArrayObject* self, PyObject* indices, int axis, PyArrayObject* ret, NPY_CLIPMODE clipmode)
//PyObject*
//CudaNdarray_TakeFrom(CudaNdarray* self, PyObject* indices, int axis,
// PyArrayObject* ret, NPY_CLIPMODE clipmode);
PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
......
import sys, time, unittest import copy
import sys
import time
import unittest
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import tensor from theano import tensor
...@@ -846,6 +849,47 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): ...@@ -846,6 +849,47 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
return super(theano.tensor.tests.test_basic.T_subtensor, return super(theano.tensor.tests.test_basic.T_subtensor,
self).__init__(name) self).__init__(name)
def test_adv_sub1_fast(self):
""" We check that we correctly used the fast version"""
rand = numpy.random.rand
for data, idx, fast in [(rand(70000), range(70000), True),
(rand(70000, 5), range(70000), True),
(rand(70000, 2, 3), range(70000), True),
(rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2,
2, 0, 0], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0,
0, -1, -2, -3, -4], True),
# Test 4 dims as gpu. code use another algo
# in that case. This new algo is not as much
# optimized for that case.
(rand(4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-1, -2, -3, -4], False),
]:
data = numpy.asarray(data, dtype=self.dtype)
n = self.shared(data)
# Test with c_contiguous input
t = self.adv_sub1()(n, idx)
t.owner.op.assert_fast = True # input c_contiguous, so we reshape
val = self.eval_output_and_check(t, list=True)
val = numpy.asarray(val)
good = data[idx]
self.assertTrue(val.ndim == data.ndim)
self.assertTrue(numpy.allclose(val, good), (val, good))
# Test with input strided
t = self.adv_sub1()(n[::-1], idx)
t.owner.op.assert_fast = fast
val = theano.function([], t, mode=self.mode)()
val = numpy.asarray(val)
good = data[::-1][idx]
self.assertTrue(val.ndim == data.ndim)
self.assertTrue(numpy.allclose(val, good), (val, good))
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论