提交 572bf565 authored 作者: James Bergstra's avatar James Bergstra

Merge pull request #715 from nouiz/GpuAdvSub1

New version of GpuAdvancedSubtensor1 with gpu code.
......@@ -1891,6 +1891,10 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
"""
Implement AdvancedSubtensor1 on the gpu.
"""
#If True or False, we assert that we use the take version or not
#If None, we choose the best one applicable
perform_using_take = None
def make_node(self, x, ilist):
x_ = as_cuda_ndarray_variable(x)
ilist_ = tensor.as_tensor_variable(ilist)
......@@ -1908,11 +1912,44 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp
out, = out_
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
x.shape[1:])
for (j, i) in enumerate(idx):
o[j] = x[i]
out[0] = o
x_orig = x
#TODO: if more then 3 dims, reshape the inputs even if not all
#dimensions are c contiguous
if x.ndim > 3 and x.is_c_contiguous():
x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
out_shape = (len(idx),) + x_orig.shape[1:]
if x.ndim <= 3:
# CudaNdarray.take only supports ndim <= 3
if self.perform_using_take is not None:
assert self.perform_using_take == True, (
"GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64:
if idx.dtype in [numpy.int8, numpyt.int16, numpy.int32,
numpy.int64, numpy.uint8, numpy.uint16,
numpy.uint32]:
idx = idx.astype(numpy.int64)
if not idx.flags.c_contiguous:
idx = numpy.ascontiguousarray(idx)
idx = idx.view("float32")
idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
o = x.take(idx,
0, # axis
out_[0][0]) # return
if x is not x_orig:
o = o.reshape(out_shape)
out[0] = o
else:
if self.perform_using_take is not None:
assert self.perform_using_take == False, (
"GpuAdvancedSubtensor1 didn't use the fast version")
if out_[0][0] is None or out_[0][0].shape != out_shape:
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
else:
o = out_[0][0]
for (j, i) in enumerate(idx):
o[j] = x[i]
out[0] = o
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
......
......@@ -338,6 +338,8 @@ DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
DllExport PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
......
import sys, time, unittest
import copy
import sys
import time
import unittest
from theano.compile.pfunc import pfunc
from theano import tensor
......@@ -846,6 +849,58 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
return super(theano.tensor.tests.test_basic.T_subtensor,
self).__init__(name)
def test_adv_sub1_fast(self):
"""We check that the special cases of advanced indexing that
use CudaNdarrayTakeFrom are handled correctly
"""
rand = numpy.random.rand
# The variable fast is used to set the member perform_using_take of
# the Op. It is only useful for testing that we use the fast
# version when we should. Users should not use it.
for data, idx, fast in [(rand(70000), range(70000), True),
(rand(70000, 5), range(70000), True),
(rand(70000, 2, 3), range(70000), True),
(rand(1025, 1025), [5, 10], True),
(rand(3, 1025, 1026), [1, 2], True),
(rand(1025, 67000), [5, 10], True),
(rand(3, 10, 68000), [1, 2], True),
(rand(3, 69000, 11), [1, 2], True),
(rand(4, 5), [2, 3], True),
(rand(4, 2, 3), [0, 3], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2,
2, 0, 0], True),
(rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0,
0, -1, -2, -3, -4], True),
# Test 4 dims as gpu. code use another algo
# in that case. This new algo is not as much
# optimized for that case.
(rand(4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-1, -2, -3, -4], False),
]:
data = numpy.asarray(data, dtype=self.dtype)
n = self.shared(data)
# Test with c_contiguous input
t = self.adv_sub1()(n, idx)
t.owner.op.perform_using_take = True # input c_contiguous, so we reshape
val = self.eval_output_and_check(t, list=True)
val = numpy.asarray(val)
good = data[idx]
self.assertTrue(val.ndim == data.ndim)
self.assertTrue(numpy.allclose(val, good), (val, good))
# Test with input strided
t = self.adv_sub1()(n[::-1], idx)
t.owner.op.perform_using_take = fast
val = theano.function([], t, mode=self.mode)()
val = numpy.asarray(val)
good = data[::-1][idx]
self.assertTrue(val.ndim == data.ndim)
self.assertTrue(numpy.allclose(val, good), (val, good))
def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论