提交 975e0d2b authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3227 from abergeron/gpua_advsub1

Implement GpuAdvancedSubtensor1 for gpuarray
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <string.h> #include <string.h>
#include <gpuarray_api.h> #include <gpuarray_api.h>
#include <numpy_compat.h> #include <numpy_compat.h>
#include <gpuarray/util.h>
static int theano_size_check(PyGpuArrayObject *a, unsigned int nd, static int theano_size_check(PyGpuArrayObject *a, unsigned int nd,
const size_t *dims, int typecode) { const size_t *dims, int typecode) {
...@@ -42,9 +44,14 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out, ...@@ -42,9 +44,14 @@ static PyGpuArrayObject *theano_try_copy(PyGpuArrayObject *out,
return out; return out;
} }
/* This is guaranteed to work and return the raw CUDA/OpenCL object on static inline void *PyGpuArray_DEV_DATA(PyGpuArrayObject *a) {
/* This is guaranteed to work and return the raw CUDA/OpenCL object on
* all recent (as of June 2015) version of libgpuarray. This is also * all recent (as of June 2015) version of libgpuarray. This is also
* promised to keep working in future versions. */ * promised to keep working in future versions. */
#define PyGpuArray_DEV_DATA(ary) (*(void **)((ary)->ga.data)) char * p = *((char **)a->ga.data);
/* This only works on cuda since we have a real pointer. */
return (void *)(p + a->ga.offset);
}
#endif #endif
...@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, ...@@ -35,6 +35,7 @@ from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY) GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
...@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node): ...@@ -488,6 +489,12 @@ def local_gpua_incsubtensor(node):
node.op.destroyhandler_tolerate_aliased) node.op.destroyhandler_tolerate_aliased)
@register_opt('fast_compile')
@op_lifter([tensor.AdvancedSubtensor1])
def local_gpua_advanced_subtensor(node):
return GpuAdvancedSubtensor1()
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node):
...@@ -496,7 +503,16 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -496,7 +503,16 @@ def local_gpua_advanced_incsubtensor(node):
if pygpu.get_default_context().kind != "cuda": if pygpu.get_default_context().kind != "cuda":
return None return None
x, y = node.inputs[0:2] x, y, ilist = node.inputs
# Gpu Ops needs both inputs to have the same dtype
if (x.type.dtype != y.type.dtype):
dtype = scalar.upcast(x.type.dtype, y.type.dtype)
if x.type.dtype != dtype:
x = tensor.cast(x, dtype)
if y.type.dtype != dtype:
y = tensor.cast(y, dtype)
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
device_properties = theano.sandbox.cuda.device_properties device_properties = theano.sandbox.cuda.device_properties
...@@ -504,11 +520,11 @@ def local_gpua_advanced_incsubtensor(node): ...@@ -504,11 +520,11 @@ def local_gpua_advanced_incsubtensor(node):
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
return GpuAdvancedIncSubtensor1( return [GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
else: else:
return GpuAdvancedIncSubtensor1_dev20( return [GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)(x, y, ilist)]
@register_opt('fast_compile') @register_opt('fast_compile')
......
...@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor ...@@ -7,6 +7,7 @@ from theano.tensor.tests import test_subtensor
from ..basic_ops import HostFromGpu, GpuFromHost from ..basic_ops import HostFromGpu, GpuFromHost
from ..subtensor import (GpuIncSubtensor, GpuSubtensor, from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1) GpuAdvancedIncSubtensor1)
from ..type import gpuarray_shared_constructor from ..type import gpuarray_shared_constructor
...@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor): ...@@ -24,6 +25,7 @@ class G_subtensor(test_subtensor.T_subtensor):
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor, inc_sub=GpuIncSubtensor,
adv_sub1=GpuAdvancedSubtensor1,
adv_incsub1=GpuAdvancedIncSubtensor1, adv_incsub1=GpuAdvancedIncSubtensor1,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
......
...@@ -515,8 +515,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -515,8 +515,8 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertRaises(IndexError, g, shp) self.assertRaises(IndexError, g, shp)
def test_adv_sub1_broadcast(self): def test_adv_sub1_broadcast(self):
ones = numpy.ones((1, 3), dtype=self.dtype) v = numpy.arange(3, dtype=self.dtype).reshape((1, 3))
n = self.shared(ones * 5, broadcastable=(True, False)) n = self.shared(v*5, broadcastable=(True, False))
idx = tensor.lvector() idx = tensor.lvector()
t = n[idx] t = n[idx]
self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1)) self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
...@@ -529,10 +529,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -529,10 +529,10 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(isinstance(topo_[0].op, self.adv_sub1)) self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
f_0 = f([0]) f_0 = f([0])
self.assertTrue(f_0.shape == (1, 3)) self.assertTrue(f_0.shape == (1, 3))
self.assertTrue(numpy.allclose(f_0, ones[0] * 5)) self.assertTrue(numpy.allclose(f_0, v*5))
f_00 = f([0, 0]) f_00 = f([0, 0])
self.assertTrue(f_00.shape == (2, 3)) self.assertTrue(f_00.shape == (2, 3))
self.assertTrue(numpy.allclose(f_00, 5)) self.assertTrue(numpy.allclose(f_00, v*5))
self.assertRaises(IndexError, f, [0, 1]) self.assertRaises(IndexError, f, [0, 1])
# Test the gradient # Test the gradient
......
...@@ -160,7 +160,6 @@ whitelist_flake8 = [ ...@@ -160,7 +160,6 @@ whitelist_flake8 = [
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py", "sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py", "sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/subtensor.py",
"sandbox/gpuarray/elemwise.py", "sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py", "sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论