提交 21c11dc1 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Corrected check for whether cuda is active or not and compile only one addition…

Corrected check for whether cuda is active or not and compile only one addition kernel instead of many in the perform() method.
上级 83eeda3a
import copy import copy
import theano import theano
import numpy import numpy
try:
import pygpu
except ImportError:
pass
from theano import tensor, scalar, gof from theano import tensor, scalar, gof
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, from theano.gof import (local_optimizer, EquilibriumDB,
...@@ -279,6 +285,11 @@ def local_gpua_incsubtensor(node): ...@@ -279,6 +285,11 @@ def local_gpua_incsubtensor(node):
@register_opt() @register_opt()
@op_lifter([tensor.AdvancedIncSubtensor1]) @op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node): def local_gpua_advanced_incsubtensor(node):
# This optimization is disabled if cuda is not active
if pygpu.get_default_context().kind != "cuda":
return None
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
coords = node.inputs[2:] coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc set_instead_of_inc = node.op.set_instead_of_inc
......
...@@ -359,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor): ...@@ -359,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
return parent_version + elemwise_version + (0,) return parent_version + elemwise_version + (0,)
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op): class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
...@@ -383,8 +383,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op): ...@@ -383,8 +383,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()]) return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
# CudaNdarray_Subscript() doesn't support Advanced slicing. def getInplElemwiseAdditionKernel(self, a, b):
# But we can't use the parent version that loops on each index a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg]
oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
return k
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the # as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case. # parent doesn't loop in that case.
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
...@@ -413,18 +420,26 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op): ...@@ -413,18 +420,26 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
# jointly on `x` and `y`. Otherwise, it means `y` should be # jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`. # broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node` assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
if len(idx) == 0:
pass
elif y.ndim == x.ndim:
assert len(y) == len(idx) assert len(y) == len(idx)
firstIdxY, firstIdxX = enumerate(idx).next()
k = self.getInplElemwiseAdditionKernel(x[firstIdxX],
y[firstIdxY])
for (j, i) in enumerate(idx): for (j, i) in enumerate(idx):
#x[i] += y[j] k(x[i], y[j], broadcast=False)
pygpu.elemwise.ielemwise2(x[i], '+', y[j], broadcast=False)
else: else:
for i in idx: nb_dims_to_add = (x[idx[0]].ndim - y.ndim)
#x[i] += y
nb_dims_to_add = (x[i].ndim - y.ndim)
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape) reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
pygpu.elemwise.ielemwise2(x[i], '+', reshaped_y, k = self.getInplElemwiseAdditionKernel(x[0],
broadcast=True) reshaped_y)
for i in idx:
k(x[i], reshaped_y, broadcast=True)
out[0] = x out[0] = x
......
...@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.allclose(val, good), (val, good)) self.assertTrue(numpy.allclose(val, good), (val, good))
# Test reuse of output memory # Test reuse of output memory
if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1): if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
op = self.adv_sub1() op = self.adv_sub1()
# When idx is a TensorConstant. # When idx is a TensorConstant.
if hasattr(idx, "data"): if hasattr(idx, "data"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论