提交 21c11dc1 authored 作者: Pierre Luc Carrier's avatar Pierre Luc Carrier

Corrected check for whether cuda is active or not and compile only one addition…

Corrected check for whether cuda is active or not and compile only one addition kernel instead of many in the perform() method.
上级 83eeda3a
import copy
import theano
import numpy
try:
import pygpu
except ImportError:
pass
from theano import tensor, scalar, gof
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB,
......@@ -279,6 +285,11 @@ def local_gpua_incsubtensor(node):
@register_opt()
@op_lifter([tensor.AdvancedIncSubtensor1])
def local_gpua_advanced_incsubtensor(node):
# This optimization is disabled if cuda is not active
if pygpu.get_default_context().kind != "cuda":
return None
x, y = node.inputs[0:2]
coords = node.inputs[2:]
set_instead_of_inc = node.op.set_instead_of_inc
......
......@@ -359,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
return parent_version + elemwise_version + (0,)
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
"""
Implement AdvancedIncSubtensor1 on the gpu.
"""
......@@ -383,8 +383,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
# CudaNdarray_Subscript() doesn't support Advanced slicing.
# But we can't use the parent version that loops on each index
def getInplElemwiseAdditionKernel(self, a, b):
a_arg = pygpu.tools.as_argument(a, 'a')
b_arg = pygpu.tools.as_argument(b, 'b')
args = [a_arg, b_arg]
oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
return k
# We can't use the parent version that loops on each index
# as we also need to loop when set_instead_of_inc is True and the
# parent doesn't loop in that case.
def perform(self, node, inp, out_):
......@@ -413,18 +420,26 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
# jointly on `x` and `y`. Otherwise, it means `y` should be
# broadcasted to fill all relevant rows of `x`.
assert y.ndim <= x.ndim # Should be guaranteed by `make_node`
if y.ndim == x.ndim:
if len(idx) == 0:
pass
elif y.ndim == x.ndim:
assert len(y) == len(idx)
firstIdxY, firstIdxX = enumerate(idx).next()
k = self.getInplElemwiseAdditionKernel(x[firstIdxX],
y[firstIdxY])
for (j, i) in enumerate(idx):
#x[i] += y[j]
pygpu.elemwise.ielemwise2(x[i], '+', y[j], broadcast=False)
k(x[i], y[j], broadcast=False)
else:
nb_dims_to_add = (x[idx[0]].ndim - y.ndim)
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
k = self.getInplElemwiseAdditionKernel(x[0],
reshaped_y)
for i in idx:
#x[i] += y
nb_dims_to_add = (x[i].ndim - y.ndim)
reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
pygpu.elemwise.ielemwise2(x[i], '+', reshaped_y,
broadcast=True)
k(x[i], reshaped_y, broadcast=True)
out[0] = x
......
......@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.allclose(val, good), (val, good))
# Test reuse of output memory
if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1):
if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
op = self.adv_sub1()
# When idx is a TensorConstant.
if hasattr(idx, "data"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论