提交 a4d2db75 authored 作者: Dustin Webb's avatar Dustin Webb

Started applying local_elemwise_alloc to GpuElemwise.

上级 c4a2fd88
...@@ -313,6 +313,13 @@ class GpuDimShuffle(GpuOp): ...@@ -313,6 +313,13 @@ class GpuDimShuffle(GpuOp):
" dimension.", " dimension.",
(input_broadcastable, new_order)) (input_broadcastable, new_order))
# this is the list of the original dimensions that we keep
self.shuffle = [x for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not
# in the original input
self.augment = [i for i, x in enumerate(new_order) if x == 'x']
self.view_map = {0: [0]} self.view_map = {0: [0]}
self._rehash() self._rehash()
...@@ -486,6 +493,17 @@ class GpuDimShuffle(GpuOp): ...@@ -486,6 +493,17 @@ class GpuDimShuffle(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, 0) return (1, 0)
def infer_shape(self, node, shapes):
ishp, = shapes
# transpose
rval = [ishp[i] for i in self.shuffle]
# augment
for augm in self.augment:
rval.insert(augm, 1)
return [rval]
class GpuCAReduce(GpuOp): class GpuCAReduce(GpuOp):
"""GpuCAReduce is a Reduction along some dimensions by a scalar op. """GpuCAReduce is a Reduction along some dimensions by a scalar op.
......
...@@ -1814,6 +1814,15 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op( ...@@ -1814,6 +1814,15 @@ gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75, optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
'fast_run', 'inplace', 'gpu_inplace') 'fast_run', 'inplace', 'gpu_inplace')
gpu_local_elemwise_alloc = tensor.opt.register_specialize_device(
gof.local_optimizer([GpuElemwise])(
tensor.opt.local_elemwise_alloc_op(
GpuElemwise,
GpuAlloc,
GpuDimShuffle
)
)
)
@register_opt() @register_opt()
@local_optimizer([tensor.alloc]) @local_optimizer([tensor.alloc])
......
import operator import operator
import sys import sys
import unittest
import numpy import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
...@@ -86,6 +87,74 @@ def test_gpualloc(): ...@@ -86,6 +87,74 @@ def test_gpualloc():
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l]) assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l])
class Test_local_elemwise_alloc(unittest.TestCase):
dtype = config.floatX
def setUp(self):
self.vec = tensor.vector('vec', dtype=theano.config.floatX)
self.mat = tensor.matrix('mat', dtype=theano.config.floatX)
self.tens = tensor.tensor3('tens', dtype=theano.config.floatX)
self.alloc_wo_dep = basic_ops.gpu_alloc(self.vec, 2)
self.alloc_w_dep = basic_ops.gpu_alloc(self.vec, *self.vec.shape)
def _verify_alloc_count(self, f, count):
assert(
sum([isinstance(elem.op, basic_ops.GpuAlloc)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def _verify_assert_count(self, f, count):
assert(
sum([isinstance(elem.op, tensor.opt.Assert)
for elem in f.maker.fgraph.toposort()
if elem.op is not None]) == count
)
def test_remove_alloc_wo_dimshuffle(self):
# No optimization on alloc
from theano.printing import debugprint as dp
func = theano.function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on alloc with assert
"""
func = theano.function(
[self.vec, self.mat],
self.alloc_wo_dep + self.mat,
mode=mode_with_gpu
)
import ipdb; ipdb.set_trace()
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 1)
"""
# No optimization on alloc without assert
func = theano.function(
[self.vec, self.mat],
self.alloc_w_dep + self.mat,
mode='FAST_COMPILE'
)
self._verify_alloc_count(func, 1)
self._verify_assert_count(func, 0)
# Optimization on alloc without assert
func = theano.function(
[self.vec, self.mat],
self.alloc_w_dep + self. mat,
mode=mode_with_gpu
)
import ipdb; ipdb.set_trace()
self._verify_alloc_count(func, 0)
self._verify_assert_count(func, 0)
def test_alloc_memset_0(): def test_alloc_memset_0():
i = tensor.iscalar() i = tensor.iscalar()
z = numpy.zeros((1,), dtype='float32') z = numpy.zeros((1,), dtype='float32')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论