提交 60ec239c authored 作者: nouiz's avatar nouiz

Merge pull request #1295 from lamblin/fix_erfinvgpuopt

Fix bug in erfinv GPU optimization
...@@ -8,15 +8,14 @@ import warnings ...@@ -8,15 +8,14 @@ import warnings
import numpy import numpy
import theano import theano
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano import scalar as scal from theano import scalar as scal
from theano import tensor, compile, gof from theano import tensor, gof
import theano.ifelse import theano.ifelse
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler, Optimizer, toolbox, DestroyHandler,
InconsistencyError, EquilibriumOptimizer) EquilibriumOptimizer)
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
...@@ -34,9 +33,9 @@ from theano.sandbox.cuda.nnet import ( ...@@ -34,9 +33,9 @@ from theano.sandbox.cuda.nnet import (
GpuSoftmax, GpuSoftmaxWithBias) GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import ErfinvGPU, erfinv_gpu from theano.sandbox.cuda.elemwise import erfinv_gpu
from theano.sandbox.cuda.var import CudaNdarrayConstant from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.scan_module import scan_utils, scan_op from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
#optdb.print_summary() # shows what is currently registered #optdb.print_summary() # shows what is currently registered
...@@ -241,7 +240,7 @@ def local_gpu_elemwise_1(node): ...@@ -241,7 +240,7 @@ def local_gpu_elemwise_1(node):
# Don't set any inplace pattern. # Don't set any inplace pattern.
# gpu_inplace_elemwise_optimizer will do it later # gpu_inplace_elemwise_optimizer will do it later
if isinstance(node.op.scalar_op, Erfinv): if isinstance(elemwise_node.op.scalar_op, Erfinv):
new_op = GpuElemwise(erfinv_gpu) new_op = GpuElemwise(erfinv_gpu)
else: else:
try: try:
...@@ -622,8 +621,8 @@ def local_gpu_careduce(node): ...@@ -622,8 +621,8 @@ def local_gpu_careduce(node):
# Try to make a simpler pattern based on reshaping # Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have # The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape # the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the reduction, and then # to make them a single dimension, do the reduction, and
# reshape to get them back. # then reshape to get them back.
shape_of = node.fgraph.shape_feature.shape_of shape_of = node.fgraph.shape_feature.shape_of
...@@ -641,7 +640,7 @@ def local_gpu_careduce(node): ...@@ -641,7 +640,7 @@ def local_gpu_careduce(node):
new_greduce = GpuCAReduce(new_mask, scalar_op) new_greduce = GpuCAReduce(new_mask, scalar_op)
reshaped_x = x.reshape(tensor.stack(*new_in_shp)) reshaped_x = x.reshape(tensor.stack(*new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x) gpu_reshaped_x = gpu_from_host(reshaped_x)
reshaped_gpu_inputs = [ gpu_reshaped_x ] reshaped_gpu_inputs = [gpu_reshaped_x]
if new_greduce.supports_c_code(reshaped_gpu_inputs): if new_greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu( reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x)) new_greduce(gpu_reshaped_x))
...@@ -655,11 +654,11 @@ def local_gpu_careduce(node): ...@@ -655,11 +654,11 @@ def local_gpu_careduce(node):
return [unreshaped_reduce] return [unreshaped_reduce]
else: else:
print >> sys.stderr, \ print >> sys.stderr, \
"WARNING: local_gpu_careduce got type wrong" "WARNING: local_gpu_careduce got type wrong"
return None return None
raise Exception( raise Exception(
"GpuCAReduce does not yet implement this pattern:", "GpuCAReduce does not yet implement this pattern:",
pattern) pattern)
return False return False
...@@ -1020,6 +1019,7 @@ def local_gpu_conv(node): ...@@ -1020,6 +1019,7 @@ def local_gpu_conv(node):
float(op.imshp[1]))) float(op.imshp[1])))
cstride = int(numpy.ceil(op.imshp_logical[2] / cstride = int(numpy.ceil(op.imshp_logical[2] /
float(op.imshp[2]))) float(op.imshp[2])))
def make_graph(img, kern): def make_graph(img, kern):
buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype), buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
img.shape[0], *op.imshp_logical) img.shape[0], *op.imshp_logical)
...@@ -1027,6 +1027,7 @@ def local_gpu_conv(node): ...@@ -1027,6 +1027,7 @@ def local_gpu_conv(node):
img) img)
img = gpu_from_host(img) img = gpu_from_host(img)
return ret(img, kern) return ret(img, kern)
return make_graph return make_graph
return ret return ret
...@@ -1344,7 +1345,6 @@ def local_gpualloc(node): ...@@ -1344,7 +1345,6 @@ def local_gpualloc(node):
@register_opt() @register_opt()
@local_optimizer([tensor.Alloc]) @local_optimizer([tensor.Alloc])
def local_gpualloc_memset_0(node): def local_gpualloc_memset_0(node):
replace = False
if isinstance(node.op, GpuAlloc) and not node.op.memset_0: if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0] inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and if (isinstance(inp, CudaNdarrayConstant) and
...@@ -1522,9 +1522,11 @@ def gpuScanOptimization(node): ...@@ -1522,9 +1522,11 @@ def gpuScanOptimization(node):
local_fgraph = gof.FunctionGraph(tmp_in, tmp_out) local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
_cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, []) _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
info['gpu_hash'] = hash(_cmodule_key) info['gpu_hash'] = hash(_cmodule_key)
def typeConstructor(broadcastable, dtype): def typeConstructor(broadcastable, dtype):
assert dtype == 'float32' assert dtype == 'float32'
return CudaNdarrayType(broadcastable=broadcastable) return CudaNdarrayType(broadcastable=broadcastable)
_outputs = scan_op.Scan( _outputs = scan_op.Scan(
scan_ins, scan_ins,
scan_outs, scan_outs,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论