Merge pull request #1295 from lamblin/fix_erfinvgpuopt

Fix bug in erfinv GPU optimization

Merge pull request #1295 from lamblin/fix_erfinvgpuopt
60ec239c · nouiz · a8865593 · b4889a2b · 60ec239c
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -8,15 +8,14 @@ import warnings
 import numpy

 import theano
-from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano import scalar as scal
-from theano import tensor, compile, gof
+from theano import tensor, gof
 import theano.ifelse

 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler,
-                        InconsistencyError, EquilibriumOptimizer)
+                        EquilibriumOptimizer)
 from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -34,9 +33,9 @@ from theano.sandbox.cuda.nnet import (
        GpuSoftmax, GpuSoftmaxWithBias)
 from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
-from theano.sandbox.cuda.elemwise import ErfinvGPU, erfinv_gpu
+from theano.sandbox.cuda.elemwise import erfinv_gpu
 from theano.sandbox.cuda.var import CudaNdarrayConstant
-from theano.scan_module import scan_utils, scan_op
+from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix

 #optdb.print_summary()  # shows what is currently registered
@@ -241,7 +240,7 @@ def local_gpu_elemwise_1(node):
            # Don't set any inplace pattern.
            # gpu_inplace_elemwise_optimizer will do it later

-            if isinstance(node.op.scalar_op, Erfinv):
+            if isinstance(elemwise_node.op.scalar_op, Erfinv):
                new_op = GpuElemwise(erfinv_gpu)
            else:
                try:
@@ -622,8 +621,8 @@ def local_gpu_careduce(node):
                    # Try to make a simpler pattern based on reshaping
                    # The principle is that if two adjacent dimensions have
                    # the same value in the reduce_mask, then we can reshape
-                    # to make them a single dimension, do the reduction, and then
-                    # reshape to get them back.
+                    # to make them a single dimension, do the reduction, and
+                    # then reshape to get them back.

                    shape_of = node.fgraph.shape_feature.shape_of

@@ -641,7 +640,7 @@ def local_gpu_careduce(node):
                    new_greduce = GpuCAReduce(new_mask, scalar_op)
                    reshaped_x = x.reshape(tensor.stack(*new_in_shp))
                    gpu_reshaped_x = gpu_from_host(reshaped_x)
-                    reshaped_gpu_inputs = [ gpu_reshaped_x ]
+                    reshaped_gpu_inputs = [gpu_reshaped_x]
                    if new_greduce.supports_c_code(reshaped_gpu_inputs):
                        reduce_reshaped_x = host_from_gpu(
                            new_greduce(gpu_reshaped_x))
@@ -655,11 +654,11 @@ def local_gpu_careduce(node):
                            return [unreshaped_reduce]
                        else:
                            print >> sys.stderr, \
-                                    "WARNING: local_gpu_careduce got type wrong"
+                                "WARNING: local_gpu_careduce got type wrong"
                            return None

                        raise Exception(
-                                "GpuCAReduce does not yet implement this pattern:",
+                            "GpuCAReduce does not yet implement this pattern:",
                            pattern)
    return False

@@ -1020,6 +1019,7 @@ def local_gpu_conv(node):
                                         float(op.imshp[1])))
                cstride = int(numpy.ceil(op.imshp_logical[2] /
                                         float(op.imshp[2])))
+
                def make_graph(img, kern):
                    buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
                                       img.shape[0], *op.imshp_logical)
@@ -1027,6 +1027,7 @@ def local_gpu_conv(node):
                                               img)
                    img = gpu_from_host(img)
                    return ret(img, kern)
+
                return make_graph
        return ret

@@ -1344,7 +1345,6 @@ def local_gpualloc(node):
 @register_opt()
 @local_optimizer([tensor.Alloc])
 def local_gpualloc_memset_0(node):
-    replace = False
    if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
        inp = node.inputs[0]
        if (isinstance(inp, CudaNdarrayConstant) and
@@ -1522,9 +1522,11 @@ def gpuScanOptimization(node):
            local_fgraph = gof.FunctionGraph(tmp_in, tmp_out)
            _cmodule_key = gof.CLinker().cmodule_key_(local_fgraph, [])
            info['gpu_hash'] = hash(_cmodule_key)
+
            def typeConstructor(broadcastable, dtype):
                assert dtype == 'float32'
                return CudaNdarrayType(broadcastable=broadcastable)
+
            _outputs = scan_op.Scan(
                scan_ins,
                scan_outs,