Merge pull request #3712 from nouiz/crash_gpu_from_host

Fix compilation crash

Merge pull request #3712 from nouiz/crash_gpu_from_host
24e73f67 · abergeron · 5429c30a · 78158149 · 24e73f67 · 24e73f67
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -504,6 +504,24 @@ def test_pdbbreakpoint_op():
    assert topo[-1].op == cuda.host_from_gpu
+def test_local_gpu_elemwise_careduce():
+    x = theano.tensor.fmatrix()
+    o = (x * x).sum()
+    f = theano.function([x], o, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    data = numpy.random.rand(3, 4).astype('float32')
+    utt.assert_allclose(f(data), (data * data).sum())
+    o = (x * x).sum(axis=1)
+    f = theano.function([x], o, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    utt.assert_allclose(f(data), (data * data).sum(axis=1))
 def test_huge_elemwise_fusion():
    """ Test the the GpuElemwise fusion work correctly
        We check that we fuse one node with part of its input

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -403,6 +403,7 @@ class GpuFromHost(Op):
        return """
        PyArrayObject *%(name)s_tmp;
        %(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
+        int err;
        if (%(name)s_tmp == NULL)
          %(fail)s
@@ -411,8 +412,8 @@ class GpuFromHost(Op):
                              (size_t *)PyArray_DIMS(%(name)s_tmp),
                              get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)))) {
          Py_BEGIN_ALLOW_THREADS
-          int err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
+          err = GpuArray_write(&%(out)s->ga, PyArray_DATA(%(name)s_tmp),
-                                   PyArray_NBYTES(%(name)s_tmp));
+                               PyArray_NBYTES(%(name)s_tmp));
          Py_END_ALLOW_THREADS
          Py_DECREF(%(name)s_tmp);
          if (err != GA_NO_ERROR) {

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -867,12 +867,13 @@ def local_gpu_elemwise_careduce(node):
            isinstance(node.inputs[0].owner.op, GpuElemwise) and
            # The Op support all scalar with 1 inputs.  We don't
            # automatically add more case, as some like trigonometic
-            # operation with some reduction pattern will probably result
+            # operation with some reduction pattern will probably results
-            # to slow down.
+            # in slow down.
            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
        op = node.op
        inp = node.inputs[0].owner.inputs[0]
        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
+                                axis=op.axis,
                                reduce_mask=op.reduce_mask,
                                pre_scalar_op=scalar.basic.sqr)(inp)]

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -212,7 +212,15 @@ def test_local_gpu_elemwise_careduce():
    topo = f.maker.fgraph.toposort()
    assert len(topo) == 3
    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    f(numpy.random.rand(3, 4).astype(theano.config.floatX))
+    data = numpy.random.rand(3, 4).astype(theano.config.floatX)
+    utt.assert_allclose(f(data), (data * data).sum())
+    o = (x * x).sum(axis=1)
+    f = theano.function([x], o, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    utt.assert_allclose(f(data), (data * data).sum(axis=1))
 def test_local_gpu_subtensor():