Merge pull request #904 from nouiz/small

Small

Merge pull request #904 from nouiz/small
7478227f · lamblin · edfc726f · fa6685d8 · 7478227f · 7478227f
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -749,7 +749,16 @@ class Test_preallocated_output(unittest.TestCase):
        from theano.sandbox import cuda
        if not cuda.cuda_available:
            raise SkipTest("Optional package Cuda disabled")
-
+        if cuda.use.device_number is None:
+            # We should normally set VecAsRowAndCol as a GPUOp But we
+            # don't want to do this here as this will disable others
+            # tests in this file.  So we manually init the GPU if
+            # needed to remove warning.
+            cuda.use("gpu",
+                     force=True,
+                     default_to_move_computation_to_gpu=False,
+                     move_shared_float32_to_gpu=False,
+                     enable_cuda=False)
        v = cuda.fvector('v')
        c, r = VecAsRowAndCol()(v)
        f = theano.function([v], [c, r])

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -497,101 +497,6 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
 gpu_ger_inplace = GpuGer(inplace=True)


-class GpuOuter(GpuOp):
-    """ Implement outer on the gpu."""
-    def make_node(self, x, y):
-        # we suppose type checking has been done, but make sure.
-        assert (x.type.ndim == 1 and y.type.ndim == 1 and
-                x.type.dtype == 'float32' and y.type.dtype == 'float32')
-
-        bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
-
-        outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
-        return Apply(self, [x, y], outputs)
-
-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def c_code_cache_version(self):
-        return (4,)
-
-    def c_code(self, node, name, inputs, outputs, sub):
-        # A = x * y'
-        x, y = inputs
-        A, = outputs
-        fail = sub['fail']
-
-        return """
-        CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
-        int %(name)sres;
-
-        if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
-            %(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
-            if (!%(name)sx) {
-                %(fail)s;
-            }
-        } else {
-            %(name)sx = %(x)s;
-            Py_INCREF(%(name)sx);
-        }
-        if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
-            %(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
-            if (!%(name)sy) {
-                Py_DECREF(%(name)sx);
-                %(fail)s;
-            }
-        } else {
-            %(name)sy = %(y)s;
-            Py_INCREF(%(name)sy);
-        }
-        if (!(%(A)s &&
-              CudaNdarray_HOST_DIMS(%(A)s)[0] ==
-                CudaNdarray_HOST_DIMS(%(x)s)[0] &&
-              CudaNdarray_HOST_DIMS(%(A)s)[1] ==
-                CudaNdarray_HOST_DIMS(%(y)s)[0] &&
-              CudaNdarray_is_c_contiguous(%(A)s))) {
-            Py_XDECREF(%(A)s);
-            int dims[2];
-            dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
-            dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
-            %(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
-            if (!%(A)s) {
-                Py_DECREF(%(name)sy);
-                Py_DECREF(%(name)sx);
-                %(fail)s;
-            }
-        }
-        else
-        {
-            // sger accumulates into A. We need to zero it first.
-            int total_size = (sizeof(real) *
-                                CudaNdarray_HOST_DIMS(%(A)s)[0] *
-                                CudaNdarray_HOST_DIMS(%(A)s)[1]);
-            if (cudaSuccess != cudaMemset(%(A)s->devdata, 0, total_size))
-            {
-                PyErr_Format(PyExc_MemoryError,
-                      "GpuOuter: Error memsetting %%d bytes of device memory.",
-                      total_size);
-                Py_DECREF(%(name)sy);
-                Py_DECREF(%(name)sx);
-                %(fail)s;
-            }
-        }
-
-        %(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
-        Py_DECREF(%(name)sy);
-        Py_DECREF(%(name)sx);
-        if (%(name)sres) {
-            %(fail)s;
-        }
-        """ % dict(x=x, y=y, A=A, fail=fail, name=name)
-
-gpu_outer = GpuOuter()
-
-
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -128,6 +128,18 @@ class NVCC_compiler(object):
        import theano.sandbox.cuda
        if hasattr(theano.sandbox, 'cuda'):
            n = theano.sandbox.cuda.use.device_number
+            if n is None:
+                _logger.warn("We try to get compilation arguments for CUDA"
+                             " code, but the GPU device is not initialized."
+                             " This is probably caused by an Op that work on"
+                             " the GPU that don't inherit from GpuOp."
+                             " We Initialize the GPU now.")
+                theano.sandbox.cuda.use("gpu",
+                                        force=True,
+                                        default_to_move_computation_to_gpu=False,
+                                        move_shared_float32_to_gpu=False,
+                                        enable_cuda=False)
+
            p = theano.sandbox.cuda.device_properties(n)
            flags.append('-arch=sm_' + str(p['major']) + str(p['minor']))
        return flags

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -21,7 +21,7 @@ from theano.gof.python25 import all, any
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -579,26 +579,6 @@ def local_gpu_gemm(node):
    return False


-@register_opt()
-@local_optimizer([])
-def local_gpu_outer(node):
-    """
-    gpu_dot22(col, row) -> gpu_outer
-    """
-    if node.op == gpu_dot22:
-        l, r = node.inputs
-        if l.type.broadcastable[1] and r.type.broadcastable[0]:
-            # TODO: we would like to remove the double-dimshuffle when
-            # l or r is already the output of a GpuDimshuffle. To do
-            # this, refactor the logic in tensor/opt.py that collapses
-            # dimshuffle chains so that we can call it from here.
-            lvec = GpuDimShuffle(l.broadcastable, [0])(l)
-            rvec = GpuDimShuffle(r.broadcastable, [1])(r)
-            return [gpu_outer(lvec, rvec)]
-
-    return False
-
-
 @register_opt()
 @local_optimizer([])
 def local_gpu_sum(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -200,31 +200,6 @@ class TestBlasStridesGpu(TestBlasStrides):
    mode = mode_with_gpu


-def test_outer():
-    x = tcn.shared_constructor(my_rand(8,), 'x')
-    y = tcn.shared_constructor(my_rand(6,), 'y')
-
-    x_val = x.get_value().copy()
-    y_val = y.get_value().copy()
-
-    f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val, y_val), f())
-
-    f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
-
-    f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
-
-    f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
-
-    f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
-
-    f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
-    assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
-
 if 0:
    # This is commented out because it doesn't make sense...
    # tcn.blas has no op called DownsampleFactorMax