Merge pull request #2856 from abergeron/lstm_fixes

Lstm fixes

Merge pull request #2856 from abergeron/lstm_fixes
03d0e784 · Frédéric Bastien · 40bb994b · 9e00dfae · 03d0e784 · 03d0e784
--- a/theano/sandbox/gpuarray/blas.py
+++ b/theano/sandbox/gpuarray/blas.py
+import os.path
+
 from theano import Op, Apply, config

+from theano.compile import optdb
+from theano.gof import local_optimizer, LocalOptGroup
 from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
+from theano.tensor.opt import in2out
+
 from .basic_ops import HideC, as_gpuarray_variable

 try:
@@ -13,14 +19,35 @@ except ImportError as e:

 class BlasOp(HideC):
    def c_headers(self):
-        return ['<blas_api.h>']
+        return ['<blas_api.h>', '<numpy_compat.h>', '<gpuarray_helper.h>']

    def c_header_dirs(self):
-        return [pygpu.get_include()]
+        return [pygpu.get_include(), os.path.dirname(__file__)]

    def c_init_code(self):
        return ['import_pygpu__blas();']

+    def c_support_code(self):
+        return """
+PyGpuArrayObject *gpublas_try_copy(PyGpuArrayObject *out,
+                                   PyGpuArrayObject *y) {
+  if (out &&
+      GpuArray_CHKFLAGS(&out->ga, GA_CARRAY) &&
+      theano_size_check(out, PyGpuArray_NDIM(y),
+                        PyGpuArray_DIMS(y),
+                        y->ga.typecode)) {
+    if (pygpu_move(out, y)) {
+      Py_XDECREF(out);
+      return NULL;
+    }
+  } else {
+    Py_XDECREF(out);
+    out = pygpu_copy(y, GA_ANY_ORDER);
+  }
+  return out;
+}
+"""
+

 class GpuGemv(BlasOp, Gemv):
    def make_node(self, y, alpha, A, x, beta):
@@ -44,21 +71,20 @@ class GpuGemv(BlasOp, Gemv):
                    beta=inp[4], fail=sub['fail'], name=name)
        if self.inplace:
            code = """
-                   Py_XDECREF(%(out)s);
                   if (%(y)s->ga.strides[0] <= 0) {
-                     %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                     %(out)s = gpublas_try_copy(%(out)s, %(y)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
                   } else {
+                     Py_XDECREF(%(out)s);
                     %(out)s = %(y)s;
                     Py_INCREF(%(out)s);
                   }
                   """ % vars
        else:
            code = """
-                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
+                   %(out)s = gpublas_try_copy(%(out)s, %(y)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -79,7 +105,7 @@ class GpuGemv(BlasOp, Gemv):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)

 gpugemv_no_inplace = GpuGemv(inplace=False)
 gpugemv_inplace = GpuGemv(inplace=True)
@@ -107,13 +133,13 @@ class GpuGemm(BlasOp, Gemm):
                    beta=inp[4], fail=sub['fail'], name=name)
        if self.inplace:
            code = """
-                   Py_XDECREF(%(out)s);
                   if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
-                     %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                     %(out)s = gpublas_try_copy(%(out)s, %(C)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
                   } else {
+                     Py_XDECREF(%(out)s);
                     %(out)s = %(C)s;
                     Py_INCREF(%(out)s);
                   }
@@ -121,7 +147,7 @@ class GpuGemm(BlasOp, Gemm):
        else:
            code = """
                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
+                   %(out)s = gpublas_try_copy(%(out)s, %(C)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -142,7 +168,7 @@ class GpuGemm(BlasOp, Gemm):
        return code

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)


 gpugemm_no_inplace = GpuGemm(inplace=False)
@@ -171,21 +197,20 @@ class GpuGer(BlasOp, Ger):
                    fail=sub['fail'], name=name)
        if self.destructive:
            code = """
-                   Py_XDECREF(%(out)s);
                   if (!GpuArray_ISONESEGMENT(&%(A)s->ga)) {
-                     %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                     %(out)s = gpublas_try_copy(%(out)s, %(A)s);
                     if (%(out)s == NULL) {
                       %(fail)s
                     }
                   } else {
+                     Py_XDECREF(%(out)s);
                     %(out)s = %(A)s;
                     Py_INCREF(%(out)s);
                   }
                   """ % vars
        else:
            code = """
-                   Py_XDECREF(%(out)s);
-                   %(out)s = pygpu_copy(%(A)s, GA_ANY_ORDER);
+                   %(out)s = gpublas_try_copy(%(out)s, %(A)s);
                   if (%(out)s == NULL) {
                       %(fail)s
                   }
@@ -203,7 +228,7 @@ class GpuGer(BlasOp, Ger):
        return code

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)


 gpuger_no_inplace = GpuGer(destructive=False)
@@ -239,11 +264,8 @@ class GpuDot22(BlasOp, Dot22):
        dims[0] = PyGpuArray_DIMS(%(A)s)[0];
        dims[1] = PyGpuArray_DIMS(%(B)s)[1];

-        %(out)s = pygpu_empty(2, dims,
-                            %(typecode)s,
-                            GA_C_ORDER,
-                            pygpu_default_context(), Py_None);
-        if (!%(out)s) {
+        if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
+                              pygpu_default_context())) {
            %(fail)s
        }

@@ -262,19 +284,10 @@ class GpuDot22(BlasOp, Dot22):
        return code

    def c_code_cache_version(self):
-        return (1,)
-
-    def c_headers(self):
-        ret = super(GpuDot22, self).c_headers()
-        return ret + ['<numpy_compat.h>']
+        return (3,)

 gpu_dot22 = GpuDot22()

-from theano.compile import optdb
-from theano.gof import local_optimizer, LocalOptGroup
-from theano.tensor.opt import in2out
-
-
 @local_optimizer([gpugemv_no_inplace], inplace=True)
 def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:

--- a/theano/sandbox/gpuarray/gpuarray_helper.h
+++ b/theano/sandbox/gpuarray/gpuarray_helper.h
+#ifndef THEANO_GPUARRAY_HELPER
+#define THEANO_GPUARRAY_HELPER
+
+#include <string.h>
+#include <gpuarray_api.h>
+#include <numpy_compat.h>
+
+static int theano_size_check(PyGpuArrayObject *a, unsigned int nd,
+                             const size_t *dims, int typecode) {
+  return (a->ga.nd == nd && a->ga.typecode == typecode &&
+          memcmp(a->ga.dimensions, dims, nd * sizeof(size_t)) == 0);
+}
+
+static int theano_prep_output(PyGpuArrayObject **out, unsigned int nd,
+                             const size_t *dims, int typecode, ga_order ord,
+                             PyGpuContextObject *c) {
+  if (*out != NULL &&
+      theano_size_check(*out, nd, dims, typecode)) {
+    return 0;
+  }
+
+  Py_XDECREF(*out);
+  *out = pygpu_empty(nd, dims, typecode, ord, c, Py_None);
+  return (*out == NULL) ? 1 : 0;
+}
+
+#endif
--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -10,7 +10,7 @@ from ..elemwise import (GpuElemwise, GpuDimShuffle,
                        GpuCAReduceCuda, GpuCAReduceCPY)
 from ..type import GpuArrayType

-from pygpu.array import gpuarray
+from pygpu import ndgpuarray as gpuarray


 # This is acutally a test for GpuElemwise

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -40,7 +40,12 @@ class GpuArrayType(Type):
        return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)

    def filter(self, data, strict=False, allow_downcast=None):
-        if strict:
+        if (isinstance(data, gpuarray.GpuArray) and
+                data.typecode == self.typecode):
+            # This is just to make this condition not enter the
+            # following branches
+            pass
+        elif strict:
            if not isinstance(data, gpuarray.GpuArray):
                raise TypeError("%s expected a GpuArray object." % self,
                                data, type(data))
@@ -50,13 +55,24 @@ class GpuArrayType(Type):
                                (self, self.typecode, self.dtype,
                                 data.typecode, str(data.dtype)))
            # fallthrough to ndim check
-        elif allow_downcast:
+        elif (allow_downcast or
+              (allow_downcast is None and
+               type(data) == float and
+               self.dtype == config.floatX)):
            data = gpuarray.array(data, dtype=self.typecode, copy=False,
                                  ndmin=len(self.broadcastable))
        else:
+            if not hasattr(data, 'dtype'):
+                # This is to convert objects that don't have a dtype
+                # (like lists).  We anticipate that the type below
+                # will match and we pass copy=False so it won't make a
+                # second object on the GPU.
+                data = gpuarray.array(data, copy=False)
+
            up_dtype = scalar.upcast(self.dtype, data.dtype)
            if up_dtype == self.dtype:
-                data = gpuarray.array(data, dtype=self.dtype, copy=False)
+                data = gpuarray.array(data, dtype=self.dtype,
+                                      copy=False)
            else:
                raise TypeError("%s cannot store a value of dtype %s "
                                "without risking loss of precision." %
@@ -150,18 +166,15 @@ class GpuArrayType(Type):

    def convert_variable(self, var):
        if (type(self) == type(var.type) and
-            self.typecode == var.type.typecode and
-            self.ndim == var.type.ndim and
-            all(sb == ob or ob for sb, ob in zip(self.broadcastable,
-                                                 var.type.broadcastable))):
+                self.typecode == var.type.typecode and
+                self.ndim == var.type.ndim and
+                all(sb == ob or ob for sb, ob in zip(self.broadcastable,
+                                                     var.type.broadcastable))):
            return theano.tensor.patternbroadcast(var, self.broadcastable)

    def __hash__(self):
        return (hash(self.typecode) ^ hash(self.broadcastable))

-    def __str__(self):
-        return "GpuArray<%s>" % (self.dtype,)
-
    def dtype_specs(self):
        """Return a tuple (python type, c type, numpy typenum) that corresponds
        to self.dtype.
@@ -250,9 +263,9 @@ class GpuArrayType(Type):
    def c_headers(self):
        # We need arrayobject for the PyArrayDescr struct def
        # (even if we just use a pointer to it in a function def)
-        return ['<gpuarray/array.h>', '<gpuarray/kernel.h>', '<gpuarray/error.h>',
-                '<gpuarray/buffer_blas.h>', '<numpy/arrayobject.h>',
-                '<gpuarray_api.h>']
+        return ['<gpuarray/array.h>', '<gpuarray/kernel.h>',
+                '<gpuarray/error.h>', '<gpuarray/buffer_blas.h>',
+                '<numpy/arrayobject.h>', '<gpuarray_api.h>']

    def c_header_dirs(self):
        return [pygpu.get_include(), numpy.get_include()]
@@ -284,8 +297,9 @@ GpuArrayType.Variable = GpuArrayVariable


 class GpuArraySignature(tensor.TensorConstantSignature):
-    pass  # might do something better if we can run the sum on the
-          # GPU, but for now this will suffice.
+    # might do something better if we can run the sum on the GPU, but
+    # for now this will suffice.
+    pass


 class GpuArrayConstant(_operators, Constant):
@@ -312,7 +326,9 @@ class GpuArraySharedVariable(_operators, SharedVariable):
            return numpy.asarray(self.container.value)

    def set_value(self, value, borrow=False):
-        self.container.value = pygpu.gpuarray.array(value, copy=(not borrow))
+        if isinstance(value, pygpu.gpuarray.GpuArray):
+            value = pygpu.gpuarray.array(value, copy=(not borrow))
+        self.container.value = value

    def __getitem__(self, *args):
        return _operators.__getitem__(self, *args)