Merged

12088f8c · Olivier Delalleau · 3dd8e6c0 · da2a1bbd · 12088f8c · 12088f8c
--- a/NEWS.txt
+++ b/NEWS.txt
 Modifications in the trunk since the last release

-In trunk since 0.3.1 release
----------------------------
+Partial of what is in trunk since the last release
+--------------------------------------------------
+Deprecation:
+
+Bugs fixed:
+ * Bugfix in CudaNdarray.__iadd__. When it is not implemented, return the error.
+
+Crash fixed:
+ * Work around a bug in gcc 4.3.0 that make the compilation of 2d convolution crash.
+
+Optimization:
+ * Optimize 4 pattern of subtensor followed by subtensor.
+
 GPU:
 * Move to the gpu fused elemwise that have other dtype then float32 in them(except float64) if the input and output are float32.
   * This allow to move elemwise comparaison to the gpu if we cast it to float32 after that.
+ * Implemented CudaNdarray.ndim to have the same interface in ndarray.
+
+New features:
+ * ProfileMode
+    * profile the scan overhead
+    * simple hook system to add profiler
+    * reordered the output to be in the order of more general to more specific
+ * var[vector of index] now work, (grad work recursivly, the direct grad work inplace, gpu work)
+    * limitation: work only of the outer most dimensions.
+
+Documentation:
+
+Unit tests:
+ * Reuse test for subtensor of tensor for gpu tensor(more gpu test)

+Other:
+ * ?? a bug?? Correctly put the broadcast flag to True in the output var of a Rehapse op when we receive an int 1 in the new shape.

 Theano 0.3.1 (2011-02-21)
 ----------------------------

--- a/doc/internal/how_to_release.txt
+++ b/doc/internal/how_to_release.txt
@@ -4,6 +4,13 @@
 How to make a release
 ==================================================

+Update files
+============
+
+Update the NEWS.txt and move the old stuff in the HISTORY.txt file.
+To update the NEWS.txt file, check all ticket closed for this release
+and all commit log messages.
+
 Get a fresh copy of the repository
 ==================================


--- a/doc/tutorial/modes.txt
+++ b/doc/tutorial/modes.txt
@@ -123,9 +123,18 @@ ProfileMode

 Beside checking for errors, another important task is to profile your
 code. For this Theano uses a special mode called ProfileMode which has
-to be passed as an argument to :func:`theano.function <function.function>`. Using the ProfileMode is a three-step process.
+to be passed as an argument to :func:`theano.function <function.function>`. 
+Using the ProfileMode is a three-step process.

-To change the default to it, put the theano flags mode to PROFILE_MODE.
+.. note::
+
+    To change the default to it, put the Theano flags
+    :attr:`config.mode` to ProfileMode.  In that case, when the python
+    process exit, it will automatically print the profiling
+    information on the stdout.
+
+    The memory profile of the output of each apply node can be enabled with the 
+    Theano flag :attr:`config.ProfileMode.profile_memory`.

 Creating a ProfileMode Instance
 -------------------------------

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -123,10 +123,6 @@ AddConfigVar('nvcc.fastmath',
        "",
        BoolParam(False))

-AddConfigVar('cuda.root',
-        "directory with bin/, lib/, include/ for cuda utilities",
-        StrParam(os.getenv('CUDA_ROOT', "/usr/local/cuda")))
-
 AddConfigVar('gpuelemwise.sync',
        "when true, wait that the gpu fct finished and check it error code.",
        BoolParam(True))

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -208,17 +208,25 @@ def module_name_from_dir(dirname):
 def get_module_hash(module_file, key):

    """
-    Return an MD5 hash that identifies a module.
+    Return an MD5 hash that uniquely identifies a module.

    This hash takes into account:
        1. The 'mod.cpp' or 'mod.cu' file used to compile `module_file`.
-        2. The compiler options defined in `key`.
+        2. The version part of the key.
+        3. The compiler options defined in `key` (command line parameters and
+           libraries to link against).
    """
    source_code = os.path.join(os.path.dirname(module_file), 'mod.cpp')
    if not os.path.exists(source_code):
        source_code = os.path.join(os.path.dirname(module_file), 'mod.cu')
        assert os.path.exists(source_code)
-    source_hash = hash_from_file(source_code)
+    # `to_hash` will contain any element such that we know for sure that if
+    # it changes, then the module hash should be different.
+    # We start with the source code itself (stripping blanks might avoid
+    # recompiling after a basic indentation fix for instance).
+    to_hash = map(str.strip, open(source_code).readlines())
+    # Get the version part of the key.
+    to_hash += map(str, key[0])
    c_link_key = key[1]
    # Currently, in order to catch potential bugs early, we are very
    # convervative about the structure of the key and raise an exception
@@ -231,9 +239,10 @@ def get_module_hash(module_file, key):
                 "AssertionError may be removed or modified to accomodate "
                 "this change)")
    assert c_link_key[0] == 'CLinker.cmodule_key', error_msg
-    to_hash = [source_hash]
    for key_element in c_link_key[1:]:
        if isinstance(key_element, tuple):
+            # This should be the C++ compilation command line parameters or the
+            # libraries to link against.
            to_hash += list(key_element)
        elif isinstance(key_element, str):
            if key_element.startswith('md5:'):

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -247,6 +247,81 @@ class GpuGemm(Op):
 gpu_gemm_no_inplace = GpuGemm(inplace=False)
 gpu_gemm_inplace = GpuGemm(inplace=True)

+class GpuOuter(Op):
+    def make_node(self, x, y):
+        # we suppose type checking has been done, but make sure.
+        assert (x.type.ndim == 1 and y.type.ndim == 1 and
+                x.type.dtype == 'float32' and y.type.dtype == 'float32')
+
+        bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
+
+        outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
+        return Apply(self, [x, y], outputs)
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def __hash__(self):
+        return hash(type(self))
+
+    def c_code_cache_version(self):
+        return (3,)
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        # A = x * y'
+        x, y = inputs
+        A, = outputs
+        fail = sub['fail']
+        
+        return """
+        CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
+        int %(name)sres;
+        
+        if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
+            %(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
+            if (!%(name)sx) {
+                %(fail)s;
+            }
+        } else {
+            %(name)sx = %(x)s;
+            Py_INCREF(%(name)sx);
+        }
+        if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
+            %(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
+            if (!%(name)sy) {
+                Py_DECREF(%(name)sx);
+                %(fail)s;
+            }
+        } else {
+            %(name)sy = %(y)s;
+            Py_INCREF(%(name)sy);
+        }
+        if (!(%(A)s &&
+              CudaNdarray_HOST_DIMS(%(A)s)[0] == CudaNdarray_HOST_DIMS(%(x)s)[0] &&
+              CudaNdarray_HOST_DIMS(%(A)s)[1] == CudaNdarray_HOST_DIMS(%(y)s)[0] &&
+              CudaNdarray_is_c_contiguous(%(A)s))) {
+            Py_XDECREF(%(A)s);
+            int dims[2];
+            dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
+            dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
+            %(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
+            if (!%(A)s) {
+                Py_DECREF(%(name)sy);
+                Py_DECREF(%(name)sx);
+                %(fail)s;
+            }
+        }
+
+        %(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
+        Py_DECREF(%(name)sy);
+        Py_DECREF(%(name)sx);
+        if (%(name)sres) {
+            %(fail)s;
+        }
+        """%dict(x=x,y=y,A=A,fail=fail,name=name)
+
+gpu_outer = GpuOuter()
+        
 ##
 # Not really a BLAS operation, but whatever.
 #

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -328,6 +328,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
+        Py_DECREF(contiguous_self);
        Py_DECREF(rval);
        rval = NULL;
    }
@@ -353,7 +354,7 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
    CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
    if (!rval)
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to new_null failed");
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
        return NULL;
    }

@@ -2347,7 +2348,7 @@ CudaNdarray_New(int nd)
    CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
    if (self == NULL)
    {
-        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_new_null failed to allocate self");
+        PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
        return NULL;
    }
    CudaNdarray_null_init(self);
@@ -2717,6 +2718,48 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    return 0;
 }

+int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
+  if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
+  if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
+  if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
+
+  if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
+      || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
+    PyErr_Format(PyExc_ValueError,
+		 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
+		 CudaNdarray_HOST_DIMS(x)[0],
+		 CudaNdarray_HOST_DIMS(y)[0],
+		 CudaNdarray_HOST_DIMS(A)[0],
+		 CudaNdarray_HOST_DIMS(A)[1]);
+      return -1;
+  }
+  
+  // Maybe this could work, but be safe for now
+  if (!CudaNdarray_is_c_contiguous(A)) {
+    PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
+    return -1;
+  }
+
+  // Same for this, be safe
+  assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
+  assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
+
+  // Since Sger expects A in col-major, we invert x and y to fake this.
+  cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
+	     CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
+	     CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
+	     CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
+  CNDA_THREAD_SYNC;
+
+  cudaError_t err = cudaGetLastError();
+  if (CUBLAS_STATUS_SUCCESS != err)
+    {
+      PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
+      return -1;
+    }
+  return 0;
+}
+
 /**
 *
 * Precondition:
@@ -3238,3 +3281,14 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
    return NULL;
 }

+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  c-file-offsets:((innamespace . 0)(inline-open . 0))
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -478,13 +478,16 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
 PyObject * 
 CudaNdarray_CreateArrayObj(CudaNdarray * self);

+PyObject * 
+CudaNdarray_ZEROS(int n, int * dims);
+
 /**
 * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
 */
 bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);

 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
-
+int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);

 int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
 int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -10,13 +10,18 @@ _logger.setLevel(logging.WARN)
 from theano.configparser import config, AddConfigVar, StrParam

 AddConfigVar('nvcc.compiler_bindir',
-        "if defined, nvcc compiler driver will seek g++ and gcc in this directory",
+        "If defined, nvcc compiler driver will seek g++ and gcc in this directory",
        StrParam(""))

 AddConfigVar('cuda.nvccflags',
        "Extra compiler flags for nvcc",
        StrParam(""))

+AddConfigVar('cuda.root',
+        "The directory with bin/, lib/, include/ for cuda utilities. Used to put this directory of nvidia lib in the compiled libraire. Usefull when people forget to update there LD_LIBRARY_PATH and LIBRARY_PATH environment variable. If AUTO, if nvcc is in the path, it will use one of this parent directory. Otherwise /usr/local/cuda. If empty, won't appen the directory in the compiled library",
+        StrParam(os.getenv('CUDA_ROOT', "AUTO")))
+
+
 def error(*args):
    #sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
    _logger.error("ERROR: "+' '.join(str(a) for a in args))
@@ -43,7 +48,7 @@ def is_nvcc_available():
        global nvcc_version
        nvcc_version = s[1]
        return True
-    except:
+    except Exception:
        #try to find nvcc into cuda.root
        p = os.path.join(config.cuda.root,'bin','nvcc')
        if os.path.exists(p):
@@ -51,6 +56,20 @@ def is_nvcc_available():
            nvcc_path = p
            return True
        else: return False
+
+def set_cuda_root():
+    import pdb;pdb.set_trace()
+    s = os.getenv("PATH")
+    if not s:
+        return
+    for dir in s.split(os.path.pathsep):
+        if os.path.exists(os.path.join(dir,"nvcc")):
+            config.cuda.root = os.path.split(dir)[0]
+            return
+
+if config.cuda.root == "AUTO":
+    set_cuda_root()
+
 is_nvcc_available()#to set nvcc_path correctly and get the version

 def nvcc_module_compile_str(
@@ -66,11 +85,11 @@ def nvcc_module_compile_str(
    :param preargs: a list of extra compiler arguments

    :returns: dynamically-imported python module of the compiled code.
-    
+
    :note 1: On Windows 7 with nvcc 3.1 we need to compile in the real directory
             Otherwise nvcc never finish.
    """
-    
+
    if sys.platform=="win32":
        # Remove some compilation args that cl.exe does not understand.
        # cl.exe is the compiler used by nvcc on Windows.
@@ -91,7 +110,7 @@ def nvcc_module_compile_str(
    include_dirs = include_dirs + std_include_dirs()
    if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs:
        include_dirs.append(os.path.abspath(os.path.split(__file__)[0]))
-    
+
    libs = std_libs() + libs
    if 'cudart' not in libs:
        libs.append('cudart')
@@ -107,11 +126,11 @@ def nvcc_module_compile_str(


    if sys.platform == 'darwin':
-        # On the mac, nvcc is not able to link using -framework Python, so we have 
+        # On the mac, nvcc is not able to link using -framework Python, so we have
        # manually add the correct library and paths
        darwin_python_lib = commands.getoutput('python-config --ldflags')
    else:
-        # sometimes, the linker cannot find -lpython so we need to tell it 
+        # sometimes, the linker cannot find -lpython so we need to tell it
        # explicitly where it is located
        # this returns somepath/lib/python2.x
        python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
@@ -133,14 +152,14 @@ def nvcc_module_compile_str(
            (module_name, get_lib_extension()))

    debug('Generating shared lib', lib_filename)
-    # TODO: Why do these args cause failure on gtx285 that has 1.3 compute capability? '--gpu-architecture=compute_13', '--gpu-code=compute_13', 
+    # TODO: Why do these args cause failure on gtx285 that has 1.3 compute capability? '--gpu-architecture=compute_13', '--gpu-code=compute_13',
    preargs1=[pa for pa in preargs if pa.startswith('-O') or pa.startswith('--maxrregcount=')]#nvcc argument
    preargs2=[pa for pa in preargs if pa not in preargs1]#other arguments

    cmd = [nvcc_path, '-shared', '-g'] + preargs1
    if config.nvcc.compiler_bindir:
        cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
-   
+
    if sys.platform!='win32':
        if local_bitwidth() == 64:
            cmd.append('-m64')
@@ -148,11 +167,11 @@ def nvcc_module_compile_str(
        else:
            cmd.append('-m32')
            preargs2.append('-m32')
-            
+
    if len(preargs2)>0:
        cmd.extend(['-Xcompiler', ','.join(preargs2)])

-    if os.path.exists(os.path.join(config.cuda.root,'lib')):
+    if config.cuda.root and os.path.exists(os.path.join(config.cuda.root,'lib')):
        cmd.extend(['-Xlinker',','.join(['-rpath',os.path.join(config.cuda.root,'lib')])])
        if sys.platform != 'darwin':
            # the 64bit CUDA libs are in the same files as are named by the function above
@@ -168,7 +187,7 @@ def nvcc_module_compile_str(
    cmd.extend(['-l%s'%l for l in libs])
    if sys.platform == 'darwin':
        cmd.extend(darwin_python_lib.split())
-    
+
    if sys.platform == 'darwin':
        done = False
        while not done:
@@ -213,7 +232,7 @@ def nvcc_module_compile_str(
        nvcc_stdout, nvcc_stderr = p.communicate()[:2]
    finally:
        os.chdir(orig_dir)
-        
+
    if nvcc_stdout:
        # this doesn't happen to my knowledge
        print >> sys.stderr, "DEBUG: nvcc STDOUT", nvcc_stdout
@@ -229,7 +248,7 @@ def nvcc_module_compile_str(
            continue
        _logger.info("NVCC: "+eline)

-    if p.returncode: 
+    if p.returncode:
        # filter the output from the compiler
        for l in nvcc_stderr.split('\n'):
            if not l:
@@ -241,7 +260,7 @@ def nvcc_module_compile_str(
                    continue
                if l[l.index(':'):].startswith(': warning: label'):
                    continue
-            except: 
+            except:
                pass
            print >> sys.stderr, l
        print >> sys.stderr, '==============================='
@@ -252,4 +271,3 @@ def nvcc_module_compile_str(
    #touch the __init__ file
    file(os.path.join(location, "__init__.py"),'w').close()
    return dlimport(lib_filename)
-
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -14,7 +14,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
 from theano.sandbox.cuda.basic_ops import *
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
-        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
+        gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import (
@@ -378,6 +378,29 @@ def local_gpu_gemm(node):
            return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
    return False

+@register_opt()
+@local_optimizer([])
+def local_gpu_outer(node):
+    """
+    gpu_from_host(outer) -> gpu_outer(gpu_from_host)
+    
+    outer(host_from_gpu) -> host_from_gpu(gpu_outer)
+    """
+    if node.op == gpu_from_host:
+        host_input = node.inputs[0]
+        if host_input.owner and host_input.owner.op == tensor.basic.outer:
+            x, y = host_input.owner.inputs
+            # gpu_outer will refuse to work with float64 so future-proof
+            if x.type.dtype == 'float32' and y.type.dtype == 'float32':
+                return [gpu_outer(gpu_from_host(x), gpu_from_host(y))]
+    if node.op == tensor.basic.outer:
+        x, y = node.inputs
+        x_on_gpu = (x.owner and x.owner.op == host_from_gpu and x.type.dtype == 'float32')
+        y_on_gpu = (y.owner and y.owner.op == host_from_gpu and x.type.dtype == 'float32')
+        if x_on_gpu or y_on_gpu:
+            return [host_from_gpu(gpu_outer(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y)))]
+    return False
+
 @register_opt()
 @local_optimizer([])
 def local_gpu_sum(node):

--- a/theano/sandbox/cuda/tests/test_blas.py
+++ b/theano/sandbox/cuda/tests/test_blas.py
@@ -117,6 +117,31 @@ def test_gemm_no_inplace():
    assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
    assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)

+def test_outer():
+    x = tcn.shared_constructor(my_rand(8,), 'x')
+    y = tcn.shared_constructor(my_rand(6,), 'y')
+
+    x_val = x.get_value().copy()
+    y_val = y.get_value().copy()
+
+    f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val), f())
+
+    f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
+
+    f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
+
+    f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
+
+    f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
+
+    f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
+    assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
+
 if 0:
    # This is commented out because it doesn't make sense...
    # tcn.blas has no op called DownsampleFactorMax