提交 2d77ade5 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Reuse output when possible for GpuDot22.

上级 a73b9210
import os.path
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.blas import Dot22, Gemv, Gemm, Ger from theano.tensor.blas import Dot22, Gemv, Gemm, Ger
from theano.tensor.opt import in2out
from .basic_ops import HideC, as_gpuarray_variable from .basic_ops import HideC, as_gpuarray_variable
try: try:
...@@ -239,12 +245,8 @@ class GpuDot22(BlasOp, Dot22): ...@@ -239,12 +245,8 @@ class GpuDot22(BlasOp, Dot22):
dims[0] = PyGpuArray_DIMS(%(A)s)[0]; dims[0] = PyGpuArray_DIMS(%(A)s)[0];
dims[1] = PyGpuArray_DIMS(%(B)s)[1]; dims[1] = PyGpuArray_DIMS(%(B)s)[1];
Py_XDECREF(%(out)s); if (theano_prep_output(&%(out)s, 2, dims, %(typecode)s, GA_C_ORDER,
%(out)s = pygpu_empty(2, dims, pygpu_default_context()))
%(typecode)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(out)s) {
%(fail)s %(fail)s
} }
...@@ -265,17 +267,16 @@ class GpuDot22(BlasOp, Dot22): ...@@ -265,17 +267,16 @@ class GpuDot22(BlasOp, Dot22):
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (2,)
def c_header_dir(self):
ret = super(GpuDot22, self).c_header_dirs()
return ret + [os.path.dirname(__file__)]
def c_headers(self): def c_headers(self):
ret = super(GpuDot22, self).c_headers() ret = super(GpuDot22, self).c_headers()
return ret + ['<numpy_compat.h>'] return ret + ['<numpy_compat.h>', '"gpuarray_helper.h"']
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out
@local_optimizer([gpugemv_no_inplace], inplace=True) @local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node): def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace: if node.op == gpugemv_no_inplace:
......
#ifndef THEANO_GPUARRAY_HELPER
#define THEANO_GPUARRAY_HELPER
#include <string.h>
#include <pygpu_api.h>
static int theano_size_check(PyGpuArray *a, unsigned int nd,
const size_t *dims, int typecode) {
return (a->ga.nd == nd && a->ga.typecode == typecode &&
memcmp(a->dims, dims, nd * sizeof(size_t)) == 0);
}
static int theano_prep_output(PyGpuArrayObject **out, unsigned int nd,
const size_t *dims, int typecode, ga_order ord,
PyGpuContextObject *c) {
if (*out != NULL &&
theano_size_check(*out, nd, dims, typecode)) {
return 1;
}
Py_XDECREF(*out);
*out = pygpu_empty(nd, dims, typecode, ord, c, Py_None);
return (*out == NULL)? 0 : 1;
}
#endif
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论