提交 f9c8d096 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #6119 from notoraptor/get-rid-of-get-op-params

Get rid of get_op_params()
......@@ -22,9 +22,6 @@ Blas Op
.. automodule:: theano.gpuarray.blas
:members:
.. automodule:: theano.gpuarray.nerv
:members:
Elemwise Op
===========
......
......@@ -1388,11 +1388,10 @@ class COp(Op):
raise ValueError("No valid section marker was found in file "
"%s" % func_files[i])
def get_op_params(self):
def __get_op_params(self):
"""
Returns a list of (name, value) pairs that will be turned into
macros for use within the op code. This is intended to allow
an op's properties to influence the generated C code.
macros for use within the op code.
The names must be strings that are not a C keyword and the
values must be strings of literal C representations.
......@@ -1412,6 +1411,10 @@ class COp(Op):
params = [('PARAMS_TYPE', wrapper.name)]
for i in range(wrapper.length):
try:
# NB (reminder): These macros are currently used only in ParamsType example test
# (`theano/gof/tests/test_quadratic_function.c`), to demonstrate how we can
# access params dtypes when dtypes may change (e.g. if based on theano.config.floatX).
# But in practice, params types generally have fixed types per op.
params.append(('DTYPE_PARAM_' + wrapper.fields[i], wrapper.types[i].c_element_type()))
except utils.MethodNotDefined:
pass
......@@ -1506,7 +1509,7 @@ class COp(Op):
"str##_%s" % name))
undef_macros.append(undef_template % "APPLY_SPECIFIC")
for n, v in self.get_op_params():
for n, v in self.__get_op_params():
define_macros.append(define_template % (n, v))
undef_macros.append(undef_template % (n,))
......
......@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor,
reg_context, get_context, ContextNotDefined)
from .basic_ops import as_gpuarray_variable
from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction, rng_mrg
from . import fft, dnn, opt, extra_ops, multinomial, reduction, rng_mrg
def transfer(x, target):
......
......@@ -4,19 +4,19 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
PyGpuArrayObject *h, PyArrayObject *inputIdx,
PyArrayObject *outputIdx,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
#ifdef INPLACE
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
} else {
out = theano_try_copy(out, o);
if (out == NULL) {
// Error already set
return -1;
}
#endif
}
gpudata **W_list = NULL;
gpudata **inp_list = NULL;
......@@ -26,7 +26,7 @@ int APPLY_SPECIFIC(blockgemv)(PyGpuArrayObject *o, PyGpuArrayObject *W,
size_t *offOut = NULL;
int err;
err = gpublas_setup(ctx->ctx);
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
......
......@@ -4,7 +4,7 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
PyGpuArrayObject *y, PyArrayObject *xIdx,
PyArrayObject *yIdx, PyArrayObject *alpha,
PyGpuArrayObject **_out,
PyGpuContextObject *ctx) {
PARAMS_TYPE* params) {
PyGpuArrayObject *out = *_out;
gpudata **o_list = NULL;
gpudata **x_list = NULL;
......@@ -14,21 +14,21 @@ int APPLY_SPECIFIC(blockger)(PyGpuArrayObject *o, PyGpuArrayObject *x,
size_t *offY = NULL;
int err;
err = gpublas_setup(ctx->ctx);
err = gpublas_setup(params->context->ctx);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
return -1;
}
#ifdef INPLACE
if (params->inplace) {
Py_XDECREF(out);
out = o;
Py_INCREF(out);
#else
} else {
out = theano_try_copy(out, o);
if (out == NULL)
return -1;
#endif
}
size_t maxi = PyGpuArray_DIMS(x)[1];
size_t maxj = PyGpuArray_DIMS(y)[1];
size_t maxb = PyGpuArray_DIMS(x)[0];
......
......@@ -4,8 +4,9 @@ import os
import numpy as np
from theano import Apply, tensor
from theano.gof import COp
from theano.gof import COp, ParamsType
from theano.tensor import discrete_dtypes, as_tensor_variable
from theano.scalar import bool as bool_t
from theano.gradient import grad_undefined
......@@ -25,7 +26,8 @@ class GpuSparseBlockGemv(COp):
function for a stable interface.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
# NB: DTYPE_INPUT_* is used in C code, so I think we should not set check_input to False.
def __init__(self, inplace=False):
COp.__init__(self, "blockgemv.c", "APPLY_SPECIFIC(blockgemv)")
......@@ -34,13 +36,7 @@ class GpuSparseBlockGemv(COp):
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_header_dirs(self):
return [os.path.dirname(__file__)]
......@@ -102,7 +98,7 @@ class GpuSparseBlockOuter(COp):
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ('inplace',)
params_type = gpu_context_type
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
COp.__init__(self, ["blockger.c"], "APPLY_SPECIFIC(blockger)")
......@@ -111,13 +107,7 @@ class GpuSparseBlockOuter(COp):
self.destroy_map = {0: [0]}
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
ctx = infer_context_name(o, x, y)
......
#section init_code_struct
/* Why do we need this? */
size_t dim = 2048 * 32;
rand_buf = pygpu_empty(1, &dim, GA_UINT, GA_C_ORDER, PARAMS,
Py_None);
if (rand_buf == NULL) {
FAIL;
}
#section support_code_struct
PyGpuArrayObject *rand_buf;
int gemm16(PyGpuArrayObject *C, float alpha,
PyGpuArrayObject *A, PyGpuArrayObject *B,
float beta, PyGpuArrayObject **out,
PyGpuContextObject *c) {
PyGpuArrayObject *_A = NULL;
PyGpuArrayObject *_B = NULL;
GpuKernel *gk;
char *prand, *pA, *pB, *pout;
void *params[13];
size_t grid[2];
size_t threads[2];
int res = 0;
int flags = 0;
int lda, ldb, ldc, n, m, k;
int n128, n64;
int size = 0;
int vec = 0;
static unsigned int nprocs = 0;
char opA, opB;
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY) &&
GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
/*
* The nervana kernels do not cover the case where both inputs are
* trans so we need to copy one of them. We choose the smallest
* one.
*/
if (PyGpuArray_DIM(A, 0) * PyGpuArray_DIM(A, 1) <
PyGpuArray_DIM(B, 0) * PyGpuArray_DIM(B, 1)) {
_A = pygpu_copy(A, GA_C_ORDER);
if (_A == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _A so don't add an INCREF.
* Also, we don't lose the ref on A since our caller will deal
* with it.
*/
A = _A;
} else {
_B = pygpu_copy(B, GA_C_ORDER);
if (_B == NULL) {
res = 1;
goto cleanup;
}
/*
* This is not an extra reference on _B so don't add an INCREF
* Also, we don't lose the ref on B since our caller will deal
* with it.
*/
B = _B;
}
}
if (GEMM16_INPLACE && GpuArray_CHKFLAGS(&C->ga, GA_CARRAY)) {
Py_XDECREF(*out);
*out = C;
Py_INCREF(*out);
} else {
*out = theano_try_copy(*out, C);
if (*out == NULL) {
res = 1;
goto cleanup;
}
}
if (GpuArray_CHKFLAGS(&A->ga, GA_FARRAY)) {
opA = 't';
lda = PyGpuArray_STRIDE(A, 1);
} else {
opA = 'n';
lda = PyGpuArray_STRIDE(A, 0);
}
if (GpuArray_CHKFLAGS(&B->ga, GA_FARRAY)) {
opB = 't';
ldb = PyGpuArray_STRIDE(B, 1);
} else {
opB = 'n';
ldb = PyGpuArray_STRIDE(B, 0);
}
ldc = PyGpuArray_STRIDE(*out, 0);
/* lda and friend are in number of elements, not bytes */
lda /= 2;
ldb /= 2;
ldc /= 2;
m = PyGpuArray_DIM(*out, 0);
n = PyGpuArray_DIM(*out, 1);
k = PyGpuArray_DIM(B, 0);
/* Tuning code adapted from the python version */
grid[0] = (m + 127) / 128;
if (opA == 'n' && opB == 't')
size = 128;
else {
if (n < 384-16) {
n128 = n % 128;
if (n128 < 112) {
if (48 < n128 && n128 <= 64) {
n64 = n / 64;
if (nprocs == 0)
if (gpucontext_property(A->context->ctx,
GA_CTX_PROP_NUMPROCS, &nprocs)) {
nprocs = 0;
res = 1;
goto cleanup;
}
n64 *= (grid[0] / nprocs);
if (n64 > 1 || (opA == 't' && opB == 'n'))
size = 64;
else
size = 32;
} else {
size = 32;
}
} else {
size = 128;
}
} else {
size = 128;
}
}
grid[1] = (n + (size-1)) / size;
if (size == 128)
threads[0] = 256;
else
threads[0] = 128;
threads[1] = 1;
if ((opA == 't' && opB == 'n' && m % 8 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 'n' && k % 16 == 0 && n % 8 == 0) ||
(opA == 'n' && opB == 't' && k % 16 == 0))
vec = 1;
switch (size) {
case 128:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x128;
else
gk = &k_nn_128x128;
} else if (opA == 'n' && opB == 't') {
if (vec)
gk = &k_nt_vec_128x128;
else
gk = &k_nt_128x128;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x128;
else
gk = &k_tn_128x128;
}
break;
case 64:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x64;
else
gk = &k_nn_128x64;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x64;
else
gk = &k_tn_128x64;
}
break;
case 32:
if (opA == 'n' && opB == 'n') {
if (vec)
gk = &k_nn_vec_128x32;
else
gk = &k_nn_128x32;
} else if (opA == 't' && opB == 'n') {
if (vec)
gk = &k_tn_vec_128x32;
else
gk = &k_tn_128x32;
}
break;
default:
PyErr_SetString(PyExc_RuntimeError, "error selecting kernel");
res = 1;
goto cleanup;
}
prand = *((char **)rand_buf->ga.data);
prand += rand_buf->ga.offset;
pA = *((char **)A->ga.data);
pA += A->ga.offset;
pB = *((char **)B->ga.data);
pB += B->ga.offset;
pout = *((char **)(*out)->ga.data);
pout += (*out)->ga.offset;
params[0] = &prand;
params[1] = &pA;
params[2] = &pB;
params[3] = &pout;
params[4] = &lda;
params[5] = &ldb;
params[6] = &ldc;
params[7] = &m;
params[8] = &n;
params[9] = &k;
params[10] = &alpha;
params[11] = &beta;
params[12] = &flags;
if (GpuKernel_call(gk, 2, grid, threads, 0, params) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "error in gemm16 kernel call");
res = 1;
}
cleanup:
Py_XDECREF(_A);
Py_XDECREF(_B);
return res;
}
......@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError
import theano
from theano import Op, config, tensor
from theano.gof import COp
from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
......@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True):
class GpuMagmaSVD(COp):
"""Computes the svd of a matrix :math:`A` using magma library.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
to get them in expected order ``U, S, VT``.
"""
__props__ = ('full_matrices', 'compute_uv')
params_type = gpu_context_type
_cop_num_inputs = 1
_cop_num_outputs = 3
check_input = False
params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
......@@ -385,25 +396,28 @@ class GpuMagmaSVD(COp):
assert A.dtype == 'float32'
if self.compute_uv:
return theano.Apply(self, [A],
[A.type(),
GpuArrayType(A.dtype, broadcastable=[False],
# return S, U, VT
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type(),
A.type()])
else:
return theano.Apply(self, [A],
# return only S
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
params = []
def prepare_node(self, node, storage_map, compute_map, impl):
# Check node to prevent eventual errors with old pickled nodes.
if self.compute_uv:
params.append(('COMPUTE_UV', '1'))
if self.full_matrices:
params.append(('FULL_MATRICES', '1'))
return params
A, B, C = node.outputs
# We expect order: S (vector), U (matrix), VT (matrix)
assert A.type.ndim == 1 and B.type.ndim == C.type.ndim == 2, \
"Due to implementation constraints, GpuMagmaSVD interface has changed and now returns (S, U, VT) " \
"instead of (U, S, VT). Either update your code, or use gpu_svd() to get the expected (U, S, VT) order."
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
x_shape, = shapes
......@@ -413,7 +427,7 @@ class GpuMagmaSVD(COp):
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [u_shape, s_shape, vt_shape]
return [s_shape, u_shape, vt_shape]
else:
return [s_shape]
......@@ -438,14 +452,19 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
U, V, D : matrices
"""
return GpuMagmaSVD(full_matrices, compute_uv)(a)
out = GpuMagmaSVD(full_matrices, compute_uv)(a)
if compute_uv:
S, U, VT = out
out = [U, S, VT]
return out
class GpuMagmaMatrixInverse(COp):
"""Computes the inverse of a matrix :math:`A` using magma library.
"""
__props__ = ('inplace', )
params_type = gpu_context_type
check_input = False
params_type = ParamsType(inplace=bool_t, context=gpu_context_type)
def __init__(self, inplace=False):
COp.__init__(self, ['magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
......@@ -483,13 +502,7 @@ class GpuMagmaMatrixInverse(COp):
return theano.Apply(self, [x], [x.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
return shapes
......
......@@ -5,7 +5,7 @@ setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PyGpuContextObject *c) {
PARAMS_TYPE* params) {
const size_t *dims;
magma_int_t N, ldwork, info;
magma_int_t *piv = NULL;
......@@ -19,7 +19,7 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
......@@ -38,11 +38,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
"GpuMagmaMatrixInverse: matrix is not square");
goto fail;
}
#ifdef INPLACE
if (params->inplace) {
Py_XDECREF(*A_inv);
*A_inv = A;
Py_INCREF(*A_inv);
#else
} else {
*A_inv = theano_try_copy(*A_inv, A);
if (*A_inv == NULL) {
PyErr_SetString(
......@@ -50,13 +50,13 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
"GpuMagmaMatrixInverse: failed to allocate memory for the output");
goto fail;
}
#endif
}
// magma matrix inverse
N = dims[0];
ldwork = N * magma_get_sgetri_nb(N);
dwork = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
dwork = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (dwork == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate working memory");
......@@ -94,6 +94,6 @@ fail:
if (dwork != NULL)
gpudata_release(dwork);
magma_finalize();
cuda_exit(c->ctx);
cuda_exit(params->context->ctx);
return res;
}
......@@ -5,14 +5,11 @@ setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
#ifdef COMPUTE_UV
PyGpuArrayObject **U,
#endif
PyGpuArrayObject **S,
#ifdef COMPUTE_UV
PyGpuArrayObject **VT,
#endif
PyGpuContextObject *c) {
PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE* params) {
bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobz;
......@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
......@@ -63,12 +60,12 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
goto fail;
}
#ifdef COMPUTE_UV
#ifdef FULL_MATRICES
if (compute_uv) {
if (params->full_matrices) {
jobz = MagmaAllVec;
#else
} else {
jobz = MagmaSomeVec;
#endif
}
M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M;
......@@ -84,11 +81,11 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
#else
} else {
jobz = MagmaNoVec;
ldu = M;
ldv = N;
#endif
}
// query for workspace size
magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
......@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
}
s_dims[0] = K;
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
......@@ -132,9 +129,9 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
#ifdef COMPUTE_UV
if (compute_uv) {
u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
......@@ -145,7 +142,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
......@@ -154,7 +151,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
#endif
}
res = 0;
fail:
if (a_data != NULL)
......@@ -170,6 +167,6 @@ fail:
if (iwork != NULL)
magma_free_cpu(iwork);
magma_finalize();
cuda_exit(c->ctx);
cuda_exit(params->context->ctx);
return res;
}
from __future__ import absolute_import, print_function, division
import os.path
import theano
from theano import Apply, Variable, tensor
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import local_optimizer, COp
from theano.scalar import as_scalar, constant
from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name)
from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge
try:
from nervanagpu.nervanagpu import GPUTensor, NervanaGPU
nerv = NervanaGPU()
except ImportError:
GPUTensor = None
nerv = None
def to_gputensor(a):
assert a.flags.c_contiguous or a.flags.f_contiguous
return GPUTensor(a.shape, dtype=a.dtype, base=a,
gpudata=a.gpudata + a.offset,
strides=a.strides, is_trans=a.flags.f_contiguous)
def ensure_float(val, name):
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'float32':
raise TypeError("%s: type is not float32" % (name,))
return val
class Gemm16(COp):
"""
Gemm for float16 using the nervena kernels.
"""
__props__ = ('relu', 'inplace')
_f16_ok = True
params_type = gpu_context_type
KERN_NAMES = ('nn_128x128', 'nn_128x64', 'nn_128x32',
'nn_vec_128x128', 'nn_vec_128x64', 'nn_vec_128x32',
'tn_128x128', 'tn_128x64', 'tn_128x32',
'tn_vec_128x128', 'tn_vec_128x64', 'tn_vec_128x32',
'tn_vec_128x16', 'nt_128x128', 'nt_vec_128x128')
def __init__(self, relu=False, inplace=False):
COp.__init__(self, ["gemm16.c"], "gemm16")
self.relu = relu
# relu = True will require more work in optimizations.
assert self.relu is False
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, C, alpha, A, B, beta):
if GPUTensor is None:
raise RuntimeError("Can't use Gemm16: nervanagpu not found")
ctx_name = infer_context_name(C, A, B)
A = as_gpuarray_variable(A, ctx_name)
B = as_gpuarray_variable(B, ctx_name)
C = as_gpuarray_variable(C, ctx_name)
alpha = ensure_float(alpha, 'alpha')
beta = ensure_float(beta, 'beta')
assert C.dtype == A.dtype == B.dtype == 'float16'
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def get_params(self, node):
return node.inputs[0].type.context
def c_headers(self):
return ['gpuarray/types.h', 'numpy_compat.h', 'gpuarray_helper.h',
'string.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def get_op_params(self):
return [('GEMM16_INPLACE', '1' if self.inplace else '0')]
@staticmethod
def cubin_to_code(name):
fname = 'hgemm_{0}.cubin'.format(name)
with open(os.path.join(nerv.cubin_path, fname)) as f:
cubin = f.read()
bcode = ','.join(hex(ord(c)) for c in cubin)
return "static const char bin_%s[] = { %s };" % (name, bcode)
@staticmethod
def init_gpukernel(name, fail):
return """
bcode = bin_%(name)s;
sz = sizeof(bin_%(name)s);
if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
"hgemm_%(name)s", 13, types, GA_USE_BINARY, NULL)
!= GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not initialize kernel %(name)s");
%(fail)s;
}
""" % dict(name=name, fail=fail)
def c_support_code(self):
codel = []
for name in self.KERN_NAMES:
codel.append(Gemm16.cubin_to_code(name))
return '\n'.join(codel)
def c_support_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel k_{0};".format(name))
codel.append(super(Gemm16, self).c_support_code_struct(node, nodename))
return '\n'.join(codel)
def c_init_code_struct(self, node, nodename, sub):
codel = [super(Gemm16, self).c_init_code_struct(node, nodename, sub)]
for name in self.KERN_NAMES:
codel.append("memset(&k_{0}, 0, sizeof(GpuKernel));".format(name))
codel.append("const char *bcode;")
codel.append("size_t sz;")
codel.append("PyGpuContextObject *c = %s;" % (sub['params'],))
codel.append("int types[13] = {GA_BUFFER, GA_BUFFER, GA_BUFFER, "
"GA_BUFFER, GA_INT, GA_INT, GA_INT, GA_INT, GA_INT, "
"GA_INT, GA_FLOAT, GA_FLOAT, GA_INT};")
for name in self.KERN_NAMES:
codel.append(self.init_gpukernel(name, sub['fail']))
return '\n'.join(codel)
def c_cleanup_code_struct(self, node, nodename):
codel = []
for name in self.KERN_NAMES:
codel.append("GpuKernel_clear(&k_{0});".format(name))
return '\n'.join(codel)
@opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot])
@opt.register_opt2([tensor.Dot], 'fast_compile')
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None:
return
A = inputs[0]
B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = getattr(outputs[0], 'fgraph', None)
C = GpuAllocEmpty('float16', ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0)
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
if type(node.op) != Gemm16 or node.op.inplace:
return
inputs = list(node.inputs)
C = inputs[0]
if (C.owner and
isinstance(C.owner.op, GpuAllocEmpty) and
len(C.clients) > 1):
inputs[0] = C.owner.op(*C.owner.inputs)
return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]
optdb.register('local_gemm16_inplace',
tensor.opt.in2out(local_gemm16_inplace,
name='local_gemm16_inplace'),
70.0, 'fast_run', 'inplace', 'gpuarray')
# To prevent flake8 error.
from __future__ import print_function, absolute_import, division
raise ImportError(
"You are importing theano.gpuarray.nerv. "
"This module was removed as it was based on nervanagpu that is now deprecated. "
"To still get this module, use Theano 0.9. "
"More info about nervanagpu here: https://github.com/NervanaSystems/nervanagpu "
"(viewed on 2017/07/05).")
......@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
cusolver_available, GpuMagmaMatrixInverse, gpu_svd)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs):
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
x = inputs[0]
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
x = inputs[0].astype('float32')
out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
if inputs[0].dtype == 'float16':
if op.compute_uv:
out = [o.astype('float16') for o in out]
else:
out = [out.astype('float16')]
return out
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
......
......@@ -217,8 +217,8 @@ KERNEL void ave_pool3d_kernel(const ga_size nthreads,
// output shape for a given input padded shape, window shape and stride
// We use ssize_t in the max since this is done to avoid negative results.
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
#define OUTPUT_DIMS(in_dim, ws, st, ignore_border) \
(ignore_border ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
......@@ -229,7 +229,10 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PyGpuContextObject *ctx) {
PARAMS_TYPE* params) {
bool max_pool = (params->mode == POOLING_MAX);
bool inc_pad = (params->mode != POOLING_AVERAGE_COUNT_EXCLUDE_PADDING);
bool sum_mode = (params->mode == POOLING_SUM);
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga))
{
PyErr_Format(PyExc_ValueError,
......@@ -253,19 +256,19 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!IGNORE_BORDER && nonzero_padding) {
if (!params->ignore_border && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuPool: padding works only with ignore_border=True");
return 1;
}
if (theano_prep_output(z, PyGpuArray_NDIM(x), z_dims,
x->ga.typecode, GA_C_ORDER, ctx) != 0)
x->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuPool: failed to allocate memory");
......@@ -277,7 +280,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
if (ndims == 2) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3];
if (MAX_POOL) {
if (max_pool) {
err = max_pool2d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3],
x_dims[2], x_dims[3],
......@@ -295,7 +298,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
x_dims[2], x_dims[3],
x->ga.data, x->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE,
inc_pad, sum_mode,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
......@@ -307,7 +310,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
}
else if (ndims == 3) {
size_t num_kernels = z_dims[0] * z_dims[1] * z_dims[2] * z_dims[3] * z_dims[4];
if (MAX_POOL) {
if (max_pool) {
err = max_pool3d_kernel_scall(1, &num_kernels, 0, num_kernels,
z_dims[0], z_dims[1], z_dims[2], z_dims[3], z_dims[4],
x_dims[2], x_dims[3], x_dims[4],
......@@ -326,7 +329,7 @@ int APPLY_SPECIFIC(pool)(PyGpuArrayObject *x,
x->ga.data, x->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2],
INC_PAD, SUM_MODE,
inc_pad, sum_mode,
(*z)->ga.data, (*z)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
......
......@@ -3,9 +3,12 @@ import os.path
import theano
from theano import Apply
from theano.gof import ParamsType
from theano.scalar import bool as bool_t
from theano.tensor.basic import as_tensor_variable
from theano.tensor.signal.pool import Pool
from theano.tensor.signal.pool import Pool, PoolingMode_t
from .type import gpu_context_type
from .basic_ops import (CGpuKernelBase, infer_context_name,
as_gpuarray_variable, gpu_contiguous)
......@@ -22,6 +25,9 @@ class GpuPool(CGpuKernelBase):
"""
__props__ = ('ignore_border', 'mode', 'ndim')
params_type = ParamsType(ignore_border=bool_t,
mode=PoolingMode_t,
context=gpu_context_type)
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
......@@ -31,9 +37,12 @@ class GpuPool(CGpuKernelBase):
self.mode = mode
CGpuKernelBase.__init__(self, ['pool.c'],
'APPLY_SPECIFIC(pool)')
assert mode in ('max', 'sum', 'average_inc_pad', 'average_exc_pad')
assert PoolingMode_t.has_alias(self.mode)
assert self.ndim in [2, 3]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
......@@ -74,16 +83,6 @@ class GpuPool(CGpuKernelBase):
return Apply(self, [inp, ws, stride, pad], [inp.type()])
def get_op_params(self):
ignore_border = int(self.ignore_border)
max_pool = int(self.mode == 'max')
inc_pad = int(self.mode != 'average_exc_pad')
sum_mode = int(self.mode == 'sum')
return [('IGNORE_BORDER', ignore_border),
('INC_PAD', inc_pad),
('MAX_POOL', max_pool),
('SUM_MODE', sum_mode)]
def infer_shape(self, node, in_shapes):
ws, stride, pad = [node.inputs[1], node.inputs[2], node.inputs[3]]
shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
......@@ -214,6 +213,7 @@ class GpuAveragePoolGrad(CGpuKernelBase):
"""
__props__ = ('ignore_border', 'mode', 'ndim')
params_type = ParamsType(mode=PoolingMode_t, context=gpu_context_type)
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
......@@ -226,6 +226,9 @@ class GpuAveragePoolGrad(CGpuKernelBase):
assert mode in ('sum', 'average_inc_pad', 'average_exc_pad')
assert ndim in [2, 3]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
......@@ -267,12 +270,6 @@ class GpuAveragePoolGrad(CGpuKernelBase):
return Apply(self, [inp, out_grad, ws, stride, pad], [inp.type()])
def get_op_params(self):
inc_pad = int(self.mode == 'average_inc_pad')
sum_mode = int(self.mode == 'sum')
return [('INC_PAD', inc_pad),
('SUM_MODE', sum_mode)]
def infer_shape(self, node, in_shapes):
return [in_shapes[0]]
......@@ -369,6 +366,7 @@ class GpuMaxPoolRop(CGpuKernelBase):
"""
__props__ = ('ignore_border', 'mode', 'ndim')
params_type = ParamsType(ignore_border=bool_t, context=gpu_context_type)
def __init__(self, ignore_border, mode='max', ndim=2):
self.ndim = ndim
......@@ -379,6 +377,9 @@ class GpuMaxPoolRop(CGpuKernelBase):
assert mode == 'max'
assert ndim in [2, 3]
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def c_headers(self):
return ['gpuarray_api.h', 'gpuarray_helper.h', 'numpy_compat.h']
......@@ -422,10 +423,6 @@ class GpuMaxPoolRop(CGpuKernelBase):
return Apply(self, [inp, eval_point, ws, stride, pad], [eval_point.type()])
def get_op_params(self):
ignore_border = int(self.ignore_border)
return [('IGNORE_BORDER', ignore_border)]
def infer_shape(self, node, in_shapes):
ws, stride, pad = [node.inputs[2], node.inputs[3], node.inputs[4]]
shp = Pool.out_shape(in_shapes[0], ws, self.ignore_border, stride,
......
......@@ -115,7 +115,9 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **gx,
PyGpuContextObject *ctx) {
PARAMS_TYPE* params) {
bool inc_pad = (params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING);
bool sum_mode = (params->mode == POOLING_SUM);
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga)
|| !GpuArray_IS_C_CONTIGUOUS(&gz->ga))
{
......@@ -131,7 +133,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
return 1;
}
if (theano_prep_output(gx, PyGpuArray_NDIM(x), PyGpuArray_DIMS(x),
x->ga.typecode, GA_C_ORDER, ctx) != 0)
x->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolGrad: failed to allocate memory");
......@@ -161,7 +163,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], s[0], s[1], p[0], p[1],
INC_PAD, SUM_MODE,
inc_pad, sum_mode,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
......@@ -177,7 +179,7 @@ int APPLY_SPECIFIC(ave_pool_grad)(PyGpuArrayObject *x,
x->ga.data, x->ga.offset,
gz->ga.data, gz->ga.offset,
w[0], w[1], w[2], s[0], s[1], s[2],
p[0], p[1], p[2], INC_PAD, SUM_MODE,
p[0], p[1], p[2], inc_pad, sum_mode,
(*gx)->ga.data, (*gx)->ga.offset);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
......
......@@ -109,8 +109,8 @@ KERNEL void max_pool3d_rop_kernel(const ga_size nthreads,
#section support_code
// output shape for a given input padded shape, window shape and stride
#define OUTPUT_DIMS(in_dim, ws, st) \
(IGNORE_BORDER ? (in_dim - ws)/st + 1 : \
#define OUTPUT_DIMS(in_dim, ws, st, ignore_border) \
(ignore_border ? (in_dim - ws)/st + 1 : \
(st > ws ? (in_dim - 1)/st + 1 : \
std::max<ssize_t>(0, (in_dim - 1 - ws + st)/st) + 1))
......@@ -122,7 +122,7 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
PyArrayObject *stride,
PyArrayObject *pad,
PyGpuArrayObject **z,
PyGpuContextObject *ctx) {
PARAMS_TYPE* params) {
if (!GpuArray_IS_C_CONTIGUOUS(&x->ga) || !GpuArray_IS_C_CONTIGUOUS(&ex->ga))
{
PyErr_Format(PyExc_ValueError,
......@@ -146,19 +146,19 @@ int APPLY_SPECIFIC(max_pool_rop)(PyGpuArrayObject *x,
w[i] = *((npy_int64*)PyArray_GETPTR1(ws, i));
s[i] = *((npy_int64*)PyArray_GETPTR1(stride, i));
p[i] = *((npy_int64*)PyArray_GETPTR1(pad, i));
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i]);
z_dims[2 + i] = OUTPUT_DIMS(x_dims[2 + i] + 2*p[i], w[i], s[i], params->ignore_border);
if (p[i] > 0) {
nonzero_padding = 1;
}
}
if (!IGNORE_BORDER && nonzero_padding) {
if (!params->ignore_border && nonzero_padding) {
PyErr_SetString(PyExc_ValueError,
"GpuMaxPoolRop: padding works only with ignore_border=True");
return 1;
}
if (theano_prep_output(z, PyGpuArray_NDIM(ex), z_dims,
ex->ga.typecode, GA_C_ORDER, ctx) != 0)
ex->ga.typecode, GA_C_ORDER, params->context) != 0)
{
PyErr_SetString(PyExc_RuntimeError,
"GpuMaxPoolRop: failed to allocate memory");
......
......@@ -4,10 +4,12 @@ from six.moves import xrange
import theano
from theano import tensor, config, Apply, Op
from theano.scalar import int32 as int_t
from theano.gof import ParamsType
from theano.gradient import grad_undefined
from ..basic_ops import CGpuKernelBase
from ..type import GpuArrayType, get_context
from ..type import GpuArrayType, get_context, gpu_context_type
# This is an implementation to test that CGpuKernelBase works and also
......@@ -18,6 +20,7 @@ class GpuEye(CGpuKernelBase, Op):
"""
__props__ = ('dtype', 'context_name')
params_type = ParamsType(typecode=int_t, context=gpu_context_type)
def __init__(self, dtype=None, context_name=None):
if dtype is None:
......@@ -28,7 +31,9 @@ class GpuEye(CGpuKernelBase, Op):
'APPLY_SPECIFIC(tstgpueye)')
def get_params(self, node):
return get_context(self.context_name)
from pygpu.gpuarray import dtype_to_typecode
return self.params_type.get_params(typecode=dtype_to_typecode(self.dtype),
context=get_context(self.context_name))
def c_headers(self):
return ['<gpuarray/types.h>', '<gpuarray/kernel.h>']
......@@ -52,11 +57,6 @@ class GpuEye(CGpuKernelBase, Op):
return [grad_undefined(self, i, inp[i])
for i in xrange(2)]
def get_op_params(self):
from pygpu.gpuarray import dtype_to_typecode
return [('TYPECODE', str(dtype_to_typecode(self.dtype)))]
def test_cgpukernelbase():
# Import inside the function to prevent the back-end from being
......@@ -69,4 +69,5 @@ def test_cgpukernelbase():
r = f()
assert r.dtype == 'int32'
assert (np.asarray(r) == np.eye(4, 5, dtype='int32')).all()
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
import numpy as np
from theano import function
from theano.tests import unittest_tools as utt
from theano.tensor import vector, matrix, dot
from .config import mode_with_gpu
from ..nerv import Gemm16, nerv
def test_gemm16_swap():
if nerv is None:
raise SkipTest("nervanagpu not available")
v = vector(dtype='float16')
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
m32 = matrix(dtype='float32')
# test that we don't try to replace anything but matrix x matrix in float16
f = function([v, m], dot(v, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m32, m], dot(m32, m), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 0
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
assert len([node for node in f.maker.fgraph.apply_nodes
if isinstance(node.op, Gemm16)]) == 1
def test_gemm16_value():
if nerv is None:
raise SkipTest("nervanagpu not available")
m = matrix(dtype='float16')
m2 = matrix(dtype='float16')
f = function([m, m2], dot(m, m2), mode=mode_with_gpu)
v1 = np.random.random((3, 4)).astype('float16')
v2 = np.random.random((4, 2)).astype('float16')
of = f(v1, v2)
on = np.dot(v1, v2)
utt.assert_allclose(of, on)
......@@ -18,7 +18,7 @@ KERNEL void eye(GLOBAL_MEM DTYPE_OUTPUT_0 *a, ga_size a_off, ga_size n, ga_size
#section support_code_struct
int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
PyGpuArrayObject **z, PyGpuContextObject *ctx) {
PyGpuArrayObject **z, PARAMS_TYPE* params) {
size_t dims[2] = {0, 0};
size_t ls, gs;
void *args[3];
......@@ -29,9 +29,9 @@ int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
Py_XDECREF(*z);
*z = pygpu_zeros(2, dims,
TYPECODE,
params->typecode,
GA_C_ORDER,
ctx, Py_None);
params->context, Py_None);
if (*z == NULL)
return -1;
......
from nose.plugins.skip import SkipTest
# NB: We raise a SkipTest (instead of another type of exception) because we're in a folder,
# thus nosetests will look for test files into this folder. With a SkipTest raised,
# the folder will be skipped by nosetests without failing.
raise SkipTest(
"You are importing theano.sandbox.cuda. This is the old GPU back-end and "
"is removed from Theano. Use Theano 0.9 to use it. Even better, "
......
......@@ -14,7 +14,7 @@ from six.moves import xrange
import six.moves.builtins as builtins
import theano
from theano import gof, OpenMPOp, tensor, Variable, Apply
from theano.gof.params_type import ParamsType
from theano.gof import ParamsType, EnumList
from theano.gradient import DisconnectedType
from theano.scalar import bool as bool_t
......@@ -258,6 +258,16 @@ def pool_3d(input, ws=None, ignore_border=None, stride=None, pad=(0, 0, 0),
return output
# NB: This enum type is currently used in gpuarray/pool.py.
# It may be used later as op param in this current file.
# Enum name and constants names are inspired from cuDNN type `cudnnPoolingMode_t`
# (cf. `theano/gpuarray/cudnn_defs.py`).
PoolingMode_t = EnumList(('POOLING_MAX', 'max'),
('POOLING_SUM', 'sum'),
('POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
('POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'))
class Pool(OpenMPOp):
"""
sum or average over different patches.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论