提交 7dd20e1d authored 作者: vdumoulin's avatar vdumoulin

Merge pull request #1 from nouiz/vdumoulin-new_backend

Vdumoulin new backend
import numpy
from theano import Op, Apply from theano import Op, Apply
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray, elemwise
...@@ -13,6 +13,7 @@ except ImportError: ...@@ -13,6 +13,7 @@ except ImportError:
pass pass
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
...@@ -36,7 +37,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -36,7 +37,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
x = as_gpuarray_variable(x) x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b) b = as_gpuarray_variable(b)
y_idx = as_gpuarray_variable(y_idx) y_idx = as_gpuarray_variable(y_idx)
nll = y_idx.type() nll = GpuArrayType(x.type.dtype,
y_idx.type.broadcastable)()
sm = x.type() sm = x.type()
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
...@@ -44,31 +46,31 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -44,31 +46,31 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
def c_headers(self): def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>'] return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_support_code_apply(self, node): def c_support_code_apply(self, node, nodename):
dtype0 = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
dtype1 = node.inputs[1].dtype dtype_b = node.inputs[1].dtype
dtype2 = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
return """ return """
__global__ void k_xent_sm_1hot_bias(int M, int N, __global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype0)s* x_data, int xs0, int xs1, const npy_%(dtype_x)s* x_data, int xs0, int xs1,
const npy_%(dtype1)s* b, int bs0, const npy_%(dtype_b)s* b, int bs0,
const npy_%(dtype2)s* y_idx_data, int y_idxs0, const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
npy_%(dtype)s* nll_data, int nlls0, npy_%(dtype_x)s* nll_data, int nlls0,
npy_%(dtype)s* sm_data, int sms0, int sms1, npy_%(dtype_x)s* sm_data, int sms0, int sms1,
npy_%(dtype)s* am_data, int ams0) npy_%(dtype_y_idx)s* am_data, int ams0)
{ {
for (int row = blockIdx.x; row < M; row += gridDim.x){ for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype0)s* x = x_data + xs0 * row; const npy_%(dtype_x)s* x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0]; const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype0)s* sm = sm_data + sms0 * row; npy_%(dtype_x)s* sm = sm_data + sms0 * row;
npy_%(dtype0)s sum = 0.0; npy_%(dtype_x)s sum = 0.0;
int row_max_j = 0; int row_max_j = 0;
npy_%(dtype0)s row_max = x[0] + b[0]; npy_%(dtype_x)s row_max = x[0] + b[0];
for (int j = 1; j < N; ++j) for (int j = 1; j < N; ++j)
{ {
npy_%(dtype0)s row_ij = x[j*xs1] + b[j*bs0]; npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
//todo: store to shared memory //todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j; row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max; row_max = (row_ij > row_max) ? row_ij : row_max;
...@@ -76,12 +78,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -76,12 +78,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp //compute the exp
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
npy_%(dtype0)s row_ij = x[j*xs1] + b[j*bs0]; npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype0)s sm_ij = exp(row_ij - row_max); npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij; sum += sm_ij;
sm[j * sms1] = sm_ij; sm[j * sms1] = sm_ij;
} }
npy_%(dtype0)s sum_inv = 1.0 / sum; npy_%(dtype_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j) for (int j = 0; j < N; ++j)
{ {
sm[j * sms1] *= sum_inv; sm[j * sms1] *= sum_inv;
...@@ -103,14 +105,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -103,14 +105,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
} }
CUdeviceptr (*cuda_get_ptr)(gpudata *g); CUdeviceptr (*cuda_get_ptr)(gpudata *g);
""" """ % locals()
def c_init_code(self): def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");'] return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype = self.dtype typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode = pygpu.gpuarray.dtype_to_typecode(dtype) typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp x, b, y_idx = inp
nll, sm, am = out nll, sm, am = out
classname = self.__class__.__name__ classname = self.__class__.__name__
...@@ -151,7 +160,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -151,7 +160,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{ {
Py_XDECREF(%(nll)s); Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode)s, %(typecode_x)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(nll)s) { if (!%(nll)s) {
...@@ -166,7 +175,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -166,7 +175,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{ {
Py_XDECREF(%(sm)s); Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s), %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s, %(typecode_b)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if(!%(sm)s) if(!%(sm)s)
...@@ -183,7 +192,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -183,7 +192,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{ {
Py_XDECREF(%(am)s); Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode)s, %(typecode_y_idx)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if(!%(am)s) if(!%(am)s)
...@@ -196,37 +205,35 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -196,37 +205,35 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0], int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
256);
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1; int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(%(dtype)s); int n_shared_bytes = 0; //n_threads * sizeof(dtype);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>( k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1], PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + (dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset); %(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0], PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1], PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + (dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset); %(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0], PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset); %(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0], PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(dtype_%(nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) + (dtype_%(nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
%(nll)s->ga.offset); %(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0], PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) + (dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset); %(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0], PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1], PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(dtype_%(am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) + (dtype_%(am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset); %(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0]); PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
...@@ -245,7 +252,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op): ...@@ -245,7 +252,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (4,) return (5,)
def c_compiler(self): def c_compiler(self):
return NVCC_compiler return NVCC_compiler
...@@ -273,11 +280,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -273,11 +280,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def make_node(self, dy, sm, y_idx): def make_node(self, dnll, sm, y_idx):
dy = as_gpuarray_variable(dy) dnll = as_gpuarray_variable(dnll)
sm = as_gpuarray_variable(sm) sm = as_gpuarray_variable(sm)
y_idx = as_gpuarray_variable(y_idx) y_idx = as_gpuarray_variable(y_idx)
return Apply(self, [dy, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
...@@ -290,7 +297,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -290,7 +297,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return NVCC_compiler return NVCC_compiler
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
dx, = out dx, = out
fail = sub['fail'] fail = sub['fail']
...@@ -326,7 +337,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -326,7 +337,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
{ {
Py_XDECREF(%(dx)s); Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s), %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode)s, %(typecode_dx)s,
GA_C_ORDER, GA_C_ORDER,
pygpu_default_context(), Py_None); pygpu_default_context(), Py_None);
if (!%(dx)s) { if (!%(dx)s) {
...@@ -334,9 +345,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -334,9 +345,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
} }
} }
{ {
int n_blocks = std::min(PyGpuArray_DIMS(%(dx)s)[0], int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
256); int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
int n_threads = std::min(PyGpuArray_DIMS(%(dx)s)[1],256);
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<n_blocks, n_threads>>>( <<<n_blocks, n_threads>>>(
...@@ -344,24 +354,23 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -344,24 +354,23 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
PyGpuArray_DIMS(%(dx)s)[1], PyGpuArray_DIMS(%(dx)s)[1],
(dtype_%(dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) + (dtype_%(dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
%(dnll)s->ga.offset); %(dnll)s->ga.offset),
PyGpuArray_STRIDES(%(dnll)s)[0], PyGpuArray_STRIDES(%(dnll)s)[0] / %(itemsize_dnll)s,
(dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) + (dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset); %(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0], PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1], PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + (dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset); %(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0], PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(dtype_%(dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) + (dtype_%(dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
%(dx)s->ga.offset); %(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0], PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
); );
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
...@@ -378,21 +387,22 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op): ...@@ -378,21 +387,22 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
dtype0 = node.inputs[0].dtype dtype_dnll = node.inputs[0].dtype
dtype1 = node.inputs[1].dtype dtype_sm = node.inputs[1].dtype
dtype2 = node.inputs[2].dtype dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
return """ return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s( __global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K, int N, int K,
const npy_%(dtype0)s* dnll, const int dnll_s0, const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
const npy_%(dtype1)s* sm, const int sm_s0, const int sm_s1, const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
const npy_%(dtype2)s* y_idx, const int y_idx_s0, const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
npy_%(dtype1)s* dx, const int dx_s0, const int dx_s1) npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
{ {
for (int i = blockIdx.x; i < N; i += gridDim.x) for (int i = blockIdx.x; i < N; i += gridDim.x)
{ {
npy_%(dtype0)s dnll_i = dnll[i * dnll_s0]; npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
int y_i = (int)y_idx[i * y_idx_s0]; npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x) for (int j = threadIdx.x; j < K; j += blockDim.x)
{ {
......
...@@ -60,7 +60,6 @@ def op_lifter(OP): ...@@ -60,7 +60,6 @@ def op_lifter(OP):
def local_opt(node): def local_opt(node):
if type(node.op) in OP: if type(node.op) in OP:
# This does not support nodes that have more than one output. # This does not support nodes that have more than one output.
assert len(node.outputs) == 1
# either one of our inputs is on the gpu or # either one of our inputs is on the gpu or
# all of our client are on the gpu # all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu if (any([i.owner and i.owner.op == host_from_gpu
...@@ -71,7 +70,9 @@ def op_lifter(OP): ...@@ -71,7 +70,9 @@ def op_lifter(OP):
# This is needed as sometimes new_op inherit from OP. # This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op: if new_op and new_op != node.op:
if isinstance(new_op, theano.Op): if isinstance(new_op, theano.Op):
return [host_from_gpu(new_op(*node.inputs))] return [host_from_gpu(o) for o in new_op(*node.inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [host_from_gpu(o) for o in new_op]
else: # suppose it is a variable on the GPU else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)] return [host_from_gpu(new_op)]
return False return False
...@@ -281,4 +282,3 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node): ...@@ -281,4 +282,3 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx]) @op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
def local_gpua_crossentropysoftmax1hotwithbiasdx(node): def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx() return GpuCrossentropySoftmax1HotWithBiasDx()
...@@ -6,7 +6,7 @@ from theano.gof.python25 import any ...@@ -6,7 +6,7 @@ from theano.gof.python25 import any
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
import theano.sandbox.gpuarray from theano.sandbox import gpuarray
if theano.sandbox.gpuarray.pygpu is None: if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed") raise SkipTest("pygpu not installed")
...@@ -20,21 +20,21 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: ...@@ -20,21 +20,21 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
default_to_move_computation_to_gpu=False, default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False, move_shared_float32_to_gpu=False,
enable_cuda=False) enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda') gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated: if not gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled") raise SkipTest("pygpu disabled")
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, from theano.sandbox.gpuarray.nnet import (
GpuCrossentropySoftmax1HotWithBiasDx) GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode( mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
'FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
...@@ -87,7 +87,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -87,7 +87,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
mode=mode_without_gpu) mode=mode_without_gpu)
classify_gpu = theano.function(inputs=[y, b, dot_result], classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW], outputs=[loss, y_pred, dW],
mode=mode_with_gpu) mode=mode_with_gpu)
#theano.printing.debugprint(classify) #theano.printing.debugprint(classify)
#theano.printing.debugprint(classify_gpu) #theano.printing.debugprint(classify_gpu)
...@@ -95,7 +95,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -95,7 +95,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
T.nnet.CrossentropySoftmaxArgmax1HotWithBias) T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
for node in classify.maker.fgraph.toposort()]) for node in classify.maker.fgraph.toposort()])
assert any([isinstance(node.op, assert any([isinstance(node.op,
theano.sandbox.gpuarray.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.fgraph.toposort()]) for node in classify_gpu.maker.fgraph.toposort()])
out = classify(yy, b_values, dot_value) out = classify(yy, b_values, dot_value)
...@@ -104,7 +104,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -104,7 +104,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
assert len(out) == len(gout) == 3 assert len(out) == len(gout) == 3
assert numpy.allclose(out[0], gout[0]) assert numpy.allclose(out[0], gout[0])
assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute( assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
gout - out).max() gout[2] - out[2]).max()
assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val) assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
for id, val in enumerate(out[1] - for id, val in enumerate(out[1] -
gout[1]) gout[1])
...@@ -150,7 +150,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -150,7 +150,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx) assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
for node in cpu_f.maker.fgraph.toposort()]) for node in cpu_f.maker.fgraph.toposort()])
assert any([isinstance(node.op, assert any([isinstance(node.op,
theano.sandbox.gpuarray.nnet.GpuCrossentropySoftmax1HotWithBiasDx) GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.fgraph.toposort()]) for node in gpu_f.maker.fgraph.toposort()])
cpu_out = cpu_f(softmax_output_value) cpu_out = cpu_f(softmax_output_value)
...@@ -164,7 +164,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -164,7 +164,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
max_i = scaled_err.argmax() max_i = scaled_err.argmax()
print 'max err index:', max_i, max_i / batch_size, print 'max err index:', max_i, max_i / batch_size,
print max_i % batch_size, max_i / n_out, max_i & n_out print max_i % batch_size, max_i / n_out, max_i & n_out
print 'At that index:' print 'At that index:'
print 'err:', scaled_err.flatten()[max_i] print 'err:', scaled_err.flatten()[max_i]
print 'absolute error:', abs_err.flatten()[max_i] print 'absolute error:', abs_err.flatten()[max_i]
...@@ -176,4 +176,4 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -176,4 +176,4 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
print 'y_idx_value:', y_idx_value[max_i / n_out] print 'y_idx_value:', y_idx_value[max_i / n_out]
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % ( assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol) rtol, atol)
...@@ -138,7 +138,13 @@ class GpuArrayType(Type): ...@@ -138,7 +138,13 @@ class GpuArrayType(Type):
return numpy.dtype(self.dtype).itemsize return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub): def c_declare(self, name, sub):
return "PyGpuArrayObject *%s;" % (name,) dtype = theano.tensor.TensorType(
dtype=self.dtype,
broadcastable=self.broadcastable).dtype_specs()[1]
return """
PyGpuArrayObject *%(name)s;
typedef %(dtype)s dtype_%(name)s;
""" % locals()
def c_init(self, name, sub): def c_init(self, name, sub):
return "%s = NULL;" % (name,) return "%s = NULL;" % (name,)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论