提交 7dd20e1d authored 作者: vdumoulin's avatar vdumoulin

Merge pull request #1 from nouiz/vdumoulin-new_backend

Vdumoulin new backend
import numpy
from theano import Op, Apply
from theano.compat.six import StringIO
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
try:
import pygpu
from pygpu import gpuarray, elemwise
......@@ -13,6 +13,7 @@ except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
......@@ -36,7 +37,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b)
y_idx = as_gpuarray_variable(y_idx)
nll = y_idx.type()
nll = GpuArrayType(x.type.dtype,
y_idx.type.broadcastable)()
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
......@@ -44,31 +46,31 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_support_code_apply(self, node):
dtype0 = node.inputs[0].dtype
dtype1 = node.inputs[1].dtype
dtype2 = node.inputs[2].dtype
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
return """
__global__ void k_xent_sm_1hot_bias(int M, int N,
const npy_%(dtype0)s* x_data, int xs0, int xs1,
const npy_%(dtype1)s* b, int bs0,
const npy_%(dtype2)s* y_idx_data, int y_idxs0,
npy_%(dtype)s* nll_data, int nlls0,
npy_%(dtype)s* sm_data, int sms0, int sms1,
npy_%(dtype)s* am_data, int ams0)
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype_x)s* x_data, int xs0, int xs1,
const npy_%(dtype_b)s* b, int bs0,
const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
npy_%(dtype_x)s* nll_data, int nlls0,
npy_%(dtype_x)s* sm_data, int sms0, int sms1,
npy_%(dtype_y_idx)s* am_data, int ams0)
{
for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype0)s* x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0];
npy_%(dtype0)s* sm = sm_data + sms0 * row;
const npy_%(dtype_x)s* x = x_data + xs0 * row;
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row;
npy_%(dtype0)s sum = 0.0;
npy_%(dtype_x)s sum = 0.0;
int row_max_j = 0;
npy_%(dtype0)s row_max = x[0] + b[0];
npy_%(dtype_x)s row_max = x[0] + b[0];
for (int j = 1; j < N; ++j)
{
npy_%(dtype0)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
......@@ -76,12 +78,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
//compute the exp
for (int j = 0; j < N; ++j)
{
npy_%(dtype0)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype0)s sm_ij = exp(row_ij - row_max);
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = sm_ij;
}
npy_%(dtype0)s sum_inv = 1.0 / sum;
npy_%(dtype_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
sm[j * sms1] *= sum_inv;
......@@ -103,14 +105,21 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
}
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, nodename, inp, out, sub):
dtype = self.dtype
typecode = pygpu.gpuarray.dtype_to_typecode(dtype)
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp
nll, sm, am = out
classname = self.__class__.__name__
......@@ -151,7 +160,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode)s,
%(typecode_x)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(nll)s) {
......@@ -166,7 +175,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode)s,
%(typecode_b)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if(!%(sm)s)
......@@ -183,7 +192,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode)s,
%(typecode_y_idx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if(!%(am)s)
......@@ -196,37 +205,35 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(x)s)[0],
256);
int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
//TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(%(dtype)s);
int n_shared_bytes = 0; //n_threads * sizeof(dtype);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(dtype_%(x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset);
PyGpuArray_STRIDES(%(x)s)[0],
PyGpuArray_STRIDES(%(x)s)[1],
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(dtype_%(b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset);
PyGpuArray_STRIDES(%(b)s)[0],
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset);
PyGpuArray_STRIDES(%(y_idx)s)[0],
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(dtype_%(nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
%(nll)s->ga.offset);
PyGpuArray_STRIDES(%(nll)s)[0],
%(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset);
PyGpuArray_STRIDES(%(sm)s)[0],
PyGpuArray_STRIDES(%(sm)s)[1],
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(dtype_%(am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset);
PyGpuArray_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC;
%(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
......@@ -245,7 +252,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
def c_code_cache_version(self):
#return ()
return (4,)
return (5,)
def c_compiler(self):
return NVCC_compiler
......@@ -273,11 +280,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
def __str__(self):
return self.__class__.__name__
def make_node(self, dy, sm, y_idx):
dy = as_gpuarray_variable(dy)
def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll)
sm = as_gpuarray_variable(sm)
y_idx = as_gpuarray_variable(y_idx)
return Apply(self, [dy, sm, y_idx], [sm.type()])
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
#return ()
......@@ -290,7 +297,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
return NVCC_compiler
def c_code(self, node, nodename, inp, out, sub):
typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
......@@ -326,7 +337,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
{
Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode)s,
%(typecode_dx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(dx)s) {
......@@ -334,9 +345,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
}
}
{
int n_blocks = std::min(PyGpuArray_DIMS(%(dx)s)[0],
256);
int n_threads = std::min(PyGpuArray_DIMS(%(dx)s)[1],256);
int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<n_blocks, n_threads>>>(
......@@ -344,24 +354,23 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
PyGpuArray_DIMS(%(dx)s)[1],
(dtype_%(dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
%(dnll)s->ga.offset);
PyGpuArray_STRIDES(%(dnll)s)[0],
%(dnll)s->ga.offset),
PyGpuArray_STRIDES(%(dnll)s)[0] / %(itemsize_dnll)s,
(dtype_%(sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset);
PyGpuArray_STRIDES(%(sm)s)[0],
PyGpuArray_STRIDES(%(sm)s)[1],
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(dtype_%(y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset);
PyGpuArray_STRIDES(%(y_idx)s)[0],
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(dtype_%(dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
%(dx)s->ga.offset);
PyGpuArray_STRIDES(%(dx)s)[0],
PyGpuArray_STRIDES(%(dx)s)[1]
%(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
......@@ -378,21 +387,22 @@ class GpuCrossentropySoftmax1HotWithBiasDx(Op):
""" % locals()
def c_support_code_apply(self, node, nodename):
dtype0 = node.inputs[0].dtype
dtype1 = node.inputs[1].dtype
dtype2 = node.inputs[2].dtype
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K,
const npy_%(dtype0)s* dnll, const int dnll_s0,
const npy_%(dtype1)s* sm, const int sm_s0, const int sm_s1,
const npy_%(dtype2)s* y_idx, const int y_idx_s0,
npy_%(dtype1)s* dx, const int dx_s0, const int dx_s1)
const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
{
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
npy_%(dtype0)s dnll_i = dnll[i * dnll_s0];
int y_i = (int)y_idx[i * y_idx_s0];
npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
{
......
......@@ -60,7 +60,6 @@ def op_lifter(OP):
def local_opt(node):
if type(node.op) in OP:
# This does not support nodes that have more than one output.
assert len(node.outputs) == 1
# either one of our inputs is on the gpu or
# all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu
......@@ -71,7 +70,9 @@ def op_lifter(OP):
# This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [host_from_gpu(new_op(*node.inputs))]
return [host_from_gpu(o) for o in new_op(*node.inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [host_from_gpu(o) for o in new_op]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False
......@@ -281,4 +282,3 @@ def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx()
......@@ -6,7 +6,7 @@ from theano.gof.python25 import any
import theano.tensor as T
import theano.tests.unittest_tools as utt
import theano.sandbox.gpuarray
from theano.sandbox import gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
......@@ -20,21 +20,21 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
theano.sandbox.gpuarray.init_dev('cuda')
gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
if not gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
from theano.sandbox.gpuarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
......@@ -87,7 +87,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
mode=mode_without_gpu)
classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
mode=mode_with_gpu)
#theano.printing.debugprint(classify)
#theano.printing.debugprint(classify_gpu)
......@@ -95,7 +95,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
for node in classify.maker.fgraph.toposort()])
assert any([isinstance(node.op,
theano.sandbox.gpuarray.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias)
GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.fgraph.toposort()])
out = classify(yy, b_values, dot_value)
......@@ -104,7 +104,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
assert len(out) == len(gout) == 3
assert numpy.allclose(out[0], gout[0])
assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
gout - out).max()
gout[2] - out[2]).max()
assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
for id, val in enumerate(out[1] -
gout[1])
......@@ -150,7 +150,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
for node in cpu_f.maker.fgraph.toposort()])
assert any([isinstance(node.op,
theano.sandbox.gpuarray.nnet.GpuCrossentropySoftmax1HotWithBiasDx)
GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.fgraph.toposort()])
cpu_out = cpu_f(softmax_output_value)
......@@ -164,7 +164,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
max_i = scaled_err.argmax()
print 'max err index:', max_i, max_i / batch_size,
print max_i % batch_size, max_i / n_out, max_i & n_out
print max_i % batch_size, max_i / n_out, max_i & n_out
print 'At that index:'
print 'err:', scaled_err.flatten()[max_i]
print 'absolute error:', abs_err.flatten()[max_i]
......@@ -176,4 +176,4 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
print 'y_idx_value:', y_idx_value[max_i / n_out]
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
rtol, atol)
......@@ -138,7 +138,13 @@ class GpuArrayType(Type):
return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub):
return "PyGpuArrayObject *%s;" % (name,)
dtype = theano.tensor.TensorType(
dtype=self.dtype,
broadcastable=self.broadcastable).dtype_specs()[1]
return """
PyGpuArrayObject *%(name)s;
typedef %(dtype)s dtype_%(name)s;
""" % locals()
def c_init(self, name, sub):
return "%s = NULL;" % (name,)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论