提交 51a6bbc6 authored 作者: abergeron's avatar abergeron

Merge pull request #1634 from vdumoulin/new_backend

New backend: GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx
import numpy
from theano import Op, Apply
from theano.compat.six import StringIO
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try:
import pygpu
from pygpu import gpuarray, elemwise
except ImportError:
pass
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable
from theano.sandbox.gpuarray.type import GpuArrayType
class GpuCrossentropySoftmaxArgmax1HotWithBias(Op):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b, y_idx):
#N.B. won't work when we don't cast y_idx to float anymore
x = as_gpuarray_variable(x)
b = as_gpuarray_variable(b)
y_idx = as_gpuarray_variable(y_idx)
nll = GpuArrayType(x.type.dtype,
y_idx.type.broadcastable)()
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_support_code_apply(self, node, nodename):
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
return """
__global__ void k_xent_sm_1hot_bias_%(nodename)s(int M, int N,
const npy_%(dtype_x)s* x_data, int xs0, int xs1,
const npy_%(dtype_b)s* b, int bs0,
const npy_%(dtype_y_idx)s* y_idx_data, int y_idxs0,
npy_%(dtype_x)s* nll_data, int nlls0,
npy_%(dtype_x)s* sm_data, int sms0, int sms1,
npy_%(dtype_y_idx)s* am_data, int ams0)
{
for (int row = blockIdx.x; row < M; row += gridDim.x){
const npy_%(dtype_x)s* x = x_data + xs0 * row;
const npy_%(dtype_y_idx)s y_idx = y_idx_data[row * y_idxs0];
npy_%(dtype_x)s* sm = sm_data + sms0 * row;
npy_%(dtype_x)s sum = 0.0;
int row_max_j = 0;
npy_%(dtype_x)s row_max = x[0] + b[0];
for (int j = 1; j < N; ++j)
{
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
//todo: store to shared memory
row_max_j = (row_ij > row_max) ? j : row_max_j;
row_max = (row_ij > row_max) ? row_ij : row_max;
}
//compute the exp
for (int j = 0; j < N; ++j)
{
npy_%(dtype_x)s row_ij = x[j*xs1] + b[j*bs0];
npy_%(dtype_x)s sm_ij = exp(row_ij - row_max);
sum += sm_ij;
sm[j * sms1] = sm_ij;
}
npy_%(dtype_x)s sum_inv = 1.0 / sum;
for (int j = 0; j < N; ++j)
{
sm[j * sms1] *= sum_inv;
}
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = - x[y_idx*xs1]
- b[y_idx*bs0]
+ row_max
+ log(sum);
}
am_data[row*ams0] = row_max_j;
}
}
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
def c_code(self, node, nodename, inp, out, sub):
typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
x, b, y_idx = inp
nll, sm, am = out
dtype_x = node.inputs[0].dtype
dtype_b = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_nll = node.outputs[0].dtype
dtype_sm = node.outputs[1].dtype
dtype_am = node.outputs[2].dtype
classname = self.__class__.__name__
fail = sub['fail']
sio = StringIO()
print >> sio, """
if (PyGpuArray_NDIM(%(y_idx)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (PyGpuArray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == %(nll)s) //initial condition
|| (PyGpuArray_DIMS(%(nll)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(nll)s);
%(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_x)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(nll)s) {
%(fail)s
}
}
if ((NULL == %(sm)s)
|| (PyGpuArray_DIMS(%(sm)s)[0] !=
PyGpuArray_DIMS(%(x)s)[0])
|| (PyGpuArray_DIMS(%(sm)s)[1] !=
PyGpuArray_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
%(typecode_b)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)s;
}
}
if ((NULL == %(am)s)
|| (PyGpuArray_DIMS(%(am)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(am)s);
%(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
%(typecode_y_idx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)s;
}
}
{
int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
//TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(dtype);
k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
PyGpuArray_DIMS(%(x)s)[0],
PyGpuArray_DIMS(%(x)s)[1],
(npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
%(x)s->ga.offset),
PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
(npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
%(b)s->ga.offset),
PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
%(nll)s->ga.offset),
PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
%(am)s->ga.offset),
PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
}
""" % locals()
return sio.getvalue()
def c_code_cache_version(self):
#return ()
return (5,)
def c_compiler(self):
return NVCC_compiler
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(Op):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
"""
nin = 3
nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op"""
def __init__(self, **kwargs):
Op.__init__(self, **kwargs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, dnll, sm, y_idx):
dnll = as_gpuarray_variable(dnll)
sm = as_gpuarray_variable(sm)
y_idx = as_gpuarray_variable(y_idx)
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
#return ()
return (6,)
def c_headers(self):
return ['cuda.h', '<compyte/extension.h>', '<compyte/numpy_compat.h>']
def c_compiler(self):
return NVCC_compiler
def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize
itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize
itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
return """
if ((PyGpuArray_NDIM(%(dnll)s) != 1)
|| (PyGpuArray_NDIM(%(sm)s) != 2)
|| (PyGpuArray_NDIM(%(y_idx)s) != 1))
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if (PyGpuArray_DIMS(%(dnll)s)[0] !=
PyGpuArray_DIMS(%(sm)s)[0])
{
PyErr_Format(PyExc_ValueError,
"dnll.shape[0] == %%i, but sm.shape[0] == %%i",
PyGpuArray_DIMS(%(dnll)s)[0],
PyGpuArray_DIMS(%(sm)s)[0]);
%(fail)s;
}
if (PyGpuArray_DIMS(%(dnll)s)[0] !=
PyGpuArray_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dnll.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if ((NULL == %(dx)s)
|| (PyGpuArray_DIMS(%(dx)s)[0] !=
PyGpuArray_DIMS(%(sm)s)[0])
|| (PyGpuArray_DIMS(%(dx)s)[1] !=
PyGpuArray_DIMS(%(sm)s)[1]))
{
Py_XDECREF(%(dx)s);
%(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s),
%(typecode_dx)s,
GA_C_ORDER,
pygpu_default_context(), Py_None);
if (!%(dx)s) {
%(fail)s
}
}
{
int n_blocks = PyGpuArray_DIMS(%(dx)s)[0] < 256 ? PyGpuArray_DIMS(%(dx)s)[0] : 256;
int n_threads = PyGpuArray_DIMS(%(dx)s)[1] < 256 ? PyGpuArray_DIMS(%(dx)s)[1] : 256;
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<n_blocks, n_threads>>>(
PyGpuArray_DIMS(%(dx)s)[0],
PyGpuArray_DIMS(%(dx)s)[1],
(npy_%(dtype_dnll)s*)(((char *)cuda_get_ptr(%(dnll)s->ga.data)) +
%(dnll)s->ga.offset),
PyGpuArray_STRIDES(%(dnll)s)[0] / %(itemsize_dnll)s,
(npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
%(sm)s->ga.offset),
PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
(npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
%(y_idx)s->ga.offset),
PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
(npy_%(dtype_dx)s*)(((char *)cuda_get_ptr(%(dx)s->ga.data)) +
%(dx)s->ga.offset),
PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s,
PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s
);
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
}
assert(%(dx)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
dtype_dnll = node.inputs[0].dtype
dtype_sm = node.inputs[1].dtype
dtype_y_idx = node.inputs[2].dtype
dtype_dx = node.outputs[0].dtype
return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K,
const npy_%(dtype_dnll)s* dnll, const int dnll_s0,
const npy_%(dtype_sm)s* sm, const int sm_s0, const int sm_s1,
const npy_%(dtype_y_idx)s* y_idx, const int y_idx_s0,
npy_%(dtype_dx)s* dx, const int dx_s0, const int dx_s1)
{
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
npy_%(dtype_dnll)s dnll_i = dnll[i * dnll_s0];
npy_%(dtype_y_idx)s y_i = y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
{
if (y_i == j)
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
}
else
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * sm[i * sm_s0 + j * sm_s1];
}
//dx[i * dx_s0 + j * dx_s1] =
// dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*dx_s0+j*dx_s1] = 0;
}
}
}
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
""" % locals()
def c_init_code(self):
return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))compyte_get_extension("cuda_get_ptr");']
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
......@@ -18,6 +18,8 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
GpuReshape,
GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
......@@ -58,7 +60,6 @@ def op_lifter(OP):
def local_opt(node):
if type(node.op) in OP:
# This does not support nodes that have more than one output.
assert len(node.outputs) == 1
# either one of our inputs is on the gpu or
# all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu
......@@ -69,7 +70,9 @@ def op_lifter(OP):
# This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [host_from_gpu(new_op(*node.inputs))]
return [host_from_gpu(o) for o in new_op(*node.inputs, return_list=True)]
elif isinstance(new_op, (tuple, list)):
return [host_from_gpu(o) for o in new_op]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False
......@@ -267,3 +270,15 @@ def local_gpua_dot22(node):
@op_lifter([tensor.basic.Eye])
def local_gpua_eye(node):
return GpuEye(dtype=node.op.dtype)
@register_opt()
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
def local_gpua_crossentropysoftmaxargmax1hotwithbias(node):
return GpuCrossentropySoftmaxArgmax1HotWithBias()
@register_opt()
@op_lifter([tensor.nnet.CrossentropySoftmax1HotWithBiasDx])
def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
return GpuCrossentropySoftmax1HotWithBiasDx()
from nose.plugins.skip import SkipTest
import numpy
import theano
from theano.gof.python25 import any
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox import gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
# We let that import do the init of the back-end if needed.
from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
mode_without_gpu)
if not gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from theano.sandbox.gpuarray.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx)
def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
"""
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
We check that we loop when their is too much threads
"""
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y')
b = T.fvector('b')
#W = T.fmatrix('W')
#we precompute the dot with big shape before to allow the test of
#GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
#powerful enough. We need the big shape to check for corner
#case.
dot_result = T.fmatrix('dot_result')
# Seed numpy.random with config.unittests.rseed
utt.seed_rng()
xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
dot_value = numpy.asarray(numpy.dot(xx, W_values), dtype='float32')
del W_values
p_y_given_x = T.nnet.softmax(dot_result + b)
y_pred = T.argmax(p_y_given_x, axis=-1)
loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
dW = T.grad(loss, dot_result)
classify = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_without_gpu)
classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
#theano.printing.debugprint(classify)
#theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
for node in classify.maker.fgraph.toposort()])
assert any([isinstance(node.op,
GpuCrossentropySoftmaxArgmax1HotWithBias)
for node in classify_gpu.maker.fgraph.toposort()])
out = classify(yy, b_values, dot_value)
gout = classify_gpu(yy, b_values, dot_value)
assert len(out) == len(gout) == 3
assert numpy.allclose(out[0], gout[0])
assert numpy.allclose(out[2], gout[2], atol=3e-6), numpy.absolute(
gout[2] - out[2]).max()
assert numpy.allclose(out[1], gout[1]), [(id, out[1][id], gout[1][id], val)
for id, val in enumerate(out[1] -
gout[1])
if val != 0]
def test_GpuCrossentropySoftmax1HotWithBiasDx():
"""
This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
We check that we loop when their is too much threads
"""
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed
utt.seed_rng()
softmax_output_value = numpy.random.rand(batch_size,
n_out).astype('float32')
dnll_value = numpy.asarray(numpy.random.rand(batch_size), dtype='float32')
y_idx_value = numpy.random.randint(low=0, high=5, size=batch_size)
softmax_output = T.fmatrix()
softmax_output /= softmax_output.sum(axis=1).reshape(
softmax_output.shape[1], 1)
op = theano.tensor.nnet.crossentropy_softmax_1hot_with_bias_dx(
dnll_value,
softmax_output,
y_idx_value)
cpu_f = theano.function([softmax_output], op, mode=mode_without_gpu)
gpu_f = theano.function([softmax_output], op, mode=mode_with_gpu)
#theano.printing.debugprint(cpu_f)
#theano.printing.debugprint(gpu_f)
assert any([isinstance(node.op, T.nnet.CrossentropySoftmax1HotWithBiasDx)
for node in cpu_f.maker.fgraph.toposort()])
assert any([isinstance(node.op,
GpuCrossentropySoftmax1HotWithBiasDx)
for node in gpu_f.maker.fgraph.toposort()])
cpu_out = cpu_f(softmax_output_value)
gpu_out = gpu_f(softmax_output_value)
rtol = 1e-5
atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print 'max err index:', max_i, max_i / batch_size,
print max_i % batch_size, max_i / n_out, max_i & n_out
print 'At that index:'
print 'err:', scaled_err.flatten()[max_i]
print 'absolute error:', abs_err.flatten()[max_i]
print 'relative error:', rel_err.flatten()[max_i]
print 'cpu_out:', cpu_out.flatten()[max_i]
print 'gpu_out:', gpu_out.flatten()[max_i]
print 'softmax_output_value:', softmax_output_value.flatten()[max_i]
print 'dnll_value:', dnll_value[max_i / n_out]
print 'y_idx_value:', y_idx_value[max_i / n_out]
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
......@@ -138,7 +138,9 @@ class GpuArrayType(Type):
return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub):
return "PyGpuArrayObject *%s;" % (name,)
return """
PyGpuArrayObject *%(name)s;
""" % locals()
def c_init(self, name, sub):
return "%s = NULL;" % (name,)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论