提交 85db6f61 authored 作者: lamblin's avatar lamblin

Merge pull request #1442 from nouiz/mixed2

[WIP] Use the new grad interface.
......@@ -124,6 +124,27 @@ Do like in the section "Updating Theano", but use
.. _install_ubuntu_gpu:
Manual Openblas instruction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The openblas included in Ubuntu is limited to 2 threads. If you want
to use more cores at the same time, you will need to compile it
yourself. Here is some code that will help you.
.. code-block:: bash
# remove openblas if you installed it
sudo apt-get remove libopenblas-base
# Download the development version of OpenBLAS
git clone git://github.com/xianyi/OpenBLAS
cd OpenBLAS
make FC=gfortran
sudo make PREFIX=/usr/local/ install
cd /usr/local/lib
ln -s libopenblas.so /usr/lib/libblas.so
ln -s libopenblas.so.0 /usr/lib/libblas.so.3gf
Contributed GPU instruction
~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
差异被折叠。
......@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
float * sm_data, int sms0, int sms1,
float * am_data, int ams0)
{
const int row = blockIdx.x;
for (int row = blockIdx.x; row < M; row += gridDim.x){
const float * x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0];
......@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
+ log(sum);
}
am_data[row*ams0] = row_max_j;
}
}
"""
......@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
}
}
{
int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0];
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS);
//TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(float);
......@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n",
cudaGetErrorString(err));
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
......@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
def c_code_cache_version(self):
#return ()
return (3,)
return (4,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
def c_code_cache_version(self):
#return ()
return (5,)
return (6,)
def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp
......@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<
CudaNdarray_HOST_DIMS(%(dx)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>(
<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
......@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err));
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
}
......
......@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
"""
......@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
"""
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed
utt.seed_rng()
......
......@@ -715,10 +715,9 @@ class ExtractDiag(Op):
implemented our own. """
x, = ins
z, = outs
# zero-dimensional matrices ...
if x.shape[0] == 0 or x.shape[1] == 0:
z[0] = numpy.zeros(0, dtype=x.dtype)
z[0] = node.outputs[0].type.value_zeros((0,))
return
if x.shape[0] < x.shape[1]:
......
......@@ -204,8 +204,8 @@ def test_rop_lop():
rop_f = function([mx, mv], yv)
sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
sequences=tensor.arange(y.shape[0]),
non_sequences=[y, mx, mv])
sequences=tensor.arange(y.shape[0]),
non_sequences=[y, mx, mv])
scan_f = function([mx, mv], sy)
rng = numpy.random.RandomState(utt.fetch_seed())
......@@ -561,6 +561,7 @@ class test_Eigh(test_Eig):
class test_Eigh_float32(test_Eigh):
dtype = 'float32'
def test_matrix_inverse_solve():
if not imported_scipy:
raise SkipTest("Scipy needed for the Solve op.")
......
......@@ -144,7 +144,17 @@ class ArgSortOp(theano.Op):
def grad(self, inputs, output_grads):
#No grad defined for intergers.
return [None, None]
inp, axis = inputs
inp_grad = theano.gradient.grad_not_implemented(
self, 0, axis,
"I'm not sure if argsort should have its gradient"
" implemented or is should be marked as undefined."
" So I mark it as not implemented for now.")
axis_grad = theano.gradient.grad_undefined(
self, 1, axis,
"argsort is not defined for non-integer axes so"
" argsort(x, axis+eps) is undefined")
return [inp_grad, axis_grad]
"""
def R_op(self, inputs, eval_points):
# R_op can receive None as eval_points.
......
......@@ -185,7 +185,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
subprocess_extra_args.update(dict(
stdout=dummy_out.fileno(),
stderr=dummy_out.fileno()))
t0 = time.time()
subprocess.call(cmd, **subprocess_extra_args)
t1 = time.time()
# Recover failed test indices from the 'failed' field of the
# '.noseids' file. We need to do it after each batch because
# otherwise this field may get erased. We use a set because it
......@@ -193,8 +195,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
# to avoid duplicates.
failed = failed.union(cPickle.load(open(noseids_file, 'rb'))
['failed'])
print '%s%% done (failed: %s)' % ((test_range[-1] * 100) //
n_tests, len(failed))
print '%s%% done in %.3fs (failed: %s)' % (
(test_range[-1] * 100) // n_tests, t1 - t0, len(failed))
# Sort for cosmetic purpose only.
failed = sorted(failed)
if failed:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论