提交 85db6f61 authored 作者: lamblin's avatar lamblin

Merge pull request #1442 from nouiz/mixed2

[WIP] Use the new grad interface.
...@@ -124,6 +124,27 @@ Do like in the section "Updating Theano", but use ...@@ -124,6 +124,27 @@ Do like in the section "Updating Theano", but use
.. _install_ubuntu_gpu: .. _install_ubuntu_gpu:
Manual Openblas instruction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The openblas included in Ubuntu is limited to 2 threads. If you want
to use more cores at the same time, you will need to compile it
yourself. Here is some code that will help you.
.. code-block:: bash
# remove openblas if you installed it
sudo apt-get remove libopenblas-base
# Download the development version of OpenBLAS
git clone git://github.com/xianyi/OpenBLAS
cd OpenBLAS
make FC=gfortran
sudo make PREFIX=/usr/local/ install
cd /usr/local/lib
ln -s libopenblas.so /usr/lib/libblas.so
ln -s libopenblas.so.0 /usr/lib/libblas.so.3gf
Contributed GPU instruction Contributed GPU instruction
~~~~~~~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~
......
差异被折叠。
...@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -41,7 +41,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
float * sm_data, int sms0, int sms1, float * sm_data, int sms0, int sms1,
float * am_data, int ams0) float * am_data, int ams0)
{ {
const int row = blockIdx.x; for (int row = blockIdx.x; row < M; row += gridDim.x){
const float * x = x_data + xs0 * row; const float * x = x_data + xs0 * row;
const int y_idx = (int)y_idx_data[row * y_idxs0]; const int y_idx = (int)y_idx_data[row * y_idxs0];
...@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -83,6 +83,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
+ log(sum); + log(sum);
} }
am_data[row*ams0] = row_max_j; am_data[row*ams0] = row_max_j;
}
} }
""" """
...@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -168,7 +169,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
} }
} }
{ {
int n_blocks = CudaNdarray_HOST_DIMS(%(sm)s)[0]; int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS);
//TODO: launch more threads per row and do parallel sum and max reductions //TODO: launch more threads per row and do parallel sum and max reductions
int n_threads = 1; int n_threads = 1;
int n_shared_bytes = 0; //n_threads * sizeof(float); int n_shared_bytes = 0; //n_threads * sizeof(float);
...@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -195,8 +197,11 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
if (cudaSuccess != err) if (cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n", "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
cudaGetErrorString(err)); "The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it // no need to decref output vars the cleanup code will do it
%(fail)s; %(fail)s;
} }
...@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp): ...@@ -206,7 +211,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (3,) return (4,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
...@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -235,7 +240,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) return (6,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp dnll, sm, y_idx = inp
...@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -283,11 +288,12 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
} }
} }
{ {
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<< <<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0], CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1], CudaNdarray_HOST_DIMS(%(dx)s)[1],
...@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp): ...@@ -310,9 +316,11 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n", "Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s", "kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err)); cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s; %(fail)s;
} }
} }
......
...@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -25,7 +25,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias
We check that we loop when their is too much threads We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
""" """
...@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -100,13 +99,16 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
This is basic test for GpuCrossentropySoftmax1HotWithBiasDx This is basic test for GpuCrossentropySoftmax1HotWithBiasDx
We check that we loop when their is too much threads We check that we loop when their is too much threads
TODO: check that we loop when their is too much block(>32*1024)
""" """
n_in = 1000 n_in = 1000
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
utt.seed_rng() utt.seed_rng()
......
...@@ -715,10 +715,9 @@ class ExtractDiag(Op): ...@@ -715,10 +715,9 @@ class ExtractDiag(Op):
implemented our own. """ implemented our own. """
x, = ins x, = ins
z, = outs z, = outs
# zero-dimensional matrices ... # zero-dimensional matrices ...
if x.shape[0] == 0 or x.shape[1] == 0: if x.shape[0] == 0 or x.shape[1] == 0:
z[0] = numpy.zeros(0, dtype=x.dtype) z[0] = node.outputs[0].type.value_zeros((0,))
return return
if x.shape[0] < x.shape[1]: if x.shape[0] < x.shape[1]:
......
...@@ -204,8 +204,8 @@ def test_rop_lop(): ...@@ -204,8 +204,8 @@ def test_rop_lop():
rop_f = function([mx, mv], yv) rop_f = function([mx, mv], yv)
sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(), sy, _ = theano.scan(lambda i, y, x, v: (tensor.grad(y[i], x) * v).sum(),
sequences=tensor.arange(y.shape[0]), sequences=tensor.arange(y.shape[0]),
non_sequences=[y, mx, mv]) non_sequences=[y, mx, mv])
scan_f = function([mx, mv], sy) scan_f = function([mx, mv], sy)
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
...@@ -561,6 +561,7 @@ class test_Eigh(test_Eig): ...@@ -561,6 +561,7 @@ class test_Eigh(test_Eig):
class test_Eigh_float32(test_Eigh): class test_Eigh_float32(test_Eigh):
dtype = 'float32' dtype = 'float32'
def test_matrix_inverse_solve(): def test_matrix_inverse_solve():
if not imported_scipy: if not imported_scipy:
raise SkipTest("Scipy needed for the Solve op.") raise SkipTest("Scipy needed for the Solve op.")
......
...@@ -144,7 +144,17 @@ class ArgSortOp(theano.Op): ...@@ -144,7 +144,17 @@ class ArgSortOp(theano.Op):
def grad(self, inputs, output_grads): def grad(self, inputs, output_grads):
#No grad defined for intergers. #No grad defined for intergers.
return [None, None] inp, axis = inputs
inp_grad = theano.gradient.grad_not_implemented(
self, 0, axis,
"I'm not sure if argsort should have its gradient"
" implemented or is should be marked as undefined."
" So I mark it as not implemented for now.")
axis_grad = theano.gradient.grad_undefined(
self, 1, axis,
"argsort is not defined for non-integer axes so"
" argsort(x, axis+eps) is undefined")
return [inp_grad, axis_grad]
""" """
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
# R_op can receive None as eval_points. # R_op can receive None as eval_points.
......
...@@ -185,7 +185,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile, ...@@ -185,7 +185,9 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
subprocess_extra_args.update(dict( subprocess_extra_args.update(dict(
stdout=dummy_out.fileno(), stdout=dummy_out.fileno(),
stderr=dummy_out.fileno())) stderr=dummy_out.fileno()))
t0 = time.time()
subprocess.call(cmd, **subprocess_extra_args) subprocess.call(cmd, **subprocess_extra_args)
t1 = time.time()
# Recover failed test indices from the 'failed' field of the # Recover failed test indices from the 'failed' field of the
# '.noseids' file. We need to do it after each batch because # '.noseids' file. We need to do it after each batch because
# otherwise this field may get erased. We use a set because it # otherwise this field may get erased. We use a set because it
...@@ -193,8 +195,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile, ...@@ -193,8 +195,8 @@ def run(stdout, stderr, argv, theano_nose, batch_size, time_profile,
# to avoid duplicates. # to avoid duplicates.
failed = failed.union(cPickle.load(open(noseids_file, 'rb')) failed = failed.union(cPickle.load(open(noseids_file, 'rb'))
['failed']) ['failed'])
print '%s%% done (failed: %s)' % ((test_range[-1] * 100) // print '%s%% done in %.3fs (failed: %s)' % (
n_tests, len(failed)) (test_range[-1] * 100) // n_tests, t1 - t0, len(failed))
# Sort for cosmetic purpose only. # Sort for cosmetic purpose only.
failed = sorted(failed) failed = sorted(failed)
if failed: if failed:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论