提交 7478227f authored 作者: lamblin's avatar lamblin

Merge pull request #904 from nouiz/small

Small
......@@ -749,7 +749,16 @@ class Test_preallocated_output(unittest.TestCase):
from theano.sandbox import cuda
if not cuda.cuda_available:
raise SkipTest("Optional package Cuda disabled")
if cuda.use.device_number is None:
# We should normally set VecAsRowAndCol as a GPUOp But we
# don't want to do this here as this will disable others
# tests in this file. So we manually init the GPU if
# needed to remove warning.
cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
v = cuda.fvector('v')
c, r = VecAsRowAndCol()(v)
f = theano.function([v], [c, r])
......
......@@ -497,101 +497,6 @@ gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True)
class GpuOuter(GpuOp):
""" Implement outer on the gpu."""
def make_node(self, x, y):
# we suppose type checking has been done, but make sure.
assert (x.type.ndim == 1 and y.type.ndim == 1 and
x.type.dtype == 'float32' and y.type.dtype == 'float32')
bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
return Apply(self, [x, y], outputs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def c_code_cache_version(self):
return (4,)
def c_code(self, node, name, inputs, outputs, sub):
# A = x * y'
x, y = inputs
A, = outputs
fail = sub['fail']
return """
CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
int %(name)sres;
if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
%(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
if (!%(name)sx) {
%(fail)s;
}
} else {
%(name)sx = %(x)s;
Py_INCREF(%(name)sx);
}
if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
%(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
if (!%(name)sy) {
Py_DECREF(%(name)sx);
%(fail)s;
}
} else {
%(name)sy = %(y)s;
Py_INCREF(%(name)sy);
}
if (!(%(A)s &&
CudaNdarray_HOST_DIMS(%(A)s)[0] ==
CudaNdarray_HOST_DIMS(%(x)s)[0] &&
CudaNdarray_HOST_DIMS(%(A)s)[1] ==
CudaNdarray_HOST_DIMS(%(y)s)[0] &&
CudaNdarray_is_c_contiguous(%(A)s))) {
Py_XDECREF(%(A)s);
int dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
%(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
if (!%(A)s) {
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
%(fail)s;
}
}
else
{
// sger accumulates into A. We need to zero it first.
int total_size = (sizeof(real) *
CudaNdarray_HOST_DIMS(%(A)s)[0] *
CudaNdarray_HOST_DIMS(%(A)s)[1]);
if (cudaSuccess != cudaMemset(%(A)s->devdata, 0, total_size))
{
PyErr_Format(PyExc_MemoryError,
"GpuOuter: Error memsetting %%d bytes of device memory.",
total_size);
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
%(fail)s;
}
}
%(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
if (%(name)sres) {
%(fail)s;
}
""" % dict(x=x, y=y, A=A, fail=fail, name=name)
gpu_outer = GpuOuter()
##
# Not really a BLAS operation, but whatever.
#
......
......@@ -128,6 +128,18 @@ class NVCC_compiler(object):
import theano.sandbox.cuda
if hasattr(theano.sandbox, 'cuda'):
n = theano.sandbox.cuda.use.device_number
if n is None:
_logger.warn("We try to get compilation arguments for CUDA"
" code, but the GPU device is not initialized."
" This is probably caused by an Op that work on"
" the GPU that don't inherit from GpuOp."
" We Initialize the GPU now.")
theano.sandbox.cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
p = theano.sandbox.cuda.device_properties(n)
flags.append('-arch=sm_' + str(p['major']) + str(p['minor']))
return flags
......
......@@ -21,7 +21,7 @@ from theano.gof.python25 import all, any
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
......@@ -579,26 +579,6 @@ def local_gpu_gemm(node):
return False
@register_opt()
@local_optimizer([])
def local_gpu_outer(node):
"""
gpu_dot22(col, row) -> gpu_outer
"""
if node.op == gpu_dot22:
l, r = node.inputs
if l.type.broadcastable[1] and r.type.broadcastable[0]:
# TODO: we would like to remove the double-dimshuffle when
# l or r is already the output of a GpuDimshuffle. To do
# this, refactor the logic in tensor/opt.py that collapses
# dimshuffle chains so that we can call it from here.
lvec = GpuDimShuffle(l.broadcastable, [0])(l)
rvec = GpuDimShuffle(r.broadcastable, [1])(r)
return [gpu_outer(lvec, rvec)]
return False
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
......
......@@ -200,31 +200,6 @@ class TestBlasStridesGpu(TestBlasStrides):
mode = mode_with_gpu
def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y')
x_val = x.get_value().copy()
y_val = y.get_value().copy()
f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val), f())
f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
if 0:
# This is commented out because it doesn't make sense...
# tcn.blas has no op called DownsampleFactorMax
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论