提交 12088f8c authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

Modifications in the trunk since the last release
In trunk since 0.3.1 release
----------------------------
Partial of what is in trunk since the last release
--------------------------------------------------
Deprecation:
Bugs fixed:
* Bugfix in CudaNdarray.__iadd__. When it is not implemented, return the error.
Crash fixed:
* Work around a bug in gcc 4.3.0 that make the compilation of 2d convolution crash.
Optimization:
* Optimize 4 pattern of subtensor followed by subtensor.
GPU:
* Move to the gpu fused elemwise that have other dtype then float32 in them(except float64) if the input and output are float32.
* This allow to move elemwise comparaison to the gpu if we cast it to float32 after that.
* Implemented CudaNdarray.ndim to have the same interface in ndarray.
New features:
* ProfileMode
* profile the scan overhead
* simple hook system to add profiler
* reordered the output to be in the order of more general to more specific
* var[vector of index] now work, (grad work recursivly, the direct grad work inplace, gpu work)
* limitation: work only of the outer most dimensions.
Documentation:
Unit tests:
* Reuse test for subtensor of tensor for gpu tensor(more gpu test)
Other:
* ?? a bug?? Correctly put the broadcast flag to True in the output var of a Rehapse op when we receive an int 1 in the new shape.
Theano 0.3.1 (2011-02-21)
----------------------------
......
......@@ -4,6 +4,13 @@
How to make a release
==================================================
Update files
============
Update the NEWS.txt and move the old stuff in the HISTORY.txt file.
To update the NEWS.txt file, check all ticket closed for this release
and all commit log messages.
Get a fresh copy of the repository
==================================
......
......@@ -123,9 +123,18 @@ ProfileMode
Beside checking for errors, another important task is to profile your
code. For this Theano uses a special mode called ProfileMode which has
to be passed as an argument to :func:`theano.function <function.function>`. Using the ProfileMode is a three-step process.
to be passed as an argument to :func:`theano.function <function.function>`.
Using the ProfileMode is a three-step process.
To change the default to it, put the theano flags mode to PROFILE_MODE.
.. note::
To change the default to it, put the Theano flags
:attr:`config.mode` to ProfileMode. In that case, when the python
process exit, it will automatically print the profiling
information on the stdout.
The memory profile of the output of each apply node can be enabled with the
Theano flag :attr:`config.ProfileMode.profile_memory`.
Creating a ProfileMode Instance
-------------------------------
......
......@@ -123,10 +123,6 @@ AddConfigVar('nvcc.fastmath',
"",
BoolParam(False))
AddConfigVar('cuda.root',
"directory with bin/, lib/, include/ for cuda utilities",
StrParam(os.getenv('CUDA_ROOT', "/usr/local/cuda")))
AddConfigVar('gpuelemwise.sync',
"when true, wait that the gpu fct finished and check it error code.",
BoolParam(True))
......
......@@ -208,17 +208,25 @@ def module_name_from_dir(dirname):
def get_module_hash(module_file, key):
"""
Return an MD5 hash that identifies a module.
Return an MD5 hash that uniquely identifies a module.
This hash takes into account:
1. The 'mod.cpp' or 'mod.cu' file used to compile `module_file`.
2. The compiler options defined in `key`.
2. The version part of the key.
3. The compiler options defined in `key` (command line parameters and
libraries to link against).
"""
source_code = os.path.join(os.path.dirname(module_file), 'mod.cpp')
if not os.path.exists(source_code):
source_code = os.path.join(os.path.dirname(module_file), 'mod.cu')
assert os.path.exists(source_code)
source_hash = hash_from_file(source_code)
# `to_hash` will contain any element such that we know for sure that if
# it changes, then the module hash should be different.
# We start with the source code itself (stripping blanks might avoid
# recompiling after a basic indentation fix for instance).
to_hash = map(str.strip, open(source_code).readlines())
# Get the version part of the key.
to_hash += map(str, key[0])
c_link_key = key[1]
# Currently, in order to catch potential bugs early, we are very
# convervative about the structure of the key and raise an exception
......@@ -231,9 +239,10 @@ def get_module_hash(module_file, key):
"AssertionError may be removed or modified to accomodate "
"this change)")
assert c_link_key[0] == 'CLinker.cmodule_key', error_msg
to_hash = [source_hash]
for key_element in c_link_key[1:]:
if isinstance(key_element, tuple):
# This should be the C++ compilation command line parameters or the
# libraries to link against.
to_hash += list(key_element)
elif isinstance(key_element, str):
if key_element.startswith('md5:'):
......
......@@ -247,6 +247,81 @@ class GpuGemm(Op):
gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuOuter(Op):
def make_node(self, x, y):
# we suppose type checking has been done, but make sure.
assert (x.type.ndim == 1 and y.type.ndim == 1 and
x.type.dtype == 'float32' and y.type.dtype == 'float32')
bz = [x.type.broadcastable[0], y.type.broadcastable[0]]
outputs = [CudaNdarrayType(dtype='float32', broadcastable=bz)()]
return Apply(self, [x, y], outputs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def c_code_cache_version(self):
return (3,)
def c_code(self, node, name, inputs, outputs, sub):
# A = x * y'
x, y = inputs
A, = outputs
fail = sub['fail']
return """
CudaNdarray *%(name)sx = NULL, *%(name)sy = NULL;
int %(name)sres;
if (CudaNdarray_HOST_STRIDES(%(x)s)[0] < 0) {
%(name)sx = (CudaNdarray *)CudaNdarray_Copy(%(x)s);
if (!%(name)sx) {
%(fail)s;
}
} else {
%(name)sx = %(x)s;
Py_INCREF(%(name)sx);
}
if (CudaNdarray_HOST_STRIDES(%(y)s)[0] < 0) {
%(name)sy = (CudaNdarray *)CudaNdarray_Copy(%(y)s);
if (!%(name)sy) {
Py_DECREF(%(name)sx);
%(fail)s;
}
} else {
%(name)sy = %(y)s;
Py_INCREF(%(name)sy);
}
if (!(%(A)s &&
CudaNdarray_HOST_DIMS(%(A)s)[0] == CudaNdarray_HOST_DIMS(%(x)s)[0] &&
CudaNdarray_HOST_DIMS(%(A)s)[1] == CudaNdarray_HOST_DIMS(%(y)s)[0] &&
CudaNdarray_is_c_contiguous(%(A)s))) {
Py_XDECREF(%(A)s);
int dims[2];
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[0];
%(A)s = (CudaNdarray *)CudaNdarray_ZEROS(2, dims);
if (!%(A)s) {
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
%(fail)s;
}
}
%(name)sres = CudaNdarray_sger(1.0, %(name)sx, %(name)sy, %(A)s);
Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx);
if (%(name)sres) {
%(fail)s;
}
"""%dict(x=x,y=y,A=A,fail=fail,name=name)
gpu_outer = GpuOuter()
##
# Not really a BLAS operation, but whatever.
#
......
......@@ -328,6 +328,7 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self)
if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{
PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
Py_DECREF(contiguous_self);
Py_DECREF(rval);
rval = NULL;
}
......@@ -353,7 +354,7 @@ PyObject* CudaNdarray_ZEROS(int n, int * dims)
CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
if (!rval)
{
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to new_null failed");
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
return NULL;
}
......@@ -2347,7 +2348,7 @@ CudaNdarray_New(int nd)
CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
if (self == NULL)
{
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_new_null failed to allocate self");
PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
return NULL;
}
CudaNdarray_null_init(self);
......@@ -2717,6 +2718,48 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return 0;
}
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
|| (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
PyErr_Format(PyExc_ValueError,
"dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
CudaNdarray_HOST_DIMS(x)[0],
CudaNdarray_HOST_DIMS(y)[0],
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1]);
return -1;
}
// Maybe this could work, but be safe for now
if (!CudaNdarray_is_c_contiguous(A)) {
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1;
}
// Same for this, be safe
assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0);
assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0);
// Since Sger expects A in col-major, we invert x and y to fake this.
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0],
CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0],
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublasSger failed (%s)",cudaGetErrorString(err));
return -1;
}
return 0;
}
/**
*
* Precondition:
......@@ -3238,3 +3281,14 @@ CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
return NULL;
}
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
c-file-offsets:((innamespace . 0)(inline-open . 0))
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=79 :
......@@ -478,13 +478,16 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other, boo
PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self);
PyObject *
CudaNdarray_ZEROS(int n, int * dims);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
......
......@@ -10,13 +10,18 @@ _logger.setLevel(logging.WARN)
from theano.configparser import config, AddConfigVar, StrParam
AddConfigVar('nvcc.compiler_bindir',
"if defined, nvcc compiler driver will seek g++ and gcc in this directory",
"If defined, nvcc compiler driver will seek g++ and gcc in this directory",
StrParam(""))
AddConfigVar('cuda.nvccflags',
"Extra compiler flags for nvcc",
StrParam(""))
AddConfigVar('cuda.root',
"The directory with bin/, lib/, include/ for cuda utilities. Used to put this directory of nvidia lib in the compiled libraire. Usefull when people forget to update there LD_LIBRARY_PATH and LIBRARY_PATH environment variable. If AUTO, if nvcc is in the path, it will use one of this parent directory. Otherwise /usr/local/cuda. If empty, won't appen the directory in the compiled library",
StrParam(os.getenv('CUDA_ROOT', "AUTO")))
def error(*args):
#sys.stderr.write('ERROR:'+ ' '.join(str(a) for a in args)+'\n')
_logger.error("ERROR: "+' '.join(str(a) for a in args))
......@@ -43,7 +48,7 @@ def is_nvcc_available():
global nvcc_version
nvcc_version = s[1]
return True
except:
except Exception:
#try to find nvcc into cuda.root
p = os.path.join(config.cuda.root,'bin','nvcc')
if os.path.exists(p):
......@@ -51,6 +56,20 @@ def is_nvcc_available():
nvcc_path = p
return True
else: return False
def set_cuda_root():
import pdb;pdb.set_trace()
s = os.getenv("PATH")
if not s:
return
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir,"nvcc")):
config.cuda.root = os.path.split(dir)[0]
return
if config.cuda.root == "AUTO":
set_cuda_root()
is_nvcc_available()#to set nvcc_path correctly and get the version
def nvcc_module_compile_str(
......@@ -66,11 +85,11 @@ def nvcc_module_compile_str(
:param preargs: a list of extra compiler arguments
:returns: dynamically-imported python module of the compiled code.
:note 1: On Windows 7 with nvcc 3.1 we need to compile in the real directory
Otherwise nvcc never finish.
"""
if sys.platform=="win32":
# Remove some compilation args that cl.exe does not understand.
# cl.exe is the compiler used by nvcc on Windows.
......@@ -91,7 +110,7 @@ def nvcc_module_compile_str(
include_dirs = include_dirs + std_include_dirs()
if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs:
include_dirs.append(os.path.abspath(os.path.split(__file__)[0]))
libs = std_libs() + libs
if 'cudart' not in libs:
libs.append('cudart')
......@@ -107,11 +126,11 @@ def nvcc_module_compile_str(
if sys.platform == 'darwin':
# On the mac, nvcc is not able to link using -framework Python, so we have
# On the mac, nvcc is not able to link using -framework Python, so we have
# manually add the correct library and paths
darwin_python_lib = commands.getoutput('python-config --ldflags')
else:
# sometimes, the linker cannot find -lpython so we need to tell it
# sometimes, the linker cannot find -lpython so we need to tell it
# explicitly where it is located
# this returns somepath/lib/python2.x
python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
......@@ -133,14 +152,14 @@ def nvcc_module_compile_str(
(module_name, get_lib_extension()))
debug('Generating shared lib', lib_filename)
# TODO: Why do these args cause failure on gtx285 that has 1.3 compute capability? '--gpu-architecture=compute_13', '--gpu-code=compute_13',
# TODO: Why do these args cause failure on gtx285 that has 1.3 compute capability? '--gpu-architecture=compute_13', '--gpu-code=compute_13',
preargs1=[pa for pa in preargs if pa.startswith('-O') or pa.startswith('--maxrregcount=')]#nvcc argument
preargs2=[pa for pa in preargs if pa not in preargs1]#other arguments
cmd = [nvcc_path, '-shared', '-g'] + preargs1
if config.nvcc.compiler_bindir:
cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
if sys.platform!='win32':
if local_bitwidth() == 64:
cmd.append('-m64')
......@@ -148,11 +167,11 @@ def nvcc_module_compile_str(
else:
cmd.append('-m32')
preargs2.append('-m32')
if len(preargs2)>0:
cmd.extend(['-Xcompiler', ','.join(preargs2)])
if os.path.exists(os.path.join(config.cuda.root,'lib')):
if config.cuda.root and os.path.exists(os.path.join(config.cuda.root,'lib')):
cmd.extend(['-Xlinker',','.join(['-rpath',os.path.join(config.cuda.root,'lib')])])
if sys.platform != 'darwin':
# the 64bit CUDA libs are in the same files as are named by the function above
......@@ -168,7 +187,7 @@ def nvcc_module_compile_str(
cmd.extend(['-l%s'%l for l in libs])
if sys.platform == 'darwin':
cmd.extend(darwin_python_lib.split())
if sys.platform == 'darwin':
done = False
while not done:
......@@ -213,7 +232,7 @@ def nvcc_module_compile_str(
nvcc_stdout, nvcc_stderr = p.communicate()[:2]
finally:
os.chdir(orig_dir)
if nvcc_stdout:
# this doesn't happen to my knowledge
print >> sys.stderr, "DEBUG: nvcc STDOUT", nvcc_stdout
......@@ -229,7 +248,7 @@ def nvcc_module_compile_str(
continue
_logger.info("NVCC: "+eline)
if p.returncode:
if p.returncode:
# filter the output from the compiler
for l in nvcc_stderr.split('\n'):
if not l:
......@@ -241,7 +260,7 @@ def nvcc_module_compile_str(
continue
if l[l.index(':'):].startswith(': warning: label'):
continue
except:
except:
pass
print >> sys.stderr, l
print >> sys.stderr, '==============================='
......@@ -252,4 +271,3 @@ def nvcc_module_compile_str(
#touch the __init__ file
file(os.path.join(location, "__init__.py"),'w').close()
return dlimport(lib_filename)
......@@ -14,7 +14,7 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv)
gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import (
......@@ -378,6 +378,29 @@ def local_gpu_gemm(node):
return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_outer(node):
"""
gpu_from_host(outer) -> gpu_outer(gpu_from_host)
outer(host_from_gpu) -> host_from_gpu(gpu_outer)
"""
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.basic.outer:
x, y = host_input.owner.inputs
# gpu_outer will refuse to work with float64 so future-proof
if x.type.dtype == 'float32' and y.type.dtype == 'float32':
return [gpu_outer(gpu_from_host(x), gpu_from_host(y))]
if node.op == tensor.basic.outer:
x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu and x.type.dtype == 'float32')
y_on_gpu = (y.owner and y.owner.op == host_from_gpu and x.type.dtype == 'float32')
if x_on_gpu or y_on_gpu:
return [host_from_gpu(gpu_outer(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y)))]
return False
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
......
......@@ -117,6 +117,31 @@ def test_gemm_no_inplace():
assert numpy.allclose(numpy.dot(a0, bval)+cval, a.get_value())
assert numpy.allclose(numpy.dot(a0, bval2)+cval, rval)
def test_outer():
x = tcn.shared_constructor(my_rand(8,), 'x')
y = tcn.shared_constructor(my_rand(6,), 'y')
x_val = x.get_value().copy()
y_val = y.get_value().copy()
f = pfunc([], tensor.outer(x, y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val), f())
f = pfunc([], tensor.outer(x[::2], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val), f())
f = pfunc([], tensor.outer(x, y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::3]), f())
f = pfunc([], tensor.outer(x[::2], y[::3]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::2], y_val[::3]), f())
f = pfunc([], tensor.outer(x[::-1], y), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val[::-1], y_val), f())
f = pfunc([], tensor.outer(x, y[::-1]), mode=mode_with_gpu)
assert numpy.allclose(numpy.outer(x_val, y_val[::-1]), f())
if 0:
# This is commented out because it doesn't make sense...
# tcn.blas has no op called DownsampleFactorMax
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论