提交 645557f9 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu
[nosetest]
match=^test
nocapture=1
[flake8]
ignore=E501,E123,E133
......@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \
config.gpuarray.init_device != '':
if (config.device.startswith('cuda') or
config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
......
......@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self.default = default
def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \
if val == self.default or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'):
return val
else:
raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"'
% (val, self.fullname)))
'one of "%s", "gpu", "opencl", "cuda"'
% (self.default, val, self.fullname)))
over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over)
def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)
AddConfigVar(
'device',
......@@ -94,15 +94,7 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."),
DeviceParam('cpu', allow_override=False),
in_c_key=False,)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False)
in_c_key=False)
AddConfigVar(
'init_gpu_device',
......@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."),
EnumStr('', 'gpu',
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
DeviceParam('', allow_override=False),
in_c_key=False)
AddConfigVar(
......@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key=False)
def default_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
return ''
AddConfigVar(
'cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
def default_dnn_path(suffix):
def f(suffix=suffix):
if config.cuda.root == '':
return ''
return os.path.join(config.cuda.root, suffix)
return f
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('lib64')))
# This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph.
AddConfigVar(
......
......@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
old_val[k] = l[0].__get__()
old_val[k] = l[0].__get__(True, None)
try:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
......@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for cv in _config_var_list:
print(cv, file=buf)
print(" Doc: ", cv.doc, file=buf)
print(" Value: ", cv.__get__(), file=buf)
print(" Value: ", cv.__get__(True, None), file=buf)
print("", file=buf)
......@@ -182,7 +182,7 @@ def get_config_md5():
all_opts = sorted([c for c in _config_var_list if c.in_c_key],
key=lambda cv: cv.fullname)
return theano.gof.utils.hash_from_code('\n'.join(
['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts]))
['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))
class TheanoConfigParser(object):
......@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user.
if not callable(configparam.default):
configparam.__get__()
configparam.__get__(root, type(root))
else:
# We do not want to evaluate now the default value
# when it is a callable.
try:
fetch_val_for_key(configparam.fullname)
# The user provided a value, filter it now.
configparam.__get__()
configparam.__get__(root, type(root))
except KeyError:
pass
setattr(root.__class__, sections[0], configparam)
......@@ -294,6 +294,7 @@ class ConfigParam(object):
self.default = default
self.filter = filter
self.allow_override = allow_override
self.is_default = True
# N.B. --
# self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar
......@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects.
def __get__(self, *args):
def __get__(self, cls, type_):
if cls is None:
return self
if not hasattr(self, 'val'):
try:
val_str = fetch_val_for_key(self.fullname)
self.is_default = False
except KeyError:
if callable(self.default):
val_str = self.default()
else:
val_str = self.default
self.__set__(None, val_str)
self.__set__(cls, val_str)
# print "RVAL", self.val
return self.val
......
......@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code = self.code_sections[tag]
define_macros, undef_macros = self.get_c_macros(node, name)
return os.linesep.join([define_macros, code,
return os.linesep.join(['', define_macros, code,
undef_macros])
else:
raise utils.MethodNotDefined(
......@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros, undef_macros = self.get_c_macros(node, name)
def_sub, undef_sub = self.get_sub_macros(sub)
return os.linesep.join([def_macros, def_sub,
return os.linesep.join(['', def_macros, def_sub,
op_code,
undef_sub, undef_macros])
else:
......@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros, undef_macros = self.get_c_macros(node, name,
check_input=False)
ctx = ""
if 'context' in sub:
ctx = ", %s" % (sub['context'],)
# Generate the C code
return """
%(define_macros)s
{
if (%(func_name)s(%(func_args)s) != 0) {
if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
%(fail)s
}
}
%(undef_macros)s
""" % dict(func_name=self.func_name,
fail=sub['fail'],
fail=sub['fail'], ctx=ctx,
func_args=self.format_c_function_args(inp, out),
define_macros=define_macros,
undef_macros=undef_macros)
......
......@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device:
elif config.init_gpu_device.startswith('gpu'):
assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'")
......
......@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available():
if dnn_available.avail is None:
......@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
params=params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
......
......@@ -8,6 +8,7 @@ import warnings
import numpy
from theano import config
from theano.compat import decode, decode_iter
from theano.gof import local_bitwidth
from theano.gof.utils import hash_from_file
......@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
from theano.configparser import (config, AddConfigVar, StrParam,
BoolParam, ConfigParam)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
user_provided_cuda_root = True
def default_cuda_root():
global user_provided_cuda_root
v = os.getenv('CUDA_ROOT', "")
user_provided_cuda_root = False
if v:
return v
return find_cuda_root()
AddConfigVar('cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
nvcc_path = 'nvcc'
nvcc_version = None
......@@ -115,14 +55,6 @@ def is_nvcc_available():
return False
def find_cuda_root():
s = os.getenv("PATH")
if not s:
return
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
rpath_defaults = []
......@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
......@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details.
if (user_provided_cuda_root and
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib'))
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
......@@ -19,13 +19,6 @@ try:
except ImportError:
pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
......@@ -57,8 +50,9 @@ if pygpu:
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device)
elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
init_dev(config.init_gpu_device)
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
......@@ -70,7 +64,8 @@ if pygpu:
except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True)
else:
if (config.gpuarray.init_device != '' or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
if (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl') or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True)
......@@ -2,11 +2,9 @@ import os
import numpy
import theano
from theano import Op, Apply
from theano import tensor, scalar, config
from theano import Op, Apply, Type, Variable
from theano import tensor, config
from theano.gradient import grad_undefined
from theano.scalar import Scalar
from theano.tensor.basic import Alloc, Join, Split
from theano.gof import HideC
......@@ -17,7 +15,7 @@ from six.moves import xrange
try:
import pygpu
from pygpu import gpuarray, elemwise
from pygpu import gpuarray
except ImportError:
pass
......@@ -293,7 +291,6 @@ class GpuFromHost(Op):
def perform(self, node, inp, out):
x, = inp
z, = out
type = node.outputs[0].type
z[0] = gpuarray.array(x)
def grad(self, inputs, grads):
......@@ -312,254 +309,29 @@ class GpuFromHost(Op):
def c_code(self, node, name, inputs, outputs, sub):
return """
PyGpuArrayObject *%(name)s_tmp;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
if (%(name)s_tmp == NULL)
%(fail)s
Py_XDECREF(%(out)s);
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s),
get_typecode((PyObject *)PyArray_DESCR(%(inp)s)),
PyArray_NDIM(%(inp)s),
(size_t *)PyArray_DIMS(%(inp)s),
(ssize_t *)PyArray_STRIDES(%(inp)s),
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(name)s_tmp),
pygpu_default_context(),
Py_None);
if (%(out)s == NULL) {
%(fail)s
}
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL)
%(fail)s
""" % {'name': name, 'inp': inputs[0],
'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (4,)
gpu_from_host = GpuFromHost()
class GpuFromCuda(Op):
view_map = {0: [0]}
__props__ = ()
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
dtype=x.dtype)()])
def perform(self, node, inp, out):
x, = inp
z, = out
z[0] = gpuarray.array(numpy.asarray(x))
def grad(self, inputs, grads):
gz, = grads
return [cuda_from_gpu(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, GpuArrayType):
return [cuda_from_gpu(ev)]
else:
return ev
def infer_shape(self, node, xshp):
return xshp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>',
'<gpuarray/types.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");',
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr;
gpudata *%(name)sdata;
CUcontext %(name)scur;
size_t *%(name)sdims;
ssize_t *%(name)sstr;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
%(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
if (%(name)sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)s
}
%(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
if (%(name)sstr == NULL) {
free(%(name)sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)s
}
for (unsigned int i = 0; i < %(in)s->nd; i++) {
%(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
%(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
}
%(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
(CUdeviceptr)%(in)s->devdata,
((size_t)%(in)s->data_allocated)*4);
if (%(name)sdata == NULL) {
Py_DECREF(%(out)s);
free(%(name)sdims);
free(%(name)sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
%(name)sdims, %(name)sstr,
pygpu_default_context(), 1,
(PyObject *)%(in)s,
(PyObject *)&PyGpuArrayType);
pygpu_default_context()->ops->buffer_release(%(name)sdata);
free(%(name)sdims);
free(%(name)sstr);
if (%(out)s == NULL) {
%(fail)s
}
""" % {'name': name, 'in': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self):
return (5,)
gpu_from_cuda = GpuFromCuda()
class CudaFromGpu(Op):
view_map = {0: [0]}
__props__ = ()
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
if x.type.dtype != 'float32':
raise TypeError(x)
return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
def perform(self, node, inp, out):
from theano.sandbox.cuda import filter as cuda_filter
x, = inp
z, = out
z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
tuple([0] * x.ndim), 0, z[0])
def grad(self, inputs, grads):
gz, = grads
return [gpu_from_cuda(gz)]
def R_op(self, inputs, eval_points):
from theano.sandbox.cuda import CudaNdarrayType
ev, = eval_points
if (isinstance(ev, CudaNdarrayType)):
return [gpu_from_cuda(ev)]
else:
return [ev]
def infer_shape(self, node, shp):
return shp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");',
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr = 0, %(name)si;
CUcontext %(name)scur;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
if (!%(out)s) {
%(fail)s
}
for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
}
%(name)serr = CudaNdarray_set_device_data(%(out)s,
(float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
(PyObject *)%(inp)s);
if (%(name)serr) {
%(fail)s
}
""" % {'name': name, 'inp': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self):
return (3,)
cuda_from_gpu = CudaFromGpu()
gpu_from_host = GpuFromHost()
class GpuAlloc(HideC, Alloc):
......@@ -572,7 +344,7 @@ class GpuAlloc(HideC, Alloc):
value is always 0, so the c code call memset as it is faster.
"""
__props__ = ('memset_0',)
_f16_ok = True
......@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
sh, bcast = self.validate_shape(shape)
if value.ndim > len(sh):
TypeError("The GpuAlloc value to use has more dimensions "
"than the specified shape", v.ndim, len(sh))
"than the specified shape", value.ndim, len(sh))
otype = value.type.clone(broadcastable=bcast)
return Apply(self, [value] + sh, [otype()])
......@@ -686,29 +458,28 @@ class GpuAlloc(HideC, Alloc):
return (2,)
def do_constant_folding(self, node):
from . import subtensor, blas
for client in node.outputs[0].clients:
if client[0] == 'output':
# If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold.
return False
elif ( # The following ops work inplace of their input id 0.
client[1] == 0 and
isinstance(client[0].op, (
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor,
theano.sandbox.gpuarray.subtensor.GpuIncSubtensor,
theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1,
theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20,
theano.sandbox.gpuarray.blas.GpuGemm,
theano.sandbox.gpuarray.blas.GpuGemv,
theano.sandbox.gpuarray.blas.GpuGer,
))):
# The following ops work inplace of their input id 0.
elif (client[1] == 0 and
# Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the
# constant and this is less efficients.
# Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't
# always exists.
isinstance(client[0].op,
(subtensor.GpuIncSubtensor,
subtensor.GpuAdvancedIncSubtensor1,
subtensor.GpuAdvancedIncSubtensor1_dev20,
blas.GpuGemm, blas.GpuGemv,
blas.GpuGer)
)):
return False
# If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do.
......@@ -795,7 +566,7 @@ class GpuContiguous(Op):
"""
Always return a c contiguous output. Copy the input only if it is
not already c contiguous.
"""
__props__ = ()
......@@ -980,7 +751,7 @@ class GpuJoin(HideC, Join):
node = Join.make_node(self, axis, *tensors)
return Apply(self, [node.inputs[0]] + list(map(as_gpuarray_variable,
tensors)),
tensors)),
[GpuArrayType(broadcastable=node.outputs[0].broadcastable,
dtype=node.outputs[0].dtype)()])
......@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype),
objvar='k_eye_'+name,
)]
objvar='k_eye_' + name)]
def c_code(self, node, name, inp, out, sub):
n, m = inp
......
......@@ -5,17 +5,15 @@ import theano
from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from theano.gof import utils
class GpuConv(GpuKernelBase, gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
......@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
raise ValueError(mode)
def __init__(self, border_mode,
subsample=(1, 1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True,
version=-1,
direction_hint=None,
verbose=0,
kshp=None,
imshp=None,
max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True):
subsample=(1, 1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True,
version=-1,
direction_hint=None,
verbose=0,
kshp=None,
imshp=None,
max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
......@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def flops(self, inputs, outputs):
"""
Useful with the hack in profilemode to print the MFlops.
"""
images, kerns = inputs
out, = outputs
......@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
return ['-DTHEANO_KERN_WID=' + str(nb)]
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 21)
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
return (0, 22)
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
......
......@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge
# We need to import this to define the flags.
from theano.sandbox import dnn_flags # noqa
from .opt_util import alpha_merge, output_merge, inplace_allocempty
def dnn_available():
......@@ -50,7 +47,6 @@ def dnn_available():
dnn_available.avail = False
preambule = """
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
......@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
......@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True)
def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"),
name="local_dnna_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
......
from __future__ import print_function
import copy
import os
from theano.compat import izip
import numpy
import theano
from theano import Apply, scalar, config
from theano import scalar as scal
from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined
from theano.gof.cmodule import GCC_compiler
from theano.scalar import Scalar
from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
......@@ -94,7 +91,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
try:
support_code = self.scalar_op.c_support_code()
if (support_code.strip() != "#define THEANO_MACRO_MOD(x,y) (x % y)" and
support_code.strip() != ""):
support_code.strip() != ""):
# The macro is fine, the C++ struct is not.
raise SupportCodeError(support_code)
except MethodNotDefined:
......@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs) if not n in self.inplace_pattern]
enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
......@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else:
scal_out.append(arg.name + '[i]')
kop = self.scalar_op.c_code(fake_node, nodename+'_scalar',
kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
scal_in, scal_out,
dict(fail='return;'))
......@@ -169,27 +166,12 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float16", "ga_half"),
("npy_float32", "ga_float"),
("npy_float64", "ga_double"),
]:
]:
kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_compiler(self):
return GCC_compiler
return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_support_code(self):
return self.scalar_op.c_support_code()
......@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))]
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
......@@ -399,7 +376,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
param.append("(void *)&%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0],
i=i))
for n, (name, var) in enumerate(zip(inputs + outputs,
node.inputs + node.outputs)):
node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern:
continue
dtype = dtype_to_ctype(var.dtype)
......@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
""" % dict(kname=kname,fail=fail)
""" % dict(kname=kname, fail=fail)
if config.gpuarray.sync:
code += """
err = GpuArray_sync(&%(z)s->ga);
......@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (3, ver)
return (4, ver)
else:
return ver
......@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res = input
res = res.transpose(self.shuffle+self.drop)
res = res.transpose(self.shuffle + self.drop)
shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment:
......@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp);
return res;
}
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)),
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
name=name, nd_out=len(self.new_order),
copy_shape=copy_shape(len(self.new_order)))
......@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return process
def c_code_cache_version(self):
return (4,)
return (5,)
class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
......@@ -581,7 +558,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
pre_scalar_op
If present, must be a scalar op with only 1 input. We will execute it
on the input value before reduction.
Examples
--------
When scalar_op is a theano.scalar.basic.Add instance:
......@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause
# upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None:
if self.axis is None:
......@@ -687,8 +662,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i" % len(self.reduce_mask))
if ("complex" in x.dtype or
"complex" in ret.outputs[0].dtype or
"complex" in self._acc_dtype(x.dtype)):
"complex" in ret.outputs[0].dtype or
"complex" in self._acc_dtype(x.dtype)):
raise NotImplementedError("We don't support complex in gpu reduction")
return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
ret.outputs[0].type.broadcastable)()])
......@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return False
return True
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, name, inp, out, sub):
x, = inp
......@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sio = StringIO()
fail = sub['fail']
ctx = sub['context']
# check input
print("""
......@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(%(nd_out)s, new_dims,
%(out_typecode)s, GA_C_ORDER,
pygpu_default_context(),
Py_None);
pygpu_default_context(), Py_None);
if (NULL == %(z)s)
{
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
......@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims.
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio)
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
getattr(self, 'c_code_reduce_%s' %
(''.join(str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio)
else:
getattr(self, 'c_code_reduce_%s'%(''.join(
getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
# \end bracket the reduction ...
......@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else:
assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype!
#dtype = node.inputs[0].dtype
if self.pre_scalar_op: # TODO: multiple dtypes
# dtype = node.inputs[0].dtype
dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)()
......@@ -1171,7 +1139,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Parameters
----------
node, name, sub
node, name, sub
These should be passed through from the original call to c_code.
"""
......@@ -1411,7 +1379,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
"""
Parameters
----------
N
......@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [16] # the version corresponding to the c code in this Op
version = [17] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
......@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype)
flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype)
......@@ -2106,10 +2071,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
]
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
#01, 011, 0111
# 01, 011, 0111
if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]):
# this kernel uses one block for each row.
# threads per block for each element per row.
......@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column.
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
# thread.x = dim 2 contiguous
# thread.y = dim 1
# block.x = dim 0
# block.y = dim 1 rest
init = self._k_init(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
......@@ -2515,7 +2480,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var))
if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row,
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={})
......@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename
k_var = "kernel_reduce_1011_" + nodename
sio = StringIO()
print("""
KERNEL void %(kname)s(
......@@ -2712,7 +2677,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
# cache the kernel object
self.get_kernel_cache(node)
return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
compute_map, no_recycling)
compute_map, no_recycling)
def get_kernel_cache(self, node):
attr = '@cache_reduction_k'
......@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype,
node.outputs[0].type.dtype),
objvar='k_reduk_'+name)]
objvar='k_reduk_' + name)]
def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
......@@ -2768,8 +2733,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d)
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
......@@ -2816,8 +2781,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else:
code += """
if (%(output)s == NULL || %(output)s->ga.nd != 0) {
......@@ -2828,8 +2793,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
""" % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype:
code += """
......@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None);
if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype))
""" % dict(output=output, fail=sub['fail'],
acc_type=dtype_to_typecode(acc_dtype))
else:
code += """
tmp = %(output)s;
Py_INCREF(tmp);
""" % dict(output=output)
""" % dict(output=output)
# We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the
......@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code += """
args[0] = &n;
args[1] = tmp->ga.data;
""" % dict(output=output)
""" % dict(output=output)
p = 2
for i in range(node.inputs[0].ndim):
......@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input)
""" % dict(i=i, p=p, input=input)
p += 1
if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
......@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[%(p)s] = %(input)s->ga.data;
proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input)
""" % dict(p=p, input=input)
p += 2
for i in range(node.inputs[0].ndim):
code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input)
""" % dict(p=p, i=i, input=input)
p += 1
code += """
......@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
}
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
""" % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version(self):
return (1, self.GpuKernelBase_version)
return (2, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux = self.redux
if any(redux):
output[0] = self.get_kernel_cache(node)(input).astype(copy=False,
dtype=node.outputs[0].type.dtype)
output[0] = self.get_kernel_cache(node)(input).astype(
copy=False, dtype=node.outputs[0].type.dtype)
else:
output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype)
......
......@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
"""
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
......@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
ctype = gpuarray.dtype_to_ctype(dtype)
return [
# get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
# get max of buf (trashing all but buf[0])
return [inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
......@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
'__syncthreads()',
]
......@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
......@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
Parameters
----------
N
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(dtype).
......
import os
import numpy
from theano import Op, Apply, config
from theano.gof import local_optimizer
from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T
try:
import pygpu
from pygpu import gpuarray, elemwise
from pygpu import gpuarray
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
from .comp import NVCC_compiler
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
......@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return (10,1)
return (11,)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
......
......@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
......
from __future__ import print_function
import numpy
import os
from theano import Op, Apply, config
from six import StringIO
try:
import pygpu
from pygpu import gpuarray, elemwise
from pygpu import gpuarray
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .type import GpuArrayType
from .kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
inline_softmax,
inline_softmax_fixed_shared)
from .fp16_help import work_dtype, load_w, write_w
......@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype
......@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue()
def c_code_cache_version(self):
return (7,)
return (8,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
return (10,)
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
return (11,)
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/types.h>']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
......@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape
def c_code_cache_version(self):
return (14,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
return (15,) + inline_softmax.code_version
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype
......@@ -665,60 +637,60 @@ class GpuSoftmax(GpuKernelBase, Op):
]
kernels = []
kname = "kSoftmax"
k_var= "kSoftmax_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
k_var = "kSoftmax_" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
'blockDim.x', dtype=work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
body=["extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2', 'threadIdx.x',
'blockDim.x', dtype=work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared"
k_var= "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
k_var = "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
dtype=work_sm),
"__syncthreads()",
"}",
])
body=["extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
dtype=work_sm),
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
......@@ -743,26 +715,13 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape):
return [shape[0]]
return [shape[0]]
def c_code_cache_version(self):
return (13,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
return (14,) + inline_softmax.code_version
def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype
......@@ -892,65 +851,65 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
kernels = []
kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x,
"buf[tx] += %s(b[tx * sb0])" % load_b,
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x', work_sm),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm,
"}",
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = nvcc_kernel(kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[
"extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1',
'const %s * b' % type_b, 'const ga_size offset_b',
'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=["extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const %s *x_ptr = &x[blockIDX * sx0]" % type_x,
"%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm,
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
load_x,
'sm_ptr', 'sm_s1', write_sm,
'threadIdx.x', 'blockDim.x',
'b', 'sb0', load_b, work_sm),
"__syncthreads()",
"}",
])
kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var))
return kernels
......
......@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
......
......@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float64'))
def grab_cpu_scalar(v, nd):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v : variable
Theano variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
if (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
n.op.new_order == ('x',) * nd):
return grab_cpu_scalar(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return grab_cpu_scalar(n.inputs[0])
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
return grab_cpu_scalar(n.inputs[0], nd)
else:
return None
else:
......@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
def find_node(v, cls, ignore_clients=False):
# This digs through possibly redundant transfers to for the node
# that has the op class specified. If ignore_clients is False (the
# default) it will only dig through nodes that have a single
# client.
"""
Find the node that has an op of of type `cls` in `v`.
This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v : variable
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls):
return v.owner
......@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
"""
Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var : variable
Variable to compare
val : value
Python value
"""
try:
v = get_scalar_constant_value(var)
return v == val
......@@ -59,7 +97,57 @@ def is_equal(var, val):
return False
def alpha_merge(cls, alpha_in, beta_in, nd):
def alpha_merge(cls, alpha_in, beta_in):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of scal * <yourop>(some, params, alpha,
beta) and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
lr = grab_cpu_scalar(node.inputs[0], nd=nd)
if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0],
nd=targ.outputs[0].ndim)
else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd)
if (lr is None or targ is None or
lr.dtype != targ.outputs[0].dtype):
lr = grab_cpu_scalar(node.inputs[1],
nd=targ.outputs[0].ndim)
if lr is None or lr.dtype != targ.outputs[0].dtype:
return None
inputs = list(targ.inputs)
try:
......@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in, nd):
def output_merge(cls, alpha_in, beta_in, out_in):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of val * <yourop>(some, params, alpha,
beta, out_like) and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
......@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
return maker(targ, *inputs)
return opt
return wrapper
def inplace_allocempty(op, idx):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature:
maker(node, inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as:
def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs)
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
This returns an unregistered inplace local optimizer that has the
same name as the decorated function.
"""
def wrapper(maker):
@local_optimizer([op], inplace=True)
@wraps(maker)
def opt(node):
if type(node.op) != op or node.op.inplace:
return
inputs = list(node.inputs)
alloc = inputs[idx]
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs)
return opt
return wrapper
......@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self):
return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self):
return self.iadd_node.op.c_init_code()
......@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version:
return
return parent_version + elemwise_version + (2,)
return parent_version + elemwise_version + (3,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
import unittest
from theano.compat import izip
from copy import copy, deepcopy
from six import iteritems
......@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file
from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu,
......@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
GpuAllocEmpty,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuContiguous,
GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt
utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
......@@ -63,22 +61,6 @@ else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def may_fail(msg, EClass):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def test_decorator(f):
def wrapper():
try:
f()
except Exception as e:
if isinstance(e, EClass):
raise SkipTest(msg, e)
raise
wrapper.__name__ = f.__name__
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None):
if mode is None:
......@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else:
err_msg = ("Test %s::%s: exception raised during test "
"call was not the same as the reference "
"call (got: %s, expected %s)") % \
"call (got: %s, expected %s)" %
(self.gpu_op, testname, type(exc),
type(ref_e))
type(ref_e)))
exc.args += (err_msg,)
raise
......@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected):
self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % (
self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype))
"(dtype %s), got %s (dtype %s)." %
(self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype)))
for description, check in iteritems(self.checks):
if not check(inputs, variables):
......@@ -250,36 +232,6 @@ def test_transfer_strided():
assert numpy.all(fv == av)
@may_fail("Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:, ::-2]
cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
assert GpuArrayType.values_eq_approx(fv, gv)
fvs = f(cvs)
assert GpuArrayType.values_eq_approx(fvs, gvs)
f = theano.function([g], cuda_from_gpu(g))
fv = f(gv)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
fvs = f(gvs)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype)
g[:] = x
......@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op=gpu_alloc,
cases=dict(
correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)),
correct02=(rand(), numpy.int32(4), numpy.int32(7)),
correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand(7), numpy.int32(2), numpy.int32(4),
......@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32')
gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()]
......
......@@ -6,8 +6,7 @@ import numpy
import theano
from theano import tensor
from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22)
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor
......@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu)
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpugemm_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
GpuGemvTester = makeTester(
'GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
......@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return theano.shared(val)
GpuGemmTester = makeTester('GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
GpuGemmTester = makeTester(
'GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
class TestGpuSger(TestGer):
......@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def test_f32_0_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_1_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_0_1(self):
raise SkipTest('0-sized objects not supported')
......@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester = makeTester(
'GpuGemmTester',
'GpuDot22Tester',
op=_dot22, gpu_op=gpu_dot22,
cases=dict(
test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
)
)
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
......@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
......
......@@ -6,37 +6,36 @@ import sys
import time
import unittest
import numpy
from six.moves import xrange
from nose.plugins.skip import SkipTest
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
import theano
from theano import tensor
from theano.tests.unittest_tools import seed_rng
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from .test_basic_ops import mode_with_gpu
from ..type import GpuArrayType
from ..conv import GpuConv
from theano.sandbox.gpuarray import dnn
import pygpu
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
gftensor4 = GpuArrayType('float32', [False] * 4)
def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1]
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
......@@ -60,7 +59,7 @@ def py_conv_full_numpy(img, kern):
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
dtype=img.dtype)
padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
return py_conv_valid_numpy(padded_img, kern)
......@@ -73,10 +72,10 @@ def py_conv(img, kern, mode, subsample):
return py_conv_scipy(img, kern, mode, subsample)
elif mode == 'valid':
return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
::subsample[1]]
elif mode == 'full':
return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
::subsample[1]]
else:
raise Exception("Can't execute this kernel.")
......@@ -85,12 +84,12 @@ def py_conv_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1]
if mode == 'valid':
outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1)
else:
outshp = (img.shape[0], kern.shape[0],
img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1)
img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1)
out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]):
......@@ -133,9 +132,9 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
dtype='float32')
else:
npy_img = theano._asarray(numpy.arange(
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = pygpu.array(npy_img)
kern = pygpu.array(npy_kern)
......@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
cpu_mflops = approx_fp / (t1 - t0)
gpu_mflops = approx_fp / (t2 - t1)
if verbose > 0:
print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout)
print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
file=sys.stdout)
print('%12.5f %7.2f %7.2f %7.1f' %
(approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
file=sys.stdout)
if not rval:
print(('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride,
version), file=sys.stdout)
print('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride,
version, file=sys.stdout)
diff = cpuval - gpuval
diffabs = numpy.absolute(diff)
pr_diff = diffabs / numpy.absolute(cpuval)
......@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
nb_close, "/", diff.size))
print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
numpy.average(pr_diff)))
if not rval and print_ != False:
if not rval and print_ is not False:
if npy_img.shape[0] > 5:
print("img", npy_img[0])
print("kern", npy_kern[0])
......@@ -242,19 +243,20 @@ def exec_conv(version, shapes, verbose, random, mode,
istride, kstride) in enumerate(shapes):
ret = False
try:
ret = _params_allgood(ishape,
kshape,
mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones)
ret = _params_allgood(
ishape,
kshape,
mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones)
except Exception as e:
print(ver, id, (ishape, kshape, subshape, istride, kstride))
print(e)
......@@ -273,11 +275,11 @@ def exec_conv(version, shapes, verbose, random, mode,
def get_basic_shapes():
# basic test of image and kernel shape
# basic test of image and kernel shape
return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
# basic test for unsquare kernel and image
# basic test for unsquare kernel and image
((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
......@@ -297,17 +299,17 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
# nkern only
((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and nkern
# batch and nkern
((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and stack
# batch and stack
((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
#stack and nkern
# stack and nkern
((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack
# batch, nkern and stack
((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack
# batch, nkern and stack
((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
]
]
def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
......@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
def get_valid_shapes():
# img shape, kern shape, subsample shape
shapes = get_basic_shapes()
......@@ -360,39 +361,36 @@ def get_valid_shapes():
# test subsample done in a separate fct
shapes += [
# other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
, ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)) # added a test case that fail from test_nnet.py.test_conv_nnet2
, ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)) # test precedent but reduced that triger the error
# Test more than maxThreadsDim0
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
]
shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 1 layers
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 2 layers
, ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full
, ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 1 layers
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 2 layers
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
]
# other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)), # added a test case that fail from test_nnet.py.test_conv_nnet2
((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)), # test precedent but reduced that triger the error
# Test more than maxThreadsDim0
((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
]
shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 1 layers
((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
]
return shapes
......@@ -428,43 +426,35 @@ def test_full():
shapes += [
# other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1))
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
# other test
, ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)) # kernel bigger then image
, ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)) # kernel bigger then image
, ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
]
((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
]
shapes += [
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
# Test more than maxThreadsDim0
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
]
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
# Test more than maxThreadsDim0
((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
]
version = [-1]
verbose = 0
......@@ -481,7 +471,7 @@ def test_subsample():
((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
]
]
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
......@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns.
]:
......@@ -580,47 +569,45 @@ def benchmark():
shapes_valid = [
# test_lenet_28 shape
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # valid
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid
, ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid
, ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)) # valid
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # valid
((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_32 shape
, ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)) # valid
, ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid
, ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid
, ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)) # valid
((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)), # valid
((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_64 shape
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid
, ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)) # valid
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid
, ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)) # valid
((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)), # valid
((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_108 shape
, ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid
, ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)) # valid
, ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid
, ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)) # valid
((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)), # valid
((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_256 shape
, ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)) # valid
, ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)) # valid
, ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)) # valid
, ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)) # valid
]
((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)), # valid
((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)), # valid
]
shapes_full = [
# test_lenet_28 shape
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full
# test_lenet_32 shape
, ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=1
((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=3
((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape
, ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full 'conv_full_patch_stack_padded' N=9
((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape
, ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)) # full conv_reference_full
]
((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)), # full conv_reference_full
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version = [-1]
verbose = 1
random = True
......@@ -640,6 +627,6 @@ def test_stack_rows_segfault_070312():
kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
nkern=1, bsize=1)
nkern=1, bsize=1)
f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
f()
import unittest
from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from .test_basic_ops import mode_with_gpu
from ..neighbours import GpuImages2Neibs
......
from __future__ import print_function
from nose.plugins.skip import SkipTest
import numpy
import unittest
......@@ -7,8 +7,6 @@ import theano
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
......@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in = 4098
n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y')
b = T.fvector('b')
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
# (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# case.
dot_result = T.fmatrix('dot_result')
......@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
......@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
......@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads
"""
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed
......@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol = 1e-5
atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print('max err index:', max_i, max_i / batch_size, end=' ')
print(max_i % batch_size, max_i / n_out, max_i & n_out)
print('At that index:')
print('err:', scaled_err.flatten()[max_i])
print('absolute error:', abs_err.flatten()[max_i])
print('relative error:', rel_err.flatten()[max_i])
print('cpu_out:', cpu_out.flatten()[max_i])
print('gpu_out:', gpu_out.flatten()[max_i])
print('softmax_output_value:', softmax_output_value.flatten()[max_i])
print('dnll_value:', dnll_value[max_i / n_out])
print('y_idx_value:', y_idx_value[max_i / n_out])
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
def test_softmax_with_bias_float16():
......@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float16')
def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float32')
......@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED)
"""
x = T.matrix('x', dtype=dtypeInput)
b = T.vector('b', dtype=dtypeBias)
......@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def test_softmax_float16():
softmax_unittest_template('float16')
def test_softmax_float32():
softmax_unittest_template('float32')
def test_softmax_float64():
softmax_unittest_template('float64')
......
import operator
import numpy
import theano
......@@ -25,7 +23,6 @@ def test_deep_copy():
def test_values_eq_approx():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
assert GpuArrayType.values_eq_approx(a, a)
b = a.copy()
b[0] = numpy.asarray(b[0]) + 1.
......
......@@ -184,7 +184,7 @@ class GpuArrayType(Type):
@staticmethod
def may_share_memory(a, b):
if (not isinstance(a, gpuarray.GpuArray) or
not isinstance(b, gpuarray.GpuArray)):
not isinstance(b, gpuarray.GpuArray)):
return False
return pygpu.gpuarray.may_share_memory(a, b)
......@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self.broadcastable == other.broadcastable)
def convert_variable(self, var):
if (type(self) == type(var.type) and
self.typecode == var.type.typecode and
self.ndim == var.type.ndim and
vt = var.type
if (type(self) == type(vt) and
self.typecode == vt.typecode and
self.ndim == vt.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable,
var.type.broadcastable))):
vt.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self):
......
......@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py",
"sandbox/gpuarray/tests/test_subtensor.py",
"sandbox/gpuarray/tests/test_scan.py",
"sandbox/gpuarray/tests/test_neighbours.py",
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
"sandbox/gpuarray/tests/test_type.py",
"sandbox/gpuarray/tests/test_opt.py",
"sandbox/gpuarray/tests/test_blas.py",
"sandbox/gpuarray/tests/test_elemwise.py",
"sandbox/gpuarray/tests/test_nnet.py",
"sandbox/gpuarray/tests/test_basic_ops.py",
"scan_module/scan_utils.py",
"scan_module/scan_views.py",
"scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论