提交 645557f9 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu
[nosetest]
match=^test
nocapture=1
[flake8]
ignore=E501,E123,E133
......@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \
config.gpuarray.init_device != '':
if (config.device.startswith('cuda') or
config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr
......
......@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self.default = default
def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \
if val == self.default or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'):
return val
else:
raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"'
% (val, self.fullname)))
'one of "%s", "gpu", "opencl", "cuda"'
% (self.default, val, self.fullname)))
over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over)
def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,)
return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)
AddConfigVar(
'device',
......@@ -94,15 +94,7 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."),
DeviceParam('cpu', allow_override=False),
in_c_key=False,)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False)
in_c_key=False)
AddConfigVar(
'init_gpu_device',
......@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."),
EnumStr('', 'gpu',
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
DeviceParam('', allow_override=False),
in_c_key=False)
AddConfigVar(
......@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key=False)
def default_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
return ''
AddConfigVar(
'cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
def default_dnn_path(suffix):
def f(suffix=suffix):
if config.cuda.root == '':
return ''
return os.path.join(config.cuda.root, suffix)
return f
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('lib64')))
# This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph.
AddConfigVar(
......
......@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l = [v for v in theano.configparser._config_var_list
if v.fullname == k]
assert len(l) == 1
old_val[k] = l[0].__get__()
old_val[k] = l[0].__get__(True, None)
try:
for k in kwargs:
l = [v for v in theano.configparser._config_var_list
......@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for cv in _config_var_list:
print(cv, file=buf)
print(" Doc: ", cv.doc, file=buf)
print(" Value: ", cv.__get__(), file=buf)
print(" Value: ", cv.__get__(True, None), file=buf)
print("", file=buf)
......@@ -182,7 +182,7 @@ def get_config_md5():
all_opts = sorted([c for c in _config_var_list if c.in_c_key],
key=lambda cv: cv.fullname)
return theano.gof.utils.hash_from_code('\n'.join(
['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts]))
['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))
class TheanoConfigParser(object):
......@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user.
if not callable(configparam.default):
configparam.__get__()
configparam.__get__(root, type(root))
else:
# We do not want to evaluate now the default value
# when it is a callable.
try:
fetch_val_for_key(configparam.fullname)
# The user provided a value, filter it now.
configparam.__get__()
configparam.__get__(root, type(root))
except KeyError:
pass
setattr(root.__class__, sections[0], configparam)
......@@ -294,6 +294,7 @@ class ConfigParam(object):
self.default = default
self.filter = filter
self.allow_override = allow_override
self.is_default = True
# N.B. --
# self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar
......@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects.
def __get__(self, *args):
def __get__(self, cls, type_):
if cls is None:
return self
if not hasattr(self, 'val'):
try:
val_str = fetch_val_for_key(self.fullname)
self.is_default = False
except KeyError:
if callable(self.default):
val_str = self.default()
else:
val_str = self.default
self.__set__(None, val_str)
self.__set__(cls, val_str)
# print "RVAL", self.val
return self.val
......
......@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code = self.code_sections[tag]
define_macros, undef_macros = self.get_c_macros(node, name)
return os.linesep.join([define_macros, code,
return os.linesep.join(['', define_macros, code,
undef_macros])
else:
raise utils.MethodNotDefined(
......@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros, undef_macros = self.get_c_macros(node, name)
def_sub, undef_sub = self.get_sub_macros(sub)
return os.linesep.join([def_macros, def_sub,
return os.linesep.join(['', def_macros, def_sub,
op_code,
undef_sub, undef_macros])
else:
......@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros, undef_macros = self.get_c_macros(node, name,
check_input=False)
ctx = ""
if 'context' in sub:
ctx = ", %s" % (sub['context'],)
# Generate the C code
return """
%(define_macros)s
{
if (%(func_name)s(%(func_args)s) != 0) {
if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
%(fail)s
}
}
%(undef_macros)s
""" % dict(func_name=self.func_name,
fail=sub['fail'],
fail=sub['fail'], ctx=ctx,
func_args=self.format_c_function_args(inp, out),
define_macros=define_macros,
undef_macros=undef_macros)
......
......@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device:
elif config.init_gpu_device.startswith('gpu'):
assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'")
......
......@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available():
if dnn_available.avail is None:
......@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
params=params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
......
......@@ -8,6 +8,7 @@ import warnings
import numpy
from theano import config
from theano.compat import decode, decode_iter
from theano.gof import local_bitwidth
from theano.gof.utils import hash_from_file
......@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
from theano.configparser import (config, AddConfigVar, StrParam,
BoolParam, ConfigParam)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
user_provided_cuda_root = True
def default_cuda_root():
global user_provided_cuda_root
v = os.getenv('CUDA_ROOT', "")
user_provided_cuda_root = False
if v:
return v
return find_cuda_root()
AddConfigVar('cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
nvcc_path = 'nvcc'
nvcc_version = None
......@@ -115,14 +55,6 @@ def is_nvcc_available():
return False
def find_cuda_root():
s = os.getenv("PATH")
if not s:
return
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
rpath_defaults = []
......@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
......@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details.
if (user_provided_cuda_root and
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib'))
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
......@@ -19,13 +19,6 @@ try:
except ImportError:
pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
......@@ -57,8 +50,9 @@ if pygpu:
import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '':
init_dev(config.gpuarray.init_device)
elif (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
init_dev(config.init_gpu_device)
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
......@@ -70,7 +64,8 @@ if pygpu:
except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True)
else:
if (config.gpuarray.init_device != '' or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
if (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl') or
config.device.startswith('opencl') or
config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True)
......@@ -5,17 +5,15 @@ import theano
from theano import config, gof
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel)
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from theano.gof import utils
class GpuConv(GpuKernelBase, gof.Op):
"""
Implement the batched and stacked 2d convolution on the gpu.
......@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
raise ValueError(mode)
def __init__(self, border_mode,
subsample=(1, 1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True,
version=-1,
direction_hint=None,
verbose=0,
kshp=None,
imshp=None,
max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True):
subsample=(1, 1),
logical_img_hw=None,
logical_kern_hw=None,
logical_kern_align_top=True,
version=-1,
direction_hint=None,
verbose=0,
kshp=None,
imshp=None,
max_threads_dim0=None,
nkern=None,
bsize=None,
fft_opt=True):
self.border_mode = border_mode
self.subsample = subsample
if logical_img_hw is not None:
......@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def flops(self, inputs, outputs):
"""
Useful with the hack in profilemode to print the MFlops.
"""
images, kerns = inputs
out, = outputs
......@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb = 0
if self.kshp is not None:
nb = self.kshp[1]
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
return ['-DTHEANO_KERN_WID=' + str(nb)]
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files
return (0, 21)
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
return (0, 22)
def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp
......
......@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge
# We need to import this to define the flags.
from theano.sandbox import dnn_flags # noqa
from .opt_util import alpha_merge, output_merge, inplace_allocempty
def dnn_available():
......@@ -50,7 +47,6 @@ def dnn_available():
dnn_available.avail = False
preambule = """
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
"""
......@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1;
}
"""
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__),
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
......@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True)
def local_dnn_conv_inplace(node):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs):
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True)
def local_dnn_convgw_inplace(node):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace,
local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"),
name="local_dnna_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4)
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4)
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
......
......@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
"""
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
......@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
"""
ctype = gpuarray.dtype_to_ctype(dtype)
return [
# get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
# get max of buf (trashing all but buf[0])
return [inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
......@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
'__syncthreads()',
]
......@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Optional, the dtype of the output.
manner_fn
A function that accepts strings of arguments a and b, and returns c code
for their reduction.
for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init
A function that accepts strings of arguments a and return c code for its
......@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
Parameters
----------
N
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(dtype).
......
import os
import numpy
from theano import Op, Apply, config
from theano.gof import local_optimizer
from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T
try:
import pygpu
from pygpu import gpuarray, elemwise
from pygpu import gpuarray
except ImportError:
pass
from .basic_ops import (as_gpuarray_variable,
host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType
from .comp import NVCC_compiler
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
......@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return (10,1)
return (11,)
def c_headers(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
return ['<numpy_compat.h>', '<gpuarray/types.h>']
def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype
......
......@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2)
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2)
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)]
......
......@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2)
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
@register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2)
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)]
......
......@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def _f16_ok(self):
return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self):
return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self):
return self.iadd_node.op.c_init_code()
......@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version:
return
return parent_version + elemwise_version + (2,)
return parent_version + elemwise_version + (3,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
import unittest
from theano.compat import izip
from copy import copy, deepcopy
from six import iteritems
......@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file
from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu,
......@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
GpuAllocEmpty,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuContiguous,
GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt
utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
......@@ -63,22 +61,6 @@ else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def may_fail(msg, EClass):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def test_decorator(f):
def wrapper():
try:
f()
except Exception as e:
if isinstance(e, EClass):
raise SkipTest(msg, e)
raise
wrapper.__name__ = f.__name__
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None):
if mode is None:
......@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else:
err_msg = ("Test %s::%s: exception raised during test "
"call was not the same as the reference "
"call (got: %s, expected %s)") % \
"call (got: %s, expected %s)" %
(self.gpu_op, testname, type(exc),
type(ref_e))
type(ref_e)))
exc.args += (err_msg,)
raise
......@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected):
self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % (
self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype))
"(dtype %s), got %s (dtype %s)." %
(self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype)))
for description, check in iteritems(self.checks):
if not check(inputs, variables):
......@@ -250,36 +232,6 @@ def test_transfer_strided():
assert numpy.all(fv == av)
@may_fail("Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:, ::-2]
cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
assert GpuArrayType.values_eq_approx(fv, gv)
fvs = f(cvs)
assert GpuArrayType.values_eq_approx(fvs, gvs)
f = theano.function([g], cuda_from_gpu(g))
fv = f(gv)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
fvs = f(gvs)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype)
g[:] = x
......@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op=gpu_alloc,
cases=dict(
correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)),
correct02=(rand(), numpy.int32(4), numpy.int32(7)),
correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand(7), numpy.int32(2), numpy.int32(4),
......@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32')
gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()]
......
......@@ -6,8 +6,7 @@ import numpy
import theano
from theano import tensor
from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive,
_dot22)
from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor
......@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu)
from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace,
gpugemm_inplace,
gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
GpuGemvTester = makeTester(
'GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
......@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return theano.shared(val)
GpuGemmTester = makeTester('GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
GpuGemmTester = makeTester(
'GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
)
)
class TestGpuSger(TestGer):
......@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def test_f32_0_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_1_0(self):
raise SkipTest('0-sized objects not supported')
def test_f32_0_1(self):
raise SkipTest('0-sized objects not supported')
......@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester = makeTester(
'GpuGemmTester',
'GpuDot22Tester',
op=_dot22, gpu_op=gpu_dot22,
cases=dict(
test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
# test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)],
)
)
def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
......@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5':
......
import unittest
from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
from .test_basic_ops import mode_with_gpu
from ..neighbours import GpuImages2Neibs
......
from __future__ import print_function
from nose.plugins.skip import SkipTest
import numpy
import unittest
......@@ -7,8 +7,6 @@ import theano
import theano.tensor as T
import theano.tests.unittest_tools as utt
from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu,
mode_without_gpu)
......@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in = 4098
n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y')
b = T.fvector('b')
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not
# (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner
# case.
dot_result = T.fmatrix('dot_result')
......@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
......@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW],
mode=mode_with_gpu)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
......@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads
"""
n_in = 1000
batch_size = 4097
n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099
# Seed numpy.random with config.unittests.rseed
......@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol = 1e-5
atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol):
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print('max err index:', max_i, max_i / batch_size, end=' ')
print(max_i % batch_size, max_i / n_out, max_i & n_out)
print('At that index:')
print('err:', scaled_err.flatten()[max_i])
print('absolute error:', abs_err.flatten()[max_i])
print('relative error:', rel_err.flatten()[max_i])
print('cpu_out:', cpu_out.flatten()[max_i])
print('gpu_out:', gpu_out.flatten()[max_i])
print('softmax_output_value:', softmax_output_value.flatten()[max_i])
print('dnll_value:', dnll_value[max_i / n_out])
print('y_idx_value:', y_idx_value[max_i / n_out])
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
def test_softmax_with_bias_float16():
......@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float16')
def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float32')
......@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED)
"""
x = T.matrix('x', dtype=dtypeInput)
b = T.vector('b', dtype=dtypeBias)
......@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def test_softmax_float16():
softmax_unittest_template('float16')
def test_softmax_float32():
softmax_unittest_template('float32')
def test_softmax_float64():
softmax_unittest_template('float64')
......
import operator
import numpy
import theano
......@@ -25,7 +23,6 @@ def test_deep_copy():
def test_values_eq_approx():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
assert GpuArrayType.values_eq_approx(a, a)
b = a.copy()
b[0] = numpy.asarray(b[0]) + 1.
......
......@@ -184,7 +184,7 @@ class GpuArrayType(Type):
@staticmethod
def may_share_memory(a, b):
if (not isinstance(a, gpuarray.GpuArray) or
not isinstance(b, gpuarray.GpuArray)):
not isinstance(b, gpuarray.GpuArray)):
return False
return pygpu.gpuarray.may_share_memory(a, b)
......@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self.broadcastable == other.broadcastable)
def convert_variable(self, var):
if (type(self) == type(var.type) and
self.typecode == var.type.typecode and
self.ndim == var.type.ndim and
vt = var.type
if (type(self) == type(vt) and
self.typecode == vt.typecode and
self.ndim == vt.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable,
var.type.broadcastable))):
vt.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self):
......
......@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py",
"sandbox/gpuarray/tests/test_subtensor.py",
"sandbox/gpuarray/tests/test_scan.py",
"sandbox/gpuarray/tests/test_neighbours.py",
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
"sandbox/gpuarray/tests/test_type.py",
"sandbox/gpuarray/tests/test_opt.py",
"sandbox/gpuarray/tests/test_blas.py",
"sandbox/gpuarray/tests/test_elemwise.py",
"sandbox/gpuarray/tests/test_nnet.py",
"sandbox/gpuarray/tests/test_basic_ops.py",
"scan_module/scan_utils.py",
"scan_module/scan_views.py",
"scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论