提交 645557f9 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu
[nosetest] [nosetest]
match=^test match=^test
nocapture=1 nocapture=1
[flake8]
ignore=E501,E123,E133
...@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): ...@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \ if (config.device.startswith('cuda') or
config.gpuarray.init_device != '': config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr # Use config.numpy to call numpy.seterr
......
...@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam): ...@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self.default = default self.default = default
def filter(val): def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \ if val == self.default or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'): or val.startswith('opencl') or val.startswith('cuda'):
return val return val
else: else:
raise ValueError(('Invalid value ("%s") for configuration ' raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with ' 'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"' 'one of "%s", "gpu", "opencl", "cuda"'
% (val, self.fullname))) % (self.default, val, self.fullname)))
over = kwargs.get("allow_override", True) over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over) super(DeviceParam, self).__init__(default, filter, over)
def __str__(self): def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,) return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)
AddConfigVar( AddConfigVar(
'device', 'device',
...@@ -94,15 +94,7 @@ AddConfigVar( ...@@ -94,15 +94,7 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if " "on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."), "NVIDIA use capital letters."),
DeviceParam('cpu', allow_override=False), DeviceParam('cpu', allow_override=False),
in_c_key=False,) in_c_key=False)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False)
AddConfigVar( AddConfigVar(
'init_gpu_device', 'init_gpu_device',
...@@ -110,12 +102,7 @@ AddConfigVar( ...@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, " "Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. " "nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."), "It can be used to run GPU-specific tests on a particular GPU."),
EnumStr('', 'gpu', DeviceParam('', allow_override=False),
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
in_c_key=False) in_c_key=False)
AddConfigVar( AddConfigVar(
...@@ -131,6 +118,112 @@ AddConfigVar( ...@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key=False) in_c_key=False)
def default_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
return ''
AddConfigVar(
'cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
def default_dnn_path(suffix):
def f(suffix=suffix):
if config.cuda.root == '':
return ''
return os.path.join(config.cuda.root, suffix)
return f
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('lib64')))
# This flag determines whether or not to raise error/warning message if # This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph. # there is a CPU Op in the computational graph.
AddConfigVar( AddConfigVar(
......
...@@ -102,7 +102,7 @@ def change_flags(**kwargs): ...@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l = [v for v in theano.configparser._config_var_list l = [v for v in theano.configparser._config_var_list
if v.fullname == k] if v.fullname == k]
assert len(l) == 1 assert len(l) == 1
old_val[k] = l[0].__get__() old_val[k] = l[0].__get__(True, None)
try: try:
for k in kwargs: for k in kwargs:
l = [v for v in theano.configparser._config_var_list l = [v for v in theano.configparser._config_var_list
...@@ -167,7 +167,7 @@ def _config_print(thing, buf): ...@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for cv in _config_var_list: for cv in _config_var_list:
print(cv, file=buf) print(cv, file=buf)
print(" Doc: ", cv.doc, file=buf) print(" Doc: ", cv.doc, file=buf)
print(" Value: ", cv.__get__(), file=buf) print(" Value: ", cv.__get__(True, None), file=buf)
print("", file=buf) print("", file=buf)
...@@ -182,7 +182,7 @@ def get_config_md5(): ...@@ -182,7 +182,7 @@ def get_config_md5():
all_opts = sorted([c for c in _config_var_list if c.in_c_key], all_opts = sorted([c for c in _config_var_list if c.in_c_key],
key=lambda cv: cv.fullname) key=lambda cv: cv.fullname)
return theano.gof.utils.hash_from_code('\n'.join( return theano.gof.utils.hash_from_code('\n'.join(
['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts])) ['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))
class TheanoConfigParser(object): class TheanoConfigParser(object):
...@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True): ...@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars # Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user. # This allow to filter wrong value from the user.
if not callable(configparam.default): if not callable(configparam.default):
configparam.__get__() configparam.__get__(root, type(root))
else: else:
# We do not want to evaluate now the default value # We do not want to evaluate now the default value
# when it is a callable. # when it is a callable.
try: try:
fetch_val_for_key(configparam.fullname) fetch_val_for_key(configparam.fullname)
# The user provided a value, filter it now. # The user provided a value, filter it now.
configparam.__get__() configparam.__get__(root, type(root))
except KeyError: except KeyError:
pass pass
setattr(root.__class__, sections[0], configparam) setattr(root.__class__, sections[0], configparam)
...@@ -294,6 +294,7 @@ class ConfigParam(object): ...@@ -294,6 +294,7 @@ class ConfigParam(object):
self.default = default self.default = default
self.filter = filter self.filter = filter
self.allow_override = allow_override self.allow_override = allow_override
self.is_default = True
# N.B. -- # N.B. --
# self.fullname # set by AddConfigVar # self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar # self.doc # set by AddConfigVar
...@@ -304,16 +305,19 @@ class ConfigParam(object): ...@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is # Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects. # invalid and causes a crash or has unwanted side effects.
def __get__(self, *args): def __get__(self, cls, type_):
if cls is None:
return self
if not hasattr(self, 'val'): if not hasattr(self, 'val'):
try: try:
val_str = fetch_val_for_key(self.fullname) val_str = fetch_val_for_key(self.fullname)
self.is_default = False
except KeyError: except KeyError:
if callable(self.default): if callable(self.default):
val_str = self.default() val_str = self.default()
else: else:
val_str = self.default val_str = self.default
self.__set__(None, val_str) self.__set__(cls, val_str)
# print "RVAL", self.val # print "RVAL", self.val
return self.val return self.val
......
...@@ -1171,7 +1171,7 @@ def apply_meth(tag): ...@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code = self.code_sections[tag] code = self.code_sections[tag]
define_macros, undef_macros = self.get_c_macros(node, name) define_macros, undef_macros = self.get_c_macros(node, name)
return os.linesep.join([define_macros, code, return os.linesep.join(['', define_macros, code,
undef_macros]) undef_macros])
else: else:
raise utils.MethodNotDefined( raise utils.MethodNotDefined(
...@@ -1428,7 +1428,7 @@ class COp(Op): ...@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros, undef_macros = self.get_c_macros(node, name) def_macros, undef_macros = self.get_c_macros(node, name)
def_sub, undef_sub = self.get_sub_macros(sub) def_sub, undef_sub = self.get_sub_macros(sub)
return os.linesep.join([def_macros, def_sub, return os.linesep.join(['', def_macros, def_sub,
op_code, op_code,
undef_sub, undef_macros]) undef_sub, undef_macros])
else: else:
...@@ -1442,17 +1442,21 @@ class COp(Op): ...@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros, undef_macros = self.get_c_macros(node, name, define_macros, undef_macros = self.get_c_macros(node, name,
check_input=False) check_input=False)
ctx = ""
if 'context' in sub:
ctx = ", %s" % (sub['context'],)
# Generate the C code # Generate the C code
return """ return """
%(define_macros)s %(define_macros)s
{ {
if (%(func_name)s(%(func_args)s) != 0) { if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
%(fail)s %(fail)s
} }
} }
%(undef_macros)s %(undef_macros)s
""" % dict(func_name=self.func_name, """ % dict(func_name=self.func_name,
fail=sub['fail'], fail=sub['fail'], ctx=ctx,
func_args=self.format_c_function_args(inp, out), func_args=self.format_c_function_args(inp, out),
define_macros=define_macros, define_macros=define_macros,
undef_macros=undef_macros) undef_macros=undef_macros)
......
...@@ -535,7 +535,7 @@ def handle_shared_float32(tf): ...@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py # import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'): if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device, test_driver=False) use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device: elif config.init_gpu_device.startswith('gpu'):
assert config.device == "cpu", ( assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device" "We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'") " only when the Theano flag device=='cpu'")
......
...@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1; return 1;
} }
""" """
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the # Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags( comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), params=params, preambule=preambule, body=body,
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
dnn_available.avail = comp dnn_available.avail = comp
......
...@@ -8,6 +8,7 @@ import warnings ...@@ -8,6 +8,7 @@ import warnings
import numpy import numpy
from theano import config
from theano.compat import decode, decode_iter from theano.compat import decode, decode_iter
from theano.gof import local_bitwidth from theano.gof import local_bitwidth
from theano.gof.utils import hash_from_file from theano.gof.utils import hash_from_file
...@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen ...@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler") _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
from theano.configparser import (config, AddConfigVar, StrParam,
BoolParam, ConfigParam)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
user_provided_cuda_root = True
def default_cuda_root():
global user_provided_cuda_root
v = os.getenv('CUDA_ROOT', "")
user_provided_cuda_root = False
if v:
return v
return find_cuda_root()
AddConfigVar('cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
nvcc_path = 'nvcc' nvcc_path = 'nvcc'
nvcc_version = None nvcc_version = None
...@@ -115,14 +55,6 @@ def is_nvcc_available(): ...@@ -115,14 +55,6 @@ def is_nvcc_available():
return False return False
def find_cuda_root():
s = os.getenv("PATH")
if not s:
return
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
rpath_defaults = [] rpath_defaults = []
...@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler): ...@@ -229,7 +161,7 @@ class NVCC_compiler(Compiler):
include_dirs include_dirs
A list of include directory names (each gets prefixed with -I). A list of include directory names (each gets prefixed with -I).
lib_dirs lib_dirs
A list of library search path directory names (each gets A list of library search path directory names (each gets
prefixed with -L). prefixed with -L).
libs libs
A list of libraries to link with (each gets prefixed with -l). A list of libraries to link with (each gets prefixed with -l).
...@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler): ...@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but # provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and # otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details. # https://wiki.debian.org/RpathIssue for details.
if (user_provided_cuda_root and
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))): os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib')) rpaths.append(os.path.join(config.cuda.root, 'lib'))
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
...@@ -19,13 +19,6 @@ try: ...@@ -19,13 +19,6 @@ try:
except ImportError: except ImportError:
pygpu = None pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor)
...@@ -57,8 +50,9 @@ if pygpu: ...@@ -57,8 +50,9 @@ if pygpu:
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '': elif (config.init_gpu_device.startswith('cuda') or
init_dev(config.gpuarray.init_device) config.init_gpu_device.startswith('opencl')):
init_dev(config.init_gpu_device)
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost, from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu) GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
...@@ -70,7 +64,8 @@ if pygpu: ...@@ -70,7 +64,8 @@ if pygpu:
except Exception: except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True) error("Could not initialize pygpu, support disabled", exc_info=True)
else: else:
if (config.gpuarray.init_device != '' or if (config.init_gpu_device.startswith('cuda') or
config.device.startswith('opencl') or config.init_gpu_device.startswith('opencl') or
config.device.startswith('cuda')): config.device.startswith('opencl') or
config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True) error("pygpu was configured but could not be imported", exc_info=True)
...@@ -5,17 +5,15 @@ import theano ...@@ -5,17 +5,15 @@ import theano
from theano import config, gof from theano import config, gof
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel) from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from theano.gof import utils from theano.gof import utils
class GpuConv(GpuKernelBase, gof.Op): class GpuConv(GpuKernelBase, gof.Op):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
...@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -70,19 +68,19 @@ class GpuConv(GpuKernelBase, gof.Op):
raise ValueError(mode) raise ValueError(mode)
def __init__(self, border_mode, def __init__(self, border_mode,
subsample=(1, 1), subsample=(1, 1),
logical_img_hw=None, logical_img_hw=None,
logical_kern_hw=None, logical_kern_hw=None,
logical_kern_align_top=True, logical_kern_align_top=True,
version=-1, version=-1,
direction_hint=None, direction_hint=None,
verbose=0, verbose=0,
kshp=None, kshp=None,
imshp=None, imshp=None,
max_threads_dim0=None, max_threads_dim0=None,
nkern=None, nkern=None,
bsize=None, bsize=None,
fft_opt=True): fft_opt=True):
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
if logical_img_hw is not None: if logical_img_hw is not None:
...@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -182,7 +180,7 @@ class GpuConv(GpuKernelBase, gof.Op):
def flops(self, inputs, outputs): def flops(self, inputs, outputs):
""" """
Useful with the hack in profilemode to print the MFlops. Useful with the hack in profilemode to print the MFlops.
""" """
images, kerns = inputs images, kerns = inputs
out, = outputs out, = outputs
...@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb = 0 nb = 0
if self.kshp is not None: if self.kshp is not None:
nb = self.kshp[1] nb = self.kshp[1]
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)]
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 21) return (0, 22)
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
......
...@@ -26,10 +26,7 @@ from .conv import GpuConv ...@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge, inplace_allocempty
# We need to import this to define the flags.
from theano.sandbox import dnn_flags # noqa
def dnn_available(): def dnn_available():
...@@ -50,7 +47,6 @@ def dnn_available(): ...@@ -50,7 +47,6 @@ def dnn_available():
dnn_available.avail = False dnn_available.avail = False
preambule = """ preambule = """
#include <stdio.h> #include <stdio.h>
#include <cuda.h>
#include <cudnn.h> #include <cudnn.h>
#include <cudnn_helper.h> #include <cudnn_helper.h>
""" """
...@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1; return 1;
} }
""" """
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the # Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags( comp, out, err = GCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), params, preambule=preambule, body=body,
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
dnn_available.avail = comp dnn_available.avail = comp
...@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20, ...@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn') 'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node): def local_dnn_conv_inplace(node, inputs):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node): def local_dnn_convgw_inplace(node, inputs):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node): def local_dnn_convgi_inplace(node, inputs):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace, local_dnn_convgw_inplace,
local_dnn_convgi_inplace, local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"), name="local_dnna_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn') 70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
......
...@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc. ...@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
""" """
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
def nvcc_kernel(name, params, body): def nvcc_kernel(name, params, body):
""" """
Return the c code of a kernel function. Return the c code of a kernel function.
...@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -174,16 +174,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
ctype = gpuarray.dtype_to_ctype(dtype) ctype = gpuarray.dtype_to_ctype(dtype)
return [ # get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0]) return [inline_reduce_max(N, buf, threadPos, threadCount),
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('%s row_max = ' + buf + '[0]') % ctype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)', buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]', buf2 + '[__i] = ' + buf + '[__i]',
'}', '}',
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
...@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -192,8 +191,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
'__syncthreads()', '__syncthreads()',
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}', '}',
'__syncthreads()', '__syncthreads()',
] ]
...@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -232,7 +231,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
Optional, the dtype of the output. Optional, the dtype of the output.
manner_fn manner_fn
A function that accepts strings of arguments a and b, and returns c code A function that accepts strings of arguments a and b, and returns c code
for their reduction. for their reduction.
Example: return "%(a)s + %(b)s" for a sum reduction. Example: return "%(a)s + %(b)s" for a sum reduction.
manner_init manner_init
A function that accepts strings of arguments a and return c code for its A function that accepts strings of arguments a and return c code for its
...@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, ...@@ -259,7 +258,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count,
loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" % loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf) "%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos)) r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos)) r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
...@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, ...@@ -324,7 +323,7 @@ def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x,
Parameters Parameters
---------- ----------
N N
Length of the buffer, atleast waprSize(32). Length of the buffer, atleast waprSize(32).
buf buf
A shared memory buffer of size warpSize * sizeof(dtype). A shared memory buffer of size warpSize * sizeof(dtype).
......
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.gof import local_optimizer
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T import theano.tensor as T
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
from .comp import NVCC_compiler
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,1) return (11,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<numpy_compat.h>', '<gpuarray/types.h>']
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
......
...@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node): ...@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt() @opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2) @alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs): def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)] return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt() @opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2) @output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs): def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)] return [Gemm16(relu=node.op.relu)(*inputs)]
......
...@@ -645,13 +645,13 @@ def local_gpua_hgemm(node): ...@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs): def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2) @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs): def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
......
...@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def _f16_ok(self): def _f16_ok(self):
return self.iadd_node.op._f16_ok return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return self.iadd_node.op.c_headers() return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self): def c_init_code(self):
return self.iadd_node.op.c_init_code() return self.iadd_node.op.c_init_code()
...@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version() elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version: if not parent_version or not elemwise_version:
return return
return parent_version + elemwise_version + (2,) return parent_version + elemwise_version + (3,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
import unittest import unittest
from theano.compat import izip from theano.compat import izip
from copy import copy, deepcopy
from six import iteritems from six import iteritems
...@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc ...@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file # Don't import test classes otherwise they get tested as part of the file
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None: if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed") raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather # If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it. # import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number: if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu, # We should not enable all the use like the flag device=gpu,
...@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: ...@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not theano.sandbox.gpuarray.pygpu_activated: if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled") raise SkipTest("pygpu disabled")
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
GpuAllocEmpty,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuContiguous,
GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed()) rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
...@@ -63,22 +61,6 @@ else: ...@@ -63,22 +61,6 @@ else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def may_fail(msg, EClass):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def test_decorator(f):
def wrapper():
try:
f()
except Exception as e:
if isinstance(e, EClass):
raise SkipTest(msg, e)
raise
wrapper.__name__ = f.__name__
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False, def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None): on_unused_input='raise', name=None):
if mode is None: if mode is None:
...@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else: else:
err_msg = ("Test %s::%s: exception raised during test " err_msg = ("Test %s::%s: exception raised during test "
"call was not the same as the reference " "call was not the same as the reference "
"call (got: %s, expected %s)") % \ "call (got: %s, expected %s)" %
(self.gpu_op, testname, type(exc), (self.gpu_op, testname, type(exc),
type(ref_e)) type(ref_e)))
exc.args += (err_msg,) exc.args += (err_msg,)
raise raise
...@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected): expected):
self.fail(("Test %s::%s: Output %s gave the wrong " self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s " "value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % ( "(dtype %s), got %s (dtype %s)." %
self.op, testname, i, inputs, expected, (self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype)) expected.dtype, variable, variable.dtype)))
for description, check in iteritems(self.checks): for description, check in iteritems(self.checks):
if not check(inputs, variables): if not check(inputs, variables):
...@@ -250,36 +232,6 @@ def test_transfer_strided(): ...@@ -250,36 +232,6 @@ def test_transfer_strided():
assert numpy.all(fv == av) assert numpy.all(fv == av)
@may_fail("Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:, ::-2]
cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
assert GpuArrayType.values_eq_approx(fv, gv)
fvs = f(cvs)
assert GpuArrayType.values_eq_approx(fvs, gvs)
f = theano.function([g], cuda_from_gpu(g))
fv = f(gv)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
fvs = f(gvs)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def gpu_alloc_expected(x, *shp): def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype) g = gpuarray.empty(shp, dtype=x.dtype)
g[:] = x g[:] = x
...@@ -291,8 +243,8 @@ GpuAllocTester = makeTester( ...@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op=gpu_alloc, gpu_op=gpu_alloc,
cases=dict( cases=dict(
correct01=(rand(), numpy.int32(7)), correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU # just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)), # correct01_bcast=(rand(1), numpy.int32(7)),
correct02=(rand(), numpy.int32(4), numpy.int32(7)), correct02=(rand(), numpy.int32(4), numpy.int32(7)),
correct12=(rand(7), numpy.int32(4), numpy.int32(7)), correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand(7), numpy.int32(2), numpy.int32(4), correct13=(rand(7), numpy.int32(2), numpy.int32(4),
...@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i(): ...@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv = gpuarray.asarray(numpy.random.rand(5, 4), cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32') dtype='float32')
gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
f = theano.function([a], gpu_from_host(a), mode=m) f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()] for x in f.maker.fgraph.toposort()]
......
...@@ -6,8 +6,7 @@ import numpy ...@@ -6,8 +6,7 @@ import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive, from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
_dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
...@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand, ...@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu) mode_with_gpu)
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm) GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester(
op=gemv_inplace, gpu_op=gpugemv_inplace, 'GpuGemvTester',
cases=dict( op=gemv_inplace, gpu_op=gpugemv_inplace,
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0], cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0], dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0], # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0], # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0], # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0], test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
) )
) )
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin): class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
...@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin): ...@@ -48,24 +47,24 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return theano.shared(val) return theano.shared(val)
GpuGemmTester = makeTester('GpuGemmTester', GpuGemmTester = makeTester(
op=gemm_inplace, gpu_op=gpugemm_inplace, 'GpuGemmTester',
cases=dict( op=gemm_inplace, gpu_op=gpugemm_inplace,
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0], cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0], test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0], test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0], test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6], test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0], test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0], test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1], test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.1],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1], test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.1],
# test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0], # test10=[rand(0, 4), -1.0, rand(0, 5), rand(5, 4), 0.0],
# test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1], # test11=[rand(3, 0), -1.0, rand(3, 5), rand(5, 0), 1.1],
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1], # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1], # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
) )
) )
class TestGpuSger(TestGer): class TestGpuSger(TestGer):
...@@ -84,8 +83,10 @@ class TestGpuSger(TestGer): ...@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def test_f32_0_0(self): def test_f32_0_0(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
def test_f32_1_0(self): def test_f32_1_0(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
def test_f32_0_1(self): def test_f32_0_1(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
...@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin): ...@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester = makeTester( GpuDot22Tester = makeTester(
'GpuGemmTester', 'GpuDot22Tester',
op=_dot22, gpu_op=gpu_dot22, op=_dot22, gpu_op=gpu_dot22,
cases=dict( cases=dict(
test1=[rand(3, 4), rand(4, 5)], test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)], test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)], test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)], test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)], # test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)], # test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)], # test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)], # test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)], # test9=[rand(0, 0), rand(0, 0)],
) )
) )
def test_hgemm_swap(): def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
...@@ -149,6 +151,7 @@ def test_hgemm_swap(): ...@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt.assert_allclose(of, on) utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge(): def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
......
import unittest
from theano.tensor.nnet.tests import test_neighbours from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import mode_with_gpu
mode_without_gpu)
from ..neighbours import GpuImages2Neibs from ..neighbours import GpuImages2Neibs
......
from __future__ import print_function from __future__ import print_function
from nose.plugins.skip import SkipTest
import numpy import numpy
import unittest import unittest
...@@ -7,8 +7,6 @@ import theano ...@@ -7,8 +7,6 @@ import theano
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import (mode_with_gpu,
mode_without_gpu) mode_without_gpu)
...@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in = 4098 n_in = 4098
n_out = 4099 n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y') y = T.lvector('y')
b = T.fvector('b') b = T.fvector('b')
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of # we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not # (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner # powerful enough. We need the big shape to check for corner
# case. # case.
dot_result = T.fmatrix('dot_result') dot_result = T.fmatrix('dot_result')
...@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx = numpy.asarray(numpy.random.rand(batch_size, n_in), xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32) dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32') yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32') b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
...@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu = theano.function(inputs=[y, b, dot_result], classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW], outputs=[loss, y_pred, dW],
mode=mode_with_gpu) mode=mode_with_gpu)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op, assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias) T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
...@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads We check that we loop when their is too much threads
""" """
n_in = 1000
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode): if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099 n_out = 4099
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
...@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol = 1e-5 rtol = 1e-5
atol = 1e-6 atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol): utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print('max err index:', max_i, max_i / batch_size, end=' ')
print(max_i % batch_size, max_i / n_out, max_i & n_out)
print('At that index:')
print('err:', scaled_err.flatten()[max_i])
print('absolute error:', abs_err.flatten()[max_i])
print('relative error:', rel_err.flatten()[max_i])
print('cpu_out:', cpu_out.flatten()[max_i])
print('gpu_out:', gpu_out.flatten()[max_i])
print('softmax_output_value:', softmax_output_value.flatten()[max_i])
print('dnll_value:', dnll_value[max_i / n_out])
print('y_idx_value:', y_idx_value[max_i / n_out])
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
def test_softmax_with_bias_float16(): def test_softmax_with_bias_float16():
...@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16(): ...@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template(dtypeInput='float32', softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float16') dtypeBias='float16')
def test_softmax_with_bias_float32(): def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput='float32', softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float32') dtypeBias='float32')
...@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias): ...@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED) NOT IMPLEMENTED)
""" """
x = T.matrix('x', dtype=dtypeInput) x = T.matrix('x', dtype=dtypeInput)
b = T.vector('b', dtype=dtypeBias) b = T.vector('b', dtype=dtypeBias)
...@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias): ...@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def test_softmax_float16(): def test_softmax_float16():
softmax_unittest_template('float16') softmax_unittest_template('float16')
def test_softmax_float32(): def test_softmax_float32():
softmax_unittest_template('float32') softmax_unittest_template('float32')
def test_softmax_float64(): def test_softmax_float64():
softmax_unittest_template('float64') softmax_unittest_template('float64')
......
import operator
import numpy import numpy
import theano import theano
...@@ -25,7 +23,6 @@ def test_deep_copy(): ...@@ -25,7 +23,6 @@ def test_deep_copy():
def test_values_eq_approx(): def test_values_eq_approx():
a = rand_gpuarray(20, dtype='float32') a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
assert GpuArrayType.values_eq_approx(a, a) assert GpuArrayType.values_eq_approx(a, a)
b = a.copy() b = a.copy()
b[0] = numpy.asarray(b[0]) + 1. b[0] = numpy.asarray(b[0]) + 1.
......
...@@ -184,7 +184,7 @@ class GpuArrayType(Type): ...@@ -184,7 +184,7 @@ class GpuArrayType(Type):
@staticmethod @staticmethod
def may_share_memory(a, b): def may_share_memory(a, b):
if (not isinstance(a, gpuarray.GpuArray) or if (not isinstance(a, gpuarray.GpuArray) or
not isinstance(b, gpuarray.GpuArray)): not isinstance(b, gpuarray.GpuArray)):
return False return False
return pygpu.gpuarray.may_share_memory(a, b) return pygpu.gpuarray.may_share_memory(a, b)
...@@ -200,11 +200,12 @@ class GpuArrayType(Type): ...@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self.broadcastable == other.broadcastable) self.broadcastable == other.broadcastable)
def convert_variable(self, var): def convert_variable(self, var):
if (type(self) == type(var.type) and vt = var.type
self.typecode == var.type.typecode and if (type(self) == type(vt) and
self.ndim == var.type.ndim and self.typecode == vt.typecode and
self.ndim == vt.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable, all(sb == ob or ob for sb, ob in zip(self.broadcastable,
var.type.broadcastable))): vt.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self): def __hash__(self):
......
...@@ -157,24 +157,11 @@ whitelist_flake8 = [ ...@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py", "sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py", "sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py",
"sandbox/gpuarray/tests/test_subtensor.py", "sandbox/gpuarray/tests/test_subtensor.py",
"sandbox/gpuarray/tests/test_scan.py", "sandbox/gpuarray/tests/test_scan.py",
"sandbox/gpuarray/tests/test_neighbours.py",
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
"sandbox/gpuarray/tests/test_type.py",
"sandbox/gpuarray/tests/test_opt.py", "sandbox/gpuarray/tests/test_opt.py",
"sandbox/gpuarray/tests/test_blas.py",
"sandbox/gpuarray/tests/test_elemwise.py", "sandbox/gpuarray/tests/test_elemwise.py",
"sandbox/gpuarray/tests/test_nnet.py",
"sandbox/gpuarray/tests/test_basic_ops.py",
"scan_module/scan_utils.py", "scan_module/scan_utils.py",
"scan_module/scan_views.py", "scan_module/scan_views.py",
"scan_module/scan.py", "scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论