提交 645557f9 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3476 from abergeron/move_config

Multiple fixes preparing for multi-gpu
[nosetest] [nosetest]
match=^test match=^test
nocapture=1 nocapture=1
[flake8]
ignore=E501,E123,E133
...@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): ...@@ -109,8 +109,10 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if config.device.startswith('cuda') or config.device.startswith('opencl') or \ if (config.device.startswith('cuda') or
config.gpuarray.init_device != '': config.device.startswith('opencl') or
config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl')):
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
# Use config.numpy to call numpy.seterr # Use config.numpy to call numpy.seterr
......
...@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam): ...@@ -73,19 +73,19 @@ class DeviceParam(ConfigParam):
self.default = default self.default = default
def filter(val): def filter(val):
if val.startswith('cpu') or val.startswith('gpu') \ if val == self.default or val.startswith('gpu') \
or val.startswith('opencl') or val.startswith('cuda'): or val.startswith('opencl') or val.startswith('cuda'):
return val return val
else: else:
raise ValueError(('Invalid value ("%s") for configuration ' raise ValueError(('Invalid value ("%s") for configuration '
'variable "%s". Valid options start with ' 'variable "%s". Valid options start with '
'one of "cpu", "gpu", "opencl", "cuda"' 'one of "%s", "gpu", "opencl", "cuda"'
% (val, self.fullname))) % (self.default, val, self.fullname)))
over = kwargs.get("allow_override", True) over = kwargs.get("allow_override", True)
super(DeviceParam, self).__init__(default, filter, over) super(DeviceParam, self).__init__(default, filter, over)
def __str__(self): def __str__(self):
return '%s (cpu, gpu*, opencl*, cuda*) ' % (self.fullname,) return '%s (%s, gpu*, opencl*, cuda*) ' % (self.fullname, self.default)
AddConfigVar( AddConfigVar(
'device', 'device',
...@@ -94,14 +94,6 @@ AddConfigVar( ...@@ -94,14 +94,6 @@ AddConfigVar(
"on it. Do not use upper case letters, only lower case even if " "on it. Do not use upper case letters, only lower case even if "
"NVIDIA use capital letters."), "NVIDIA use capital letters."),
DeviceParam('cpu', allow_override=False), DeviceParam('cpu', allow_override=False),
in_c_key=False,)
AddConfigVar('gpuarray.init_device',
"""
Device to initialize for gpuarray use without moving
computations automatically.
""",
StrParam(''),
in_c_key=False) in_c_key=False)
AddConfigVar( AddConfigVar(
...@@ -110,12 +102,7 @@ AddConfigVar( ...@@ -110,12 +102,7 @@ AddConfigVar(
"Unlike 'device', setting this option will NOT move computations, " "Unlike 'device', setting this option will NOT move computations, "
"nor shared variables, to the specified GPU. " "nor shared variables, to the specified GPU. "
"It can be used to run GPU-specific tests on a particular GPU."), "It can be used to run GPU-specific tests on a particular GPU."),
EnumStr('', 'gpu', DeviceParam('', allow_override=False),
'gpu0', 'gpu1', 'gpu2', 'gpu3',
'gpu4', 'gpu5', 'gpu6', 'gpu7',
'gpu8', 'gpu9', 'gpu10', 'gpu11',
'gpu12', 'gpu13', 'gpu14', 'gpu15',
allow_override=False),
in_c_key=False) in_c_key=False)
AddConfigVar( AddConfigVar(
...@@ -131,6 +118,112 @@ AddConfigVar( ...@@ -131,6 +118,112 @@ AddConfigVar(
in_c_key=False) in_c_key=False)
def default_cuda_root():
v = os.getenv('CUDA_ROOT', "")
if v:
return v
s = os.getenv("PATH")
if not s:
return ''
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
return ''
AddConfigVar(
'cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
def default_dnn_path(suffix):
def f(suffix=suffix):
if config.cuda.root == '':
return ''
return os.path.join(config.cuda.root, suffix)
return f
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(default_dnn_path('lib64')))
# This flag determines whether or not to raise error/warning message if # This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph. # there is a CPU Op in the computational graph.
AddConfigVar( AddConfigVar(
......
...@@ -102,7 +102,7 @@ def change_flags(**kwargs): ...@@ -102,7 +102,7 @@ def change_flags(**kwargs):
l = [v for v in theano.configparser._config_var_list l = [v for v in theano.configparser._config_var_list
if v.fullname == k] if v.fullname == k]
assert len(l) == 1 assert len(l) == 1
old_val[k] = l[0].__get__() old_val[k] = l[0].__get__(True, None)
try: try:
for k in kwargs: for k in kwargs:
l = [v for v in theano.configparser._config_var_list l = [v for v in theano.configparser._config_var_list
...@@ -167,7 +167,7 @@ def _config_print(thing, buf): ...@@ -167,7 +167,7 @@ def _config_print(thing, buf):
for cv in _config_var_list: for cv in _config_var_list:
print(cv, file=buf) print(cv, file=buf)
print(" Doc: ", cv.doc, file=buf) print(" Doc: ", cv.doc, file=buf)
print(" Value: ", cv.__get__(), file=buf) print(" Value: ", cv.__get__(True, None), file=buf)
print("", file=buf) print("", file=buf)
...@@ -182,7 +182,7 @@ def get_config_md5(): ...@@ -182,7 +182,7 @@ def get_config_md5():
all_opts = sorted([c for c in _config_var_list if c.in_c_key], all_opts = sorted([c for c in _config_var_list if c.in_c_key],
key=lambda cv: cv.fullname) key=lambda cv: cv.fullname)
return theano.gof.utils.hash_from_code('\n'.join( return theano.gof.utils.hash_from_code('\n'.join(
['%s = %s' % (cv.fullname, cv.__get__()) for cv in all_opts])) ['%s = %s' % (cv.fullname, cv.__get__(True, None)) for cv in all_opts]))
class TheanoConfigParser(object): class TheanoConfigParser(object):
...@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True): ...@@ -270,14 +270,14 @@ def AddConfigVar(name, doc, configparam, root=config, in_c_key=True):
# Trigger a read of the value from config files and env vars # Trigger a read of the value from config files and env vars
# This allow to filter wrong value from the user. # This allow to filter wrong value from the user.
if not callable(configparam.default): if not callable(configparam.default):
configparam.__get__() configparam.__get__(root, type(root))
else: else:
# We do not want to evaluate now the default value # We do not want to evaluate now the default value
# when it is a callable. # when it is a callable.
try: try:
fetch_val_for_key(configparam.fullname) fetch_val_for_key(configparam.fullname)
# The user provided a value, filter it now. # The user provided a value, filter it now.
configparam.__get__() configparam.__get__(root, type(root))
except KeyError: except KeyError:
pass pass
setattr(root.__class__, sections[0], configparam) setattr(root.__class__, sections[0], configparam)
...@@ -294,6 +294,7 @@ class ConfigParam(object): ...@@ -294,6 +294,7 @@ class ConfigParam(object):
self.default = default self.default = default
self.filter = filter self.filter = filter
self.allow_override = allow_override self.allow_override = allow_override
self.is_default = True
# N.B. -- # N.B. --
# self.fullname # set by AddConfigVar # self.fullname # set by AddConfigVar
# self.doc # set by AddConfigVar # self.doc # set by AddConfigVar
...@@ -304,16 +305,19 @@ class ConfigParam(object): ...@@ -304,16 +305,19 @@ class ConfigParam(object):
# Calling `filter` here may actually be harmful if the default value is # Calling `filter` here may actually be harmful if the default value is
# invalid and causes a crash or has unwanted side effects. # invalid and causes a crash or has unwanted side effects.
def __get__(self, *args): def __get__(self, cls, type_):
if cls is None:
return self
if not hasattr(self, 'val'): if not hasattr(self, 'val'):
try: try:
val_str = fetch_val_for_key(self.fullname) val_str = fetch_val_for_key(self.fullname)
self.is_default = False
except KeyError: except KeyError:
if callable(self.default): if callable(self.default):
val_str = self.default() val_str = self.default()
else: else:
val_str = self.default val_str = self.default
self.__set__(None, val_str) self.__set__(cls, val_str)
# print "RVAL", self.val # print "RVAL", self.val
return self.val return self.val
......
...@@ -1171,7 +1171,7 @@ def apply_meth(tag): ...@@ -1171,7 +1171,7 @@ def apply_meth(tag):
code = self.code_sections[tag] code = self.code_sections[tag]
define_macros, undef_macros = self.get_c_macros(node, name) define_macros, undef_macros = self.get_c_macros(node, name)
return os.linesep.join([define_macros, code, return os.linesep.join(['', define_macros, code,
undef_macros]) undef_macros])
else: else:
raise utils.MethodNotDefined( raise utils.MethodNotDefined(
...@@ -1428,7 +1428,7 @@ class COp(Op): ...@@ -1428,7 +1428,7 @@ class COp(Op):
def_macros, undef_macros = self.get_c_macros(node, name) def_macros, undef_macros = self.get_c_macros(node, name)
def_sub, undef_sub = self.get_sub_macros(sub) def_sub, undef_sub = self.get_sub_macros(sub)
return os.linesep.join([def_macros, def_sub, return os.linesep.join(['', def_macros, def_sub,
op_code, op_code,
undef_sub, undef_macros]) undef_sub, undef_macros])
else: else:
...@@ -1442,17 +1442,21 @@ class COp(Op): ...@@ -1442,17 +1442,21 @@ class COp(Op):
define_macros, undef_macros = self.get_c_macros(node, name, define_macros, undef_macros = self.get_c_macros(node, name,
check_input=False) check_input=False)
ctx = ""
if 'context' in sub:
ctx = ", %s" % (sub['context'],)
# Generate the C code # Generate the C code
return """ return """
%(define_macros)s %(define_macros)s
{ {
if (%(func_name)s(%(func_args)s) != 0) { if (%(func_name)s(%(func_args)s%(ctx)s) != 0) {
%(fail)s %(fail)s
} }
} }
%(undef_macros)s %(undef_macros)s
""" % dict(func_name=self.func_name, """ % dict(func_name=self.func_name,
fail=sub['fail'], fail=sub['fail'], ctx=ctx,
func_args=self.format_c_function_args(inp, out), func_args=self.format_c_function_args(inp, out),
define_macros=define_macros, define_macros=define_macros,
undef_macros=undef_macros) undef_macros=undef_macros)
......
...@@ -535,7 +535,7 @@ def handle_shared_float32(tf): ...@@ -535,7 +535,7 @@ def handle_shared_float32(tf):
# import dependency. So we also test it in the file theano/__init__.py # import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'): if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device, test_driver=False) use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device: elif config.init_gpu_device.startswith('gpu'):
assert config.device == "cpu", ( assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device" "We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'") " only when the Theano flag device=='cpu'")
......
...@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -27,8 +27,6 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
import theano.sandbox.dnn_flags
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -57,15 +55,17 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1; return 1;
} }
""" """
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the # Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = NVCC_compiler.try_flags( comp, out, err = NVCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), params=params, preambule=preambule, body=body,
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
dnn_available.avail = comp dnn_available.avail = comp
......
...@@ -8,6 +8,7 @@ import warnings ...@@ -8,6 +8,7 @@ import warnings
import numpy import numpy
from theano import config
from theano.compat import decode, decode_iter from theano.compat import decode, decode_iter
from theano.gof import local_bitwidth from theano.gof import local_bitwidth
from theano.gof.utils import hash_from_file from theano.gof.utils import hash_from_file
...@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen ...@@ -19,67 +20,6 @@ from theano.misc.windows import output_subprocess_Popen
_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler") _logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
from theano.configparser import (config, AddConfigVar, StrParam,
BoolParam, ConfigParam)
AddConfigVar('nvcc.compiler_bindir',
"If defined, nvcc compiler driver will seek g++ and gcc"
" in this directory",
StrParam(""),
in_c_key=False)
user_provided_cuda_root = True
def default_cuda_root():
global user_provided_cuda_root
v = os.getenv('CUDA_ROOT', "")
user_provided_cuda_root = False
if v:
return v
return find_cuda_root()
AddConfigVar('cuda.root',
"""directory with bin/, lib/, include/ for cuda utilities.
This directory is included via -L and -rpath when linking
dynamically compiled modules. If AUTO and nvcc is in the
path, it will use one of nvcc parent directory. Otherwise
/usr/local/cuda will be used. Leave empty to prevent extra
linker directives. Default: environment variable "CUDA_ROOT"
or else "AUTO".
""",
StrParam(default_cuda_root),
in_c_key=False)
def filter_nvcc_flags(s):
assert isinstance(s, str)
flags = [flag for flag in s.split(' ') if flag]
if any([f for f in flags if not f.startswith("-")]):
raise ValueError(
"Theano nvcc.flags support only parameter/value pairs without"
" space between them. e.g.: '--machine 64' is not supported,"
" but '--machine=64' is supported. Please add the '=' symbol."
" nvcc.flags value is '%s'" % s)
return ' '.join(flags)
AddConfigVar('nvcc.flags',
"Extra compiler flags for nvcc",
ConfigParam("", filter_nvcc_flags),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
AddConfigVar('nvcc.fastmath',
"",
BoolParam(False),
# Not needed in c key as it is already added.
# We remove it as we don't make the md5 of config to change
# if theano.sandbox.cuda is loaded or not.
in_c_key=False)
nvcc_path = 'nvcc' nvcc_path = 'nvcc'
nvcc_version = None nvcc_version = None
...@@ -115,14 +55,6 @@ def is_nvcc_available(): ...@@ -115,14 +55,6 @@ def is_nvcc_available():
return False return False
def find_cuda_root():
s = os.getenv("PATH")
if not s:
return
for dir in s.split(os.path.pathsep):
if os.path.exists(os.path.join(dir, "nvcc")):
return os.path.split(dir)[0]
rpath_defaults = [] rpath_defaults = []
...@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler): ...@@ -359,7 +291,8 @@ class NVCC_compiler(Compiler):
# provided an cuda.root flag, we need to add one, but # provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and # otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details. # https://wiki.debian.org/RpathIssue for details.
if (user_provided_cuda_root and
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))): os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib')) rpaths.append(os.path.join(config.cuda.root, 'lib'))
......
"""
This module contains the configuration flags for cudnn support.
Those are shared between the cuda and gpuarray backend which is why
they are in this file.
"""
import os.path
from theano.configparser import AddConfigVar, EnumStr, StrParam
from theano import config
AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''),
in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd',
"Default implementation to use for CuDNN backward convolution.",
EnumStr('none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.include_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'include')))
AddConfigVar('dnn.library_path',
"Location of the cudnn header (defaults to the cuda root)",
StrParam(lambda: os.path.join(config.cuda.root, 'lib64')))
...@@ -19,13 +19,6 @@ try: ...@@ -19,13 +19,6 @@ try:
except ImportError: except ImportError:
pygpu = None pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu # This is for documentation not to depend on the availability of pygpu
from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant, from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor) GpuArraySharedVariable, gpuarray_shared_constructor)
...@@ -57,8 +50,9 @@ if pygpu: ...@@ -57,8 +50,9 @@ if pygpu:
import theano.compile import theano.compile
theano.compile.shared_constructor(gpuarray_shared_constructor) theano.compile.shared_constructor(gpuarray_shared_constructor)
optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile') optdb.add_tags('gpuarray_opt', 'fast_run', 'fast_compile')
elif config.gpuarray.init_device != '': elif (config.init_gpu_device.startswith('cuda') or
init_dev(config.gpuarray.init_device) config.init_gpu_device.startswith('opencl')):
init_dev(config.init_gpu_device)
from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost, from .basic_ops import (GpuAlloc, GpuContiguous, GpuEye, GpuFromHost,
GpuJoin, GpuReshape, GpuSplit, HostFromGpu) GpuJoin, GpuReshape, GpuSplit, HostFromGpu)
...@@ -70,7 +64,8 @@ if pygpu: ...@@ -70,7 +64,8 @@ if pygpu:
except Exception: except Exception:
error("Could not initialize pygpu, support disabled", exc_info=True) error("Could not initialize pygpu, support disabled", exc_info=True)
else: else:
if (config.gpuarray.init_device != '' or if (config.init_gpu_device.startswith('cuda') or
config.init_gpu_device.startswith('opencl') or
config.device.startswith('opencl') or config.device.startswith('opencl') or
config.device.startswith('cuda')): config.device.startswith('cuda')):
error("pygpu was configured but could not be imported", exc_info=True) error("pygpu was configured but could not be imported", exc_info=True)
...@@ -2,11 +2,9 @@ import os ...@@ -2,11 +2,9 @@ import os
import numpy import numpy
import theano from theano import Op, Apply, Type, Variable
from theano import Op, Apply from theano import tensor, config
from theano import tensor, scalar, config
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.scalar import Scalar
from theano.tensor.basic import Alloc, Join, Split from theano.tensor.basic import Alloc, Join, Split
from theano.gof import HideC from theano.gof import HideC
...@@ -17,7 +15,7 @@ from six.moves import xrange ...@@ -17,7 +15,7 @@ from six.moves import xrange
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
...@@ -293,7 +291,6 @@ class GpuFromHost(Op): ...@@ -293,7 +291,6 @@ class GpuFromHost(Op):
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
z, = out z, = out
type = node.outputs[0].type
z[0] = gpuarray.array(x) z[0] = gpuarray.array(x)
def grad(self, inputs, grads): def grad(self, inputs, grads):
...@@ -312,254 +309,29 @@ class GpuFromHost(Op): ...@@ -312,254 +309,29 @@ class GpuFromHost(Op):
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
return """ return """
PyGpuArrayObject *%(name)s_tmp;
%(name)s_tmp = PyArray_GETCONTIGUOUS(%(inp)s);
if (%(name)s_tmp == NULL)
%(fail)s
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = pygpu_fromhostdata(PyArray_DATA(%(inp)s), %(out)s = pygpu_fromhostdata(PyArray_DATA(%(name)s_tmp),
get_typecode((PyObject *)PyArray_DESCR(%(inp)s)), get_typecode((PyObject *)PyArray_DESCR(%(name)s_tmp)),
PyArray_NDIM(%(inp)s), PyArray_NDIM(%(name)s_tmp),
(size_t *)PyArray_DIMS(%(inp)s), (size_t *)PyArray_DIMS(%(name)s_tmp),
(ssize_t *)PyArray_STRIDES(%(inp)s), (ssize_t *)PyArray_STRIDES(%(name)s_tmp),
pygpu_default_context(), pygpu_default_context(),
Py_None); Py_None);
if (%(out)s == NULL) {
Py_DECREF(%(name)s_tmp);
if (%(out)s == NULL)
%(fail)s %(fail)s
}
""" % {'name': name, 'inp': inputs[0], """ % {'name': name, 'inp': inputs[0],
'out': outputs[0], 'fail': sub['fail']} 'out': outputs[0], 'fail': sub['fail']}
def c_code_cache_version(self):
return (4,)
gpu_from_host = GpuFromHost()
class GpuFromCuda(Op):
view_map = {0: [0]}
__props__ = ()
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x)
return Apply(self, [x], [GpuArrayType(broadcastable=x.broadcastable,
dtype=x.dtype)()])
def perform(self, node, inp, out):
x, = inp
z, = out
z[0] = gpuarray.array(numpy.asarray(x))
def grad(self, inputs, grads):
gz, = grads
return [cuda_from_gpu(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
if isinstance(ev, GpuArrayType):
return [cuda_from_gpu(ev)]
else:
return ev
def infer_shape(self, node, xshp):
return xshp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>',
'<gpuarray/types.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
gpudata *(*cuda_make_buf)(void *c, CUdeviceptr p, size_t sz);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *))gpuarray_get_extension("cuda_get_ctx");',
'cuda_make_buf = (gpudata *(*)(void *, CUdeviceptr, size_t))gpuarray_get_extension("cuda_make_buf");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr;
gpudata *%(name)sdata;
CUcontext %(name)scur;
size_t *%(name)sdims;
ssize_t *%(name)sstr;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
%(name)sdims = (size_t *)calloc(%(in)s->nd, sizeof(size_t));
if (%(name)sdims == NULL) {
PyErr_SetString(PyExc_MemoryError, "Can't allocate dimensions.");
%(fail)s
}
%(name)sstr = (ssize_t *)calloc(%(in)s->nd, sizeof(ssize_t));
if (%(name)sstr == NULL) {
free(%(name)sdims);
PyErr_SetString(PyExc_MemoryError, "Can't allocate strides.");
%(fail)s
}
for (unsigned int i = 0; i < %(in)s->nd; i++) {
%(name)sdims[i] = (size_t)CudaNdarray_HOST_DIMS(%(in)s)[i];
%(name)sstr[i] = (ssize_t)CudaNdarray_HOST_STRIDES(%(in)s)[i]*4;
}
%(name)sdata = cuda_make_buf(pygpu_default_context()->ctx,
(CUdeviceptr)%(in)s->devdata,
((size_t)%(in)s->data_allocated)*4);
if (%(name)sdata == NULL) {
Py_DECREF(%(out)s);
free(%(name)sdims);
free(%(name)sstr);
PyErr_SetString(PyExc_MemoryError, "Could not allocate gpudata structure.");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = pygpu_fromgpudata(%(name)sdata, 0, GA_FLOAT, %(in)s->nd,
%(name)sdims, %(name)sstr,
pygpu_default_context(), 1,
(PyObject *)%(in)s,
(PyObject *)&PyGpuArrayType);
pygpu_default_context()->ops->buffer_release(%(name)sdata);
free(%(name)sdims);
free(%(name)sstr);
if (%(out)s == NULL) {
%(fail)s
}
""" % {'name': name, 'in': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (5,)
gpu_from_cuda = GpuFromCuda() gpu_from_host = GpuFromHost()
class CudaFromGpu(Op):
view_map = {0: [0]}
__props__ = ()
def make_node(self, x):
from theano.sandbox.cuda import CudaNdarrayType
if not isinstance(x.type, GpuArrayType):
raise TypeError(x)
if x.type.dtype != 'float32':
raise TypeError(x)
return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable)()])
def perform(self, node, inp, out):
from theano.sandbox.cuda import filter as cuda_filter
x, = inp
z, = out
z[0] = cuda_filter(theano._asarray(x, dtype='float32'),
tuple([0] * x.ndim), 0, z[0])
def grad(self, inputs, grads):
gz, = grads
return [gpu_from_cuda(gz)]
def R_op(self, inputs, eval_points):
from theano.sandbox.cuda import CudaNdarrayType
ev, = eval_points
if (isinstance(ev, CudaNdarrayType)):
return [gpu_from_cuda(ev)]
else:
return [ev]
def infer_shape(self, node, shp):
return shp
def c_headers(self):
return ['<cuda_ndarray.cuh>', '<gpuarray/extension.h>', '<cuda.h>']
def c_header_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'include'))
return ret
def c_lib_dirs(self):
import cuda_ndarray
ret = [os.path.dirname(cuda_ndarray.__file__)]
cuda_root = config.cuda.root
if cuda_root:
ret.append(os.path.join(cuda_root, 'lib'))
return ret
def c_libraries(self):
return ['cudart', 'cublas', 'cuda']
def c_support_code(self):
return """
CUcontext (*cuda_get_ctx)(void *ctx);
CUdeviceptr (*cuda_get_ptr)(gpudata *g);
"""
def c_init_code(self):
return ['cuda_get_ctx = (CUcontext (*)(void *ctx))gpuarray_get_extension("cuda_get_ctx");',
'cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))gpuarray_get_extension("cuda_get_ptr");']
def c_code(self, node, name, inputs, outputs, sub):
return """
int %(name)serr = 0, %(name)si;
CUcontext %(name)scur;
cuCtxGetCurrent(&%(name)scur);
if (%(name)scur != cuda_get_ctx(pygpu_default_context()->ctx)) {
PyErr_SetString(PyExc_ValueError, "Ambient cuda context is not the same as output context.");
%(fail)s
}
if (GpuArray_sync(&%(inp)s->ga) != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Could not sync GpuArray");
%(fail)s
}
Py_XDECREF(%(out)s);
%(out)s = (CudaNdarray *)CudaNdarray_new_nd(%(inp)s->ga.nd);
if (!%(out)s) {
%(fail)s
}
for (%(name)si = 0; %(name)si < %(inp)s->ga.nd; %(name)si++) {
CudaNdarray_set_dim(%(out)s, %(name)si, %(inp)s->ga.dimensions[%(name)si]);
CudaNdarray_set_stride(%(out)s, %(name)si, %(inp)s->ga.strides[%(name)si]/4);
}
%(name)serr = CudaNdarray_set_device_data(%(out)s,
(float *)(((char *)cuda_get_ptr(%(inp)s->ga.data))+%(inp)s->ga.offset),
(PyObject *)%(inp)s);
if (%(name)serr) {
%(fail)s
}
""" % {'name': name, 'inp': inputs[0], 'out': outputs[0],
'fail': sub['fail']}
def c_code_cache_version(self):
return (3,)
cuda_from_gpu = CudaFromGpu()
class GpuAlloc(HideC, Alloc): class GpuAlloc(HideC, Alloc):
...@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc): ...@@ -592,7 +364,7 @@ class GpuAlloc(HideC, Alloc):
sh, bcast = self.validate_shape(shape) sh, bcast = self.validate_shape(shape)
if value.ndim > len(sh): if value.ndim > len(sh):
TypeError("The GpuAlloc value to use has more dimensions " TypeError("The GpuAlloc value to use has more dimensions "
"than the specified shape", v.ndim, len(sh)) "than the specified shape", value.ndim, len(sh))
otype = value.type.clone(broadcastable=bcast) otype = value.type.clone(broadcastable=bcast)
return Apply(self, [value] + sh, [otype()]) return Apply(self, [value] + sh, [otype()])
...@@ -686,14 +458,14 @@ class GpuAlloc(HideC, Alloc): ...@@ -686,14 +458,14 @@ class GpuAlloc(HideC, Alloc):
return (2,) return (2,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
from . import subtensor, blas
for client in node.outputs[0].clients: for client in node.outputs[0].clients:
if client[0] == 'output': if client[0] == 'output':
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif ( # The following ops work inplace of their input id 0. # The following ops work inplace of their input id 0.
client[1] == 0 and elif (client[1] == 0 and
isinstance(client[0].op, (
# Ops that will work inplace on the Alloc. So if they # Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the # get constant_folded, they would copy the
# constant and this is less efficients. # constant and this is less efficients.
...@@ -701,14 +473,13 @@ class GpuAlloc(HideC, Alloc): ...@@ -701,14 +473,13 @@ class GpuAlloc(HideC, Alloc):
# Not doing the constant folding could also lower # Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't # the peak memory usage, as we the "constant" won't
# always exists. # always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor, isinstance(client[0].op,
theano.sandbox.gpuarray.subtensor.GpuIncSubtensor, (subtensor.GpuIncSubtensor,
theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1, subtensor.GpuAdvancedIncSubtensor1,
theano.sandbox.gpuarray.subtensor.GpuAdvancedIncSubtensor1_dev20, subtensor.GpuAdvancedIncSubtensor1_dev20,
theano.sandbox.gpuarray.blas.GpuGemm, blas.GpuGemm, blas.GpuGemv,
theano.sandbox.gpuarray.blas.GpuGemv, blas.GpuGer)
theano.sandbox.gpuarray.blas.GpuGer, )):
))):
return False return False
# If the clients is a transfer, we don't want to fold. We # If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do. # let the moving opt finish before deciding what to do.
...@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) { ...@@ -1089,8 +860,7 @@ KERNEL void k(GLOBAL_MEM %(ctype)s *a, ga_size n, ga_size m) {
code=code, name="k", code=code, name="k",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE], params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE],
flags=Kernel.get_flags(self.dtype), flags=Kernel.get_flags(self.dtype),
objvar='k_eye_'+name, objvar='k_eye_' + name)]
)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
n, m = inp n, m = inp
......
...@@ -5,17 +5,15 @@ import theano ...@@ -5,17 +5,15 @@ import theano
from theano import config, gof from theano import config, gof
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from six.moves import reduce
from .comp import NVCC_compiler
from .type import GpuArrayType from .type import GpuArrayType
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel) from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
from theano.gof import utils from theano.gof import utils
class GpuConv(GpuKernelBase, gof.Op): class GpuConv(GpuKernelBase, gof.Op):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
...@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op): ...@@ -227,32 +225,14 @@ class GpuConv(GpuKernelBase, gof.Op):
nb = 0 nb = 0
if self.kshp is not None: if self.kshp is not None:
nb = self.kshp[1] nb = self.kshp[1]
return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)]
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
raise MethodNotDefined('cuda only')
return ['<stdint.h>', '<stdio.h>', 'cuda.h',
'<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 21) return (0, 22)
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
......
...@@ -26,10 +26,7 @@ from .conv import GpuConv ...@@ -26,10 +26,7 @@ from .conv import GpuConv
# GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from .nnet import GpuSoftmax from .nnet import GpuSoftmax
from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter from .opt import gpu_seqopt, register_opt, conv_groupopt, op_lifter
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge, inplace_allocempty
# We need to import this to define the flags.
from theano.sandbox import dnn_flags # noqa
def dnn_available(): def dnn_available():
...@@ -50,7 +47,6 @@ def dnn_available(): ...@@ -50,7 +47,6 @@ def dnn_available():
dnn_available.avail = False dnn_available.avail = False
preambule = """ preambule = """
#include <stdio.h> #include <stdio.h>
#include <cuda.h>
#include <cudnn.h> #include <cudnn.h>
#include <cudnn_helper.h> #include <cudnn_helper.h>
""" """
...@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { ...@@ -64,15 +60,18 @@ if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
return 1; return 1;
} }
""" """
params = ["-l", "cudnn", "-I" + os.path.dirname(__file__)]
if config.dnn.include_path:
params.append("-I" + config.dnn.include_path)
if config.dnn.library_path:
params.append("-L" + config.dnn.library_path)
# Do not run here the test program. It would run on the # Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed # default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in # GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection. # exclusive mode, this cause bad detection.
comp, out, err = GCC_compiler.try_flags( comp, out, err = GCC_compiler.try_flags(
["-l", "cudnn", "-I" + os.path.dirname(__file__), params, preambule=preambule, body=body,
"-I" + config.dnn.include_path,
"-L" + config.dnn.library_path],
preambule=preambule, body=body,
try_run=False, output=True) try_run=False, output=True)
dnn_available.avail = comp dnn_available.avail = comp
...@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20, ...@@ -1242,86 +1241,62 @@ conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
'conv_dnn', 'fast_compile', 'fast_run', 'cudnn') 'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
@local_optimizer([GpuDnnConv], inplace=True) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node): def local_dnn_conv_inplace(node, inputs):
if type(node.op) != GpuDnnConv or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node): def local_dnn_convgw_inplace(node, inputs):
if type(node.op) != GpuDnnConvGradW or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradI], inplace=True) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node): def local_dnn_convgi_inplace(node, inputs):
if type(node.op) != GpuDnnConvGradI or node.op.inplace:
return
inputs = list(node.inputs)
dest = inputs[2]
if (dest.owner and
isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
local_dnn_convgw_inplace, local_dnn_convgw_inplace,
local_dnn_convgi_inplace, local_dnn_convgi_inplace,
name="local_dnn_conv_inplace"), name="local_dnna_conv_inplace"),
70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn') 70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, nd=4) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2, nd=4) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
......
from __future__ import print_function from __future__ import print_function
import copy import copy
import os
from theano.compat import izip from theano.compat import izip
import numpy import numpy
import theano
from theano import Apply, scalar, config from theano import Apply, scalar, config
from theano import scalar as scal from theano import scalar as scal
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.gof.cmodule import GCC_compiler
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype) from theano.tensor.elemwise import (Elemwise, DimShuffle, CAReduceDtype)
...@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -108,7 +105,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs] scal_v_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]
outs = [make_argument(o, 'o%d' % (n,)) for n, o in outs = [make_argument(o, 'o%d' % (n,)) for n, o in
enumerate(node.outputs) if not n in self.inplace_pattern] enumerate(node.outputs) if n not in self.inplace_pattern]
scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs] scal_v_outs = [scalar.get_scalar_type(o.dtype) for o in node.outputs]
fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins], fake_node = Apply(self.scalar_op, [i() for i in scal_v_ins],
...@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -132,7 +129,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
else: else:
scal_out.append(arg.name + '[i]') scal_out.append(arg.name + '[i]')
kop = self.scalar_op.c_code(fake_node, nodename+'_scalar', kop = self.scalar_op.c_code(fake_node, nodename + '_scalar',
scal_in, scal_out, scal_in, scal_out,
dict(fail='return;')) dict(fail='return;'))
...@@ -171,25 +168,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -171,25 +168,10 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
("npy_float64", "ga_double"), ("npy_float64", "ga_double"),
]: ]:
kop = kop.replace(npy, ga) kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps + outs, kop, preamble=support_code)
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_compiler(self):
return GCC_compiler
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<numpy_compat.h>', '<gpuarray/types.h>']
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_support_code(self): def c_support_code(self):
return self.scalar_op.c_support_code() return self.scalar_op.c_support_code()
...@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -231,11 +213,6 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
node.outputs[0].type.dtype), node.outputs[0].type.dtype),
objvar='elem_%d_%s' % (nd, nodename))] objvar='elem_%d_%s' % (nd, nodename))]
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
if pygpu.get_default_context().kind == 'opencl': if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only') raise MethodNotDefined('cuda only')
...@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -417,7 +394,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
GpuKernel_error(&%(kname)s, err)); GpuKernel_error(&%(kname)s, err));
%(fail)s; %(fail)s;
} }
""" % dict(kname=kname,fail=fail) """ % dict(kname=kname, fail=fail)
if config.gpuarray.sync: if config.gpuarray.sync:
code += """ code += """
err = GpuArray_sync(&%(z)s->ga); err = GpuArray_sync(&%(z)s->ga);
...@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise): ...@@ -460,7 +437,7 @@ class GpuElemwise(GpuKernelBase, HideC, Elemwise):
def c_code_cache_version(self): def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (3, ver) return (4, ver)
else: else:
return ver return ver
...@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -495,7 +472,7 @@ class GpuDimShuffle(HideC, DimShuffle):
res = input res = input
res = res.transpose(self.shuffle+self.drop) res = res.transpose(self.shuffle + self.drop)
shape = list(res.shape[:len(self.shuffle)]) shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment: for augm in self.augment:
...@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -533,7 +510,7 @@ class GpuDimShuffle(HideC, DimShuffle):
Py_DECREF(tmp); Py_DECREF(tmp);
return res; return res;
} }
""" % dict(shuffle=', '.join(str(a) for a in (self.shuffle+self.drop)), """ % dict(shuffle=', '.join(str(a) for a in (self.shuffle + self.drop)),
name=name, nd_out=len(self.new_order), name=name, nd_out=len(self.new_order),
copy_shape=copy_shape(len(self.new_order))) copy_shape=copy_shape(len(self.new_order)))
...@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle): ...@@ -565,7 +542,7 @@ class GpuDimShuffle(HideC, DimShuffle):
return process return process
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
...@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -671,8 +648,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
if self.pre_scalar_op: if self.pre_scalar_op:
# Currently we only tested pre_scalar_op that don't cause # Currently we only tested pre_scalar_op that don't cause
# upcast. # upcast.
d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
assert d1.dtype == ret.outputs[0].dtype
assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
if self.reduce_mask is None: if self.reduce_mask is None:
if self.axis is None: if self.axis is None:
...@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -732,17 +707,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return False return False
return True return True
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['<numpy_compat.h>', '<gpuarray/types.h>']
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
x, = inp x, = inp
...@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -760,6 +726,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
sio = StringIO() sio = StringIO()
fail = sub['fail'] fail = sub['fail']
ctx = sub['context']
# check input # check input
print(""" print("""
...@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -824,8 +791,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = pygpu_empty(%(nd_out)s, new_dims, %(z)s = pygpu_empty(%(nd_out)s, new_dims,
%(out_typecode)s, GA_C_ORDER, %(out_typecode)s, GA_C_ORDER,
pygpu_default_context(), pygpu_default_context(), Py_None);
Py_None);
if (NULL == %(z)s) if (NULL == %(z)s)
{ {
PyErr_Format(PyExc_RuntimeError, "Failed to allocate output"); PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
...@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -863,14 +829,16 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code. # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle # TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims. # TODO: if only some dims are ccontiguous, call version with less dims.
print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals(), file=sio) print('if(%(x)s->ga.flags & GA_C_CONTIGUOUS){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail) self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio) print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' %
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) (''.join(str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio) print("}", file=sio)
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
# \end bracket the reduction ... # \end bracket the reduction ...
...@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1094,8 +1062,8 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
else: else:
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum)) scal.Minimum))
if self.pre_scalar_op: # TODO, multi_dtype! if self.pre_scalar_op: # TODO: multiple dtypes
#dtype = node.inputs[0].dtype # dtype = node.inputs[0].dtype
dtype = 'float32' dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)() dummy_var = scal.Scalar(dtype=dtype)()
...@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1943,12 +1911,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [16] # the version corresponding to the c code in this Op version = [17] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version()) version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
...@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1962,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
in_dtype = node.inputs[0].dtype in_dtype = node.inputs[0].dtype
out_dtype = node.outputs[0].dtype out_dtype = node.outputs[0].dtype
acc_dtype = self._acc_dtype(node.inputs[0].dtype) acc_dtype = self._acc_dtype(node.inputs[0].dtype)
flags=Kernel.get_flags(in_dtype, acc_dtype, out_dtype) flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
in_type = gpuarray.dtype_to_ctype(in_dtype) in_type = gpuarray.dtype_to_ctype(in_dtype)
out_type = gpuarray.dtype_to_ctype(out_dtype) out_type = gpuarray.dtype_to_ctype(out_dtype)
acc_type = gpuarray.dtype_to_ctype(acc_dtype) acc_type = gpuarray.dtype_to_ctype(acc_dtype)
...@@ -2106,7 +2071,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2106,7 +2071,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
] ]
kernels.append(Kernel(code=sio.getvalue(), name=kname, kernels.append(Kernel(code=sio.getvalue(), name=kname,
params=params, flags=flags, objvar=k_var)) params=params, flags=flags, objvar=k_var))
#01, 011, 0111 # 01, 011, 0111
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]): nd_in in[2, 3, 4]):
...@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2303,10 +2268,10 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
# this kernel uses one block for multiple column(up to 32TODO), # this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column. # threads per block for each element per column.
# thread.x = dim 2 contiguous # thread.x = dim 2 contiguous
# thread.y = dim 1 # thread.y = dim 1
# block.x = dim 0 # block.x = dim 0
# block.y = dim 1 rest # block.y = dim 1 rest
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner") decl, kname, params, k_var = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
...@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2625,7 +2590,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
{}, True) {}, True)
reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])") reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
kname = "kernel_reduce_1011" kname = "kernel_reduce_1011"
k_var= "kernel_reduce_1011_" + nodename k_var = "kernel_reduce_1011_" + nodename
sio = StringIO() sio = StringIO()
print(""" print("""
KERNEL void %(kname)s( KERNEL void %(kname)s(
...@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2753,7 +2718,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
flags=Kernel.get_flags(node.inputs[0].type.dtype, flags=Kernel.get_flags(node.inputs[0].type.dtype,
acc_dtype, acc_dtype,
node.outputs[0].type.dtype), node.outputs[0].type.dtype),
objvar='k_reduk_'+name)] objvar='k_reduk_' + name)]
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])): if not any(getattr(self, 'redux', [node.inputs[0].ndim != 0])):
...@@ -2768,7 +2733,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2768,7 +2733,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
if (%(sync)d) if (%(sync)d)
GpuArray_sync(&%(out)s->ga); GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'], """ % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync)) sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size, _, src, _, ls = k._get_basic_kernel(k.init_local_size,
...@@ -2816,7 +2781,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2816,7 +2781,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, nd_out=nd_out, fail=sub['fail'], """ % dict(output=output, nd_out=nd_out, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
else: else:
code += """ code += """
...@@ -2828,7 +2793,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2828,7 +2793,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(output=output, fail=sub['fail'], """ % dict(output=output, fail=sub['fail'],
out_type=dtype_to_typecode(node.outputs[0].type.dtype)) out_type=dtype_to_typecode(node.outputs[0].type.dtype))
if acc_dtype != node.outputs[0].type.dtype: if acc_dtype != node.outputs[0].type.dtype:
...@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2837,12 +2802,13 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(acc_type)s, GA_C_ORDER, pygpu_default_context(), %(acc_type)s, GA_C_ORDER, pygpu_default_context(),
Py_None); Py_None);
if (!tmp) %(fail)s if (!tmp) %(fail)s
""" % dict(output=output, fail=sub['fail'], acc_type=dtype_to_typecode(acc_dtype)) """ % dict(output=output, fail=sub['fail'],
acc_type=dtype_to_typecode(acc_dtype))
else: else:
code += """ code += """
tmp = %(output)s; tmp = %(output)s;
Py_INCREF(tmp); Py_INCREF(tmp);
""" % dict(output=output) """ % dict(output=output)
# We need the proxies since we are passing a pointer to the # We need the proxies since we are passing a pointer to the
# data into the call and therefore we need a real copy of the # data into the call and therefore we need a real copy of the
...@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2850,7 +2816,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
code += """ code += """
args[0] = &n; args[0] = &n;
args[1] = tmp->ga.data; args[1] = tmp->ga.data;
""" % dict(output=output) """ % dict(output=output)
p = 2 p = 2
for i in range(node.inputs[0].ndim): for i in range(node.inputs[0].ndim):
...@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2858,7 +2824,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s]; proxy_dim[%(i)s] = %(input)s->ga.dimensions[%(i)s];
args[%(p)s] = &proxy_dim[%(i)s]; args[%(p)s] = &proxy_dim[%(i)s];
n *= %(input)s->ga.dimensions[%(i)s]; n *= %(input)s->ga.dimensions[%(i)s];
""" % dict(i=i, p=p, input=input) """ % dict(i=i, p=p, input=input)
p += 1 p += 1
if not redux[i]: if not redux[i]:
code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i) code += "gs *= %(input)s->ga.dimensions[%(i)s];" % dict(input=input, i=i)
...@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2867,14 +2833,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
args[%(p)s] = %(input)s->ga.data; args[%(p)s] = %(input)s->ga.data;
proxy_off = %(input)s->ga.offset; proxy_off = %(input)s->ga.offset;
args[%(p)s+1] = &proxy_off; args[%(p)s+1] = &proxy_off;
""" % dict(p=p, input=input) """ % dict(p=p, input=input)
p += 2 p += 2
for i in range(node.inputs[0].ndim): for i in range(node.inputs[0].ndim):
code += """ code += """
proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s]; proxy_str[%(i)s] = %(input)s->ga.strides[%(i)s];
args[%(p)s] = &proxy_str[%(i)s]; args[%(p)s] = &proxy_str[%(i)s];
""" % dict(p=p, i=i, input=input) """ % dict(p=p, i=i, input=input)
p += 1 p += 1
code += """ code += """
...@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2911,14 +2877,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
} }
""" % dict(k_var='k_reduk_'+name, sync=bool(config.gpuarray.sync), """ % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, self.GpuKernelBase_version) return (2, self.GpuKernelBase_version)
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
...@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2942,8 +2908,8 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
redux = self.redux redux = self.redux
if any(redux): if any(redux):
output[0] = self.get_kernel_cache(node)(input).astype(copy=False, output[0] = self.get_kernel_cache(node)(input).astype(
dtype=node.outputs[0].type.dtype) copy=False, dtype=node.outputs[0].type.dtype)
else: else:
output[0] = pygpu.gpuarray.array(input, copy=True, output[0] = pygpu.gpuarray.array(input, copy=True,
dtype=node.outputs[0].type.dtype) dtype=node.outputs[0].type.dtype)
......
...@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc. ...@@ -4,11 +4,11 @@ Helper routines for generating gpu kernels for nvcc.
""" """
try: try:
import pygpu
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
def nvcc_kernel(name, params, body): def nvcc_kernel(name, params, body):
""" """
Return the c code of a kernel function. Return the c code of a kernel function.
...@@ -174,9 +174,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): ...@@ -174,9 +174,8 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"):
""" """
ctype = gpuarray.dtype_to_ctype(dtype) ctype = gpuarray.dtype_to_ctype(dtype)
return [
# get max of buf (trashing all but buf[0]) # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), return [inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
('%s row_max = ' + buf + '[0]') % ctype, ('%s row_max = ' + buf + '[0]') % ctype,
'__syncthreads()', '__syncthreads()',
......
import os
import numpy import numpy
from theano import Op, Apply, config from theano import Op, Apply, config
from theano.gof import local_optimizer
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T import theano.tensor as T
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, from .basic_ops import as_gpuarray_variable, GpuKernelBase, Kernel
host_from_gpu, gpu_from_host,
GpuKernelBase, Kernel)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt as register_gpu_opt, op_lifter
from .type import GpuArrayType from .type import GpuArrayType
from .comp import NVCC_compiler
class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
...@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -45,27 +40,10 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,1) return (11,)
def c_headers(self): def c_headers(self):
if pygpu.get_default_context().kind == 'opencl': return ['<numpy_compat.h>', '<gpuarray/types.h>']
raise MethodNotDefined('cuda only')
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_init_code(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
return ['setup_ext_cuda();']
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_ten4 = node.inputs[0].dtype dtype_ten4 = node.inputs[0].dtype
......
...@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node): ...@@ -176,13 +176,13 @@ def local_dot_to_gemm16(node):
@opt.register_opt() @opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4, nd=2) @alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs): def local_gemm16_alpha_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)] return [Gemm16(relu=node.op.relu)(*inputs)]
@opt.register_opt() @opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0, nd=2) @output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs): def local_gemm16_output_merge(node, *inputs):
return [Gemm16(relu=node.op.relu)(*inputs)] return [Gemm16(relu=node.op.relu)(*inputs)]
......
from __future__ import print_function from __future__ import print_function
import numpy import numpy
import os
from theano import Op, Apply, config from theano import Op, Apply, config
from six import StringIO from six import StringIO
try: try:
import pygpu import pygpu
from pygpu import gpuarray, elemwise from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
...@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -41,16 +40,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
am = y_idx.type() am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am]) return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['<numpy_compat.h>', '<gpuarray/types.h>']
'<gpuarray/types.h>']
def gpu_kernels(self, node, nodename): def gpu_kernels(self, node, nodename):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
...@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -302,7 +293,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
...@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -328,18 +319,10 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) return (11,)
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['<numpy_compat.h>', '<gpuarray/types.h>']
'<gpuarray/types.h>']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
...@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -541,21 +524,10 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (14,) + inline_softmax.code_version return (15,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['<numpy_compat.h>', '<gpuarray/types.h>']
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
...@@ -665,15 +637,15 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -665,15 +637,15 @@ class GpuSoftmax(GpuKernelBase, Op):
] ]
kernels = [] kernels = []
kname = "kSoftmax" kname = "kSoftmax"
k_var= "kSoftmax_" + nodename k_var = "kSoftmax_" + nodename
code = nvcc_kernel(kname, code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N', params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x', 'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1', 'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=["extern __shared__ %s buf[]" % type_acc,
"extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc, "%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
...@@ -696,15 +668,15 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -696,15 +668,15 @@ class GpuSoftmax(GpuKernelBase, Op):
kernels.append(Kernel(code=code, name=kname, params=params, kernels.append(Kernel(code=code, name=kname, params=params,
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmax_fixed_shared" kname = "kSoftmax_fixed_shared"
k_var= "kSoftmax_fixed_shared" + nodename k_var = "kSoftmax_fixed_shared" + nodename
code = nvcc_kernel(kname, code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N', params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x', 'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1', 'const ga_ssize sx0', 'const ga_ssize sx1',
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=["extern __shared__ %s buf[]" % type_acc,
"extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
...@@ -746,23 +718,10 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -746,23 +718,10 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) + inline_softmax.code_version return (14,) + inline_softmax.code_version
def c_header_dirs(self):
if pygpu.get_default_context().kind == 'opencl':
raise MethodNotDefined('cuda only')
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>', return ['<numpy_compat.h>', '<gpuarray/types.h>']
'<gpuarray/ext_cuda.h>', '<gpuarray/types.h>']
def c_init_code(self):
return ['setup_ext_cuda();']
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
dtype_x = node.inputs[0].dtype dtype_x = node.inputs[0].dtype
...@@ -892,7 +851,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -892,7 +851,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
kernels = [] kernels = []
kname = "kSoftmaxWithBias" kname = "kSoftmaxWithBias"
k_var = "kSoftmaxWithBias_" + nodename k_var = "kSoftmaxWithBias_" + nodename
code = nvcc_kernel(kname, code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N', params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x', 'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1', 'const ga_ssize sx0', 'const ga_ssize sx1',
...@@ -900,8 +860,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -900,8 +860,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'const ga_ssize sb0', 'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=["extern __shared__ %s buf[]" % type_acc,
"extern __shared__ %s buf[]" % type_acc,
"%s * buf2 = buf + N" % type_acc, "%s * buf2 = buf + N" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
...@@ -926,7 +885,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -926,7 +885,8 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
flags=flags, objvar=k_var)) flags=flags, objvar=k_var))
kname = "kSoftmaxWithBias_fixed_shared" kname = "kSoftmaxWithBias_fixed_shared"
k_var = "kSoftmaxWithBias_fixed_shared" + nodename k_var = "kSoftmaxWithBias_fixed_shared" + nodename
code = nvcc_kernel(kname, code = nvcc_kernel(
kname,
params=['const ga_size M', 'const ga_size N', params=['const ga_size M', 'const ga_size N',
'const %s * x' % type_x, 'const ga_size offset_x', 'const %s * x' % type_x, 'const ga_size offset_x',
'const ga_ssize sx0', 'const ga_ssize sx1', 'const ga_ssize sx0', 'const ga_ssize sx1',
...@@ -934,8 +894,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op): ...@@ -934,8 +894,7 @@ class GpuSoftmaxWithBias (GpuKernelBase, Op):
'const ga_ssize sb0', 'const ga_ssize sb0',
'%s * sm' % type_sm, 'const ga_size offset_sm', '%s * sm' % type_sm, 'const ga_size offset_sm',
'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'],
body=[ body=["extern __shared__ %s buf[]" % type_acc,
"extern __shared__ %s buf[]" % type_acc,
"x = (const %s *)(((char *)x)+offset_x)" % type_x, "x = (const %s *)(((char *)x)+offset_x)" % type_x,
"b = (const %s *)(((char *)b)+offset_b)" % type_b, "b = (const %s *)(((char *)b)+offset_b)" % type_b,
"sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm,
......
...@@ -645,13 +645,13 @@ def local_gpua_hgemm(node): ...@@ -645,13 +645,13 @@ def local_gpua_hgemm(node):
@register_opt() @register_opt()
@alpha_merge(GpuGemm, alpha_in=1, beta_in=4, nd=2) @alpha_merge(GpuGemm, alpha_in=1, beta_in=4)
def local_gpuagemm_alpha_merge(node, *inputs): def local_gpuagemm_alpha_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
@register_opt() @register_opt()
@output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0, nd=2) @output_merge(GpuGemm, alpha_in=1, beta_in=4, out_in=0)
def local_gpuagemm_output_merge(node, *inputs): def local_gpuagemm_output_merge(node, *inputs):
return [gpugemm_no_inplace(*inputs)] return [gpugemm_no_inplace(*inputs)]
......
...@@ -7,23 +7,35 @@ from theano.gof import local_optimizer ...@@ -7,23 +7,35 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float64')) _one = scal.constant(numpy.asarray(1.0, dtype='float64'))
def grab_cpu_scalar(v, nd): def grab_cpu_scalar(v, nd):
"""
Get a scalar variable value from the tree at `v`.
This function will dig through transfers and dimshuffles to get
the constant value. If no such constant is found, it returns None.
Parameters
----------
v : variable
Theano variable to extract the constant value from.
nd : int
Expected number of dimensions for the variable (for
broadcasted constants).
"""
if v.owner is not None: if v.owner is not None:
n = v.owner n = v.owner
if (isinstance(n.op, GpuDimShuffle) and if (isinstance(n.op, (GpuDimShuffle, DimShuffle)) and
n.op.new_order == ('x',) * nd):
return grab_cpu_scalar(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd): n.op.new_order == ('x',) * nd):
return grab_cpu_scalar(n.inputs[0]) return grab_cpu_scalar(n.inputs[0], n.inputs[0].ndim)
elif isinstance(n.op, GpuFromHost): elif isinstance(n.op, (GpuFromHost, HostFromGpu)):
return grab_cpu_scalar(n.inputs[0], nd=nd) return grab_cpu_scalar(n.inputs[0], nd)
else: else:
return None return None
else: else:
...@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd): ...@@ -33,10 +45,24 @@ def grab_cpu_scalar(v, nd):
def find_node(v, cls, ignore_clients=False): def find_node(v, cls, ignore_clients=False):
# This digs through possibly redundant transfers to for the node """
# that has the op class specified. If ignore_clients is False (the Find the node that has an op of of type `cls` in `v`.
# default) it will only dig through nodes that have a single
# client. This digs through possibly redundant transfers to for the node
that has the type `cls`. If `ignore_clients` is False (the
default) it will only dig through nodes that have a single client
to avoid duplicating computations.
Parameters
----------
v : variable
The variable to dig through
cls : Op class
The type of the node we are looking for
ignore_clients : bool, optional
Whether to ignore multiple clients or not.
"""
if v.owner is not None and (ignore_clients or len(v.clients) == 1): if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls): if isinstance(v.owner.op, cls):
return v.owner return v.owner
...@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False): ...@@ -50,8 +76,20 @@ def find_node(v, cls, ignore_clients=False):
def is_equal(var, val): def is_equal(var, val):
# Returns True if var is always equal to val (python value), False """
# otherwise (including if var is not constant) Returns True if `var` is always equal to `val`.
This will only return True if the variable will always be equal to
the value. If it might not be true in some cases then it returns False.
Parameters
----------
var : variable
Variable to compare
val : value
Python value
"""
try: try:
v = get_scalar_constant_value(var) v = get_scalar_constant_value(var)
return v == val return v == val
...@@ -59,7 +97,57 @@ def is_equal(var, val): ...@@ -59,7 +97,57 @@ def is_equal(var, val):
return False return False
def alpha_merge(cls, alpha_in, beta_in, nd): def alpha_merge(cls, alpha_in, beta_in):
"""
Decorator to merge multiplication by a scalar on the output.
This will find a pattern of scal * <yourop>(some, params, alpha,
beta) and update it so that the scalar multiplication happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
"""
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
...@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd): ...@@ -70,11 +158,14 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
targ = find_node(node.inputs[0], cls) targ = find_node(node.inputs[0], cls)
if targ is None: if targ is None:
targ = find_node(node.inputs[1], cls) targ = find_node(node.inputs[1], cls)
lr = grab_cpu_scalar(node.inputs[0], nd=nd) if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0],
nd=targ.outputs[0].ndim)
else: else:
lr = grab_cpu_scalar(node.inputs[1], nd=nd) lr = grab_cpu_scalar(node.inputs[1],
if (lr is None or targ is None or nd=targ.outputs[0].ndim)
lr.dtype != targ.outputs[0].dtype): if lr is None or lr.dtype != targ.outputs[0].dtype:
return None return None
inputs = list(targ.inputs) inputs = list(targ.inputs)
try: try:
...@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd): ...@@ -96,7 +187,62 @@ def alpha_merge(cls, alpha_in, beta_in, nd):
return wrapper return wrapper
def output_merge(cls, alpha_in, beta_in, out_in, nd): def output_merge(cls, alpha_in, beta_in, out_in):
"""
Decorator to merge addition by a value on the output.
This will find a pattern of val * <yourop>(some, params, alpha,
beta, out_like) and update it so that the addtition happens as
part of your op.
The op needs to accept an alpha and a beta scalar which act this way:
out = Op() * alpha + out_like * beta
Where out_like is a buffer that has the same size as the output
and gets added to the "real" output of the operation. An example
of an operation that respects this pattern is GEMM from blas.
The decorated function must have this signature:
maker(node, *inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
The `*inputs` parameters contains the new inputs for your op. You
MUST use those inputs instead of the ones on `node`. Note that
this function can be as simple as:
def maker(node, *inputs):
return node.op(*inputs)
Parameters
----------
cls : op class
The class of the op you want to merge
alpha_in : int
The input index for the alpha scalar for your op (in node.inputs).
beta_in : int
The input index for the beta scalar for your op (in node.inputs).
out_in : int
The input index for the out_like input for your op (in node.inputs).
Returns
-------
This returns an unregistered local optimizer that has the same
name as the decorated function.
Notes
-----
This was factored out since the code to deal with intervening
transfers and correctness in the presence of different values of
alpha and beta scaling factors is not trivial.
This also correctly handles the case where the added value is
broadcasted (by not performing the replacement).
"""
def wrapper(maker): def wrapper(maker):
@local_optimizer([GpuElemwise]) @local_optimizer([GpuElemwise])
@wraps(maker) @wraps(maker)
...@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd): ...@@ -126,3 +272,56 @@ def output_merge(cls, alpha_in, beta_in, out_in, nd):
return maker(targ, *inputs) return maker(targ, *inputs)
return opt return opt
return wrapper return wrapper
def inplace_allocempty(op, idx):
"""
Wrapper to make an inplace optimization that deals with AllocEmpty
This will duplicate the alloc input if it has more than one client
to allow the op to work on it inplace.
The decorated function must have this signature:
maker(node, inputs)
The `node` argument you recieve is the original apply node that
contains your op. You should use it to grab relevant properties
for your op so that the new version performs the same computation.
You should also switch the op to work inplace. The `*inputs`
parameters contains the new inputs for your op. You MUST use
those inputs instead of the ones on `node`. Note that this
function can be as simple as:
def maker(node, inputs):
return node.op.__class__(inplace=True)(*inputs)
Parameters
----------
op : op class
The op class to look for to make inplace
idx : int
The index of the (possibly) AllocEmpty input (in node.inputs).
Returns
-------
This returns an unregistered inplace local optimizer that has the
same name as the decorated function.
"""
def wrapper(maker):
@local_optimizer([op], inplace=True)
@wraps(maker)
def opt(node):
if type(node.op) != op or node.op.inplace:
return
inputs = list(node.inputs)
alloc = inputs[idx]
if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype)
inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs)
return opt
return wrapper
...@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -180,19 +180,9 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
def _f16_ok(self): def _f16_ok(self):
return self.iadd_node.op._f16_ok return self.iadd_node.op._f16_ok
def c_header_dirs(self):
cuda_root = config.cuda.root
if cuda_root:
return [os.path.join(cuda_root, 'include')]
else:
return []
def c_headers(self): def c_headers(self):
return self.iadd_node.op.c_headers() return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self): def c_init_code(self):
return self.iadd_node.op.c_init_code() return self.iadd_node.op.c_init_code()
...@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor): ...@@ -404,7 +394,7 @@ class GpuIncSubtensor(GpuKernelBase, IncSubtensor):
elemwise_version = self.iadd_node.c_code_cache_version() elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version: if not parent_version or not elemwise_version:
return return
return parent_version + elemwise_version + (2,) return parent_version + elemwise_version + (3,)
class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(HideC, tensor.AdvancedSubtensor1):
......
import unittest import unittest
from theano.compat import izip from theano.compat import izip
from copy import copy, deepcopy
from six import iteritems from six import iteritems
...@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc ...@@ -13,16 +12,31 @@ from theano.tensor.basic import alloc
# Don't import test classes otherwise they get tested as part of the file # Don't import test classes otherwise they get tested as part of the file
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
from theano.tensor.tests.test_basic import rand, safe_make_node from theano.tensor.tests.test_basic import rand, safe_make_node
from theano.tests import unittest_tools as utt
from theano.tests.unittest_tools import SkipTest from theano.tests.unittest_tools import SkipTest
import theano.sandbox.gpuarray import theano.sandbox.gpuarray
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host, HostFromGpu, GpuFromHost, GpuReshape,
gpu_alloc, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
import theano.sandbox.cuda as cuda_ndarray
try:
from pygpu import gpuarray
except:
pass
if theano.sandbox.gpuarray.pygpu is None: if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed") raise SkipTest("pygpu not installed")
# If you are writing a new test file, don't copy this code, but rather # If you are writing a new test file, don't copy this code, but rather
# import stuff from this file (like mode_with_gpu) to reuse it. # import stuff from this file (like mode_with_gpu) to reuse it.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number: if not cuda_ndarray.use.device_number:
# We should not enable all the use like the flag device=gpu, # We should not enable all the use like the flag device=gpu,
...@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated: ...@@ -36,25 +50,9 @@ if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not theano.sandbox.gpuarray.pygpu_activated: if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled") raise SkipTest("pygpu disabled")
from ..type import (GpuArrayType,
gpuarray_shared_constructor)
from ..basic_ops import (
host_from_gpu, gpu_from_host,
gpu_alloc, GpuAlloc,
GpuAllocEmpty,
gpu_from_cuda,
cuda_from_gpu, HostFromGpu,
GpuContiguous,
GpuFromHost, GpuReshape,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
from ..subtensor import GpuSubtensor
from theano.tests import unittest_tools as utt
utt.seed_rng() utt.seed_rng()
rng = numpy.random.RandomState(seed=utt.fetch_seed()) rng = numpy.random.RandomState(seed=utt.fetch_seed())
from pygpu import gpuarray
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
...@@ -63,22 +61,6 @@ else: ...@@ -63,22 +61,6 @@ else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def may_fail(msg, EClass):
"""Mark a test that requires very specific conditions to work to
mask a specific exception class."""
def test_decorator(f):
def wrapper():
try:
f()
except Exception as e:
if isinstance(e, EClass):
raise SkipTest(msg, e)
raise
wrapper.__name__ = f.__name__
return wrapper
return test_decorator
def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False, def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
on_unused_input='raise', name=None): on_unused_input='raise', name=None):
if mode is None: if mode is None:
...@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -183,9 +165,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
else: else:
err_msg = ("Test %s::%s: exception raised during test " err_msg = ("Test %s::%s: exception raised during test "
"call was not the same as the reference " "call was not the same as the reference "
"call (got: %s, expected %s)") % \ "call (got: %s, expected %s)" %
(self.gpu_op, testname, type(exc), (self.gpu_op, testname, type(exc),
type(ref_e)) type(ref_e)))
exc.args += (err_msg,) exc.args += (err_msg,)
raise raise
...@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu, ...@@ -197,9 +179,9 @@ def makeTester(name, op, gpu_op, cases, checks=None, mode_gpu=mode_with_gpu,
expected): expected):
self.fail(("Test %s::%s: Output %s gave the wrong " self.fail(("Test %s::%s: Output %s gave the wrong "
"value. With inputs %s, expected %s " "value. With inputs %s, expected %s "
"(dtype %s), got %s (dtype %s).") % ( "(dtype %s), got %s (dtype %s)." %
self.op, testname, i, inputs, expected, (self.op, testname, i, inputs, expected,
expected.dtype, variable, variable.dtype)) expected.dtype, variable, variable.dtype)))
for description, check in iteritems(self.checks): for description, check in iteritems(self.checks):
if not check(inputs, variables): if not check(inputs, variables):
...@@ -250,36 +232,6 @@ def test_transfer_strided(): ...@@ -250,36 +232,6 @@ def test_transfer_strided():
assert numpy.all(fv == av) assert numpy.all(fv == av)
@may_fail("Op fails if both contexts are not the same and it's rare "
"that the tests will be run this way", ValueError)
def test_transfer_cuda_gpu():
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest("Can't test interaction with cuda if cuda not present")
g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
c = cuda_ndarray.CudaNdarrayType((False, False))('c')
av = theano._asarray(rng.rand(5, 4), dtype='float32')
gv = gpuarray.array(av)
cv = cuda_ndarray.CudaNdarray(av)
gvs = gv[:, ::-2]
cvs = cv[:, ::-2]
f = theano.function([c], gpu_from_cuda(c))
fv = f(cv)
assert GpuArrayType.values_eq_approx(fv, gv)
fvs = f(cvs)
assert GpuArrayType.values_eq_approx(fvs, gvs)
f = theano.function([g], cuda_from_gpu(g))
fv = f(gv)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)
fvs = f(gvs)
assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
def gpu_alloc_expected(x, *shp): def gpu_alloc_expected(x, *shp):
g = gpuarray.empty(shp, dtype=x.dtype) g = gpuarray.empty(shp, dtype=x.dtype)
g[:] = x g[:] = x
...@@ -291,8 +243,8 @@ GpuAllocTester = makeTester( ...@@ -291,8 +243,8 @@ GpuAllocTester = makeTester(
gpu_op=gpu_alloc, gpu_op=gpu_alloc,
cases=dict( cases=dict(
correct01=(rand(), numpy.int32(7)), correct01=(rand(), numpy.int32(7)),
# just gives a DeepCopyOp with possibly wrong results on the CPU # just gives a DeepCopyOp with possibly wrong results on the CPU
# correct01_bcast=(rand(1), numpy.int32(7)), # correct01_bcast=(rand(1), numpy.int32(7)),
correct02=(rand(), numpy.int32(4), numpy.int32(7)), correct02=(rand(), numpy.int32(4), numpy.int32(7)),
correct12=(rand(7), numpy.int32(4), numpy.int32(7)), correct12=(rand(7), numpy.int32(4), numpy.int32(7)),
correct13=(rand(7), numpy.int32(2), numpy.int32(4), correct13=(rand(7), numpy.int32(2), numpy.int32(4),
...@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i(): ...@@ -486,8 +438,6 @@ def test_hostfromgpu_shape_i():
cv = gpuarray.asarray(numpy.random.rand(5, 4), cv = gpuarray.asarray(numpy.random.rand(5, 4),
dtype='float32') dtype='float32')
gpu_from_host = theano.sandbox.gpuarray.basic_ops.gpu_from_host
host_from_gpu = theano.sandbox.gpuarray.basic_ops.host_from_gpu
f = theano.function([a], gpu_from_host(a), mode=m) f = theano.function([a], gpu_from_host(a), mode=m)
assert gpu_from_host in [x.op assert gpu_from_host in [x.op
for x in f.maker.fgraph.toposort()] for x in f.maker.fgraph.toposort()]
......
...@@ -6,8 +6,7 @@ import numpy ...@@ -6,8 +6,7 @@ import numpy
import theano import theano
from theano import tensor from theano import tensor
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from theano.tensor.blas import (gemv_inplace, gemm_inplace, ger_destructive, from theano.tensor.blas import gemv_inplace, gemm_inplace, _dot22
_dot22)
from theano.tensor.tests.test_blas import TestGer, BaseGemv from theano.tensor.tests.test_blas import TestGer, BaseGemv
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
...@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand, ...@@ -15,22 +14,22 @@ from .test_basic_ops import (makeTester, rand,
mode_with_gpu) mode_with_gpu)
from ..blas import (gpugemv_inplace, gpugemv_no_inplace, from ..blas import (gpugemv_inplace, gpugemv_no_inplace,
gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace,
gpuger_inplace, gpuger_no_inplace, gpuger_inplace, gpuger_no_inplace,
GpuGer, gpu_dot22, GpuGemm) GpuGer, gpu_dot22, GpuGemm)
GpuGemvTester = makeTester('GpuGemvTester', GpuGemvTester = makeTester(
'GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace, op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict( cases=dict(dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0], dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0], # test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0], # test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0], # test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0], test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
) )
) )
class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin): class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
...@@ -48,10 +47,10 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin): ...@@ -48,10 +47,10 @@ class TestGpuSgemv(TestCase, BaseGemv, utt.TestOptimizationMixin):
return theano.shared(val) return theano.shared(val)
GpuGemmTester = makeTester('GpuGemmTester', GpuGemmTester = makeTester(
'GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace, op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict( cases=dict(test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0], test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0], test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0], test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
...@@ -65,7 +64,7 @@ GpuGemmTester = makeTester('GpuGemmTester', ...@@ -65,7 +64,7 @@ GpuGemmTester = makeTester('GpuGemmTester',
# test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1], # test12=[rand(3, 4), -1.0, rand(3, 0), rand(0, 4), -1.1],
# test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1], # test13=[rand(0, 0), -1.0, rand(0, 0), rand(0, 0), -1.1],
) )
) )
class TestGpuSger(TestGer): class TestGpuSger(TestGer):
...@@ -84,8 +83,10 @@ class TestGpuSger(TestGer): ...@@ -84,8 +83,10 @@ class TestGpuSger(TestGer):
def test_f32_0_0(self): def test_f32_0_0(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
def test_f32_1_0(self): def test_f32_1_0(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
def test_f32_0_1(self): def test_f32_0_1(self):
raise SkipTest('0-sized objects not supported') raise SkipTest('0-sized objects not supported')
...@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin): ...@@ -103,21 +104,22 @@ class TestGpuGer_OpContract(TestCase, utt.T_OpContractMixin):
GpuDot22Tester = makeTester( GpuDot22Tester = makeTester(
'GpuGemmTester', 'GpuDot22Tester',
op=_dot22, gpu_op=gpu_dot22, op=_dot22, gpu_op=gpu_dot22,
cases=dict( cases=dict(
test1=[rand(3, 4), rand(4, 5)], test1=[rand(3, 4), rand(4, 5)],
test2=[rand(1, 4), rand(4, 5)], test2=[rand(1, 4), rand(4, 5)],
test3=[rand(3, 1), rand(1, 5)], test3=[rand(3, 1), rand(1, 5)],
test4=[rand(3, 4), rand(4, 1)], test4=[rand(3, 4), rand(4, 1)],
# test5=[rand(0, 4), rand(4, 5)], # test5=[rand(0, 4), rand(4, 5)],
# test6=[rand(3, 0), rand(0, 5)], # test6=[rand(3, 0), rand(0, 5)],
# test7=[rand(3, 4), rand(4, 0)], # test7=[rand(3, 4), rand(4, 0)],
# test8=[rand(0, 4), rand(4, 0)], # test8=[rand(0, 4), rand(4, 0)],
# test9=[rand(0, 0), rand(0, 0)], # test9=[rand(0, 0), rand(0, 0)],
) )
) )
def test_hgemm_swap(): def test_hgemm_swap():
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
...@@ -149,6 +151,7 @@ def test_hgemm_swap(): ...@@ -149,6 +151,7 @@ def test_hgemm_swap():
utt.assert_allclose(of, on) utt.assert_allclose(of, on)
def test_hgemm_alpha_output_merge(): def test_hgemm_alpha_output_merge():
from theano.sandbox.cuda import nvcc_compiler from theano.sandbox.cuda import nvcc_compiler
if nvcc_compiler.nvcc_version < '7.5': if nvcc_compiler.nvcc_version < '7.5':
......
...@@ -6,32 +6,31 @@ import sys ...@@ -6,32 +6,31 @@ import sys
import time import time
import unittest import unittest
import numpy import numpy
from six.moves import xrange from six.moves import xrange
from nose.plugins.skip import SkipTest
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
import theano import theano
from theano import tensor from theano import tensor
from theano.tests.unittest_tools import seed_rng from theano.tests.unittest_tools import seed_rng
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import mode_with_gpu
mode_without_gpu)
from ..type import GpuArrayType from ..type import GpuArrayType
from ..conv import GpuConv from ..conv import GpuConv
from theano.sandbox.gpuarray import dnn from theano.sandbox.gpuarray import dnn
import pygpu import pygpu
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
gftensor4 = GpuArrayType('float32', [False] * 4) gftensor4 = GpuArrayType('float32', [False] * 4)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
outshp = (img.shape[0], kern.shape[0], outshp = (img.shape[0], kern.shape[0],
...@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -191,15 +190,17 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
cpu_mflops = approx_fp / (t1 - t0) cpu_mflops = approx_fp / (t1 - t0)
gpu_mflops = approx_fp / (t2 - t1) gpu_mflops = approx_fp / (t2 - t1)
if verbose > 0: if verbose > 0:
print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout) print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp, file=sys.stdout)
cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout) print('%12.5f %7.2f %7.2f %7.1f' %
(approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
file=sys.stdout)
if not rval: if not rval:
print(('test_' + mode + ' id=' + str(id) + print('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' + ' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape, ' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride, kshape, mode, subsample, img_stride, kern_stride,
version), file=sys.stdout) version, file=sys.stdout)
diff = cpuval - gpuval diff = cpuval - gpuval
diffabs = numpy.absolute(diff) diffabs = numpy.absolute(diff)
pr_diff = diffabs / numpy.absolute(cpuval) pr_diff = diffabs / numpy.absolute(cpuval)
...@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -210,7 +211,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
nb_close, "/", diff.size)) nb_close, "/", diff.size))
print("max relatif diff:", (pr_diff.max(), "avg rel diff:", print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
numpy.average(pr_diff))) numpy.average(pr_diff)))
if not rval and print_ != False: if not rval and print_ is not False:
if npy_img.shape[0] > 5: if npy_img.shape[0] > 5:
print("img", npy_img[0]) print("img", npy_img[0])
print("kern", npy_kern[0]) print("kern", npy_kern[0])
...@@ -242,7 +243,8 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -242,7 +243,8 @@ def exec_conv(version, shapes, verbose, random, mode,
istride, kstride) in enumerate(shapes): istride, kstride) in enumerate(shapes):
ret = False ret = False
try: try:
ret = _params_allgood(ishape, ret = _params_allgood(
ishape,
kshape, kshape,
mode, mode,
subsample=subshape, subsample=subshape,
...@@ -297,15 +299,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1), ...@@ -297,15 +299,15 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride), ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
# nkern only # nkern only
((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride), ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and nkern # batch and nkern
((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride), ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and stack # batch and stack
((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride), ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
#stack and nkern # stack and nkern
((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride), ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack # batch, nkern and stack
((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride), ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack # batch, nkern and stack
((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride) ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
] ]
...@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1), ...@@ -344,7 +346,6 @@ def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
def get_valid_shapes(): def get_valid_shapes():
# img shape, kern shape, subsample shape # img shape, kern shape, subsample shape
shapes = get_basic_shapes() shapes = get_basic_shapes()
...@@ -361,37 +362,34 @@ def get_valid_shapes(): ...@@ -361,37 +362,34 @@ def get_valid_shapes():
shapes += [ shapes += [
# other test # other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1 ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
, ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)) # added a test case that fail from test_nnet.py.test_conv_nnet2 ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)), # added a test case that fail from test_nnet.py.test_conv_nnet2
, ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)) # test precedent but reduced that triger the error ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)), # test precedent but reduced that triger the error
# Test more than maxThreadsDim0 # Test more than maxThreadsDim0
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
] ]
shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 1 layers shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 1 layers
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 2 layers ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
, ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
, ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 2 valid ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 1 layers ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 2 layers ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
] ]
return shapes return shapes
...@@ -428,42 +426,34 @@ def test_full(): ...@@ -428,42 +426,34 @@ def test_full():
shapes += [ shapes += [
# other test # other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1 ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
# other test # other test
, ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)) # kernel bigger then image ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
, ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)) ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)) # kernel bigger then image ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)), # kernel bigger then image
, ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)) ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
] ]
shapes += [ shapes += [
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2
# Test more than maxThreadsDim0 # Test more than maxThreadsDim0
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
] ]
version = [-1] version = [-1]
...@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -562,7 +552,6 @@ class TestConv2DGPU(unittest.TestCase):
for mode in ['valid', 'full']: for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)), for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)), ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns. # We use only the number of columns.
]: ]:
...@@ -580,47 +569,45 @@ def benchmark(): ...@@ -580,47 +569,45 @@ def benchmark():
shapes_valid = [ shapes_valid = [
# test_lenet_28 shape # test_lenet_28 shape
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # valid ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)) # valid ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_32 shape # test_lenet_32 shape
, ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)) # valid ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)) # valid ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_64 shape # test_lenet_64 shape
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)) # valid ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)), # valid
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)) # valid ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_108 shape # test_lenet_108 shape
, ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)) # valid ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)), # valid
, ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)) # valid ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_256 shape # test_lenet_256 shape
, ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)) # valid ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)) # valid ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)), # valid
, ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)) # valid ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)) # valid ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)), # valid
] ]
shapes_full = [ shapes_full = [
# test_lenet_28 shape # test_lenet_28 shape
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full
# test_lenet_32 shape # test_lenet_32 shape
, ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=1 ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape # test_lenet_64 shape
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=3 ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape # test_lenet_108 shape
, ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full 'conv_full_patch_stack_padded' N=9 ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape # test_lenet_256 shape
, ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)) # full conv_reference_full ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)), # full conv_reference_full
] ]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version = [-1] version = [-1]
verbose = 1 verbose = 1
random = True random = True
......
import unittest
from theano.tensor.nnet.tests import test_neighbours from theano.tensor.nnet.tests import test_neighbours
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import mode_with_gpu
mode_without_gpu)
from ..neighbours import GpuImages2Neibs from ..neighbours import GpuImages2Neibs
......
from __future__ import print_function from __future__ import print_function
from nose.plugins.skip import SkipTest
import numpy import numpy
import unittest import unittest
...@@ -7,8 +7,6 @@ import theano ...@@ -7,8 +7,6 @@ import theano
import theano.tensor as T import theano.tensor as T
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
from theano.sandbox import gpuarray
# We let that import do the init of the back-end if needed. # We let that import do the init of the back-end if needed.
from .test_basic_ops import (mode_with_gpu, from .test_basic_ops import (mode_with_gpu,
mode_without_gpu) mode_without_gpu)
...@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -36,15 +34,13 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
n_in = 4098 n_in = 4098
n_out = 4099 n_out = 4099
x = T.fmatrix('x')
y = T.lvector('y') y = T.lvector('y')
b = T.fvector('b') b = T.fvector('b')
#W = T.fmatrix('W')
# we precompute the dot with big shape before to allow the test of # we precompute the dot with big shape before to allow the test of
# GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error
#(the launch timed out and was terminated) on GPU card not # (the launch timed out and was terminated) on GPU card not
# powerful enough. We need the big shape to check for corner # powerful enough. We need the big shape to check for corner
# case. # case.
dot_result = T.fmatrix('dot_result') dot_result = T.fmatrix('dot_result')
...@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -54,7 +50,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
xx = numpy.asarray(numpy.random.rand(batch_size, n_in), xx = numpy.asarray(numpy.random.rand(batch_size, n_in),
dtype=numpy.float32) dtype=numpy.float32)
#?????yy = numpy.ones((batch_size,),dtype='float32')
yy = numpy.ones((batch_size,), dtype='int32') yy = numpy.ones((batch_size,), dtype='int32')
b_values = numpy.zeros((n_out,), dtype='float32') b_values = numpy.zeros((n_out,), dtype='float32')
W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32') W_values = numpy.asarray(numpy.random.rand(n_in, n_out), dtype='float32')
...@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): ...@@ -71,8 +66,6 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
classify_gpu = theano.function(inputs=[y, b, dot_result], classify_gpu = theano.function(inputs=[y, b, dot_result],
outputs=[loss, y_pred, dW], outputs=[loss, y_pred, dW],
mode=mode_with_gpu) mode=mode_with_gpu)
# theano.printing.debugprint(classify)
# theano.printing.debugprint(classify_gpu)
assert any([isinstance(node.op, assert any([isinstance(node.op,
T.nnet.CrossentropySoftmaxArgmax1HotWithBias) T.nnet.CrossentropySoftmaxArgmax1HotWithBias)
...@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -97,12 +90,10 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
We check that we loop when their is too much threads We check that we loop when their is too much threads
""" """
n_in = 1000
batch_size = 4097 batch_size = 4097
n_out = 1250 n_out = 1250
if not isinstance(mode_with_gpu, theano.compile.DebugMode): if not isinstance(mode_with_gpu, theano.compile.DebugMode):
n_in = 4098
n_out = 4099 n_out = 4099
# Seed numpy.random with config.unittests.rseed # Seed numpy.random with config.unittests.rseed
...@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx(): ...@@ -137,25 +128,7 @@ def test_GpuCrossentropySoftmax1HotWithBiasDx():
rtol = 1e-5 rtol = 1e-5
atol = 1e-6 atol = 1e-6
if not numpy.allclose(cpu_out, gpu_out, rtol=rtol, atol=atol): utt.assert_allclose(cpu_out, gpu_out, rtol=rtol, atol=atol)
abs_err, rel_err = T.numeric_grad.abs_rel_err(cpu_out, gpu_out)
scaled_err = numpy.minimum(abs_err / atol, rel_err / rtol)
max_i = scaled_err.argmax()
print('max err index:', max_i, max_i / batch_size, end=' ')
print(max_i % batch_size, max_i / n_out, max_i & n_out)
print('At that index:')
print('err:', scaled_err.flatten()[max_i])
print('absolute error:', abs_err.flatten()[max_i])
print('relative error:', rel_err.flatten()[max_i])
print('cpu_out:', cpu_out.flatten()[max_i])
print('gpu_out:', gpu_out.flatten()[max_i])
print('softmax_output_value:', softmax_output_value.flatten()[max_i])
print('dnll_value:', dnll_value[max_i / n_out])
print('y_idx_value:', y_idx_value[max_i / n_out])
assert False, "numpy.allclose(cpu_out, gpu_out, rtol=%s, atol=%s)" % (
rtol, atol)
def test_softmax_with_bias_float16(): def test_softmax_with_bias_float16():
...@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16(): ...@@ -166,6 +139,7 @@ def test_softmax_with_bias_float16():
softmax_with_bias_unittest_template(dtypeInput='float32', softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float16') dtypeBias='float16')
def test_softmax_with_bias_float32(): def test_softmax_with_bias_float32():
softmax_with_bias_unittest_template(dtypeInput='float32', softmax_with_bias_unittest_template(dtypeInput='float32',
dtypeBias='float32') dtypeBias='float32')
...@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias): ...@@ -188,6 +162,7 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
TODO: check that we loop when there are too many threads. (THIS IS TODO: check that we loop when there are too many threads. (THIS IS
NOT IMPLEMENTED) NOT IMPLEMENTED)
""" """
x = T.matrix('x', dtype=dtypeInput) x = T.matrix('x', dtype=dtypeInput)
b = T.vector('b', dtype=dtypeBias) b = T.vector('b', dtype=dtypeBias)
...@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias): ...@@ -228,9 +203,11 @@ def softmax_with_bias_unittest_template(dtypeInput, dtypeBias):
def test_softmax_float16(): def test_softmax_float16():
softmax_unittest_template('float16') softmax_unittest_template('float16')
def test_softmax_float32(): def test_softmax_float32():
softmax_unittest_template('float32') softmax_unittest_template('float32')
def test_softmax_float64(): def test_softmax_float64():
softmax_unittest_template('float64') softmax_unittest_template('float64')
......
import operator
import numpy import numpy
import theano import theano
...@@ -25,7 +23,6 @@ def test_deep_copy(): ...@@ -25,7 +23,6 @@ def test_deep_copy():
def test_values_eq_approx(): def test_values_eq_approx():
a = rand_gpuarray(20, dtype='float32') a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
assert GpuArrayType.values_eq_approx(a, a) assert GpuArrayType.values_eq_approx(a, a)
b = a.copy() b = a.copy()
b[0] = numpy.asarray(b[0]) + 1. b[0] = numpy.asarray(b[0]) + 1.
......
...@@ -200,11 +200,12 @@ class GpuArrayType(Type): ...@@ -200,11 +200,12 @@ class GpuArrayType(Type):
self.broadcastable == other.broadcastable) self.broadcastable == other.broadcastable)
def convert_variable(self, var): def convert_variable(self, var):
if (type(self) == type(var.type) and vt = var.type
self.typecode == var.type.typecode and if (type(self) == type(vt) and
self.ndim == var.type.ndim and self.typecode == vt.typecode and
self.ndim == vt.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable, all(sb == ob or ob for sb, ob in zip(self.broadcastable,
var.type.broadcastable))): vt.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self): def __hash__(self):
......
...@@ -157,24 +157,11 @@ whitelist_flake8 = [ ...@@ -157,24 +157,11 @@ whitelist_flake8 = [
"sandbox/linalg/ops.py", "sandbox/linalg/ops.py",
"sandbox/linalg/__init__.py", "sandbox/linalg/__init__.py",
"sandbox/linalg/tests/test_linalg.py", "sandbox/linalg/tests/test_linalg.py",
"sandbox/gpuarray/basic_ops.py",
"sandbox/gpuarray/nnet.py",
"sandbox/gpuarray/elemwise.py",
"sandbox/gpuarray/type.py",
"sandbox/gpuarray/__init__.py", "sandbox/gpuarray/__init__.py",
"sandbox/gpuarray/kernel_codegen.py",
"sandbox/gpuarray/conv.py",
"sandbox/gpuarray/neighbours.py",
"sandbox/gpuarray/tests/test_subtensor.py", "sandbox/gpuarray/tests/test_subtensor.py",
"sandbox/gpuarray/tests/test_scan.py", "sandbox/gpuarray/tests/test_scan.py",
"sandbox/gpuarray/tests/test_neighbours.py",
"sandbox/gpuarray/tests/test_conv_cuda_ndarray.py",
"sandbox/gpuarray/tests/test_type.py",
"sandbox/gpuarray/tests/test_opt.py", "sandbox/gpuarray/tests/test_opt.py",
"sandbox/gpuarray/tests/test_blas.py",
"sandbox/gpuarray/tests/test_elemwise.py", "sandbox/gpuarray/tests/test_elemwise.py",
"sandbox/gpuarray/tests/test_nnet.py",
"sandbox/gpuarray/tests/test_basic_ops.py",
"scan_module/scan_utils.py", "scan_module/scan_utils.py",
"scan_module/scan_views.py", "scan_module/scan_views.py",
"scan_module/scan.py", "scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论