提交 8aa08ca2 authored 作者: lamblin's avatar lamblin

Merge pull request #450 from nouiz/gpusum

Test nvidia driver
...@@ -99,6 +99,11 @@ import gof ...@@ -99,6 +99,11 @@ import gof
if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
import theano.sandbox.cuda import theano.sandbox.cuda
# We can't test the driver during import of theano.sandbox.cuda as
# this cause circular import dependency. So we also test it manually
# after the import
import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
# Use config.numpy to call numpy.seterr # Use config.numpy to call numpy.seterr
import numpy import numpy
......
...@@ -7,9 +7,9 @@ from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gp ...@@ -7,9 +7,9 @@ from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable, host_from_gp
from theano.misc import strutil from theano.misc import strutil
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.sandbox.cuda.opt import register_opt from theano.sandbox.cuda.opt import register_opt
from theano.sandbox.cuda import CudaNdarrayType from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(theano.Op): class GpuConv3D(GpuOp):
""" GPU implementation of Conv3D """ """ GPU implementation of Conv3D """
def __eq__(self, other): def __eq__(self, other):
......
...@@ -8,11 +8,12 @@ from theano.misc import strutil ...@@ -8,11 +8,12 @@ from theano.misc import strutil
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
from theano.sandbox.cuda.opt import register_opt from theano.sandbox.cuda.opt import register_opt
from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp)
class GpuConvGrad3D(theano.Op): class GpuConvGrad3D(GpuOp):
""" GPU version of gradient of ConvGrad3D with respect to W """ """ GPU version of gradient of ConvGrad3D with respect to W """
def make_node(self, V, d, WShape, dCdH): def make_node(self, V, d, WShape, dCdH):
......
...@@ -9,10 +9,11 @@ from theano.gof import local_optimizer ...@@ -9,10 +9,11 @@ from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.sandbox.cuda.opt import register_opt from theano.sandbox.cuda.opt import register_opt
from theano.sandbox.cuda import CudaNdarrayType, HostFromGpu, host_from_gpu from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp)
class GpuConvTransp3D(theano.Op): class GpuConvTransp3D(GpuOp):
""" The gpu version of ConvTransp3D """ """ The gpu version of ConvTransp3D """
def __eq__(self,other): def __eq__(self,other):
return type(self) == type(other) return type(self) == type(other)
......
import atexit, logging, os, shutil, stat, sys import atexit, logging, os, shutil, stat, sys
import numpy
import theano
from theano.compile import optdb from theano.compile import optdb
from theano.gof.cmodule import get_lib_extension from theano.gof.cmodule import get_lib_extension
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam
...@@ -23,7 +27,8 @@ if config.cuda.root == "AUTO": ...@@ -23,7 +27,8 @@ if config.cuda.root == "AUTO":
# set nvcc_path correctly and get the version # set nvcc_path correctly and get the version
nvcc_compiler.set_cuda_root() nvcc_compiler.set_cuda_root()
#is_nvcc_available called here to initialize global vars in nvcc_compiler module #is_nvcc_available called here to initialize global vars in
#nvcc_compiler module
nvcc_compiler.is_nvcc_available() nvcc_compiler.is_nvcc_available()
# Compile cuda_ndarray.cu # Compile cuda_ndarray.cu
...@@ -31,8 +36,9 @@ nvcc_compiler.is_nvcc_available() ...@@ -31,8 +36,9 @@ nvcc_compiler.is_nvcc_available()
# printed and this module will not be working properly (we set `cuda_available` # printed and this module will not be working properly (we set `cuda_available`
# to False). # to False).
# This variable is True by default, and set to False if nvcc is not available or # This variable is True by default, and set to False if nvcc is not
# their is no cuda card or something goes wrong when trying to initialize cuda. # available or their is no cuda card or something goes wrong when
# trying to initialize cuda.
cuda_available = True cuda_available = True
# Global variable to avoid displaying the same warning multiple times. # Global variable to avoid displaying the same warning multiple times.
...@@ -41,6 +47,7 @@ cuda_warning_is_displayed = False ...@@ -41,6 +47,7 @@ cuda_warning_is_displayed = False
#This variable is set to True when we enable cuda.(i.e. when use() is called) #This variable is set to True when we enable cuda.(i.e. when use() is called)
cuda_enabled = False cuda_enabled = False
# Code factorized within a function so that it may be called from multiple # Code factorized within a function so that it may be called from multiple
# places (which is not currently the case, but may be useful in the future). # places (which is not currently the case, but may be useful in the future).
def set_cuda_disabled(): def set_cuda_disabled():
...@@ -72,17 +79,18 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc, ...@@ -72,17 +79,18 @@ libcuda_ndarray_so = os.path.join(cuda_ndarray_loc,
'libcuda_ndarray.' + get_lib_extension()) 'libcuda_ndarray.' + get_lib_extension())
# Add the theano cache directory's cuda_ndarray subdirectory to the list of # Add the theano cache directory's cuda_ndarray subdirectory to the
# places that are hard-coded into compiled modules' runtime library search # list of places that are hard-coded into compiled modules' runtime
# list. This works in conjunction with nvcc_compiler.nvcc_module_compile_str # library search list. This works in conjunction with
# which adds this folder during compilation with -L and also adds -lcuda_ndarray # nvcc_compiler.nvcc_module_compile_str which adds this folder during
# when compiling modules. # compilation with -L and also adds -lcuda_ndarray when compiling
# modules.
nvcc_compiler.add_standard_rpath(cuda_ndarray_loc) nvcc_compiler.add_standard_rpath(cuda_ndarray_loc)
compile_cuda_ndarray = True compile_cuda_ndarray = True
if os.path.exists(cuda_ndarray_so): if os.path.exists(cuda_ndarray_so):
compile_cuda_ndarray = date>=os.stat(cuda_ndarray_so)[stat.ST_MTIME] compile_cuda_ndarray = date >= os.stat(cuda_ndarray_so)[stat.ST_MTIME]
if not compile_cuda_ndarray: if not compile_cuda_ndarray:
try: try:
# If we load a previously-compiled version, config.compiledir should # If we load a previously-compiled version, config.compiledir should
...@@ -111,7 +119,7 @@ try: ...@@ -111,7 +119,7 @@ try:
include_dirs=[cuda_path], libs=['cublas']) include_dirs=[cuda_path], libs=['cublas'])
from cuda_ndarray.cuda_ndarray import * from cuda_ndarray.cuda_ndarray import *
except Exception, e: except Exception, e:
_logger.error( "Failed to compile cuda_ndarray.cu: %s", str(e)) _logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
set_cuda_disabled() set_cuda_disabled()
if cuda_available: if cuda_available:
...@@ -129,10 +137,13 @@ if cuda_available: ...@@ -129,10 +137,13 @@ if cuda_available:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so) os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
try: try:
# This only test if the cuda driver is available and if there
# is at least one GPU that support cuda. This do not select a
# device.
gpu_init() gpu_init()
cuda_available = True cuda_available = True
cuda_initialization_error_message = "" cuda_initialization_error_message = ""
# actively closing our gpu session presents segfault-on-exit on some systems # actively closing our gpu session presents segfault-on-exit on some systems
atexit.register(gpu_shutdown) atexit.register(gpu_shutdown)
except EnvironmentError, e: except EnvironmentError, e:
cuda_available = False cuda_available = False
...@@ -162,7 +173,7 @@ if cuda_available: ...@@ -162,7 +173,7 @@ if cuda_available:
shared_constructor = float32_shared_constructor shared_constructor = float32_shared_constructor
import basic_ops import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise, from basic_ops import (GpuOp, GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous, GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor, GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1, GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
...@@ -180,18 +191,31 @@ def use(device, ...@@ -180,18 +191,31 @@ def use(device,
force=False, force=False,
default_to_move_computation_to_gpu=True, default_to_move_computation_to_gpu=True,
move_shared_float32_to_gpu=True, move_shared_float32_to_gpu=True,
enable_cuda=True): enable_cuda=True,
test_driver=True):
""" """
Error and warning about CUDA should be displayed only when this function is called. Error and warning about CUDA should be displayed only when this
We need to be able to load this module only to check if it is available! function is called. We need to be able to load this module only
to check if it is available!
:param device: string "cpu", "gpu", "gpuN" N is the device number to use
:param force: Will always raise an exception if we can't use the gpu.
:param default_to_move_computation_to_gpu: If gpu init succeeded, enable by
default optimization to move
computation to the gpu
:param move_shared_float32_to_gpu: If gpu init succeeded, put new shared
variable in float32 on the gpu.
:param enable_cuda: If the gpu is correctly enabled,
set the the variable cuda_enabled to True.
""" """
global cuda_enabled, cuda_initialization_error_message global cuda_enabled, cuda_initialization_error_message
if force and not cuda_available and device.startswith('gpu'): if force and not cuda_available and device.startswith('gpu'):
if not nvcc_compiler.is_nvcc_available(): if not nvcc_compiler.is_nvcc_available():
raise EnvironmentError("You forced the use of gpu device '%s', but " raise EnvironmentError("You forced the use of gpu device '%s', but"
"nvcc was not found. Set it in your PATH " " nvcc was not found. Set it in your PATH "
"environment variable or set the Theano " "environment variable or set the Theano "
"flags 'cuda.root' to its directory" % device) "flags 'cuda.root' to its directory"
"" % device)
else: else:
raise EnvironmentError("You forced the use of gpu device %s, " raise EnvironmentError("You forced the use of gpu device %s, "
"but CUDA initialization failed " "but CUDA initialization failed "
...@@ -206,7 +230,8 @@ def use(device, ...@@ -206,7 +230,8 @@ def use(device,
try: try:
if cuda_initialization_error_message: if cuda_initialization_error_message:
error_addendum = " (error: %s)" % cuda_initialization_error_message error_addendum = " (error: %s)" % cuda_initialization_error_message
except NameError: # cuda_initialization_error_message is not available b/c compilation failed except NameError:
# cuda_initialization_error_message is not available b/c compilation failed
pass pass
_logger.warning('CUDA is installed, but device %s is not available %s', _logger.warning('CUDA is installed, but device %s is not available %s',
device, error_addendum) device, error_addendum)
...@@ -222,28 +247,32 @@ def use(device, ...@@ -222,28 +247,32 @@ def use(device,
raise ValueError("Invalid device identifier", device) raise ValueError("Invalid device identifier", device)
if use.device_number is None: if use.device_number is None:
# No successful call to use() has been made yet # No successful call to use() has been made yet
if device != 'gpu' and device<0: if device != 'gpu' and device < 0:
return return
if device in [None,""]: if device in [None, ""]:
device=0 device = 0
try: try:
if device !='gpu': if device != 'gpu':
gpu_init(device) gpu_init(device)
use.device_number = device
if test_driver:
import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if move_shared_float32_to_gpu: if move_shared_float32_to_gpu:
handle_shared_float32(True) handle_shared_float32(True)
use.device_number = device
if enable_cuda: if enable_cuda:
cuda_enabled = True cuda_enabled = True
print >> sys.stderr, "Using gpu device %d: %s" % (active_device_number(), active_device_name()) print >> sys.stderr, "Using gpu device %d: %s" % (
active_device_number(), active_device_name())
except (EnvironmentError, ValueError), e: except (EnvironmentError, ValueError), e:
_logger.error(("ERROR: Not using GPU." _logger.error(("ERROR: Not using GPU."
" Initialisation of device %i failed:\n%s"), " Initialisation of device %i failed:\n%s"),
device, e) device, e)
cuda_enabled = False cuda_enabled = False
if force: if force:
e.args+=(("You asked to force this device and it failed." e.args += (("You asked to force this device and it failed."
" No fallback to the cpu or other gpu device."),) " No fallback to the cpu or other gpu device."),)
raise raise
...@@ -264,17 +293,16 @@ def use(device, ...@@ -264,17 +293,16 @@ def use(device,
try: try:
#in case the device if just gpu, #in case the device if just gpu,
# we check that the driver init it correctly. # we check that the driver init it correctly.
cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5,5)) cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5, 5))
except (Exception, NameError), e: except (Exception, NameError), e:
# NameError when no gpu present as cuda_ndarray is not loaded. # NameError when no gpu present as cuda_ndarray is not loaded.
e.args+=("ERROR: GPU forced but failed. ",) e.args += ("ERROR: GPU forced but failed. ",)
raise raise
use.device_number = None use.device_number = None
def handle_shared_float32(tf): def handle_shared_float32(tf):
"""Set the CudaNdarrayType as the default handler for shared float32 arrays. """Set the default shared type for float32 tensor to CudaNdarrayType
This function is intended to be called from use(gpu_index), not directly. This function is intended to be called from use(gpu_index), not directly.
""" """
...@@ -285,10 +313,13 @@ def handle_shared_float32(tf): ...@@ -285,10 +313,13 @@ def handle_shared_float32(tf):
else: else:
raise NotImplementedError('removing our handler') raise NotImplementedError('removing our handler')
# We can't test the driver during import here as this cause circular
# import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'): if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device) use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device: elif config.init_gpu_device:
assert config.device=="cpu", ("We can use the Theano flag init_gpu_device" assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'") " only when the Theano flag device=='cpu'")
_logger.warning(("GPU device %s will be initialized, and used if a GPU is " _logger.warning(("GPU device %s will be initialized, and used if a GPU is "
"needed. " "needed. "
...@@ -300,4 +331,4 @@ elif config.init_gpu_device: ...@@ -300,4 +331,4 @@ elif config.init_gpu_device:
force=config.force_device, force=config.force_device,
default_to_move_computation_to_gpu=False, default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False, move_shared_float32_to_gpu=False,
enable_cuda=False) enable_cuda=False, test_driver=False)
...@@ -33,7 +33,20 @@ def as_cuda_array(obj): ...@@ -33,7 +33,20 @@ def as_cuda_array(obj):
else: else:
raise TypeError("Don't know how to cast to a CudaNdarray object") raise TypeError("Don't know how to cast to a CudaNdarray object")
class HostFromGpu(Op):
class GpuOp(Op):
def make_thunk(self, node, storage_map, compute_map, no_recycling):
if theano.sandbox.cuda.use.device_number is None:
theano.sandbox.cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
return super(GpuOp, self).make_thunk(node, storage_map,
compute_map, no_recycling)
class HostFromGpu(GpuOp):
""" """
Implement the transfer from gpu to the cpu. Implement the transfer from gpu to the cpu.
""" """
...@@ -65,7 +78,7 @@ class HostFromGpu(Op): ...@@ -65,7 +78,7 @@ class HostFromGpu(Op):
return xshp return xshp
host_from_gpu = HostFromGpu() host_from_gpu = HostFromGpu()
class GpuFromHost(Op): class GpuFromHost(GpuOp):
""" """
Implement the transfer from cpu to the gpu. Implement the transfer from cpu to the gpu.
""" """
...@@ -98,7 +111,8 @@ class GpuFromHost(Op): ...@@ -98,7 +111,8 @@ class GpuFromHost(Op):
return xshp return xshp
gpu_from_host = GpuFromHost() gpu_from_host = GpuFromHost()
class GpuElemwise(Op):
class GpuElemwise(GpuOp):
""" """
Implement a generic elemwise on the gpu. Implement a generic elemwise on the gpu.
""" """
...@@ -208,7 +222,7 @@ class GpuElemwise(Op): ...@@ -208,7 +222,7 @@ class GpuElemwise(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return self.src_generator.cache_version return self.src_generator.cache_version
class GpuDimShuffle(Op): class GpuDimShuffle(GpuOp):
""" """
Implement DimShuffle on the gpu. Implement DimShuffle on the gpu.
""" """
...@@ -397,7 +411,7 @@ class GpuDimShuffle(Op): ...@@ -397,7 +411,7 @@ class GpuDimShuffle(Op):
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,0) return (1,0)
class GpuSum(Op): class GpuSum(GpuOp):
"""GpuSum is a Reduction along some dimensions by summation. """GpuSum is a Reduction along some dimensions by summation.
The dimensions along which to sum is specified by the `reduce_mask` that you pass to the The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
...@@ -1717,7 +1731,7 @@ class GpuSum(Op): ...@@ -1717,7 +1731,7 @@ class GpuSum(Op):
""" %locals() """ %locals()
return sio.getvalue() return sio.getvalue()
class GpuReshape(tensor.Reshape): class GpuReshape(tensor.Reshape, GpuOp):
""" """
Implement Reshape on the gpu. Implement Reshape on the gpu.
""" """
...@@ -1733,7 +1747,7 @@ class GpuReshape(tensor.Reshape): ...@@ -1733,7 +1747,7 @@ class GpuReshape(tensor.Reshape):
', should be %i' % (len(shp), self.ndim), shp) ', should be %i' % (len(shp), self.ndim), shp)
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuSubtensor(tensor.Subtensor): class GpuSubtensor(tensor.Subtensor, GpuOp):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
""" """
...@@ -1764,7 +1778,7 @@ class GpuSubtensor(tensor.Subtensor): ...@@ -1764,7 +1778,7 @@ class GpuSubtensor(tensor.Subtensor):
cdata = cdata[0] cdata = cdata[0]
out[0] = x.__getitem__(cdata) out[0] = x.__getitem__(cdata)
class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1): class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
""" """
...@@ -1790,7 +1804,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1): ...@@ -1790,7 +1804,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1):
o[j] = x[i] o[j] = x[i]
out[0] = o out[0] = o
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1): class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
""" """
...@@ -1818,7 +1832,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1): ...@@ -1818,7 +1832,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1):
# CudaNdarray_Subscript() don't support Advanced slicing. # CudaNdarray_Subscript() don't support Advanced slicing.
# so we use the parent version that loop on each indices. # so we use the parent version that loop on each indices.
class GpuIncSubtensor(tensor.IncSubtensor): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
""" """
...@@ -1828,7 +1842,7 @@ class GpuIncSubtensor(tensor.IncSubtensor): ...@@ -1828,7 +1842,7 @@ class GpuIncSubtensor(tensor.IncSubtensor):
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
return Apply(self, [x,y]+rval.inputs[2:], [x.type()]) return Apply(self, [x,y]+rval.inputs[2:], [x.type()])
class GpuFlatten(tensor.Flatten): class GpuFlatten(tensor.Flatten, GpuOp):
""" """
Implement Flatten on the gpu. Implement Flatten on the gpu.
""" """
...@@ -1839,7 +1853,7 @@ class GpuFlatten(tensor.Flatten): ...@@ -1839,7 +1853,7 @@ class GpuFlatten(tensor.Flatten):
out_type = CudaNdarrayType(broadcastable=host_out_broadcastable) out_type = CudaNdarrayType(broadcastable=host_out_broadcastable)
return Apply(self, [x], [out_type()]) return Apply(self, [x], [out_type()])
class GpuShape(tensor.Shape): class GpuShape(tensor.Shape, GpuOp):
""" """
Implement Shape on the gpu. Implement Shape on the gpu.
""" """
...@@ -1847,7 +1861,7 @@ class GpuShape(tensor.Shape): ...@@ -1847,7 +1861,7 @@ class GpuShape(tensor.Shape):
return Apply(self, [x], [tensor.lvector()]) return Apply(self, [x], [tensor.lvector()])
gpu_shape = GpuShape() gpu_shape = GpuShape()
class GpuJoin(tensor.Join): class GpuJoin(tensor.Join, GpuOp):
""" """
Implement Join on the gpu. Implement Join on the gpu.
""" """
...@@ -1924,7 +1938,7 @@ class GpuJoin(tensor.Join): ...@@ -1924,7 +1938,7 @@ class GpuJoin(tensor.Join):
gpu_join = GpuJoin() gpu_join = GpuJoin()
class GpuAlloc(Op): class GpuAlloc(GpuOp):
""" """
Implement Alloc on the gpu. Implement Alloc on the gpu.
""" """
...@@ -2023,7 +2037,7 @@ class GpuAlloc(Op): ...@@ -2023,7 +2037,7 @@ class GpuAlloc(Op):
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
class GpuContiguous(Op): class GpuContiguous(GpuOp):
""" """
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
not already c contiguous. not already c contiguous.
......
...@@ -4,8 +4,9 @@ import StringIO, os ...@@ -4,8 +4,9 @@ import StringIO, os
import cuda_ndarray.cuda_ndarray as cuda import cuda_ndarray.cuda_ndarray as cuda
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
class GpuDot22(Op): class GpuDot22(GpuOp):
""" """
Implement dot(2d, 2d) on the gpu. Implement dot(2d, 2d) on the gpu.
""" """
...@@ -76,7 +77,7 @@ class GpuDot22(Op): ...@@ -76,7 +77,7 @@ class GpuDot22(Op):
""" % locals() """ % locals()
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
class GpuDot22Scalar(Op): class GpuDot22Scalar(GpuOp):
""" """
Implement dot(2d, 2d) * scalar on the gpu. Implement dot(2d, 2d) * scalar on the gpu.
""" """
...@@ -155,7 +156,7 @@ class GpuDot22Scalar(Op): ...@@ -155,7 +156,7 @@ class GpuDot22Scalar(Op):
""" % locals() """ % locals()
gpu_dot22scalar = GpuDot22Scalar() gpu_dot22scalar = GpuDot22Scalar()
class GpuGemm(Op): class GpuGemm(GpuOp):
""" """
implement the gemm on the gpu. implement the gemm on the gpu.
...@@ -257,7 +258,7 @@ class GpuGemm(Op): ...@@ -257,7 +258,7 @@ class GpuGemm(Op):
gpu_gemm_no_inplace = GpuGemm(inplace=False) gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True) gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuGemv(Op): class GpuGemv(GpuOp):
""" """
implement gemv on the gpu. implement gemv on the gpu.
...@@ -348,7 +349,7 @@ class GpuGemv(Op): ...@@ -348,7 +349,7 @@ class GpuGemv(Op):
gpu_gemv_no_inplace = GpuGemv(inplace=False) gpu_gemv_no_inplace = GpuGemv(inplace=False)
gpu_gemv_inplace = GpuGemv(inplace=True) gpu_gemv_inplace = GpuGemv(inplace=True)
class GpuGer(Op): class GpuGer(GpuOp):
""" """
implement ger on the gpu. implement ger on the gpu.
...@@ -439,7 +440,7 @@ class GpuGer(Op): ...@@ -439,7 +440,7 @@ class GpuGer(Op):
gpu_ger_no_inplace = GpuGer(inplace=False) gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True) gpu_ger_inplace = GpuGer(inplace=True)
class GpuOuter(Op): class GpuOuter(GpuOp):
""" Implement outer on the gpu.""" """ Implement outer on the gpu."""
def make_node(self, x, y): def make_node(self, x, y):
# we suppose type checking has been done, but make sure. # we suppose type checking has been done, but make sure.
...@@ -532,7 +533,7 @@ gpu_outer = GpuOuter() ...@@ -532,7 +533,7 @@ gpu_outer = GpuOuter()
## ##
# Not really a BLAS operation, but whatever. # Not really a BLAS operation, but whatever.
# #
class GpuConv(Op): class GpuConv(GpuOp):
""" """
Implement the batched and stacked 2d convolution on the gpu. Implement the batched and stacked 2d convolution on the gpu.
""" """
...@@ -698,7 +699,7 @@ class GpuConv(Op): ...@@ -698,7 +699,7 @@ class GpuConv(Op):
"""%sub """%sub
class GpuDownsampleFactorMax(Op): class GpuDownsampleFactorMax(GpuOp):
""" """
Implement downsample with max on the gpu. Implement downsample with max on the gpu.
""" """
...@@ -858,7 +859,7 @@ class GpuDownsampleFactorMax(Op): ...@@ -858,7 +859,7 @@ class GpuDownsampleFactorMax(Op):
} }
""" % locals() """ % locals()
class GpuDownsampleFactorMaxGrad(Op): class GpuDownsampleFactorMaxGrad(GpuOp):
""" """
Implement the grad of downsample with max on the gpu. Implement the grad of downsample with max on the gpu.
""" """
......
...@@ -3,11 +3,12 @@ from theano import tensor, scalar ...@@ -3,11 +3,12 @@ from theano import tensor, scalar
import StringIO import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax
class GpuCrossentropySoftmaxArgmax1HotWithBias (Op): class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
""" """
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu. Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
""" """
...@@ -180,7 +181,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op): ...@@ -180,7 +181,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (Op):
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx (Op): class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
""" """
Implement CrossentropySoftmax1HotWithBiasDx on the gpu. Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
""" """
...@@ -302,7 +303,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op): ...@@ -302,7 +303,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax (Op): class GpuSoftmax (GpuOp):
""" """
Implement Softmax on the gpu. Implement Softmax on the gpu.
""" """
...@@ -400,7 +401,7 @@ class GpuSoftmax (Op): ...@@ -400,7 +401,7 @@ class GpuSoftmax (Op):
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias (Op): class GpuSoftmaxWithBias (GpuOp):
""" """
Implement SoftmaxWithBias on the gpu. Implement SoftmaxWithBias on the gpu.
""" """
......
...@@ -10,7 +10,7 @@ __contact__ = "theano-dev@googlegroups.com" ...@@ -10,7 +10,7 @@ __contact__ = "theano-dev@googlegroups.com"
import sys import sys
import numpy import numpy
import theano.gof import theano.gof
from theano.sandbox.cuda import CudaNdarrayType from theano.sandbox.cuda import CudaNdarrayType, GpuOp
from theano.tensor import (get_vector_length, cast, opt) from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, Variable from theano.gof import local_optimizer, Variable
...@@ -19,7 +19,7 @@ from theano.gof import local_optimizer, Variable ...@@ -19,7 +19,7 @@ from theano.gof import local_optimizer, Variable
config = theano.config config = theano.config
class CURAND_Base(theano.gof.Op): class CURAND_Base(GpuOp):
""" Base class for a random number generator implemented in CURAND. """ Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by The random number generator itself is an opaque reference managed by
......
...@@ -19,23 +19,28 @@ import theano.sandbox.cuda.basic_ops as B ...@@ -19,23 +19,28 @@ import theano.sandbox.cuda.basic_ops as B
from theano.tensor.basic import _allclose from theano.tensor.basic import _allclose
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
if theano.config.mode=='FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def rand_cuda_ndarray(shape): def rand_cuda_ndarray(shape):
return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) return cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
dtype='float32'))
#intentionally disabled #intentionally disabled
def tes_use(): def tes_use():
tcn.use() tcn.use()
def test_sum(): def test_sum():
""" """
test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111, 0011, 0101, 0111, 1011, 1111 test sum pattern 1, 11, 10, 01, 001, 010, 100, 110, 011, 111,
0011, 0101, 0111, 1011, 1111
test sum pattern implemented with reshape: test sum pattern implemented with reshape:
1000, 0100, 0010, 0001, 11111 1000, 0100, 0010, 0001, 11111
...@@ -91,18 +96,18 @@ def test_sum(): ...@@ -91,18 +96,18 @@ def test_sum():
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111 ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]: ]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val,dtype='float32') val = theano._asarray(val, dtype='float32')
f = theano.function([a],b, mode=mode_with_gpu) f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a],b, mode=mode_without_gpu) f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
if val.size==0: if val.size == 0:
assert f2(val)==f(val), ('shape', shape, 'pattern', pattern) assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
else: else:
try: try:
#We raise the error threashold as we sum big matrix #We raise the error threashold as we sum big matrix
...@@ -110,7 +115,9 @@ def test_sum(): ...@@ -110,7 +115,9 @@ def test_sum():
#example in debug mode with unittests.rseed=9275 #example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5 theano.tensor.basic.float32_rtol = 2e-5
assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
finally: finally:
theano.tensor.basic.float32_rtol = orig_rtol theano.tensor.basic.float32_rtol = orig_rtol
...@@ -121,21 +128,23 @@ def test_sum(): ...@@ -121,21 +128,23 @@ def test_sum():
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape)) dim_pattern = range(len(shape))
dim_pattern[0]=1 dim_pattern[0] = 1
dim_pattern[1]=0 dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern) a = a.dimshuffle(dim_pattern)
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val,dtype='float32') val = theano._asarray(val, dtype='float32')
f = theano.function([a],b, mode=mode_with_gpu) f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a],b, mode=mode_without_gpu) f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()] assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast #test with broadcast
...@@ -143,116 +152,135 @@ def test_sum(): ...@@ -143,116 +152,135 @@ def test_sum():
((5,4),[0,1]),((5,4),[0]), ((5,4),[0,1]),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]), ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]: ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
shape = numpy.asarray(shape)*2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32',(False,)*len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,)*len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = T.Sum(pattern)(a) b = T.Sum(pattern)(a)
b2 = T.Sum(pattern)(a2) b2 = T.Sum(pattern)(a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val,dtype='float32') val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val) val2 = cuda.CudaNdarray(val)
if len(shape)==1: if len(shape) == 1:
val = val[::2] val = val[::2]
val2 = val2[::2] val2 = val2[::2]
elif len(shape)==2: elif len(shape) == 2:
val = val[::2,::2] val = val[::2, ::2]
val2 = val2[::2,::2] val2 = val2[::2, ::2]
elif len(shape)==3: elif len(shape) == 3:
val = val[::2,::2,::2] val = val[::2, ::2, ::2]
val2 = val2[::2,::2,::2] val2 = val2[::2, ::2, ::2]
elif len(shape)==4: elif len(shape) == 4:
val = val[::2,::2,::2,::2] val = val[::2, ::2, ::2, ::2]
val2 = val2[::2,::2,::2,::2] val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a],b, mode=mode_without_gpu) f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2],b2, mode=mode_with_gpu) f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()] assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()] assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern])) assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_flatten(): def test_flatten():
x = cuda.fmatrix('x') x = cuda.fmatrix('x')
f = theano.function([x], x.flatten()) f = theano.function([x], x.flatten())
assert len(f( [[0.,0.],[0.,0.]] ).shape)==1 assert len(f([[0., 0.], [0., 0.]]).shape) == 1
def test_reshape(): def test_reshape():
a = tcn.CudaNdarrayType((False,))() a = tcn.CudaNdarrayType((False,))()
b = tcn.CudaNdarrayType((False,False))() b = tcn.CudaNdarrayType((False, False))()
c = T.reshape(a, [2,3]) c = T.reshape(a, [2, 3])
#basic #basic
f = theano.function([a], c, mode=mode_without_gpu) f = theano.function([a], c, mode=mode_with_gpu)
fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))) fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
assert numpy.all(fv == numpy.asarray([[0,1,2], [3,4,5]])) dtype='float32')))
topo = f.maker.env.toposort()
assert any([isinstance(node.op, B.GpuReshape) for node in topo])
assert numpy.all(fv == numpy.asarray([[0, 1, 2], [3, 4, 5]]))
#test that it works without inplace operations #test that it works without inplace operations
a_val = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')) a_val = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')) dtype='float32'))
b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0,1,2],[3,4,5]],dtype='float32')) a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0, 1, 2, 3, 4, 5],
dtype='float32'))
f_sub = theano.function([a,b], c-b, mode=mode_without_gpu) b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0, 1, 2], [3, 4, 5]],
dtype='float32'))
f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
topo = f_sub.maker.env.toposort()
assert any([isinstance(node.op, B.GpuReshape) for node in topo])
assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))
#test that it works with inplace operations #test that it works with inplace operations
a_val = theano._asarray([0,1,2,3,4,5], dtype='float32') a_val = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float32') a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float32')
b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float32') b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float32')
f_sub = theano.function([a,b], c-b, mode=mode_without_gpu) f_sub = theano.function([a, b], c - b, mode=mode_with_gpu)
topo = f_sub.maker.env.toposort()
assert any([isinstance(node.op, B.GpuReshape) for node in topo])
assert numpy.all(f_sub(a_val, b_val) == 0.0) assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy)) assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))
# verify gradient # verify gradient
def just_vals(v): def just_vals(v):
return T.Reshape(2)(v, theano._asarray([2,3], dtype='int32')) return T.Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
utt.verify_grad(just_vals, [a_val]) utt.verify_grad(just_vals, [a_val])
def test_elemwise_empty(): def test_elemwise_empty():
#test with 0 element #test with 0 element
a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0,0), dtype='float32'), 'a') a = tcn.shared_constructor(theano._asarray(numpy.random.rand(0, 0),
dtype='float32'), 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
f2 = pfunc([b], [], updates=[(a, a+b)], mode=mode_without_gpu) f2 = pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)
a0 = a.get_value() * 1.0 a0 = a.get_value() * 1.0
f(numpy.ones((0,0), dtype='float32')) f(numpy.ones((0, 0), dtype='float32'))
assert numpy.all(a0 + 1.0 == a.get_value()) assert numpy.all(a0 + 1.0 == a.get_value())
def test_elemwise0(): def test_elemwise0():
a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4,4), dtype='float32'), 'a') a = tcn.shared_constructor(theano._asarray(numpy.random.rand(4, 4),
dtype='float32'), 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, a+b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
#check that we work inplace. #check that we work inplace.
assert f.maker.env.toposort()[1].op.destroy_map.items()==[(0,[0])] assert f.maker.env.toposort()[1].op.destroy_map.items() == [(0, [0])]
a0 = a.get_value() * 1.0 a0 = a.get_value() * 1.0
print 'BEFORE ADD', a.get_value() print 'BEFORE ADD', a.get_value()
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print i, node print i, node
f(numpy.ones((4,4), dtype='float32')) f(numpy.ones((4, 4), dtype='float32'))
print 'AFTER ADD', a.get_value() print 'AFTER ADD', a.get_value()
assert numpy.all(a0 + 1.0 == a.get_value()) assert numpy.all(a0 + 1.0 == a.get_value())
def test_elemwise_bad_broadcast(): def test_elemwise_bad_broadcast():
x = cuda.fmatrix('x') x = cuda.fmatrix('x')
y = cuda.fmatrix('y') y = cuda.fmatrix('y')
f = theano.function([x, y], x * y, mode=mode_with_gpu) f = theano.function([x, y], x * y, mode=mode_with_gpu)
print f.maker.env.toposort() print f.maker.env.toposort()
assert len(f.maker.env.toposort())==2 assert len(f.maker.env.toposort()) == 2
assert isinstance(f.maker.env.toposort()[0].op, cuda.GpuElemwise) assert isinstance(f.maker.env.toposort()[0].op, cuda.GpuElemwise)
assert f.maker.env.toposort()[1].op==cuda.host_from_gpu assert f.maker.env.toposort()[1].op == cuda.host_from_gpu
try: try:
f(rand_cuda_ndarray((10, 3)), rand_cuda_ndarray((10, 1))) f(rand_cuda_ndarray((10, 3)), rand_cuda_ndarray((10, 1)))
...@@ -261,41 +289,48 @@ def test_elemwise_bad_broadcast(): ...@@ -261,41 +289,48 @@ def test_elemwise_bad_broadcast():
else: else:
raise Exception("Theano should have raised an error") raise Exception("Theano should have raised an error")
def test_elemwise1(): def test_elemwise1():
""" Several kinds of elemwise expressions with no broadcasting, non power-of-two shape """ """ Several kinds of elemwise expressions with no broadcasting,
non power-of-two shape """
shape = (3,4) shape = (3, 4)
a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.5, 'a') a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
dtype='float32') + 0.5, 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
#let debugmode catch any mistakes #let debugmode catch any mistakes
print >> sys.stdout, "STARTING FUNCTION 1" print >> sys.stdout, "STARTING FUNCTION 1"
f = pfunc([b], [], updates=[(a, b**a)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, b ** a)], mode=mode_with_gpu)
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print i, node print i, node
f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
print >> sys.stdout, "STARTING FUNCTION 2" print >> sys.stdout, "STARTING FUNCTION 2"
#let debugmode catch any mistakes #let debugmode catch any mistakes
f = pfunc([b], [], updates=[(a, tensor.exp(b**a))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, tensor.exp(b ** a))], mode=mode_with_gpu)
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print i, node print i, node
f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3) f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
print >> sys.stdout, "STARTING FUNCTION 3" print >> sys.stdout, "STARTING FUNCTION 3"
#let debugmode catch any mistakes #let debugmode catch any mistakes
f = pfunc([b], [], updates=[(a, a+b * tensor.exp(b**a))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a + b * tensor.exp(b ** a))],
f(theano._asarray(numpy.random.rand(*shape), dtype='float32')+0.3) mode=mode_with_gpu)
f(theano._asarray(numpy.random.rand(*shape), dtype='float32') + 0.3)
def test_elemwise2(): def test_elemwise2():
""" Several kinds of elemwise expressions with dimension permutations """ """ Several kinds of elemwise expressions with dimension permutations """
rng = numpy.random.RandomState(int(time.time())) rng = numpy.random.RandomState(int(time.time()))
print 'random?', rng.rand(3) print 'random?', rng.rand(3)
shape = (3,5) shape = (3, 5)
for pattern in [(0,1), (1,0)]: for pattern in [(0, 1), (1, 0)]:
a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), name=None) a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))() dtype='float32'), name=None)
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle(pattern))], mode=mode_with_gpu) b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
f = pfunc([b], [], updates=[(a, (a + b).dimshuffle(pattern))],
mode=mode_with_gpu)
has_elemwise = False has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stdout, i, node print >> sys.stdout, i, node
...@@ -303,34 +338,39 @@ def test_elemwise2(): ...@@ -303,34 +338,39 @@ def test_elemwise2():
assert not has_elemwise assert not has_elemwise
#let debugmode catch errors #let debugmode catch errors
print >> sys.stdout, 'pattern', pattern print >> sys.stdout, 'pattern', pattern
f(theano._asarray(rng.rand(*shape),dtype='float32')*.3) f(theano._asarray(rng.rand(*shape), dtype='float32') * .3)
shape = (3,4,5,6) shape = (3, 4, 5, 6)
a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),dtype='float32'), 'a') a = tcn.shared_constructor(theano._asarray(rng.rand(*shape),
b = tensor.Tensor(dtype='float32', broadcastable=[0]*len(shape))() dtype='float32'), 'a')
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
tensor.exp(b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu)
has_elemwise = False has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print i, node print i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise assert not has_elemwise
#let debugmode catch errors #let debugmode catch errors
f(theano._asarray(rng.rand(*shape),dtype='float32')) f(theano._asarray(rng.rand(*shape), dtype='float32'))
def test_elemwise3(): def test_elemwise3():
""" Several kinds of elemwise expressions with dimension permutations and broadcasting""" """ Several kinds of elemwise expressions with dimension
permutations and broadcasting"""
shape = (3,4,5,6) shape = (3, 4, 5, 6)
a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
dtype='float32'), 'a')
b = tensor.fvector() b = tensor.fvector()
print b.type print b.type
print tensor.constant(1).type print tensor.constant(1).type
print (1 + b).type print (1 + b).type
print (1 + b**a).type print (1 + b ** a).type
print tensor.exp((1 + b**a)).type print tensor.exp((1 + b ** a)).type
f = pfunc([b], [], updates=[(a, (a+b).dimshuffle([2,0,3,1]) * tensor.exp(1 + new_val = (a + b).dimshuffle([2, 0, 3, 1])
b**a).dimshuffle([2,0,3,1]))], mode=mode_with_gpu) new_val *= tensor.exp(1 + b ** a).dimshuffle([2, 0, 3, 1])
f = pfunc([b], [], updates=[(a, new_val)], mode=mode_with_gpu)
has_elemwise = False has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stdout, i, node print >> sys.stdout, i, node
...@@ -339,75 +379,86 @@ def test_elemwise3(): ...@@ -339,75 +379,86 @@ def test_elemwise3():
#let debugmode catch errors #let debugmode catch errors
f(theano._asarray(numpy.random.rand(6), dtype='float32')) f(theano._asarray(numpy.random.rand(6), dtype='float32'))
def test_elemwise4(): def test_elemwise4():
""" Test that two vectors can be broadcast to form an outer product (by performing rank-1 matrix update""" """ Test that two vectors can be broadcast to form an outer
product (by performing rank-1 matrix update"""
shape = (3,4) shape = (3, 4)
a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape), dtype='float32'), 'a') a = tcn.shared_constructor(theano._asarray(numpy.random.rand(*shape),
dtype='float32'), 'a')
b = tensor.fvector() b = tensor.fvector()
c = tensor.fvector() c = tensor.fvector()
f = pfunc([b,c], [], updates=[(a, (a+b.dimshuffle('x', 0)*c.dimshuffle(0, 'x')))], mode=mode_with_gpu) f = pfunc([b, c], [],
updates=[(a, (a + b.dimshuffle('x', 0) * c.dimshuffle(0, 'x')))],
mode=mode_with_gpu)
has_elemwise = False has_elemwise = False
for i, node in enumerate(f.maker.env.toposort()): for i, node in enumerate(f.maker.env.toposort()):
print >> sys.stdout, i, node print >> sys.stdout, i, node
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
assert not has_elemwise assert not has_elemwise
#let debugmode catch errors #let debugmode catch errors
f(theano._asarray(numpy.random.rand(4), dtype='float32'), theano._asarray(numpy.random.rand(3), dtype='float32')) f(theano._asarray(numpy.random.rand(4), dtype='float32'),
theano._asarray(numpy.random.rand(3), dtype='float32'))
def test_elemwise_comparaison_cast(): def test_elemwise_comparaison_cast():
""" """
test if an elemwise comparaison followed by a cast to float32 are pushed to gpu. test if an elemwise comparaison followed by a cast to float32 are
pushed to gpu.
""" """
a = tensor.fmatrix() a = tensor.fmatrix()
b = tensor.fmatrix() b = tensor.fmatrix()
av = theano._asarray(numpy.random.rand(4,4), dtype='float32') av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
bv = numpy.ones((4,4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32')
for g,ans in [(tensor.lt, av<bv), (tensor.gt, av>bv), for g, ans in [(tensor.lt, av < bv), (tensor.gt, av > bv),
(tensor.le, av<=bv), (tensor.ge, av>=bv)]: (tensor.le, av <= bv), (tensor.ge, av >= bv)]:
f = pfunc([a,b], tensor.cast(g(a,b),'float32'), mode=mode_with_gpu) f = pfunc([a, b], tensor.cast(g(a, b), 'float32'), mode=mode_with_gpu)
#theano.printing.debugprint(f) #theano.printing.debugprint(f)
out = f(av,bv) out = f(av, bv)
assert numpy.all(out == ans) assert numpy.all(out == ans)
assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([isinstance(node.op, cuda.GpuElemwise)
#assert any([isinstance(node.op, tensor.Elemwise) for node in f.maker.env.toposort()]) for node in f.maker.env.toposort()])
def test_elemwise_composite_float64(): def test_elemwise_composite_float64():
# test that we don't fuse composite elemwise with float64 somewhere inside # test that we don't fuse composite elemwise with float64 somewhere inside
# nvcc by default downcast them to float32. We would need to tell him not to # nvcc by default downcast them to float32. We would need to tell him not
# do so, but that possible only on some device. # to do so, but that possible only on some device.
a = tensor.fmatrix() a = tensor.fmatrix()
b = tensor.fmatrix() b = tensor.fmatrix()
av = theano._asarray(numpy.random.rand(4,4), dtype='float32') av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
bv = numpy.ones((4,4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32')
def get_all_basic_scalar(composite_op): def get_all_basic_scalar(composite_op):
l=[] l = []
for i in composite_op.env.toposort(): for i in composite_op.env.toposort():
if isinstance(i, theano.scalar.Composite): if isinstance(i, theano.scalar.Composite):
l += get_all_basic_scalar(i) l += get_all_basic_scalar(i)
else: else:
l.append(i) l.append(i)
return l return l
for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
f = pfunc([a,b], tensor.cast(tensor.lt(tensor.cast(a,'float64')**2,#*numpy.asarray(2, 'float32'), mode_with_gpu.excluding('elemwise_fusion')]:
f = pfunc([a, b],
tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
b), b),
'float32'), mode=mode) 'float32'), mode=mode)
#theano.printing.debugprint(f, print_type=True) #theano.printing.debugprint(f, print_type=True)
out = f(av,bv) out = f(av, bv)
assert numpy.all(out == ((av**2)<bv)) assert numpy.all(out == ((av ** 2) < bv))
for node in f.maker.env.toposort(): for node in f.maker.env.toposort():
if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op, cuda.GpuElemwise):
if isinstance(node.op.scalar_op, theano.scalar.Composite): if isinstance(node.op.scalar_op, theano.scalar.Composite):
scals = get_all_basic_scalar(node.op.scalar_op) scals = get_all_basic_scalar(node.op.scalar_op)
for s in scals: for s in scals:
assert not any([i.type.dtype=='float64' for i in s.inputs+s.outputs]) assert not any([i.type.dtype == 'float64'
for i in s.inputs + s.outputs])
def test_elemwise_composite_support_code(): def test_elemwise_composite_support_code():
...@@ -443,205 +494,226 @@ def test_elemwise_composite_support_code(): ...@@ -443,205 +494,226 @@ def test_elemwise_composite_support_code():
def speed_elemwise_collapse(): def speed_elemwise_collapse():
""" used to time if the collapse of ccontiguous dims are useful """ """ used to time if the collapse of ccontiguous dims are useful """
shape = (30,40,50,600) shape = (30, 40, 50, 600)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2[:,::2,:,:] a3 = a2[:, ::2, :, :]
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = a3+b * tensor.exp(1 + b**a3) c = a3 + b * tensor.exp(1 + b ** a3)
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
v = theano._asarray(numpy.random.rand(*shape),dtype='float32') v = v[:, ::2, :, :]
v = v[:,::2,:,:] v = cuda_ndarray.CudaNdarray(v)
v=cuda_ndarray.CudaNdarray(v) for id, n in enumerate(f.maker.env.toposort()):
for id,n in enumerate(f.maker.env.toposort()):
print id, n print id, n
t1=time.time() t1 = time.time()
for i in range(100): for i in range(100):
#let debugmode catch errors #let debugmode catch errors
f(v) f(v)
t2=time.time() t2 = time.time()
def speed_elemwise_collapse2(): def speed_elemwise_collapse2():
""" used to test the speed up of the generalised collapse of ccontiguous dims""" """ used to test the speed up of the generalised collapse of
ccontiguous dims"""
shape = (30,40,50,600) shape = (30, 40, 50, 600)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2[:,:,:,::2] a3 = a2[:, :, :, ::2]
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = a3+b * tensor.exp(1 + b**a3) c = a3 + b * tensor.exp(1 + b ** a3)
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
v = theano._asarray(numpy.random.rand(*shape),dtype='float32') v = v[:, :, :, ::2]
v = v[:,:,:,::2] v = cuda_ndarray.CudaNdarray(v)
v=cuda_ndarray.CudaNdarray(v) for id, n in enumerate(f.maker.env.toposort()):
for id,n in enumerate(f.maker.env.toposort()):
print id, n print id, n
t1=time.time() t1 = time.time()
for i in range(100): for i in range(100):
#let debugmode catch errors #let debugmode catch errors
f(v) f(v)
t2=time.time() t2 = time.time()
def test_elemwise_collapse(): def test_elemwise_collapse():
""" Test when all inputs have one(and the same) broadcastable dimension """ """ Test when all inputs have one(and the same) broadcastable dimension """
shape = (4,5,60) shape = (4, 5, 60)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle(0,'x',1,2) a3 = a2.dimshuffle(0, 'x', 1, 2)
b = tcn.CudaNdarrayType((False, True, False, False))() b = tcn.CudaNdarrayType((False, True, False, False))()
c = a3+b c = a3 + b
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(shape[0], 1, *shape[1:]),
v = theano._asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32') dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v) assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
print "Expected collapse of all dimensions" print "Expected collapse of all dimensions"
def test_elemwise_collapse2(): def test_elemwise_collapse2():
""" Test when only one inputs have one broadcastable dimension """ """ Test when only one inputs have one broadcastable dimension """
shape = (4,5,9) shape = (4, 5, 9)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle(0,'x',1,2) a3 = a2.dimshuffle(0, 'x', 1, 2)
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = a3+b c = a3 + b
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]),
v = theano._asarray(numpy.random.rand(shape[0],5,*shape[1:]),dtype='float32') dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v) assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse3(): def test_elemwise_collapse3():
""" Test when only one inputs have two broadcastable dimension at each ends """ """ Test when only one inputs have two broadcastable dimension at each ends """
shape = (4,5) shape = (4, 5)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape),
dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x',0,1,'x') a3 = a2.dimshuffle('x', 0, 1, 'x')
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = (a3+b) c = (a3 + b)
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32') dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v) assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse4(): def test_elemwise_collapse4():
""" Test when only one inputs have two broadcastable dimension at each ends and we add a scalar""" """ Test when only one inputs have two broadcastable dimension at
each ends and we add a scalar"""
shape = (4,5) shape = (4, 5)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x',0,1,'x') a3 = a2.dimshuffle('x', 0, 1, 'x')
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = (a3+b+2) c = (a3 + b + 2)
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
v = theano._asarray(numpy.random.rand(5,shape[0],shape[1],4),dtype='float32') dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(1,shape[0],shape[1],1)+v+2) assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
print "Expected collapse to 3 dimensions" print "Expected collapse to 3 dimensions"
def test_elemwise_collapse5(): def test_elemwise_collapse5():
""" Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar""" """ Test when only one inputs have two broadcastable dimension at
the beginning and we add a scalar"""
shape = (4,5) shape = (4, 5)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1) a3 = a2.dimshuffle('x', 'x', 0, 1)
b = tcn.CudaNdarrayType((False, False, False, False))() b = tcn.CudaNdarrayType((False, False, False, False))()
c = (a3+b+2) c = (a3 + b + 2)
f = pfunc([b], [c], mode=mode_with_gpu) f = pfunc([b], [c], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(5, 4, shape[0], shape[1]),
v = theano._asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32') dtype='float32')
v=cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2) assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v + 2)
print "Expected collapse to 2 dimensions" print "Expected collapse to 2 dimensions"
def test_elemwise_collapse6(): def test_elemwise_collapse6():
""" Test when all inputs have two broadcastable dimension at the beginning""" """ Test when all inputs have two broadcastable dimension at the
beginning"""
shape = (4,5) shape = (4, 5)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a, 'a') a2 = tcn.shared_constructor(a, 'a')
a3 = a2.dimshuffle('x','x',0,1) a3 = a2.dimshuffle('x', 'x', 0, 1)
b = tcn.CudaNdarrayType((True, True, False, False))() b = tcn.CudaNdarrayType((True, True, False, False))()
f = pfunc([b], [a3+b], mode=mode_with_gpu) f = pfunc([b], [a3 + b], mode=mode_with_gpu)
v = theano._asarray(numpy.random.rand(1,1,shape[0],shape[1]),dtype='float32') v = theano._asarray(numpy.random.rand(1, 1, shape[0], shape[1]),
v=cuda_ndarray.CudaNdarray(v) dtype='float32')
v = cuda_ndarray.CudaNdarray(v)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f(v)[0] out = f(v)[0]
assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v) assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v)
print "Expected collapse to c contiguous" print "Expected collapse to c contiguous"
def test_elemwise_collapse7(atol=1e-6): def test_elemwise_collapse7(atol=1e-6):
""" Test when one input have one broadcastable dimension and the other is a scalar""" """ Test when one input have one broadcastable dimension and the
other is a scalar"""
shape = (5,4,1) shape = (5, 4, 1)
a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32')) a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
a = theano._asarray(numpy.random.rand(*shape),dtype='float32') dtype='float32'))
a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a2 = tcn.shared_constructor(a.copy(), 'a') a2 = tcn.shared_constructor(a.copy(), 'a')
a3 = a2.dimshuffle(0, 'x', 1, 2) a3 = a2.dimshuffle(0, 'x', 1, 2)
f = pfunc([], [a3+2], mode=mode_with_gpu) f = pfunc([], [a3 + 2], mode=mode_with_gpu)
if False: if False:
for id,n in enumerate(f.maker.env.toposort()): for id, n in enumerate(f.maker.env.toposort()):
print id, n print id, n
#let debugmode catch errors #let debugmode catch errors
out=f()[0] out = f()[0]
ans=(a+2).reshape(shape[0],1,shape[1],shape[2]) ans = (a + 2).reshape(shape[0], 1, shape[1], shape[2])
assert numpy.allclose(out,ans, atol=atol) assert numpy.allclose(out, ans, atol=atol)
print "Expected collapse to c contiguous" print "Expected collapse to c contiguous"
...@@ -651,40 +723,45 @@ def test_hostfromgpu_shape_i(): ...@@ -651,40 +723,45 @@ def test_hostfromgpu_shape_i():
""" """
pass pass
m = mode_with_gpu.including('local_dot_to_dot22','local_dot22_to_dot22scalar','specialize') m = mode_with_gpu.including('local_dot_to_dot22',
a=T.fmatrix('a') 'local_dot22_to_dot22scalar','specialize')
ca=theano.sandbox.cuda.var.CudaNdarrayType((False,False))() a = T.fmatrix('a')
ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))()
av=numpy.asarray(numpy.random.rand(5,4),dtype='float32') av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
cv=cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5,4),dtype='float32')) cv = cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5, 4),
dtype='float32'))
f = theano.function([a],cuda.basic_ops.gpu_from_host(a), mode=m) f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m)
assert cuda.basic_ops.gpu_from_host in [x.op for x in f.maker.env.toposort()] assert cuda.basic_ops.gpu_from_host in [x.op
f = theano.function([a],cuda.basic_ops.gpu_from_host(a).shape, mode=m) for x in f.maker.env.toposort()]
f = theano.function([a], cuda.basic_ops.gpu_from_host(a).shape, mode=m)
topo = f.maker.env.toposort() topo = f.maker.env.toposort()
assert isinstance(topo[0].op,T.opt.Shape_i) assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op,T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i)
assert isinstance(topo[2].op,T.opt.MakeVector) assert isinstance(topo[2].op, T.opt.MakeVector)
assert tuple(f(av))==(5,4) assert tuple(f(av)) == (5, 4)
f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca), mode=m) f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca), mode=m)
assert cuda.basic_ops.host_from_gpu in [x.op for x in f.maker.env.toposort()] assert cuda.basic_ops.host_from_gpu in [x.op
f = theano.function([ca],cuda.basic_ops.host_from_gpu(ca).shape, mode=m) for x in f.maker.env.toposort()]
f = theano.function([ca], cuda.basic_ops.host_from_gpu(ca).shape, mode=m)
topo = f.maker.env.toposort() topo = f.maker.env.toposort()
assert isinstance(topo[0].op,T.opt.Shape_i) assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op,T.opt.Shape_i) assert isinstance(topo[1].op, T.opt.Shape_i)
assert isinstance(topo[2].op,T.opt.MakeVector) assert isinstance(topo[2].op, T.opt.MakeVector)
assert tuple(f(cv))==(5,4) assert tuple(f(cv)) == (5, 4)
# ----------------------------------------------------------------------- # -----------------------------------------------------------------------
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
def test_gpujoin_assert_cndas(): def test_gpujoin_assert_cndas():
# this will end up being an ndarray, as it's float64 # this will end up being an ndarray, as it's float64
_a = numpy.asarray([[1,2],[3,4]],dtype='float64') _a = numpy.asarray([[1, 2], [3, 4]], dtype='float64')
a = theano.shared(_a) a = theano.shared(_a)
try: try:
...@@ -697,64 +774,80 @@ def test_gpujoin_assert_cndas(): ...@@ -697,64 +774,80 @@ def test_gpujoin_assert_cndas():
assert False assert False
def test_gpujoin_no_rebroadcast(): def test_gpujoin_no_rebroadcast():
_a = numpy.asarray([[1,2],[3,4]],dtype='float32') _a = numpy.asarray([[1, 2], [3, 4]], dtype='float32')
a = tcn.shared_constructor(_a) a = tcn.shared_constructor(_a)
f = theano.function([],T.join(1,a)) f = theano.function([], T.join(1, a))
l = f.maker.env.toposort() l = f.maker.env.toposort()
assert not any([isinstance(x.op,T.Rebroadcast) for x in l]) assert not any([isinstance(x.op, T.Rebroadcast) for x in l])
def test_gpualloc_input_on_gpu(): def test_gpualloc_input_on_gpu():
a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
a = tcn.shared_constructor(a_val) a = tcn.shared_constructor(a_val)
b = T.fscalar() b = T.fscalar()
f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu) f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
f_gpu = theano.function([b], T.ones_like(a)+b, mode=mode_with_gpu) f_gpu = theano.function([b], T.ones_like(a) + b, mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
for node in f_gpu.maker.env.toposort()]) == 1
assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1 assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1 f_gpu(9))
assert numpy.allclose(f(5), f_gpu(5))
assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
assert numpy.allclose(f(5),f_gpu(5))
def test_gpujoin_gpualloc(): def test_gpujoin_gpualloc():
a = T.fmatrix('a') a = T.fmatrix('a')
a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
b = T.fmatrix('b') b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3,5),dtype='float32') b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4,
mode=mode_without_gpu)
f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
mode=mode_with_gpu)
f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
T.ones_like(b)) + 4,
mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
for node in f_gpu.maker.env.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu.maker.env.toposort()]) == 1
assert sum([node.op == B.gpu_alloc
for node in f_gpu2.maker.env.toposort()]) == 2
assert sum([node.op == B.gpu_join
for node in f_gpu2.maker.env.toposort()]) == 1
assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
f = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_without_gpu)
f_gpu = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b)), mode=mode_with_gpu)
f_gpu2 = theano.function([a,b], T.join(0,T.zeros_like(a),T.ones_like(b))+4, mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==2
assert sum([node.op == T.join for node in f.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()])==1
assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()])==2
assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()])==1
assert numpy.allclose(f(a_val,b_val),f_gpu2(a_val,b_val))
def test_gpualloc_output_to_gpu(): def test_gpualloc_output_to_gpu():
a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
a = tcn.shared_constructor(a_val) a = tcn.shared_constructor(a_val)
b = T.fscalar() b = T.fscalar()
f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu) f = theano.function([b], T.ones_like(a) + b, mode=mode_without_gpu)
f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a))+b, mode=mode_with_gpu) f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a)) + b,
mode=mode_with_gpu)
print f.maker.env.toposort() print f.maker.env.toposort()
print f_gpu.maker.env.toposort() print f_gpu.maker.env.toposort()
print f(2) print f(2)
print f_gpu(2) print f_gpu(2)
assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1 assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 1
assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1 assert sum([node.op == B.gpu_alloc
for node in f_gpu.maker.env.toposort()]) == 1
assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9)) assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape) + 9,
assert numpy.allclose(f(5),f_gpu(5)) f_gpu(9))
assert numpy.allclose(f(5), f_gpu(5))
import theano.tensor.tests.test_basic import theano.tensor.tests.test_basic
...@@ -766,6 +859,7 @@ class TestAlloc(theano.tensor.tests.test_basic.TestAlloc): ...@@ -766,6 +859,7 @@ class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc] allocs = [B.GpuAlloc, B.GpuAlloc, tensor.Alloc]
class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split): class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
def setUp(self): def setUp(self):
utt.seed_rng() utt.seed_rng()
...@@ -783,128 +877,152 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split): ...@@ -783,128 +877,152 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
# This is to don't duplicate test. # This is to don't duplicate test.
class T_subtensor(theano.tensor.tests.test_basic.T_subtensor): class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
shared=staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
sub=cuda.GpuSubtensor sub = cuda.GpuSubtensor
inc_sub=cuda.GpuIncSubtensor inc_sub = cuda.GpuIncSubtensor
adv_sub1=cuda.GpuAdvancedSubtensor1 adv_sub1 = cuda.GpuAdvancedSubtensor1
adv_incsub1=cuda.GpuAdvancedIncSubtensor1 adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
mode=mode_with_gpu mode = mode_with_gpu
dtype='float32' dtype = 'float32'
ignore_topo=(B.HostFromGpu, B.GpuFromHost) ignore_topo = (B.HostFromGpu, B.GpuFromHost)
fast_compile = theano.config.mode == 'FAST_COMPILE' fast_compile = theano.config.mode == 'FAST_COMPILE'
def __init__(self, name): def __init__(self, name):
return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name) return super(theano.tensor.tests.test_basic.T_subtensor,
self).__init__(name)
def test_advinc_subtensor1(): def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """ """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
shared = cuda.shared_constructor shared = cuda.shared_constructor
#shared = tensor.shared #shared = tensor.shared
xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]], xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
dtype='float32') dtype='float32')
yval = numpy.asarray([[10,10,10], [10,10,10]], yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
dtype='float32') dtype='float32')
x = shared(xval, name = 'x') x = shared(xval, name='x')
y = T.fmatrices('y') y = T.fmatrices('y')
expr = T.advanced_inc_subtensor1(x,y,[0,2]) expr = T.advanced_inc_subtensor1(x, y, [0, 2])
f=theano.function([y], expr, mode=mode_with_gpu) f = theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1 assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]]) for node in f.maker.env.toposort()]) == 1
assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
[17., 18., 19.]])
def test_inc_subtensor(): def test_inc_subtensor():
shared = cuda.shared_constructor shared = cuda.shared_constructor
#shared = tensor.shared #shared = tensor.shared
x,y = T.fmatrices('x','y') x, y = T.fmatrices('x', 'y')
xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]], xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
dtype='float32') dtype='float32')
yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]], yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
dtype='float32') dtype='float32')
expr = T.inc_subtensor(x[:,1:3], y[:,1:3]) expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3])
f=theano.function([x,y], expr, mode=mode_with_gpu) f = theano.function([x, y], expr, mode=mode_with_gpu)
print f.maker.env.toposort() print f.maker.env.toposort()
assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1 assert sum([isinstance(node.op, cuda.GpuSubtensor)
assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==False for node in f.maker.env.toposort() ])==1 for node in f.maker.env.toposort()]) == 1
assert numpy.allclose(f(xval,yval),[[1.,12.,13.], [4.,15.,16.], [7.,18.,19.]]) assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
node.op.set_instead_of_inc==False
for node in f.maker.env.toposort()]) == 1
assert numpy.allclose(f(xval, yval), [[1., 12., 13.],
[4., 15., 16.], [7., 18., 19.]])
def test_set_subtensor(): def test_set_subtensor():
shared = cuda.shared_constructor shared = cuda.shared_constructor
#shared = tensor.shared #shared = tensor.shared
x,y = T.fmatrices('x','y') x, y = T.fmatrices('x', 'y')
xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]], xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
dtype='float32') dtype='float32')
yval = numpy.asarray([[10,10,10], [10,10,10], [10,10,10]], yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]],
dtype='float32') dtype='float32')
expr = T.set_subtensor(x[:,1:3], y[:,1:3]) expr = T.set_subtensor(x[:, 1:3], y[:, 1:3])
f=theano.function([x,y], expr, mode=mode_with_gpu) f = theano.function([x, y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1 assert sum([isinstance(node.op, cuda.GpuSubtensor)
assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1 for node in f.maker.env.toposort()]) == 1
print f(xval,yval) assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
node.op.set_instead_of_inc == True
for node in f.maker.env.toposort()]) == 1
print f(xval, yval)
def test_many_arg_elemwise(): def test_many_arg_elemwise():
"""this test checks whether the + and * elemwise ops can handle extremely large numbers of """this test checks whether the + and * elemwise ops can handle extremely large numbers of
arguments on gpu arguments on gpu
i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """ i.e., it is a test of the optimization theano/sandbox/cuda/opt.py:local_gpu_huge_add_or_mul """
rng = numpy.random.RandomState( [1,2,3]) rng = numpy.random.RandomState([1, 2, 3])
for num_args in [25]: for num_args in [25]:
for op_to_test in [ theano.tensor.add, theano.tensor.mul ]: for op_to_test in [theano.tensor.add, theano.tensor.mul]:
for nb_dim in [2,3,4,5]: for nb_dim in [2, 3, 4, 5]:
shapes = [rng.randint(1,5) for i in range(nb_dim)] shapes = [rng.randint(1, 5) for i in range(nb_dim)]
args = [ numpy.cast['float32'](rng.randn(*shapes)) for arg in xrange(0,num_args) ] args = [numpy.cast['float32'](rng.randn(*shapes))
for arg in xrange(0, num_args)]
symb_args = [ theano.tensor.TensorType('float32', (False,)*nb_dim)() for arg in xrange(0,num_args) ] symb_args = [theano.tensor.TensorType('float32',
(False,)*nb_dim)()
for arg in xrange(0, num_args)]
outputs = [] outputs = []
for mode in [ mode_with_gpu, mode_without_gpu ]: for mode in [mode_with_gpu, mode_without_gpu]:
#test the optijmization local_gpu_elemwise_0 #test the optijmization local_gpu_elemwise_0
f = theano.function( symb_args, op_to_test(*symb_args), mode = mode.excluding("local_gpu_elemwise_1") ) f = theano.function(
outputs.append( f( * args) ) symb_args, op_to_test(*symb_args),
mode=mode.excluding("local_gpu_elemwise_1"))
outputs.append(f(*args))
#assert that the test was done on the gpu. #assert that the test was done on the gpu.
if mode is mode_with_gpu: if mode is mode_with_gpu:
assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes]) assert any([isinstance(node.op, cuda.GpuElemwise)
for node in f.maker.env.nodes])
#test the optijmization local_gpu_elemwise_1 #test the optijmization local_gpu_elemwise_1
f = theano.function( symb_args, f = theano.function(
symb_args,
cuda.gpu_from_host(op_to_test(*symb_args)), cuda.gpu_from_host(op_to_test(*symb_args)),
mode = mode.excluding("local_gpu_elemwise_0") ) mode=mode.excluding("local_gpu_elemwise_0"))
out = f( * args) out = f(*args)
#assert that the test was done on the gpu. #assert that the test was done on the gpu.
if mode is mode_with_gpu: if mode is mode_with_gpu:
assert any([isinstance(node.op, cuda.GpuElemwise) for node in f.maker.env.nodes]) assert any([isinstance(node.op, cuda.GpuElemwise)
for node in f.maker.env.nodes])
assert numpy.allclose(out, outputs[-1]) assert numpy.allclose(out, outputs[-1])
results_gpu, results_cpu = outputs results_gpu, results_cpu = outputs
assert numpy.allclose(results_gpu, results_cpu) assert numpy.allclose(results_gpu, results_cpu)
def test_duplicate_arg_elemwise(): def test_duplicate_arg_elemwise():
A = theano.tensor.fmatrix() A = theano.tensor.fmatrix()
B = A + A B = A + A
f = theano.function([A],B, mode = mode_with_gpu) f = theano.function([A], B, mode=mode_with_gpu)
Aval = numpy.random.RandomState([1,2,3]).randn(5,5).astype('float32') Aval = numpy.random.RandomState([1, 2, 3]).randn(5, 5).astype('float32')
Bval = Aval + Aval Bval = Aval + Aval
assert numpy.allclose(Bval,f(Aval)) assert numpy.allclose(Bval, f(Aval))
def test_shared_float32(): def test_shared_float32():
'''Test use of cuda.shared_constructor through theano.shared''' '''Test use of cuda.shared_constructor through theano.shared'''
# Register cuda.shared_constructor in theano.shared # Register cuda.shared_constructor in theano.shared
theano.shared.constructors.append(cuda.shared_constructor) theano.shared.constructors.append(cuda.shared_constructor)
a = theano.shared(numpy.ones((2,3), dtype='float32')) a = theano.shared(numpy.ones((2, 3), dtype='float32'))
assert isinstance(a.type, tcn.CudaNdarrayType) assert isinstance(a.type, tcn.CudaNdarrayType)
# Unregister # Unregister
del theano.shared.constructors[-1] del theano.shared.constructors[-1]
def test_shared_cudandarray(): def test_shared_cudandarray():
'''Test that we can create a CudaNdarraySharedVariable from a CudaNdarray''' '''Test that we can create a CudaNdarraySharedVariable from a
a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3))) CudaNdarray'''
a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2, 3)))
assert isinstance(a.type, tcn.CudaNdarrayType) assert isinstance(a.type, tcn.CudaNdarrayType)
...@@ -987,38 +1105,38 @@ class test_size(unittest.TestCase): ...@@ -987,38 +1105,38 @@ class test_size(unittest.TestCase):
import theano.tensor.tests.test_sharedvar import theano.tensor.tests.test_sharedvar
#This test the case when the shared constructor view an CudaNdarray as input #This test the case when the shared constructor view an CudaNdarray as input
test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester( test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
shared_constructor_ = tcn.shared_constructor, shared_constructor_=tcn.shared_constructor,
dtype_ = 'float32', dtype_='float32',
get_value_borrow_true_alias_ = True, get_value_borrow_true_alias_=True,
shared_borrow_true_alias_ = True,#True when the original value is already a CudaNdarray! shared_borrow_true_alias_=True,#True when the original value is already a CudaNdarray!
set_value_borrow_true_alias_ = True, set_value_borrow_true_alias_=True,
set_value_inplace_ = True, set_value_inplace_=True,
set_cast_value_inplace_ = False, set_cast_value_inplace_=False,
shared_constructor_accept_ndarray_ = True, shared_constructor_accept_ndarray_=True,
internal_type_ = cuda_ndarray.CudaNdarray, internal_type_=cuda_ndarray.CudaNdarray,
test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray), test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
theano_fct_ = theano.tensor.exp, theano_fct_=theano.tensor.exp,
ref_fct_ = numpy.exp, ref_fct_=numpy.exp,
cast_value_ = cuda.as_cuda_array, cast_value_=cuda.as_cuda_array,
op_by_matrix_ = True, op_by_matrix_=True,
name='test_shared_options') name='test_shared_options')
#This test the case when the shared constructor view an ndarray as input #This test the case when the shared constructor view an ndarray as input
test_shared_options2 = theano.tensor.tests.test_sharedvar.makeSharedTester( test_shared_options2 = theano.tensor.tests.test_sharedvar.makeSharedTester(
shared_constructor_ = tcn.shared_constructor, shared_constructor_=tcn.shared_constructor,
dtype_ = 'float32', dtype_='float32',
get_value_borrow_true_alias_ = False, get_value_borrow_true_alias_=False,
shared_borrow_true_alias_ = False, shared_borrow_true_alias_=False,
set_value_borrow_true_alias_ = False, set_value_borrow_true_alias_=False,
set_value_inplace_ = True, set_value_inplace_=True,
set_cast_value_inplace_ = True, set_cast_value_inplace_=True,
shared_constructor_accept_ndarray_ = True, shared_constructor_accept_ndarray_=True,
internal_type_ = cuda_ndarray.CudaNdarray, internal_type_=cuda_ndarray.CudaNdarray,
test_internal_type_ = lambda a: isinstance(a,cuda_ndarray.CudaNdarray), test_internal_type_=lambda a: isinstance(a, cuda_ndarray.CudaNdarray),
theano_fct_ = theano.tensor.exp, theano_fct_=theano.tensor.exp,
ref_fct_ = numpy.exp, ref_fct_=numpy.exp,
cast_value_ = numpy.asarray, cast_value_=numpy.asarray,
op_by_matrix_ = True, op_by_matrix_=True,
name='test_shared_options') name='test_shared_options')
if __name__ == '__main__': if __name__ == '__main__':
......
import numpy
import theano
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as cuda
import theano.sandbox.cuda.basic_ops as B
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def test_nvidia_driver1():
""" Some nvidia driver give bad result for reduction
This execute some reduction test to ensure it run correctly
"""
a = numpy.random.rand(10000).astype("float32")
A = cuda.shared_constructor(a)
f = theano.function(inputs=[], outputs=A.sum(), mode=mode_with_gpu)
topo = f.maker.env.toposort()
assert len(topo) == 2
assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with the OS "
"don't give good result for reduction."
"Installing the nvidia driver available on the same "
"download page as the cuda package will fix the "
"problem: http://developer.nvidia.com/cuda-downloads")
def test_nvidia_driver2():
""" Test that the gpu device is initialized by theano when
we manually make a shared variable on the gpu.
The driver should always be tested during theano initialization
of the gpu device
"""
a = numpy.random.rand(10000).astype("float32")
cuda.shared_constructor(a)
assert theano.sandbox.cuda.use.device_number is not None
def test_nvidia_driver3():
""" Test that the gpu device is initialized by theano when
we build a function with gpu op.
The driver should always be tested during theano initialization
of the gpu device
"""
var = cuda.fvector()
f = theano.function([var], var + 1, mode=mode_with_gpu)
topo = f.maker.env.toposort()
assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])
assert theano.sandbox.cuda.use.device_number is not None
# TODO make sure the test_nvidia_driver test are executed when we make manually
# a CudaNdarray like this: cuda.CudaNdarray.zeros((5,4))
...@@ -169,6 +169,12 @@ def cuda_shared_constructor(value, name=None, strict=False, ...@@ -169,6 +169,12 @@ def cuda_shared_constructor(value, name=None, strict=False,
def float32_shared_constructor(value, name=None, strict=False, def float32_shared_constructor(value, name=None, strict=False,
allow_downcast=None, borrow=False, broadcastable=None): allow_downcast=None, borrow=False, broadcastable=None):
"""SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray""" """SharedVariable Constructor for CudaNdarrayType from numpy.ndarray or CudaNdarray"""
if theano.sandbox.cuda.use.device_number is None:
theano.sandbox.cuda.use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
# if value isn't a float32 ndarray, or a CudaNdarray then raise # if value isn't a float32 ndarray, or a CudaNdarray then raise
......
...@@ -5,7 +5,7 @@ from theano.gof import local_optimizer ...@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available from theano.sandbox.cuda import cuda_available
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType from theano.sandbox.cuda import CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
from theano.sandbox.cuda.opt import register_opt from theano.sandbox.cuda.opt import register_opt
...@@ -120,7 +120,7 @@ class MultinomialFromUniform(Op): ...@@ -120,7 +120,7 @@ class MultinomialFromUniform(Op):
""" % locals() """ % locals()
class GpuMultinomialFromUniform(MultinomialFromUniform): class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
""" """
The output is transposed compared to MultinomialFromUniform. The output is transposed compared to MultinomialFromUniform.
We must insert a Transpose op after it. We must insert a Transpose op after it.
......
...@@ -5,7 +5,7 @@ from theano.gof import local_optimizer ...@@ -5,7 +5,7 @@ from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available from theano.sandbox.cuda import cuda_available
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType from theano.sandbox.cuda import CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
from theano.sandbox.cuda.opt import register_opt as register_gpu_opt from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
...@@ -292,7 +292,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'): ...@@ -292,7 +292,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode='valid'):
# This is work in progress # This is work in progress
class GpuImages2Neibs(Images2Neibs): class GpuImages2Neibs(Images2Neibs, GpuOp):
def __init__(self, mode='valid'): def __init__(self, mode='valid'):
if mode not in ['valid', 'wrap_centered']: if mode not in ['valid', 'wrap_centered']:
raise NotImplementedError("Only the mode valid and wrap_centered" raise NotImplementedError("Only the mode valid and wrap_centered"
......
...@@ -20,7 +20,10 @@ import multinomial ...@@ -20,7 +20,10 @@ import multinomial
from theano.sandbox.cuda import cuda_available, cuda_enabled from theano.sandbox.cuda import cuda_available, cuda_enabled
if cuda_available: if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor,
GpuOp)
def mulmod(a, b, c, m): def mulmod(a, b, c, m):
r = numpy.int32((numpy.int64(a)*b + c) % m) r = numpy.int32((numpy.int64(a)*b + c) % m)
...@@ -372,7 +375,7 @@ class mrg_uniform(mrg_uniform_base): ...@@ -372,7 +375,7 @@ class mrg_uniform(mrg_uniform_base):
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (1,)
class GPU_mrg_uniform(mrg_uniform_base): class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
#GPU VERSION #GPU VERSION
@classmethod @classmethod
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论