提交 9cde027a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Remove tentacles in misc.

上级 80a1e8e0
from .config import test_ctx_name
from ..type import get_context, GpuArrayType, GpuArraySharedVariable
import pygpu
import numpy as np
from theano.misc.tests.test_may_share_memory import may_share_memory_core
from theano.misc.pkl_utils import dump, load
def test_may_share_memory():
ctx = get_context(test_ctx_name)
a = pygpu.empty((5, 4), context=ctx)
b = pygpu.empty((5, 4), context=ctx)
may_share_memory_core(a, b)
def test_dump_load():
x = GpuArraySharedVariable('x',
GpuArrayType('float32', (1, 1), name='x',
context_name=test_ctx_name),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
......@@ -168,8 +168,8 @@ class IfElse(Op):
)
c = theano.tensor.as_tensor_variable(c)
if not self.gpu:
# When gpu is true, we are given only cuda ndarrays, and we want
# to keep them be cuda ndarrays
# When gpu is true, we are given only gpuarrays, and we want
# to keep them as gpuarrays
nw_args = []
for x in args:
if hasattr(x, '_as_TensorVariable'):
......
......@@ -11,7 +11,6 @@ import os
import sys
import time
from optparse import OptionParser
import subprocess
import numpy as np
import theano
......@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
print('Numpy dot module:', np.dot.__module__)
print('Numpy location:', np.__file__)
print('Numpy version:', np.__version__)
if (theano.config.device.startswith("gpu") or
theano.config.init_gpu_device.startswith("gpu")):
print('nvcc version:')
subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path,
"--version"))
print()
a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
order=order))
......@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
f() # Ignore first function call to get representative time.
if execute:
sync = (hasattr(theano, "sandbox") and
hasattr(theano.sandbox, "cuda") and
isinstance(c, theano.sandbox.cuda.CudaNdarraySharedVariable))
sync2 = (hasattr(theano, "gpuarray") and
isinstance(c, theano.gpuarray.GpuArraySharedVariable))
sync = (hasattr(theano, "gpuarray") and
isinstance(c, theano.gpuarray.GpuArraySharedVariable))
if sync:
# Make sure we don't include the time from the first call
c.get_value(borrow=True, return_internal_type=True).sync()
t0 = time.time()
for i in range(iters):
f()
if sync:
theano.sandbox.cuda.synchronize()
if sync2:
c.get_value(borrow=True, return_internal_type=True).sync()
t1 = time.time()
return t1 - t0, impl
......@@ -199,87 +190,32 @@ if __name__ == "__main__":
goto2 1.13/8 1.94s
goto2 1.13/16 3.16s
Test time in float32
cuda version 6.5 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu
K6000/NOECC 0.06s 0.06s
K40 0.07s
K20m/ECC 0.08s 0.08s 0.07s
K20/NOECC 0.07s
M2090 0.19s
C2075 0.25s
M2075 0.25s
M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s
C1060 0.46s
K600 1.04s
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 980 0.06s
GTX 970 0.08s
GTX 680 0.11s 0.12s 0.154s 0.218s
GRID K520 0.14s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 750 Ti 0.20s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 660 0.18s 0.20s 0.23s
GTX 560 0.30s
GTX 650 Ti 0.27s
GTX 765M 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GT 610 2.38s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
GT 520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
Results for larger matrices.
There were 10 executions of gemm in float32
with matrices of shape 5000x5000 (M=N=K=5000).
Test time in float32. There were 10 executions of gemm in
float32 with matrices of shape 5000x5000 (M=N=K=5000)
All memory layout was in C order.
cuda version 7.5 7.0 6.5
cuda version 8.0 7.5 7.0
gpu
M40 0.47s
k80 0.96s
K6000/NOECC 0.69s
K40 0.88s
K20m/ECC
K20/NOECC
M2090
C2075
M2075
M2070
M2070-Q
M2050(Amazon)
C1060
K600
GTX Titan X 0.45s 0.47s
GTX Titan Black 0.64s 0.64s
GTX Titan(D15U-50)
GTX 780
GTX 980 Ti 0.41s
GTX 980
GTX 970 0.66s
GTX 680 1.57s
GRID K520
GTX 750 Ti 2.01s 2.01s
GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s
GTX 580 2.42s 2.47s
GTX 480 2.87s 2.88s
TX1 7.6s (float32 storage and computation)
GT 610 33.5s
M40 0.45s 0.47s
k80 0.92s 0.96s
K6000/NOECC 0.71s 0.69s
P6000/NOECC 0.25s
Titan X (Pascal) 0.28s
GTX Titan X 0.45s 0.45s 0.47s
GTX Titan Black 0.66s 0.64s 0.64s
GTX 1080 0.35s
GTX 980 Ti 0.41s
GTX 970 0.66s
GTX 680 1.57s
GTX 750 Ti 2.01s 2.01s
GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s
GTX 580 2.42s
GTX 480 2.87s
TX1 7.6s (float32 storage and computation)
GT 610 33.5s
""")
if options.M == 0:
......
"""
This code can only work if cudamat and theano are initialized on the
same gpu as theano.
WARNING: In the test of this file there is a transpose that is used...
So there can be problem with shape and stride order...
"""
from __future__ import absolute_import, print_function, division
import six
try:
import cudamat
cudamat_available = True
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_cudamat(x, copyif=False):
""" take a CudaNdarray and return a cudamat.CUDAMatrix object.
:type x: CudaNdarray
:param x: The array to transform to cudamat.CUDAMatrix.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
elif x.ndim != 2:
raise TypeError("cudandarray_to_cudamat: input must be 2-d (has %s dims). That's "
"because cudamat arrays are always 2-dimensional")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = x.shape[0]
cm_mat.size[1] = x.shape[1]
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be __del__'ed as long as px is around.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
return px
def cudamat_to_cudandarray(x):
""" take a cudamat.CUDAMatrix and make a CudaNdarray that point to its memory
"""
if not isinstance(x, cudamat.CUDAMatrix):
raise ValueError("We can transfer only cudamat.CUDAMatrix to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
import ctypes
ptr_long = long(ctypes.cast(x.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x)
return z
except (ImportError, OSError):
cudamat_available = False
"""
This code can only work if gnumpy and theano are initialized on the
same gpu as theano.
"""
from __future__ import absolute_import, print_function, division
import six
from six.moves import reduce
try:
import gnumpy
import cudamat
gnumpy_available = True
___const_garray = gnumpy.rand(1)
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_garray(x, copyif=False):
""" take a CudaNdarray and return a gnumpy.garray object.
:type x: CudaNdarray
:param x: The array to transform to gnumpy.garray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = reduce(lambda x, y: x * y, x.shape, 1)
cm_mat.size[1] = 1
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be freed if the cudamat object isn't freed.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
# Note how gnumpy tracks its cudamat objects: it moves things to the
# _cmsReuseCache when the gnumpy array is deleted, thus the arrays
# returned by theano will never be deleted.
# However, if the garray thinks that the object is a view, then it won't
# move the _base to the _cmsResueCache; so the cudamat object will be deleted,
# and we won't overpump the world with memory.
_is_alias_of = ___const_garray
ans = gnumpy.garray(px,
x.shape,
_is_alias_of)
return ans
def garray_to_cudandarray(x):
""" take a gnumpy.garray and make a CudaNdarray that point to its memory
"""
if not isinstance(x, gnumpy.garray):
raise ValueError("We can transfer only gnumpy.garray to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = strides[::-1]
for i in range(len(strides)):
if x.shape[i] == 1:
strides[i] = 0
strides = tuple(strides)
import ctypes
ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x._base)
return z
except (ImportError, OSError):
gnumpy_available = False
"""
Function to detect memory sharing for ndarray AND sparse type AND CudaNdarray.
Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
numpy version support only ndarray.
"""
from __future__ import absolute_import, print_function, division
......@@ -14,25 +14,12 @@ try:
def _is_sparse(a):
return scipy.sparse.issparse(a)
except ImportError:
# scipy not imported, their can be only ndarray and cudandarray
# scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a):
return False
from theano.sandbox import cuda
from theano import gpuarray
if cuda.cuda_available:
from theano.sandbox.cuda.type import CudaNdarrayType
def _is_cuda(a):
return isinstance(a, cuda.CudaNdarray)
else:
def _is_cuda(a):
return False
__docformat__ = "restructuredtext en"
if gpuarray.pygpu:
def _is_gpua(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
......@@ -40,16 +27,14 @@ else:
def _is_gpua(a):
return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b)
a_cuda = _is_cuda(a)
b_cuda = _is_cuda(b)
if a_cuda and b_cuda:
return CudaNdarrayType.may_share_memory(a, b)
a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b)
if a_gpua and b_gpua:
......@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True):
a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b)
if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or
not(b_ndarray or b_sparse or b_cuda or b_gpua)):
if (not(a_ndarray or a_sparse or a_gpua) or
not(b_ndarray or b_sparse or b_gpua)):
if raise_other_type:
raise TypeError("may_share_memory support only ndarray"
" and scipy.sparse, CudaNdarray or GpuArray type")
" and scipy.sparse or GpuArray type")
return False
if a_cuda or b_cuda or a_gpua or b_gpua:
if a_gpua or b_gpua:
return False
return SparseType.may_share_memory(a, b)
......@@ -26,11 +26,11 @@ from theano import config
from theano.compat import PY3
from six import string_types
from theano.compile.sharedvalue import SharedVariable
try:
from theano.sandbox.cuda import cuda_ndarray
import pygpu
except ImportError:
cuda_ndarray = None
pygpu = None
__docformat__ = "restructuredtext en"
__authors__ = "Pascal Lamblin"
......@@ -202,21 +202,21 @@ class PersistentNdarrayID(object):
return self.seen[id(obj)]
class PersistentCudaNdarrayID(PersistentNdarrayID):
class PersistentGpuArrayID(PersistentNdarrayID):
def __call__(self, obj):
if (cuda_ndarray is not None and
type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray):
if (pygpu and
isinstance(obj, pygpu.gpuarray.GpuArray)):
if id(obj) not in self.seen:
def write_array(f):
np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name)
self.seen[id(obj)] = 'gpuarray.{0}'.format(name)
return self.seen[id(obj)]
return super(PersistentCudaNdarrayID, self).__call__(obj)
return super(PersistentGpuArrayID, self).__call__(obj)
class PersistentSharedVariableID(PersistentCudaNdarrayID):
class PersistentSharedVariableID(PersistentGpuArrayID):
"""Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the
......@@ -288,18 +288,16 @@ class PersistentNdarrayLoad(object):
return self.cache[name]
ret = None
array = np.lib.format.read_array(self.zip_file.open(name))
if array_type == 'cuda_ndarray':
if array_type == 'gpuarray':
if config.experimental.unpickle_gpu_on_cpu:
# directly return numpy array
warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
"to True. Unpickling CudaNdarray as "
"numpy.ndarray")
"to True. Unpickling GpuArray as numpy.ndarray")
ret = array
elif cuda_ndarray:
ret = cuda_ndarray.cuda_ndarray.CudaNdarray(array)
elif pygpu:
ret = pygpu.array(array)
else:
raise ImportError("Cuda not found. Cannot unpickle "
"CudaNdarray")
raise ImportError("pygpu not found. Cannot unpickle GpuArray")
else:
ret = array
self.cache[name] = ret
......
差异被折叠。
from __future__ import absolute_import, print_function, division
import os
import warnings
import theano
import theano.sandbox.cuda
from theano import config
def set_gpu_from_theano():
"""
This set the GPU used by PyCUDA to the same as the one used by Theano.
"""
# Transfer the theano gpu binding to pycuda, for consistency
if config.device.startswith("gpu") and len(config.device) > 3:
os.environ["CUDA_DEVICE"] = theano.config.device[3:]
elif (config.init_gpu_device.startswith("gpu") and
len(config.init_gpu_device) > 3):
os.environ["CUDA_DEVICE"] = theano.config.init_gpu_device[3:]
set_gpu_from_theano()
pycuda_available = False
# If theano.sandbox.cuda don't exist, it is because we are importing
# it and it try to import this file! This mean we must init the device.
if (not hasattr(theano.sandbox, 'cuda') or
theano.sandbox.cuda.use.device_number is None):
try:
import pycuda
import pycuda.autoinit
pycuda_available = True
except (ImportError, RuntimeError):
# presumably, the user wanted to use pycuda, else they wouldn't have
# imported this module, so issue a warning that the import failed.
warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
except pycuda._driver.LogicError:
if theano.config.force_device:
raise
else:
if "CUDA_DEVICE" in os.environ:
del os.environ["CUDA_DEVICE"]
import pycuda.autoinit
pycuda_available = True
else:
try:
import pycuda.driver
pycuda_available = True
except ImportError:
pass
if pycuda_available:
if hasattr(pycuda.driver.Context, "attach"):
pycuda.driver.Context.attach()
import atexit
atexit.register(pycuda.driver.Context.pop)
else:
# Now we always import this file when we call
# theano.sandbox.cuda.use. So this should not happen
# normally.
# TODO: make this an error.
warnings.warn("For some unknow reason, theano.misc.pycuda_init was"
" not imported before Theano initialized the GPU and"
" your PyCUDA version is 2011.2.2 or earlier."
" To fix the problem, import theano.misc.pycuda_init"
" manually before using/initializing the GPU, use the"
" Theano flag pycuda.init=True or use a"
" more recent version of PyCUDA.")
from __future__ import absolute_import, print_function, division
import pycuda.gpuarray
from theano.sandbox import cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
def to_gpuarray(x, copyif=False):
""" take a CudaNdarray and return a pycuda.gpuarray.GPUArray
:type x: CudaNdarray
:param x: The array to transform to pycuda.gpuarray.GPUArray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: pycuda.gpuarray.GPUArray
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to pycuda.gpuarray.GPUArray")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We were asked to not copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous
px = pycuda.gpuarray.GPUArray(x.shape, x.dtype, base=x, gpudata=x.gpudata)
return px
def to_cudandarray(x):
""" take a pycuda.gpuarray.GPUArray and make a CudaNdarray that point to its memory
:note: CudaNdarray support only float32, so only float32 GPUArray are accepted
"""
if not isinstance(x, pycuda.gpuarray.GPUArray):
raise ValueError("We can transfer only pycuda.gpuarray.GPUArray to CudaNdarray")
elif x.dtype != "float32":
raise ValueError("CudaNdarray support only float32")
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
ptr = int(x.gpudata) # in pycuda trunk, y.ptr also works, which is a little cleaner
z = cuda.from_gpu_pointer(ptr, x.shape, strides, x)
return z
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.cudamat_utils import cudamat_available
if not cudamat_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test of theano op with pycuda "
"code.")
from theano.misc.cudamat_utils import (cudandarray_to_cudamat,
cudamat_to_cudandarray)
def test(shape=(3, 4)):
"""
Make sure that the cudamat conversion is exact.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.fmatrix('U'))
ii = theano.function([U], gpu(U + 1))
A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
A_cmat = cudandarray_to_cudamat(A_cnd)
B_cnd = cudamat_to_cudandarray(A_cmat)
B_cnd = ii(A_cnd)
u = A_cnd.copy()
u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
u = np.asarray(u)
v = np.asarray(B_cnd)
w = A_cmat.add(1).asarray()
assert abs(u - v).max() == 0
assert abs(u - w.T.reshape(u.shape)).max() == 0
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.gnumpy_utils import gnumpy_available
if not gnumpy_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test related to it.")
from theano.misc.gnumpy_utils import (garray_to_cudandarray,
cudandarray_to_garray)
import gnumpy
def test(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from garray to
CudaNdarray back to garray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
ii = theano.function([U], gpu(U + 1))
A = gnumpy.rand(*shape)
A_cnd = garray_to_cudandarray(A)
assert A_cnd.shape == A.shape
# dtype always float32
# garray don't have strides
B_cnd = ii(A_cnd)
B = cudandarray_to_garray(B_cnd)
assert A_cnd.shape == A.shape
u = (A + 1).asarray()
v = B.asarray()
w = np.array(B_cnd)
assert (u == v).all()
assert (u == w).all()
def test2(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from CudaNdarray to
garray back to CudaNdarray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
theano.function([U], gpu(U + 1))
A = np.random.rand(*shape).astype('float32')
A_cnd = theano.sandbox.cuda.CudaNdarray(A)
A_gar = cudandarray_to_garray(A_cnd)
assert A_cnd.shape == A_gar.shape
# dtype always float32
# garray don't have strides
B = garray_to_cudandarray(A_gar)
assert A_cnd.shape == B.shape
# dtype always float32
assert A_cnd._strides == B._strides
assert A_cnd.gpudata == B.gpudata
v = np.asarray(B)
assert (v == A).all()
def test_broadcast_dims():
"""
Test with some dimensions being 1.
CudaNdarray use 0 for strides for those dimensions.
"""
test((1, 2, 3))
test((2, 1, 3))
test((2, 3, 1))
test2((1, 2, 3))
test2((2, 1, 3))
test2((2, 3, 1))
"""
test the tensor and sparse type. The CudaNdarray type is tested in
sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
test the tensor and sparse type. (gpuarray is tested in the gpuarray folder).
"""
from __future__ import absolute_import, print_function, division
import numpy as np
......@@ -15,9 +14,7 @@ except ImportError:
from theano.misc.may_share_memory import may_share_memory
def test_may_share_memory():
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
def may_share_memory_core(a, b):
va = a.view()
vb = b.view()
ra = a.reshape((4, 5))
......@@ -51,6 +48,13 @@ def test_may_share_memory():
except TypeError:
pass
def test_may_share_memory():
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
may_share_memory_core(a, b)
if scipy_imported:
def test_may_share_memory_scipy():
a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3))
......
......@@ -5,13 +5,9 @@ import unittest
from tempfile import mkdtemp
import numpy as np
from nose.plugins.skip import SkipTest
import theano
import theano.sandbox.cuda as cuda_ndarray
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.var import CudaNdarraySharedVariable
from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.misc.pkl_utils import dump, load, StripPickler
......@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase):
if self.tmpdir is not None:
shutil.rmtree(self.tmpdir)
def test_dump_load(self):
if not cuda_ndarray.cuda_enabled:
raise SkipTest('Optional package cuda disabled')
x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
def test_dump_load_mrg(self):
rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled)
rng = MRG_RandomStreams()
with open('test', 'wb') as f:
dump(rng, f)
......
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op"
" with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda disabled')
import theano.tensor as T
from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
PycudaElemwiseSourceModuleMakeThunkOp)
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module():
for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x = T.fmatrix('x')
y = T.fmatrix('y')
elemwise_op = theano.tensor.Elemwise(op)
pycuda_op = PycudaElemwiseSourceModuleOp(op)
pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
f2 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op(x, y)),
mode=mode_with_gpu)
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
f3 = theano.function([x, y], elemwise_op(x, y),
mode=mode_pycuda)
f4 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op_thunk(x, y)),
mode=mode_with_gpu)
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f2.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f3.maker.fgraph.toposort()])
assert any([isinstance(node.op,
PycudaElemwiseSourceModuleMakeThunkOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(*shape), dtype='float32')
val2 = np.asarray(np.random.rand(*shape), dtype='float32')
assert np.allclose(f(val1, val2), f2(val1, val2))
assert np.allclose(f(val1, val2), f3(val1, val2))
assert np.allclose(f(val1, val2), f4(val1, val2))
# print f(val1,val2)
# print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel():
x = T.fmatrix('x')
y = T.fmatrix('y')
f = theano.function([x, y], x + y, mode=mode_with_gpu)
print(f.maker.fgraph.toposort())
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print(f2.maker.fgraph.toposort())
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
#val1 = np.ones((5,5))
#val2 = np.arange(25).reshape(5,5)
assert (f(val1, val2) == f2(val1, val2)).all()
print(f(val1, val2))
print(f2(val1, val2))
x3 = T.ftensor3('x')
y3 = T.ftensor3('y')
z3 = T.ftensor3('y')
f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
print(f4.maker.fgraph.toposort())
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.random.rand(2, 2, 2)
print(val1)
print(f4(val1, val1, val1))
assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
"""
This file is an example of view the memory allocated by pycuda in a GpuArray
in a CudaNdarray to be able to use it in Theano.
This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
"""
from __future__ import absolute_import, print_function, division
import sys
import numpy as np
import theano
import theano.sandbox.cuda as cuda_ndarray
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed."
" We skip tests of Theano Ops with pycuda code.")
if cuda_ndarray.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
import pycuda
import pycuda.driver as drv
import pycuda.gpuarray
def test_pycuda_only():
"""Run pycuda only example to test that pycuda works."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
# Test with pycuda in/out of numpy.ndarray
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400, 1, 1), grid=(1, 1))
assert (dest == a * b).all()
def test_pycuda_theano():
"""Simple example with pycuda function and Theano CudaNdarray object."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
# Test with Theano object
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
multiply_them(dest, ga, gb,
block=(400, 1, 1), grid=(1, 1))
assert (np.asarray(dest) == a * b).all()
def test_pycuda_memory_to_theano():
# Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
print(sys.getrefcount(y))
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
initial_refcount = sys.getrefcount(y)
print("gpuarray ref count before creating a CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
rand = np.random.randn(*y.shape).astype(np.float32)
cuda_rand = cuda_ndarray.CudaNdarray(rand)
strides = [1]
for i in y.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
print('strides', strides)
assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
# in pycuda trunk, y.ptr also works, which is a little cleaner
y_ptr = int(y.gpudata)
z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount + 1
assert (np.asarray(z) == 0).all()
assert z.base is y
# Test that we can take a view from this cuda view on pycuda memory
zz = z.view()
assert sys.getrefcount(y) == initial_refcount + 2
assert zz.base is y
del zz
assert sys.getrefcount(y) == initial_refcount + 1
cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
dtype='float32'))
z += cuda_ones
assert (np.asarray(z) == np.ones(y.shape)).all()
assert (np.asarray(z) == 1).all()
assert cuda_rand.shape == z.shape
assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
assert (np.asarray(cuda_rand) == rand).all()
z += cuda_rand
assert (np.asarray(z) == (rand + 1)).all()
# Check that the ref count to the gpuarray is right.
del z
print("gpuarray ref count after deleting the CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
from __future__ import absolute_import, print_function, division
import numpy as np
import theano.sandbox.cuda as cuda
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda "
"code.")
if cuda.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
from theano.misc.pycuda_utils import to_gpuarray, to_cudandarray
import pycuda.gpuarray
def test_to_gpuarray():
cx = cuda.CudaNdarray.zeros((5, 4))
px = to_gpuarray(cx)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(1, dtype="float32")
# Check that they share the same memory space
assert px.gpudata == cx.gpudata
assert np.asarray(cx[0, 0]) == 1
assert np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
# Test when the CudaNdarray is strided
cx = cx[::2, ::]
px = to_gpuarray(cx, copyif=True)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(2, dtype="float32")
# Check that they do not share the same memory space
assert px.gpudata != cx.gpudata
assert np.asarray(cx[0, 0]) == 2
assert not np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert not all(np.asarray(cx._strides) * 4 == px.strides)
# Test that we return an error
try:
px = to_gpuarray(cx)
assert False
except ValueError:
pass
def test_to_cudandarray():
px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
cx = to_cudandarray(px)
assert isinstance(cx, cuda.CudaNdarray)
assert np.allclose(px.get(),
np.asarray(cx))
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
try:
px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
to_cudandarray(px)
assert False
except ValueError:
pass
try:
to_cudandarray(np.zeros(4))
assert False
except ValueError:
pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论