提交 9cde027a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Remove tentacles in misc.

上级 80a1e8e0
from .config import test_ctx_name
from ..type import get_context, GpuArrayType, GpuArraySharedVariable
import pygpu
import numpy as np
from theano.misc.tests.test_may_share_memory import may_share_memory_core
from theano.misc.pkl_utils import dump, load
def test_may_share_memory():
ctx = get_context(test_ctx_name)
a = pygpu.empty((5, 4), context=ctx)
b = pygpu.empty((5, 4), context=ctx)
may_share_memory_core(a, b)
def test_dump_load():
x = GpuArraySharedVariable('x',
GpuArrayType('float32', (1, 1), name='x',
context_name=test_ctx_name),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
...@@ -168,8 +168,8 @@ class IfElse(Op): ...@@ -168,8 +168,8 @@ class IfElse(Op):
) )
c = theano.tensor.as_tensor_variable(c) c = theano.tensor.as_tensor_variable(c)
if not self.gpu: if not self.gpu:
# When gpu is true, we are given only cuda ndarrays, and we want # When gpu is true, we are given only gpuarrays, and we want
# to keep them be cuda ndarrays # to keep them as gpuarrays
nw_args = [] nw_args = []
for x in args: for x in args:
if hasattr(x, '_as_TensorVariable'): if hasattr(x, '_as_TensorVariable'):
......
...@@ -11,7 +11,6 @@ import os ...@@ -11,7 +11,6 @@ import os
import sys import sys
import time import time
from optparse import OptionParser from optparse import OptionParser
import subprocess
import numpy as np import numpy as np
import theano import theano
...@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -51,12 +50,6 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
print('Numpy dot module:', np.dot.__module__) print('Numpy dot module:', np.dot.__module__)
print('Numpy location:', np.__file__) print('Numpy location:', np.__file__)
print('Numpy version:', np.__version__) print('Numpy version:', np.__version__)
if (theano.config.device.startswith("gpu") or
theano.config.init_gpu_device.startswith("gpu")):
print('nvcc version:')
subprocess.call((theano.sandbox.cuda.nvcc_compiler.nvcc_path,
"--version"))
print()
a = theano.shared(np.ones((M, N), dtype=theano.config.floatX, a = theano.shared(np.ones((M, N), dtype=theano.config.floatX,
order=order)) order=order))
...@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000, ...@@ -88,17 +81,15 @@ def execute(execute=True, verbose=True, M=2000, N=2000, K=2000,
f() # Ignore first function call to get representative time. f() # Ignore first function call to get representative time.
if execute: if execute:
sync = (hasattr(theano, "sandbox") and sync = (hasattr(theano, "gpuarray") and
hasattr(theano.sandbox, "cuda") and isinstance(c, theano.gpuarray.GpuArraySharedVariable))
isinstance(c, theano.sandbox.cuda.CudaNdarraySharedVariable)) if sync:
sync2 = (hasattr(theano, "gpuarray") and # Make sure we don't include the time from the first call
isinstance(c, theano.gpuarray.GpuArraySharedVariable)) c.get_value(borrow=True, return_internal_type=True).sync()
t0 = time.time() t0 = time.time()
for i in range(iters): for i in range(iters):
f() f()
if sync: if sync:
theano.sandbox.cuda.synchronize()
if sync2:
c.get_value(borrow=True, return_internal_type=True).sync() c.get_value(borrow=True, return_internal_type=True).sync()
t1 = time.time() t1 = time.time()
return t1 - t0, impl return t1 - t0, impl
...@@ -199,87 +190,32 @@ if __name__ == "__main__": ...@@ -199,87 +190,32 @@ if __name__ == "__main__":
goto2 1.13/8 1.94s goto2 1.13/8 1.94s
goto2 1.13/16 3.16s goto2 1.13/16 3.16s
Test time in float32 Test time in float32. There were 10 executions of gemm in
float32 with matrices of shape 5000x5000 (M=N=K=5000)
cuda version 6.5 6.0 5.5 5.0 4.2 4.1 4.0 3.2 3.0 # note
gpu
K6000/NOECC 0.06s 0.06s
K40 0.07s
K20m/ECC 0.08s 0.08s 0.07s
K20/NOECC 0.07s
M2090 0.19s
C2075 0.25s
M2075 0.25s
M2070 0.25s 0.27s 0.32s
M2070-Q 0.48s 0.27s 0.32s
M2050(Amazon) 0.25s
C1060 0.46s
K600 1.04s
GTX Titan Black 0.05s
GTX Titan(D15U-50) 0.06s 0.06s don't work
GTX 780 0.06s
GTX 980 0.06s
GTX 970 0.08s
GTX 680 0.11s 0.12s 0.154s 0.218s
GRID K520 0.14s
GTX 580 0.16s 0.16s 0.164s 0.203s
GTX 480 0.19s 0.19s 0.192s 0.237s 0.27s
GTX 750 Ti 0.20s
GTX 470 0.23s 0.23s 0.238s 0.297s 0.34s
GTX 660 0.18s 0.20s 0.23s
GTX 560 0.30s
GTX 650 Ti 0.27s
GTX 765M 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GT 610 2.38s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
GT 520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
GT 220 3.80s
GT 210 6.35s
8500 GT 10.68s
Results for larger matrices.
There were 10 executions of gemm in float32
with matrices of shape 5000x5000 (M=N=K=5000).
All memory layout was in C order. All memory layout was in C order.
cuda version 7.5 7.0 6.5
cuda version 8.0 7.5 7.0
gpu gpu
M40 0.47s M40 0.45s 0.47s
k80 0.96s k80 0.92s 0.96s
K6000/NOECC 0.69s K6000/NOECC 0.71s 0.69s
K40 0.88s P6000/NOECC 0.25s
K20m/ECC
K20/NOECC Titan X (Pascal) 0.28s
M2090 GTX Titan X 0.45s 0.45s 0.47s
C2075 GTX Titan Black 0.66s 0.64s 0.64s
M2075 GTX 1080 0.35s
M2070 GTX 980 Ti 0.41s
M2070-Q GTX 970 0.66s
M2050(Amazon) GTX 680 1.57s
C1060 GTX 750 Ti 2.01s 2.01s
K600 GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s
GTX Titan X 0.45s 0.47s GTX 580 2.42s
GTX Titan Black 0.64s 0.64s GTX 480 2.87s
GTX Titan(D15U-50) TX1 7.6s (float32 storage and computation)
GTX 780 GT 610 33.5s
GTX 980 Ti 0.41s
GTX 980
GTX 970 0.66s
GTX 680 1.57s
GRID K520
GTX 750 Ti 2.01s 2.01s
GTX 750 2.46s 2.37s
GTX 660 2.32s 2.32s
GTX 580 2.42s 2.47s
GTX 480 2.87s 2.88s
TX1 7.6s (float32 storage and computation)
GT 610 33.5s
""") """)
if options.M == 0: if options.M == 0:
......
"""
This code can only work if cudamat and theano are initialized on the
same gpu as theano.
WARNING: In the test of this file there is a transpose that is used...
So there can be problem with shape and stride order...
"""
from __future__ import absolute_import, print_function, division
import six
try:
import cudamat
cudamat_available = True
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_cudamat(x, copyif=False):
""" take a CudaNdarray and return a cudamat.CUDAMatrix object.
:type x: CudaNdarray
:param x: The array to transform to cudamat.CUDAMatrix.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
elif x.ndim != 2:
raise TypeError("cudandarray_to_cudamat: input must be 2-d (has %s dims). That's "
"because cudamat arrays are always 2-dimensional")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = x.shape[0]
cm_mat.size[1] = x.shape[1]
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be __del__'ed as long as px is around.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
return px
def cudamat_to_cudandarray(x):
""" take a cudamat.CUDAMatrix and make a CudaNdarray that point to its memory
"""
if not isinstance(x, cudamat.CUDAMatrix):
raise ValueError("We can transfer only cudamat.CUDAMatrix to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
import ctypes
ptr_long = long(ctypes.cast(x.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x)
return z
except (ImportError, OSError):
cudamat_available = False
"""
This code can only work if gnumpy and theano are initialized on the
same gpu as theano.
"""
from __future__ import absolute_import, print_function, division
import six
from six.moves import reduce
try:
import gnumpy
import cudamat
gnumpy_available = True
___const_garray = gnumpy.rand(1)
import theano.sandbox.cuda as cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
if six.PY3:
long = int
def cudandarray_to_garray(x, copyif=False):
""" take a CudaNdarray and return a gnumpy.garray object.
:type x: CudaNdarray
:param x: The array to transform to gnumpy.garray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: cudamat.CUDAMatrix
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to cudamat.CUDAMatrix")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We where asked to don't copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous.
# the next step is to create a CUDAMatrix object. We do so by first creating
# a cudamat object with no data_host.
cm_mat = cudamat.cudamat()
cm_mat.size[0] = reduce(lambda x, y: x * y, x.shape, 1)
cm_mat.size[1] = 1
cm_mat.on_host = 0
cm_mat.on_device = 1
cm_mat.is_trans = 0
cm_mat.owns_data = 0 # <-- note: cm_mat dosen't owe the data; x does. So x will delete it.
# x.gpudata is a long. We need a pointer to a float. cast.
import ctypes
cm_mat.data_device = ctypes.cast(x.gpudata, ctypes.POINTER(ctypes.c_float))
px = cudamat.CUDAMatrix(cm_mat)
px._base = x # x won't be freed if the cudamat object isn't freed.
# let cudamat know that we don't have a numpy array attached.
px.mat_on_host = False
# Note how gnumpy tracks its cudamat objects: it moves things to the
# _cmsReuseCache when the gnumpy array is deleted, thus the arrays
# returned by theano will never be deleted.
# However, if the garray thinks that the object is a view, then it won't
# move the _base to the _cmsResueCache; so the cudamat object will be deleted,
# and we won't overpump the world with memory.
_is_alias_of = ___const_garray
ans = gnumpy.garray(px,
x.shape,
_is_alias_of)
return ans
def garray_to_cudandarray(x):
""" take a gnumpy.garray and make a CudaNdarray that point to its memory
"""
if not isinstance(x, gnumpy.garray):
raise ValueError("We can transfer only gnumpy.garray to CudaNdarray")
# elif x.dtype != "float32":
# raise ValueError("CudaNdarray support only float32")
# We don't need this, because cudamat is always float32.
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = strides[::-1]
for i in range(len(strides)):
if x.shape[i] == 1:
strides[i] = 0
strides = tuple(strides)
import ctypes
ptr_long = long(ctypes.cast(x._base.mat.data_device, ctypes.c_void_p).value)
# seems legit.
z = cuda.from_gpu_pointer(ptr_long, x.shape, strides, x._base)
return z
except (ImportError, OSError):
gnumpy_available = False
""" """
Function to detect memory sharing for ndarray AND sparse type AND CudaNdarray. Function to detect memory sharing for ndarray AND sparse type AND GpuArray.
numpy version support only ndarray. numpy version support only ndarray.
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
...@@ -14,25 +14,12 @@ try: ...@@ -14,25 +14,12 @@ try:
def _is_sparse(a): def _is_sparse(a):
return scipy.sparse.issparse(a) return scipy.sparse.issparse(a)
except ImportError: except ImportError:
# scipy not imported, their can be only ndarray and cudandarray # scipy not imported, their can be only ndarray and gpuarray
def _is_sparse(a): def _is_sparse(a):
return False return False
from theano.sandbox import cuda
from theano import gpuarray from theano import gpuarray
if cuda.cuda_available:
from theano.sandbox.cuda.type import CudaNdarrayType
def _is_cuda(a):
return isinstance(a, cuda.CudaNdarray)
else:
def _is_cuda(a):
return False
__docformat__ = "restructuredtext en"
if gpuarray.pygpu: if gpuarray.pygpu:
def _is_gpua(a): def _is_gpua(a):
return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray) return isinstance(a, gpuarray.pygpu.gpuarray.GpuArray)
...@@ -40,16 +27,14 @@ else: ...@@ -40,16 +27,14 @@ else:
def _is_gpua(a): def _is_gpua(a):
return False return False
__docformat__ = "restructuredtext en"
def may_share_memory(a, b, raise_other_type=True): def may_share_memory(a, b, raise_other_type=True):
a_ndarray = isinstance(a, np.ndarray) a_ndarray = isinstance(a, np.ndarray)
b_ndarray = isinstance(b, np.ndarray) b_ndarray = isinstance(b, np.ndarray)
if a_ndarray and b_ndarray: if a_ndarray and b_ndarray:
return TensorType.may_share_memory(a, b) return TensorType.may_share_memory(a, b)
a_cuda = _is_cuda(a)
b_cuda = _is_cuda(b)
if a_cuda and b_cuda:
return CudaNdarrayType.may_share_memory(a, b)
a_gpua = _is_gpua(a) a_gpua = _is_gpua(a)
b_gpua = _is_gpua(b) b_gpua = _is_gpua(b)
if a_gpua and b_gpua: if a_gpua and b_gpua:
...@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True): ...@@ -57,13 +42,13 @@ def may_share_memory(a, b, raise_other_type=True):
a_sparse = _is_sparse(a) a_sparse = _is_sparse(a)
b_sparse = _is_sparse(b) b_sparse = _is_sparse(b)
if (not(a_ndarray or a_sparse or a_cuda or a_gpua) or if (not(a_ndarray or a_sparse or a_gpua) or
not(b_ndarray or b_sparse or b_cuda or b_gpua)): not(b_ndarray or b_sparse or b_gpua)):
if raise_other_type: if raise_other_type:
raise TypeError("may_share_memory support only ndarray" raise TypeError("may_share_memory support only ndarray"
" and scipy.sparse, CudaNdarray or GpuArray type") " and scipy.sparse or GpuArray type")
return False return False
if a_cuda or b_cuda or a_gpua or b_gpua: if a_gpua or b_gpua:
return False return False
return SparseType.may_share_memory(a, b) return SparseType.may_share_memory(a, b)
...@@ -26,11 +26,11 @@ from theano import config ...@@ -26,11 +26,11 @@ from theano import config
from theano.compat import PY3 from theano.compat import PY3
from six import string_types from six import string_types
from theano.compile.sharedvalue import SharedVariable from theano.compile.sharedvalue import SharedVariable
try: try:
from theano.sandbox.cuda import cuda_ndarray import pygpu
except ImportError: except ImportError:
cuda_ndarray = None pygpu = None
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
__authors__ = "Pascal Lamblin" __authors__ = "Pascal Lamblin"
...@@ -202,21 +202,21 @@ class PersistentNdarrayID(object): ...@@ -202,21 +202,21 @@ class PersistentNdarrayID(object):
return self.seen[id(obj)] return self.seen[id(obj)]
class PersistentCudaNdarrayID(PersistentNdarrayID): class PersistentGpuArrayID(PersistentNdarrayID):
def __call__(self, obj): def __call__(self, obj):
if (cuda_ndarray is not None and if (pygpu and
type(obj) is cuda_ndarray.cuda_ndarray.CudaNdarray): isinstance(obj, pygpu.gpuarray.GpuArray)):
if id(obj) not in self.seen: if id(obj) not in self.seen:
def write_array(f): def write_array(f):
np.lib.format.write_array(f, np.asarray(obj)) np.lib.format.write_array(f, np.asarray(obj))
name = self._resolve_name(obj) name = self._resolve_name(obj)
zipadd(write_array, self.zip_file, name) zipadd(write_array, self.zip_file, name)
self.seen[id(obj)] = 'cuda_ndarray.{0}'.format(name) self.seen[id(obj)] = 'gpuarray.{0}'.format(name)
return self.seen[id(obj)] return self.seen[id(obj)]
return super(PersistentCudaNdarrayID, self).__call__(obj) return super(PersistentGpuArrayID, self).__call__(obj)
class PersistentSharedVariableID(PersistentCudaNdarrayID): class PersistentSharedVariableID(PersistentGpuArrayID):
"""Uses shared variable names when persisting to zip file. """Uses shared variable names when persisting to zip file.
If a shared variable has a name, this name is used as the name of the If a shared variable has a name, this name is used as the name of the
...@@ -288,18 +288,16 @@ class PersistentNdarrayLoad(object): ...@@ -288,18 +288,16 @@ class PersistentNdarrayLoad(object):
return self.cache[name] return self.cache[name]
ret = None ret = None
array = np.lib.format.read_array(self.zip_file.open(name)) array = np.lib.format.read_array(self.zip_file.open(name))
if array_type == 'cuda_ndarray': if array_type == 'gpuarray':
if config.experimental.unpickle_gpu_on_cpu: if config.experimental.unpickle_gpu_on_cpu:
# directly return numpy array # directly return numpy array
warnings.warn("config.experimental.unpickle_gpu_on_cpu is set " warnings.warn("config.experimental.unpickle_gpu_on_cpu is set "
"to True. Unpickling CudaNdarray as " "to True. Unpickling GpuArray as numpy.ndarray")
"numpy.ndarray")
ret = array ret = array
elif cuda_ndarray: elif pygpu:
ret = cuda_ndarray.cuda_ndarray.CudaNdarray(array) ret = pygpu.array(array)
else: else:
raise ImportError("Cuda not found. Cannot unpickle " raise ImportError("pygpu not found. Cannot unpickle GpuArray")
"CudaNdarray")
else: else:
ret = array ret = array
self.cache[name] = ret self.cache[name] = ret
......
"""This file show how we can use Pycuda compiled fct in a Theano
Op. Do no use those op in production code. See the TODO.
You can use them as a guide to use your pycuda code into a Theano op.
The PycudaElemwiseSourceModuleOp is a Theano op use pycuda code
generated with pycuda.compiler.SourceModule
Their is a test in test_pycuda.py.
This don't work with broadcast and non-contiguous memory as pycuda
don't support that, but we make sure we don't introduce problem.
If the memory is non-contiguous, we create a new copy that is contiguous.
If their is broadcasted dimensions, we raise an error.
#The following is commented as it work only with old pycuda version
The PycudaElemwiseKernelOp op use pycuda code generated with
pycuda.elementwise.ElementwiseKernel. It must be wrapper by
TheanoElementwiseKernel.
"""
from __future__ import absolute_import, print_function, division
from itertools import chain
import numpy as np
import theano
from six.moves import xrange
from theano.compat import izip
from theano.gof import Op, Apply, local_optimizer, EquilibriumDB
from theano.sandbox.cuda import GpuElemwise, CudaNdarrayType, GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.opt import gpu_seqopt
from theano.misc.frozendict import frozendict
import pycuda
from pycuda.compiler import SourceModule
import pycuda.gpuarray
from . import pycuda_init
if not pycuda_init.pycuda_available:
raise Exception("No pycuda available. You can't load pycuda_example.py")
def _replace_npy_types(c_arg):
c_arg = c_arg.replace('npy_float32', 'float')
c_arg = c_arg.replace('npy_float64', 'double')
c_arg = c_arg.replace('npy_int32', 'int')
c_arg = c_arg.replace('npy_int8', 'char')
c_arg = c_arg.replace('npy_ucs4', 'unsigned int')
c_arg = c_arg.replace('npy_uint32', 'unsigned int')
c_arg = c_arg.replace('npy_uint16', 'unsigned short')
c_arg = c_arg.replace('npy_uint8', 'unsigned char')
return c_arg
def theano_parse_c_arg(c_arg):
c_arg = _replace_npy_types(c_arg)
return pycuda.tools.parse_c_arg(c_arg)
"""
class TheanoElementwiseKernel(pycuda.elementwise.ElementwiseKernel):
def __init__(self, arguments, operation,
name="kernel", keep=False, options=None, **kwargs):
if options is None:
options = []
if isinstance(arguments, string_types):
arguments = [theano_parse_c_arg(arg)
for arg in arguments.split(",")]
pycuda.elementwise.ElementwiseKernel.__init__(self, arguments,
operation, name, keep,
options, **kwargs)
def __call__(self, *args):
vectors = []
invocation_args = []
for arg, arg_descr in zip(args, self.gen_kwargs["arguments"]):
if isinstance(arg_descr, VectorArg):
vectors.append(arg)
invocation_args.append(arg.gpudata)
else:
invocation_args.append(arg)
repr_vec = vectors[0]
invocation_args.append(repr_vec.mem_size)
if hasattr(repr_vec, "_block") and hasattr(repr_vec, "_grid"):
self.func.set_block_shape(*repr_vec._block)
self.func.prepared_call(repr_vec._grid, *invocation_args)
else:
_grid, _block = pycuda.gpuarray.splay(repr_vec.mem_size)
self.func.set_block_shape(*_block)
self.func.prepared_call(_grid, *invocation_args)
class PycudaElemwiseKernelOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern=None, name=None):
if inplace_pattern is None:
inplace_pattern = {}
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = inplace_pattern
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = self.inplace_pattern.items()
items.sort()
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def __eq__(self, other):
return (type(self) == type(other) and
self.scalar_op == other.scalar_op and
self.inplace_pattern == other.inplace_pattern)
def __hash__(self):
return (hash(type(self)) ^ hash(self.scalar_op) ^
hash_from_dict(self.inplace_pattern))
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
# output is broadcastable only along dimensions where all inputs are
# broadcastable
broadcastable = []
for d in xrange(_inputs[0].type.ndim):
bcast_d = True
for i in _inputs:
if not i.type.broadcastable[d]:
bcast_d = False
break
broadcastable.append(bcast_d)
assert len(broadcastable) == _inputs[0].type.ndim
otype = CudaNdarrayType(broadcastable=broadcastable)
assert self.nout == 1
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]"for n in in_name]),
tuple(n + "[i]"for n in out_name), {})
self.pycuda_fct = TheanoElementwiseKernel(
", ".join([var.type.dtype_specs()[1] + " *" + name
for var, name in (zip(inputs, in_name) +
zip(out_node.outputs, out_name))]),
c_code,
"pycuda_elemwise_kernel_%s" % str(self.scalar_op),
preamble=("#include<Python.h>\n"
"#include <numpy/arrayobject.h>"))
return out_node
def perform(self, node, inputs, out):
#TODO assert all input have the same shape
z, = out
if z[0] is None or z[0].shape != inputs[0].shape:
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
i = inputs + z
self.pycuda_fct(*i)
"""
class PycudaElemwiseSourceModuleOp(GpuOp):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
__props__ = ("scalar_op", "inplace_pattern")
def __init__(self, scalar_op, inplace_pattern=None, name=None):
if inplace_pattern is None:
inplace_pattern = frozendict({})
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = frozendict(inplace_pattern)
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = list(self.inplace_pattern.items())
items.sort()
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def make_node(self, *inputs):
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
assert len(inputs) == 2 # TODO remove
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
assert self.nout == 1
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
in_name = ["i" + str(id) for id in range(len(inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(out_node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join(
[_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
for var, name in chain(izip(inputs, in_name),
izip(out_node.outputs, out_name))] +
["int size"])
mod = SourceModule("""
__global__ void %s(%s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s
}
}
""" % (fct_name, c_code_param, c_code))
self.pycuda_fct = mod.get_function(fct_name)
return out_node
def perform(self, node, inputs, out):
# TODO support broadcast!
# TODO assert all input have the same shape
z, = out
if (z[0] is None or
z[0].shape != inputs[0].shape or
not z[0].is_c_contiguous()):
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
if inputs[0].shape != inputs[1].shape:
raise TypeError("PycudaElemwiseSourceModuleOp:"
" inputs don't have the same shape!")
if inputs[0].size > 512:
grid = (int(np.ceil(inputs[0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1, 1)
block = (inputs[0].shape[0], inputs[0].shape[1], 1)
self.pycuda_fct(inputs[0], inputs[1], z[0],
np.intc(inputs[1].size), block=block, grid=grid)
class PycudaElemwiseSourceModuleMakeThunkOp(Op):
nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout)
__props__ = ("scalar_op", "inplace_pattern")
def __init__(self, scalar_op, inplace_pattern=None, name=None):
if inplace_pattern is None:
inplace_pattern = {}
self.name = name
self.scalar_op = scalar_op
self.inplace_pattern = frozendict(inplace_pattern)
def __str__(self):
if self.name is None:
if self.inplace_pattern:
items = list(self.inplace_pattern.items())
items.sort()
return self.__class__.__name__ + "{%s}%s" % (self.scalar_op,
str(items))
else:
return self.__class__.__name__ + "{%s}" % (self.scalar_op)
else:
return self.name
def make_node(self, *inputs):
assert self.nout == 1
assert len(inputs) == 2 # TODO remove
_inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
if self.nin > 0 and len(_inputs) != self.nin:
raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
for i in _inputs[1:]:
if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs')
if any([any(i.type.broadcastable) for i in inputs]):
raise Exception("pycuda don't support broadcasted dimensions")
otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
return out_node
def make_thunk(self, node, storage_map, _, _2, impl=None):
# TODO support broadcast!
# TODO assert all input have the same shape
fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
in_name = ["i" + str(id) for id in range(len(node.inputs))]
out_name = ["o" + str(id) for id in range(self.nout)]
c_code = self.scalar_op.c_code(node, "some_name",
tuple([n + "[i]" for n in in_name]),
tuple(n + "[i]" for n in out_name), {})
c_code_param = ", ".join(
[_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
for var, name in chain(izip(node.inputs, in_name),
izip(node.outputs, out_name))] +
["int size"])
mod = SourceModule("""
__global__ void %s(%s)
{
int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
i += threadIdx.x + threadIdx.y*blockDim.x;
if(i<size){
%s
}
}
""" % (fct_name, c_code_param, c_code))
pycuda_fct = mod.get_function(fct_name)
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if (z[0] is None or
z[0].shape != inputs[0][0].shape or
not z[0].is_c_contiguous()):
z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
inputs[0][0].shape)
if inputs[0][0].shape != inputs[1][0].shape:
raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
" inputs don't have the same shape!")
if inputs[0][0].size > 512:
grid = (int(np.ceil(inputs[0][0].size / 512.)), 1)
block = (512, 1, 1)
else:
grid = (1, 1)
block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
pycuda_fct(inputs[0][0], inputs[1][0], z[0],
np.intc(inputs[1][0].size), block=block,
grid=grid)
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")
@local_optimizer([GpuElemwise])
def local_pycuda_gpu_elemwise(node):
"""
GpuElemwise -> PycudaElemwiseSourceModuleOp
"""
if isinstance(node.op, GpuElemwise):
if (not any([any(i.type.broadcastable) for i in node.inputs]) and
all([i.ndim <= 2 for i in node.inputs])):
new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise",
local_pycuda_gpu_elemwise)
"""
@local_optimizer([GpuElemwise])
def local_pycuda_gpu_elemwise_kernel(node):
""
GpuElemwise -> PycudaElemwiseKernelOp
""
if isinstance(node.op, GpuElemwise):
if not any([any(i.type.broadcastable) for i in node.inputs]):
new_op = PycudaElemwiseKernelOp(node.op.scalar_op,
node.op.inplace_pattern)(
*node.inputs)
return [new_op]
pycuda_optimizer.register("local_pycuda_gpu_elemwise_kernel",
local_pycuda_gpu_elemwise_kernel, 1.5)
"""
from __future__ import absolute_import, print_function, division
import os
import warnings
import theano
import theano.sandbox.cuda
from theano import config
def set_gpu_from_theano():
"""
This set the GPU used by PyCUDA to the same as the one used by Theano.
"""
# Transfer the theano gpu binding to pycuda, for consistency
if config.device.startswith("gpu") and len(config.device) > 3:
os.environ["CUDA_DEVICE"] = theano.config.device[3:]
elif (config.init_gpu_device.startswith("gpu") and
len(config.init_gpu_device) > 3):
os.environ["CUDA_DEVICE"] = theano.config.init_gpu_device[3:]
set_gpu_from_theano()
pycuda_available = False
# If theano.sandbox.cuda don't exist, it is because we are importing
# it and it try to import this file! This mean we must init the device.
if (not hasattr(theano.sandbox, 'cuda') or
theano.sandbox.cuda.use.device_number is None):
try:
import pycuda
import pycuda.autoinit
pycuda_available = True
except (ImportError, RuntimeError):
# presumably, the user wanted to use pycuda, else they wouldn't have
# imported this module, so issue a warning that the import failed.
warnings.warn("PyCUDA import failed in theano.misc.pycuda_init")
except pycuda._driver.LogicError:
if theano.config.force_device:
raise
else:
if "CUDA_DEVICE" in os.environ:
del os.environ["CUDA_DEVICE"]
import pycuda.autoinit
pycuda_available = True
else:
try:
import pycuda.driver
pycuda_available = True
except ImportError:
pass
if pycuda_available:
if hasattr(pycuda.driver.Context, "attach"):
pycuda.driver.Context.attach()
import atexit
atexit.register(pycuda.driver.Context.pop)
else:
# Now we always import this file when we call
# theano.sandbox.cuda.use. So this should not happen
# normally.
# TODO: make this an error.
warnings.warn("For some unknow reason, theano.misc.pycuda_init was"
" not imported before Theano initialized the GPU and"
" your PyCUDA version is 2011.2.2 or earlier."
" To fix the problem, import theano.misc.pycuda_init"
" manually before using/initializing the GPU, use the"
" Theano flag pycuda.init=True or use a"
" more recent version of PyCUDA.")
from __future__ import absolute_import, print_function, division
import pycuda.gpuarray
from theano.sandbox import cuda
if cuda.cuda_available is False:
raise ImportError('Optional theano package cuda disabled')
def to_gpuarray(x, copyif=False):
""" take a CudaNdarray and return a pycuda.gpuarray.GPUArray
:type x: CudaNdarray
:param x: The array to transform to pycuda.gpuarray.GPUArray.
:type copyif: bool
:param copyif: If False, raise an error if x is not c contiguous.
If it is c contiguous, we return a GPUArray that share
the same memory region as x.
If True, copy x if it is no c contiguous, so the return won't
shape the same memory region. If c contiguous, the return
will share the same memory region.
We need to do this as GPUArray don't fully support strided memory.
:return type: pycuda.gpuarray.GPUArray
"""
if not isinstance(x, cuda.CudaNdarray):
raise ValueError("We can transfer only CudaNdarray to pycuda.gpuarray.GPUArray")
else:
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We were asked to not copy memory, but the memory is not c contiguous.")
# Now x is always c contiguous
px = pycuda.gpuarray.GPUArray(x.shape, x.dtype, base=x, gpudata=x.gpudata)
return px
def to_cudandarray(x):
""" take a pycuda.gpuarray.GPUArray and make a CudaNdarray that point to its memory
:note: CudaNdarray support only float32, so only float32 GPUArray are accepted
"""
if not isinstance(x, pycuda.gpuarray.GPUArray):
raise ValueError("We can transfer only pycuda.gpuarray.GPUArray to CudaNdarray")
elif x.dtype != "float32":
raise ValueError("CudaNdarray support only float32")
else:
strides = [1]
for i in x.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
ptr = int(x.gpudata) # in pycuda trunk, y.ptr also works, which is a little cleaner
z = cuda.from_gpu_pointer(ptr, x.shape, strides, x)
return z
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.cudamat_utils import cudamat_available
if not cudamat_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test of theano op with pycuda "
"code.")
from theano.misc.cudamat_utils import (cudandarray_to_cudamat,
cudamat_to_cudandarray)
def test(shape=(3, 4)):
"""
Make sure that the cudamat conversion is exact.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.fmatrix('U'))
ii = theano.function([U], gpu(U + 1))
A_cpu = np.asarray(np.random.rand(*shape), dtype="float32")
A_cnd = theano.sandbox.cuda.CudaNdarray(A_cpu)
A_cmat = cudandarray_to_cudamat(A_cnd)
B_cnd = cudamat_to_cudandarray(A_cmat)
B_cnd = ii(A_cnd)
u = A_cnd.copy()
u += theano.sandbox.cuda.CudaNdarray(np.asarray([[1]], dtype='float32'))
u = np.asarray(u)
v = np.asarray(B_cnd)
w = A_cmat.add(1).asarray()
assert abs(u - v).max() == 0
assert abs(u - w.T.reshape(u.shape)).max() == 0
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
from theano.misc.gnumpy_utils import gnumpy_available
if not gnumpy_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("gnumpy not installed. Skip test related to it.")
from theano.misc.gnumpy_utils import (garray_to_cudandarray,
cudandarray_to_garray)
import gnumpy
def test(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from garray to
CudaNdarray back to garray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
ii = theano.function([U], gpu(U + 1))
A = gnumpy.rand(*shape)
A_cnd = garray_to_cudandarray(A)
assert A_cnd.shape == A.shape
# dtype always float32
# garray don't have strides
B_cnd = ii(A_cnd)
B = cudandarray_to_garray(B_cnd)
assert A_cnd.shape == A.shape
u = (A + 1).asarray()
v = B.asarray()
w = np.array(B_cnd)
assert (u == v).all()
assert (u == w).all()
def test2(shape=(3, 4, 5)):
"""
Make sure that the gnumpy conversion is exact from CudaNdarray to
garray back to CudaNdarray.
"""
gpu = theano.sandbox.cuda.basic_ops.gpu_from_host
U = gpu(theano.tensor.ftensor3('U'))
theano.function([U], gpu(U + 1))
A = np.random.rand(*shape).astype('float32')
A_cnd = theano.sandbox.cuda.CudaNdarray(A)
A_gar = cudandarray_to_garray(A_cnd)
assert A_cnd.shape == A_gar.shape
# dtype always float32
# garray don't have strides
B = garray_to_cudandarray(A_gar)
assert A_cnd.shape == B.shape
# dtype always float32
assert A_cnd._strides == B._strides
assert A_cnd.gpudata == B.gpudata
v = np.asarray(B)
assert (v == A).all()
def test_broadcast_dims():
"""
Test with some dimensions being 1.
CudaNdarray use 0 for strides for those dimensions.
"""
test((1, 2, 3))
test((2, 1, 3))
test((2, 3, 1))
test2((1, 2, 3))
test2((2, 1, 3))
test2((2, 3, 1))
""" """
test the tensor and sparse type. The CudaNdarray type is tested in test the tensor and sparse type. (gpuarray is tested in the gpuarray folder).
sandbox/cuda/tests/test_tensor_op.py.test_may_share_memory_cuda
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy as np import numpy as np
...@@ -15,9 +14,7 @@ except ImportError: ...@@ -15,9 +14,7 @@ except ImportError:
from theano.misc.may_share_memory import may_share_memory from theano.misc.may_share_memory import may_share_memory
def test_may_share_memory(): def may_share_memory_core(a, b):
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
va = a.view() va = a.view()
vb = b.view() vb = b.view()
ra = a.reshape((4, 5)) ra = a.reshape((4, 5))
...@@ -51,6 +48,13 @@ def test_may_share_memory(): ...@@ -51,6 +48,13 @@ def test_may_share_memory():
except TypeError: except TypeError:
pass pass
def test_may_share_memory():
a = np.random.rand(5, 4)
b = np.random.rand(5, 4)
may_share_memory_core(a, b)
if scipy_imported: if scipy_imported:
def test_may_share_memory_scipy(): def test_may_share_memory_scipy():
a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3)) a = scipy.sparse.csc_matrix(scipy.sparse.eye(5, 3))
......
...@@ -5,13 +5,9 @@ import unittest ...@@ -5,13 +5,9 @@ import unittest
from tempfile import mkdtemp from tempfile import mkdtemp
import numpy as np import numpy as np
from nose.plugins.skip import SkipTest
import theano import theano
import theano.sandbox.cuda as cuda_ndarray
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.var import CudaNdarraySharedVariable
from theano.sandbox.rng_mrg import MRG_RandomStreams from theano.sandbox.rng_mrg import MRG_RandomStreams
from theano.misc.pkl_utils import dump, load, StripPickler from theano.misc.pkl_utils import dump, load, StripPickler
...@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase): ...@@ -29,24 +25,8 @@ class T_dump_load(unittest.TestCase):
if self.tmpdir is not None: if self.tmpdir is not None:
shutil.rmtree(self.tmpdir) shutil.rmtree(self.tmpdir)
def test_dump_load(self):
if not cuda_ndarray.cuda_enabled:
raise SkipTest('Optional package cuda disabled')
x = CudaNdarraySharedVariable('x', CudaNdarrayType((1, 1), name='x'),
[[1]], False)
with open('test', 'wb') as f:
dump(x, f)
with open('test', 'rb') as f:
x = load(f)
assert x.name == 'x'
np.testing.assert_allclose(x.get_value(), [[1]])
def test_dump_load_mrg(self): def test_dump_load_mrg(self):
rng = MRG_RandomStreams(use_cuda=cuda_ndarray.cuda_enabled) rng = MRG_RandomStreams()
with open('test', 'wb') as f: with open('test', 'wb') as f:
dump(rng, f) dump(rng, f)
......
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op"
" with pycuda code.")
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional package cuda disabled')
import theano.tensor as T
from theano.misc.pycuda_example import (PycudaElemwiseSourceModuleOp,
PycudaElemwiseSourceModuleMakeThunkOp)
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_pycuda_elemwise_source_module():
for shape in [(5, 5), (10, 49), (50, 49), (500, 501)]:
for op in [theano.scalar.basic.mul, theano.scalar.basic.add]:
x = T.fmatrix('x')
y = T.fmatrix('y')
elemwise_op = theano.tensor.Elemwise(op)
pycuda_op = PycudaElemwiseSourceModuleOp(op)
pycuda_op_thunk = PycudaElemwiseSourceModuleMakeThunkOp(op)
f = theano.function([x, y], elemwise_op(x, y), mode=mode_with_gpu)
f2 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op(x, y)),
mode=mode_with_gpu)
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise")
f3 = theano.function([x, y], elemwise_op(x, y),
mode=mode_pycuda)
f4 = theano.function([x, y],
theano.sandbox.cuda.host_from_gpu(
pycuda_op_thunk(x, y)),
mode=mode_with_gpu)
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f2.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseSourceModuleOp)
for node in f3.maker.fgraph.toposort()])
assert any([isinstance(node.op,
PycudaElemwiseSourceModuleMakeThunkOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(*shape), dtype='float32')
val2 = np.asarray(np.random.rand(*shape), dtype='float32')
assert np.allclose(f(val1, val2), f2(val1, val2))
assert np.allclose(f(val1, val2), f3(val1, val2))
assert np.allclose(f(val1, val2), f4(val1, val2))
# print f(val1,val2)
# print f2(val1,val2)
"""
#commented as it work only with old pycuda version.
def test_pycuda_elemwise_kernel():
x = T.fmatrix('x')
y = T.fmatrix('y')
f = theano.function([x, y], x + y, mode=mode_with_gpu)
print(f.maker.fgraph.toposort())
mode_pycuda = mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")
f2 = theano.function([x, y], x + y, mode=mode_pycuda)
print(f2.maker.fgraph.toposort())
assert any([isinstance(node.op, theano.sandbox.cuda.GpuElemwise)
for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f2.maker.fgraph.toposort()])
val1 = np.asarray(np.random.rand(5, 5), dtype='float32')
val2 = np.asarray(np.random.rand(5, 5), dtype='float32')
#val1 = np.ones((5,5))
#val2 = np.arange(25).reshape(5,5)
assert (f(val1, val2) == f2(val1, val2)).all()
print(f(val1, val2))
print(f2(val1, val2))
x3 = T.ftensor3('x')
y3 = T.ftensor3('y')
z3 = T.ftensor3('y')
f4 = theano.function([x3, y3, z3], x3 * y3 + z3, mode=mode_pycuda)
print(f4.maker.fgraph.toposort())
assert any([isinstance(node.op, PycudaElemwiseKernelOp)
for node in f4.maker.fgraph.toposort()])
val1 = np.random.rand(2, 2, 2)
print(val1)
print(f4(val1, val1, val1))
assert np.allclose(f4(val1, val1, val1), val1 * val1 + val1)
"""
"""
This file is an example of view the memory allocated by pycuda in a GpuArray
in a CudaNdarray to be able to use it in Theano.
This also serve as a test for the function: cuda_ndarray.from_gpu_pointer
"""
from __future__ import absolute_import, print_function, division
import sys
import numpy as np
import theano
import theano.sandbox.cuda as cuda_ndarray
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed."
" We skip tests of Theano Ops with pycuda code.")
if cuda_ndarray.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
import pycuda
import pycuda.driver as drv
import pycuda.gpuarray
def test_pycuda_only():
"""Run pycuda only example to test that pycuda works."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
# Test with pycuda in/out of numpy.ndarray
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
dest = np.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400, 1, 1), grid=(1, 1))
assert (dest == a * b).all()
def test_pycuda_theano():
"""Simple example with pycuda function and Theano CudaNdarray object."""
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = np.random.randn(100).astype(np.float32)
b = np.random.randn(100).astype(np.float32)
# Test with Theano object
ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b)
dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
multiply_them(dest, ga, gb,
block=(400, 1, 1), grid=(1, 1))
assert (np.asarray(dest) == a * b).all()
def test_pycuda_memory_to_theano():
# Test that we can use the GpuArray memory space in pycuda in a CudaNdarray
y = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
print(sys.getrefcount(y))
# This increase the ref count with never pycuda. Do pycuda also
# cache ndarray?
# print y.get()
initial_refcount = sys.getrefcount(y)
print("gpuarray ref count before creating a CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
rand = np.random.randn(*y.shape).astype(np.float32)
cuda_rand = cuda_ndarray.CudaNdarray(rand)
strides = [1]
for i in y.shape[::-1][:-1]:
strides.append(strides[-1] * i)
strides = tuple(strides[::-1])
print('strides', strides)
assert cuda_rand._strides == strides, (cuda_rand._strides, strides)
# in pycuda trunk, y.ptr also works, which is a little cleaner
y_ptr = int(y.gpudata)
z = cuda_ndarray.from_gpu_pointer(y_ptr, y.shape, strides, y)
print("gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount + 1
assert (np.asarray(z) == 0).all()
assert z.base is y
# Test that we can take a view from this cuda view on pycuda memory
zz = z.view()
assert sys.getrefcount(y) == initial_refcount + 2
assert zz.base is y
del zz
assert sys.getrefcount(y) == initial_refcount + 1
cuda_ones = cuda_ndarray.CudaNdarray(np.asarray([[[1]]],
dtype='float32'))
z += cuda_ones
assert (np.asarray(z) == np.ones(y.shape)).all()
assert (np.asarray(z) == 1).all()
assert cuda_rand.shape == z.shape
assert cuda_rand._strides == z._strides, (cuda_rand._strides, z._strides)
assert (np.asarray(cuda_rand) == rand).all()
z += cuda_rand
assert (np.asarray(z) == (rand + 1)).all()
# Check that the ref count to the gpuarray is right.
del z
print("gpuarray ref count after deleting the CudaNdarray", end=' ')
print(sys.getrefcount(y))
assert sys.getrefcount(y) == initial_refcount
from __future__ import absolute_import, print_function, division
import numpy as np
import theano.sandbox.cuda as cuda
import theano.misc.pycuda_init
if not theano.misc.pycuda_init.pycuda_available: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest("Pycuda not installed. Skip test of theano op with pycuda "
"code.")
if cuda.cuda_available is False: # noqa
from nose.plugins.skip import SkipTest
raise SkipTest('Optional theano package cuda disabled')
from theano.misc.pycuda_utils import to_gpuarray, to_cudandarray
import pycuda.gpuarray
def test_to_gpuarray():
cx = cuda.CudaNdarray.zeros((5, 4))
px = to_gpuarray(cx)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(1, dtype="float32")
# Check that they share the same memory space
assert px.gpudata == cx.gpudata
assert np.asarray(cx[0, 0]) == 1
assert np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
# Test when the CudaNdarray is strided
cx = cx[::2, ::]
px = to_gpuarray(cx, copyif=True)
assert isinstance(px, pycuda.gpuarray.GPUArray)
cx[0, 0] = np.asarray(2, dtype="float32")
# Check that they do not share the same memory space
assert px.gpudata != cx.gpudata
assert np.asarray(cx[0, 0]) == 2
assert not np.allclose(np.asarray(cx), px.get())
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert not all(np.asarray(cx._strides) * 4 == px.strides)
# Test that we return an error
try:
px = to_gpuarray(cx)
assert False
except ValueError:
pass
def test_to_cudandarray():
px = pycuda.gpuarray.zeros((3, 4, 5), 'float32')
cx = to_cudandarray(px)
assert isinstance(cx, cuda.CudaNdarray)
assert np.allclose(px.get(),
np.asarray(cx))
assert px.dtype == cx.dtype
assert px.shape == cx.shape
assert all(np.asarray(cx._strides) * 4 == px.strides)
try:
px = pycuda.gpuarray.zeros((3, 4, 5), 'float64')
to_cudandarray(px)
assert False
except ValueError:
pass
try:
to_cudandarray(np.zeros(4))
assert False
except ValueError:
pass
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论