提交 568bd526 authored 作者: lamblin's avatar lamblin

Merge pull request #560 from nouiz/crash_fix_gpu_reshape

Crash fix gpu reshape
import StringIO, sys import copy
import logging
import StringIO
import sys
import numpy import numpy
import theano import theano
...@@ -13,14 +17,14 @@ from theano.sandbox.cuda import filter as type_support_filter ...@@ -13,14 +17,14 @@ from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.elemwise import NaiveAlgo from theano.sandbox.cuda.elemwise import NaiveAlgo
import logging, copy
import cuda_ndarray import cuda_ndarray
_logger_name = 'theano.sandbox.cuda.basic_ops' _logger_name = 'theano.sandbox.cuda.basic_ops'
_logger = logging.getLogger(_logger_name) _logger = logging.getLogger(_logger_name)
_logger.setLevel(logging.INFO) _logger.setLevel(logging.INFO)
_logger.addHandler(logging.StreamHandler()) #TO REMOVE _logger.addHandler(logging.StreamHandler()) # TO REMOVE
def as_cuda_ndarray_variable(x): def as_cuda_ndarray_variable(x):
if hasattr(x, '_as_CudaNdarrayVariable'): if hasattr(x, '_as_CudaNdarrayVariable'):
...@@ -28,6 +32,7 @@ def as_cuda_ndarray_variable(x): ...@@ -28,6 +32,7 @@ def as_cuda_ndarray_variable(x):
tensor_x = tensor.as_tensor_variable(x) tensor_x = tensor.as_tensor_variable(x)
return gpu_from_host(tensor_x) return gpu_from_host(tensor_x)
def as_cuda_array(obj): def as_cuda_array(obj):
if isinstance(obj, numpy.ndarray): if isinstance(obj, numpy.ndarray):
return cuda_ndarray.cuda_ndarray.CudaNdarray(obj) return cuda_ndarray.cuda_ndarray.CudaNdarray(obj)
...@@ -43,18 +48,24 @@ class HostFromGpu(GpuOp): ...@@ -43,18 +48,24 @@ class HostFromGpu(GpuOp):
""" """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return 'HostFromGpu' return 'HostFromGpu'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, CudaNdarrayType): if not isinstance(x.type, CudaNdarrayType):
raise TypeError(x) raise TypeError(x)
return Apply(self, [x], [tensor.TensorType(dtype=x.dtype, broadcastable=x.broadcastable)()]) return Apply(self, [x], [tensor.TensorType(dtype=x.dtype,
broadcastable=x.broadcastable)()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
z, = out z, = out
z[0] = numpy.asarray(x) z[0] = numpy.asarray(x)
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [gpu_from_host(gz)] return [gpu_from_host(gz)]
...@@ -65,28 +76,37 @@ class HostFromGpu(GpuOp): ...@@ -65,28 +76,37 @@ class HostFromGpu(GpuOp):
return [gpu_from_host(ev)] return [gpu_from_host(ev)]
else: else:
return [ev] return [ev]
def infer_shape(self, node, xshp): def infer_shape(self, node, xshp):
return xshp return xshp
host_from_gpu = HostFromGpu() host_from_gpu = HostFromGpu()
class GpuFromHost(GpuOp): class GpuFromHost(GpuOp):
""" """
Implement the transfer from cpu to the gpu. Implement the transfer from cpu to the gpu.
""" """
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return 'GpuFromHost' return 'GpuFromHost'
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, tensor.TensorType): if not isinstance(x.type, tensor.TensorType):
raise TypeError(x) raise TypeError(x)
return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable, dtype=x.dtype)()]) return Apply(self, [x], [CudaNdarrayType(broadcastable=x.broadcastable,
dtype=x.dtype)()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
z, = out z, = out
z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0]) z[0] = type_support_filter(theano._asarray(x, dtype='float32'),
tuple([0] * x.ndim), 0, z[0])
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [host_from_gpu(gz)] return [host_from_gpu(gz)]
...@@ -110,7 +130,7 @@ class GpuElemwise(GpuOp): ...@@ -110,7 +130,7 @@ class GpuElemwise(GpuOp):
nin = property(lambda self: self.scalar_op.nin) nin = property(lambda self: self.scalar_op.nin)
nout = property(lambda self: self.scalar_op.nout) nout = property(lambda self: self.scalar_op.nout)
def __init__(self, scalar_op, inplace_pattern = {}, sync=None): def __init__(self, scalar_op, inplace_pattern={}, sync=None):
#TODO-- this looks like a bug-- either we should use the sync argument #TODO-- this looks like a bug-- either we should use the sync argument
# or get rid of it, we shouldn't let the client think they can control # or get rid of it, we shouldn't let the client think they can control
#sync when they can't #sync when they can't
...@@ -126,7 +146,7 @@ class GpuElemwise(GpuOp): ...@@ -126,7 +146,7 @@ class GpuElemwise(GpuOp):
self._rehash() self._rehash()
self.src_generator = NaiveAlgo(self.scalar_op, sync=sync, self.src_generator = NaiveAlgo(self.scalar_op, sync=sync,
inplace_pattern = self.inplace_pattern) inplace_pattern=self.inplace_pattern)
def __getstate__(self): def __getstate__(self):
d = copy.copy(self.__dict__) d = copy.copy(self.__dict__)
...@@ -136,26 +156,30 @@ class GpuElemwise(GpuOp): ...@@ -136,26 +156,30 @@ class GpuElemwise(GpuOp):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
self.sync = d.get('sync', True) #old objects defaulted to sync behaviour #old objects defaulted to sync behaviour
self.sync = d.get('sync', True)
self._rehash() self._rehash()
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and (self.scalar_op == other.scalar_op) \ return (type(self) == type(other) and
and self.inplace_pattern == other.inplace_pattern \ self.scalar_op == other.scalar_op and
and self.sync == other.sync self.inplace_pattern == other.inplace_pattern and
self.sync == other.sync)
def _rehash(self): def _rehash(self):
items = self.inplace_pattern.items() items = self.inplace_pattern.items()
items.sort() items.sort()
tuple_items=[k for k,v in items] tuple_items = [k for k, v in items]
for k,v in items: for k, v in items:
if isinstance(v, (tuple, list)): if isinstance(v, (tuple, list)):
tuple_items+=[tuple(v)] tuple_items += [tuple(v)]
else: tuple_items+=[v] else:
tuple_items += [v]
tuple_items = tuple(tuple_items) tuple_items = tuple(tuple_items)
h = hash(type(self)) ^ hash(self.scalar_op) ^ hash(tuple_items) ^ hash(self.sync) h = (hash(type(self)) ^ hash(self.scalar_op) ^
hash(tuple_items) ^ hash(self.sync))
# don't change a code that has already been computed for this object # don't change a code that has already been computed for this object
assert h == getattr(self,'_hashval', h) assert h == getattr(self, '_hashval', h)
self._hashval = h self._hashval = h
def __hash__(self): def __hash__(self):
...@@ -181,7 +205,8 @@ class GpuElemwise(GpuOp): ...@@ -181,7 +205,8 @@ class GpuElemwise(GpuOp):
if i.type.ndim != inputs[0].type.ndim: if i.type.ndim != inputs[0].type.ndim:
raise TypeError('different ranks among inputs') raise TypeError('different ranks among inputs')
# output is broadcastable only along dimensions where all inputs are broadcastable # output is broadcastable only along dimensions where all
# inputs are broadcastable
broadcastable = [] broadcastable = []
for d in xrange(_inputs[0].type.ndim): for d in xrange(_inputs[0].type.ndim):
bcast_d = True bcast_d = True
...@@ -213,6 +238,7 @@ class GpuElemwise(GpuOp): ...@@ -213,6 +238,7 @@ class GpuElemwise(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
return self.src_generator.cache_version return self.src_generator.cache_version
class GpuDimShuffle(GpuOp): class GpuDimShuffle(GpuOp):
""" """
Implement DimShuffle on the gpu. Implement DimShuffle on the gpu.
...@@ -225,26 +251,32 @@ class GpuDimShuffle(GpuOp): ...@@ -225,26 +251,32 @@ class GpuDimShuffle(GpuOp):
# list of dimensions of the input to drop # list of dimensions of the input to drop
self.drop = [] self.drop = []
i2j = {} # this maps i before dropping dimensions to j after dropping dimensions so self.shuffle can be set properly later on # this maps i before dropping dimensions to j after dropping
# dimensions so self.shuffle can be set properly later on
i2j = {}
j = 0 j = 0
for i, b in enumerate(input_broadcastable): for i, b in enumerate(input_broadcastable):
if i not in new_order: if i not in new_order:
# we want to drop this dimension because it's not a value in new_order # we want to drop this dimension because it's not a
# value in new_order
if b == 1: # 1 aka True if b == 1: # 1 aka True
self.drop.append(i) self.drop.append(i)
else: else:
# we cannot drop non-broadcastable dimensions # we cannot drop non-broadcastable dimensions
raise ValueError("You cannot drop a non-broadcastable dimension.", (input_broadcastable, new_order)) raise ValueError("You cannot drop a non-broadcastable"
" dimension.",
(input_broadcastable, new_order))
else: else:
i2j[i] = j i2j[i] = j
j += 1 j += 1
# transposition of non-broadcastable dimensions # transposition of non-broadcastable dimensions This is how
# This is how the dimensions will be permuted, without accounting for the extra # the dimensions will be permuted, without accounting for the
# 'x' broadcastable dimensions to insert. # extra 'x' broadcastable dimensions to insert.
self.shuffle = [i2j[x] for x in new_order if x != 'x'] self.shuffle = [i2j[x] for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not in the original input # list of dimensions of the output that are broadcastable and
# were not in the original input
self.augment = [i for i, x in enumerate(new_order) if x == 'x'] self.augment = [i for i, x in enumerate(new_order) if x == 'x']
self.view_map = {0: [0]} self.view_map = {0: [0]}
...@@ -255,6 +287,7 @@ class GpuDimShuffle(GpuOp): ...@@ -255,6 +287,7 @@ class GpuDimShuffle(GpuOp):
d = dict(self.__dict__) d = dict(self.__dict__)
del d['_hashval'] del d['_hashval']
return d return d
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
self._rehash() self._rehash()
...@@ -262,10 +295,14 @@ class GpuDimShuffle(GpuOp): ...@@ -262,10 +295,14 @@ class GpuDimShuffle(GpuOp):
def make_node(self, input): def make_node(self, input):
ib = tuple(input.type.broadcastable) ib = tuple(input.type.broadcastable)
if not ib == self.input_broadcastable: if not ib == self.input_broadcastable:
raise TypeError("The number of dimensions and/or broadcastable pattern of the input is incorrect for this op. Expected %s, got %s." % (self.input_broadcastable, ib)) raise TypeError(
"The number of dimensions and/or broadcastable pattern of the"
" input is incorrect for this op. Expected %s, got %s." %
(self.input_broadcastable, ib))
ob = [] ob = []
if not isinstance(input.type, CudaNdarrayType): if not isinstance(input.type, CudaNdarrayType):
raise TypeError("The input of a GpuDimshuffle must be a CudaNdarray") raise TypeError("The input of a GpuDimshuffle must"
" be a CudaNdarray")
for value in self.new_order: for value in self.new_order:
if value == 'x': if value == 'x':
ob.append(True) ob.append(True)
...@@ -280,8 +317,10 @@ class GpuDimShuffle(GpuOp): ...@@ -280,8 +317,10 @@ class GpuDimShuffle(GpuOp):
and self.input_broadcastable == other.input_broadcastable and self.input_broadcastable == other.input_broadcastable
def _rehash(self): def _rehash(self):
self._hashval = hash(type(self).__name__) ^ hash(type(self).__module__) \ self._hashval = (hash(type(self).__name__) ^
^ hash(self.new_order) ^ hash(self.input_broadcastable) hash(type(self).__module__) ^
hash(self.new_order) ^
hash(self.input_broadcastable))
def __hash__(self): def __hash__(self):
return self._hashval return self._hashval
...@@ -303,10 +342,11 @@ class GpuDimShuffle(GpuOp): ...@@ -303,10 +342,11 @@ class GpuDimShuffle(GpuOp):
print >> sio, """ print >> sio, """
if (%(input)s->nd != %(nd_in)s) if (%(input)s->nd != %(nd_in)s)
{ {
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(input)s->nd); PyErr_Format(PyExc_TypeError,
"required nd=%(nd_in)s, got nd=%%i", %(input)s->nd);
%(fail)s; %(fail)s;
} }
""" %locals() """ % locals()
#alloc an output #alloc an output
print >> sio, """ print >> sio, """
...@@ -334,17 +374,19 @@ class GpuDimShuffle(GpuOp): ...@@ -334,17 +374,19 @@ class GpuDimShuffle(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
if (CudaNdarray_set_device_data(%(res)s, CudaNdarray_DEV_DATA(%(input)s), %(input)s)) if (CudaNdarray_set_device_data(%(res)s,
CudaNdarray_DEV_DATA(%(input)s),
%(input)s))
{ {
// err message set // err message set
Py_DECREF(%(res)s); Py_DECREF(%(res)s);
%(res)s = NULL; %(res)s = NULL;
%(fail)s; %(fail)s;
} }
""" %locals() """ % locals()
#reassign the dimension and strides in the host pointers #reassign the dimension and strides in the host pointers
for i, o in enumerate(self.new_order): for i, o in enumerate(self.new_order):
...@@ -356,17 +398,19 @@ class GpuDimShuffle(GpuOp): ...@@ -356,17 +398,19 @@ class GpuDimShuffle(GpuOp):
print >> sio, """ print >> sio, """
CudaNdarray_set_dim(%(res)s, %(i)s, 1); CudaNdarray_set_dim(%(res)s, %(i)s, 1);
CudaNdarray_set_stride(%(res)s, %(i)s, 0); CudaNdarray_set_stride(%(res)s, %(i)s, 0);
""" %locals() """ % locals()
else: else:
print >> sio, """ print >> sio, """
CudaNdarray_set_dim(%(res)s, %(i)s, CudaNdarray_HOST_DIMS(%(input)s)[%(o)s]); CudaNdarray_set_dim(%(res)s, %(i)s,
CudaNdarray_set_stride(%(res)s, %(i)s, CudaNdarray_HOST_STRIDES(%(input)s)[%(o)s]); CudaNdarray_HOST_DIMS(%(input)s)[%(o)s]);
""" %locals() CudaNdarray_set_stride(%(res)s, %(i)s,
CudaNdarray_HOST_STRIDES(%(input)s)[%(o)s]);
""" % locals()
for i, o in enumerate(self.new_order): for i, o in enumerate(self.new_order):
print >> sio, """ print >> sio, """
//std::cerr << "GpuDimShuffle " << %(res)s << " str[%(i)s] = " << %(res)s->str[%(i)s] << "\\n"; //std::cerr << "GpuDimShuffle " << %(res)s << " str[%(i)s] = " << %(res)s->str[%(i)s] << "\\n";
""" %locals() """ % locals()
# copy the host dims and stride -> device # copy the host dims and stride -> device
if 0: if 0:
...@@ -378,7 +422,7 @@ class GpuDimShuffle(GpuOp): ...@@ -378,7 +422,7 @@ class GpuDimShuffle(GpuOp):
%(res)s = NULL; %(res)s = NULL;
%(fail)s; %(fail)s;
} }
""" %locals() """ % locals()
if 0: # print full code to stdout if 0: # print full code to stdout
print '--------------------------------------' print '--------------------------------------'
...@@ -400,14 +444,16 @@ class GpuDimShuffle(GpuOp): ...@@ -400,14 +444,16 @@ class GpuDimShuffle(GpuOp):
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,0) return (1, 0)
class GpuSum(GpuOp): class GpuSum(GpuOp):
"""GpuSum is a Reduction along some dimensions by summation. """GpuSum is a Reduction along some dimensions by summation.
The dimensions along which to sum is specified by the `reduce_mask` that you pass to the The dimensions along which to sum is specified by the
constructor. The `reduce_mask` is a tuple of booleans (actually integers 0 or 1) that `reduce_mask` that you pass to the constructor. The `reduce_mask`
specify for each input dimension, whether to reduce it (1) or not (0). is a tuple of booleans (actually integers 0 or 1) that specify for
each input dimension, whether to reduce it (1) or not (0).
For example: For example:
...@@ -419,15 +465,16 @@ class GpuSum(GpuOp): ...@@ -419,15 +465,16 @@ class GpuSum(GpuOp):
- reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor. - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
:note: any reduce_mask of all zeros is a sort of 'copy', and may be removed during graph :note: any reduce_mask of all zeros is a sort of 'copy', and may
optimization be removed during graph optimization
""" """
def __init__(self, reduce_mask): def __init__(self, reduce_mask):
self.reduce_mask = tuple(reduce_mask) self.reduce_mask = tuple(reduce_mask)
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and self.reduce_mask == other.reduce_mask return (type(self) == type(other) and
self.reduce_mask == other.reduce_mask)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.reduce_mask) return hash(type(self)) ^ hash(self.reduce_mask)
...@@ -437,8 +484,9 @@ class GpuSum(GpuOp): ...@@ -437,8 +484,9 @@ class GpuSum(GpuOp):
def make_node(self, x): def make_node(self, x):
if (x.type.ndim != len(self.reduce_mask)): if (x.type.ndim != len(self.reduce_mask)):
raise TypeError("x must have rank %i"%len(self.reduce_mask)) raise TypeError("x must have rank %i" % len(self.reduce_mask))
o_broadcast = [x.type.broadcastable[i] for i in xrange(x.type.ndim) if not self.reduce_mask[i]] o_broadcast = [x.type.broadcastable[i] for i
in xrange(x.type.ndim) if not self.reduce_mask[i]]
return Apply(self, [x], [CudaNdarrayType(o_broadcast)()]) return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
...@@ -462,10 +510,11 @@ class GpuSum(GpuOp): ...@@ -462,10 +510,11 @@ class GpuSum(GpuOp):
print >> sio, """ print >> sio, """
if (%(x)s->nd != %(nd_in)s) if (%(x)s->nd != %(nd_in)s)
{ {
PyErr_Format(PyExc_TypeError, "required nd=%(nd_in)s, got nd=%%i", %(x)s->nd); PyErr_Format(PyExc_TypeError,
"required nd=%(nd_in)s, got nd=%%i", %(x)s->nd);
%(fail)s; %(fail)s;
} }
""" %locals() """ % locals()
# #
# alloc an output if we need one # alloc an output if we need one
...@@ -487,7 +536,7 @@ class GpuSum(GpuOp): ...@@ -487,7 +536,7 @@ class GpuSum(GpuOp):
print >> sio, """ print >> sio, """
) )
{ {
""" %locals() """ % locals()
if nd_out > 0: if nd_out > 0:
print >> sio, "int new_dims[%(nd_out)s]; " % locals() print >> sio, "int new_dims[%(nd_out)s]; " % locals()
else: else:
...@@ -508,9 +557,10 @@ class GpuSum(GpuOp): ...@@ -508,9 +557,10 @@ class GpuSum(GpuOp):
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ % locals()
# \begin bracket the reduction in a check that there is actually work to do # \begin bracket the reduction in a check that there is
# actually work to do
print >> sio, """ print >> sio, """
if (CudaNdarray_SIZE(%(z)s)) if (CudaNdarray_SIZE(%(z)s))
{ {
...@@ -520,7 +570,7 @@ class GpuSum(GpuOp): ...@@ -520,7 +570,7 @@ class GpuSum(GpuOp):
# Now perform the reduction # Now perform the reduction
# #
if all(i==1 for i in self.reduce_mask): if all(i == 1 for i in self.reduce_mask):
#check if the tensor is ccontiguous, if true, use the c_c0de_reduce_ccontig code. #check if the tensor is ccontiguous, if true, use the c_c0de_reduce_ccontig code.
#TODO: check if we are ccontiguous when we un-dimshuffle #TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims. #TODO: if only some dims are ccontiguous, call version with less dims.
...@@ -532,7 +582,6 @@ class GpuSum(GpuOp): ...@@ -532,7 +582,6 @@ class GpuSum(GpuOp):
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) getattr(self, 'c_code_reduce_%s'%(''.join(str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
# \end bracket the reduction ... # \end bracket the reduction ...
print >> sio, """ print >> sio, """
} }
...@@ -547,9 +596,11 @@ class GpuSum(GpuOp): ...@@ -547,9 +596,11 @@ class GpuSum(GpuOp):
.. code-block:: c .. code-block:: c
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n"); if (verbose)
printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x; int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
...@@ -571,40 +622,43 @@ class GpuSum(GpuOp): ...@@ -571,40 +622,43 @@ class GpuSum(GpuOp):
ndim = len(self.reduce_mask) ndim = len(self.reduce_mask)
nd_out = ndim - sum(self.reduce_mask) nd_out = ndim - sum(self.reduce_mask)
print >> sio, """ print >> sio, """
if (verbose) printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n"); if (verbose)
printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z; int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1) printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d, nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d, nb_block=%%d, n_shared=%%d\\n", if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d, nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d, nb_block=%%d, n_shared=%%d\\n",
n_threads.x,n_threads.y,n_threads.z, n_threads.x,n_threads.y,n_threads.z,
n_threads.x*n_threads.y*n_threads.z, n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y, n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared); n_blocks.x*n_blocks.y, n_shared);
kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>( kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" %locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
CudaNdarray_HOST_DIMS(%(x)s)[%(i)s], CudaNdarray_HOST_DIMS(%(x)s)[%(i)s],
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
CudaNdarray_DEV_DATA(%(x)s) CudaNdarray_DEV_DATA(%(x)s)
""" %locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
,CudaNdarray_DEV_DATA(%(z)s) ,CudaNdarray_DEV_DATA(%(z)s)
""" %locals() """ % locals()
for i in xrange(nd_out): for i in xrange(nd_out):
print >> sio, """ print >> sio, """
,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s] ,CudaNdarray_HOST_STRIDES(%(z)s)[%(i)s]
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
); );
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts) if (cudaSuccess != sts)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_%(pattern)s_%(name)s", "kernel_reduce_sum_%(pattern)s_%(name)s",
cudaGetErrorString(sts), cudaGetErrorString(sts),
n_blocks.x, n_blocks.x,
...@@ -614,10 +668,11 @@ class GpuSum(GpuOp): ...@@ -614,10 +668,11 @@ class GpuSum(GpuOp):
n_threads.z); n_threads.z);
%(fail)s; %(fail)s;
} }
""" %locals() """ % locals()
return sio.getvalue() return sio.getvalue()
def _k_decl(self, node, nodename, pattern = None, ndim = None, reduce_mask = None): def _k_decl(self, node, nodename, pattern=None,
ndim=None, reduce_mask=None):
"""Return a string to declare a kernel function """Return a string to declare a kernel function
.. code-block:: c .. code-block:: c
...@@ -633,7 +688,7 @@ class GpuSum(GpuOp): ...@@ -633,7 +688,7 @@ class GpuSum(GpuOp):
float * Z, float * Z,
const int sZ0) const int sZ0)
""" %locals() """ % locals()
if reduce_mask is None: if reduce_mask is None:
reduce_mask = self.reduce_mask reduce_mask = self.reduce_mask
if ndim is None: if ndim is None:
...@@ -644,25 +699,25 @@ class GpuSum(GpuOp): ...@@ -644,25 +699,25 @@ class GpuSum(GpuOp):
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s( static __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s(
""" %locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
const int d%(i)s, const int d%(i)s,
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
const float *A, const float *A,
""" %locals() """ % locals()
for i in xrange(ndim): for i in xrange(ndim):
print >> sio, """ print >> sio, """
const int sA%(i)s, const int sA%(i)s,
""" %locals() """ % locals()
print >> sio, """ print >> sio, """
float * Z float * Z
""" %locals() """ % locals()
for i in xrange(ndim - sum(reduce_mask)): for i in xrange(ndim - sum(reduce_mask)):
print >> sio, """ print >> sio, """
, const int sZ%(i)s , const int sZ%(i)s
""" %locals() """ % locals()
print >> sio, ")" print >> sio, ")"
return sio.getvalue() return sio.getvalue()
...@@ -688,6 +743,49 @@ class GpuSum(GpuOp): ...@@ -688,6 +743,49 @@ class GpuSum(GpuOp):
buf[threadNum] = mysum; buf[threadNum] = mysum;
__syncthreads(); __syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;
buf[idx] += buf[threadNum];
// buf[0] = 998;
} else {
// buf[threadNum] = 0;-999;
}
__syncthreads();
//Work for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
int halfPoint = (nTotalThreads >> 1); // divide by two
// only the first half of the threads will be active.
if (threadNum < halfPoint)
{
// Get the shared value stored by another thread
float temp = buf[threadNum + halfPoint];
buf[threadNum] += temp;
}
__syncthreads();
nTotalThreads = (nTotalThreads >> 1); // divide by two.
}
__syncthreads();
if (threadNum == 0)
{
%(z_pos)s = buf[0];
}
__syncthreads();
""" % locals()
return """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp // rest of function is handled by one warp
if (threadNum < warpSize) if (threadNum < warpSize)
{ {
...@@ -728,7 +826,7 @@ class GpuSum(GpuOp): ...@@ -728,7 +826,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize #nb_reduce<=warpSize
...@@ -748,7 +846,7 @@ class GpuSum(GpuOp): ...@@ -748,7 +846,7 @@ class GpuSum(GpuOp):
} }
%(z_pos)s = mysum; %(z_pos)s = mysum;
} }
""" %locals() """ % locals()
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail): def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
print >> sio, """ print >> sio, """
...@@ -784,7 +882,7 @@ class GpuSum(GpuOp): ...@@ -784,7 +882,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
def c_code_reduce_1(self, sio, node, name, x, z, fail): def c_code_reduce_1(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -797,7 +895,7 @@ class GpuSum(GpuOp): ...@@ -797,7 +895,7 @@ class GpuSum(GpuOp):
dim3 n_blocks(1); dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_reduce_11(self, sio, node, name, x, z, fail): def c_code_reduce_11(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -815,7 +913,7 @@ class GpuSum(GpuOp): ...@@ -815,7 +913,7 @@ class GpuSum(GpuOp):
dim3 n_blocks(1); dim3 n_blocks(1);
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
""" """
...@@ -825,8 +923,8 @@ class GpuSum(GpuOp): ...@@ -825,8 +923,8 @@ class GpuSum(GpuOp):
assert N in [1,2,3] assert N in [1,2,3]
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
N_pattern = ''.join(['1']*N) N_pattern = ''.join(['1']*N)
param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]"%locals() for i in xrange(N+1)]) param_dim = ",".join(["CudaNdarray_HOST_DIMS(%(x)s)[%(i)s]" % locals() for i in xrange(N+1)])
strides_dim = ",".join(["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]"%locals() for i in xrange(N+1)]) strides_dim = ",".join(["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]" % locals() for i in xrange(N+1)])
threads_y = """ threads_y = """
//get as many y threads as we can fit //get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
...@@ -836,7 +934,7 @@ class GpuSum(GpuOp): ...@@ -836,7 +934,7 @@ class GpuSum(GpuOp):
else else
break; break;
} }
"""%locals() """ % locals()
threads_z = """ threads_z = """
//get as many z threads as we can fit //get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK) while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
...@@ -846,7 +944,7 @@ class GpuSum(GpuOp): ...@@ -846,7 +944,7 @@ class GpuSum(GpuOp):
else else
break; break;
} }
"""%locals() """ % locals()
if len(self.reduce_mask)==2: if len(self.reduce_mask)==2:
threads_y = '' threads_y = ''
threads_z = '' threads_z = ''
...@@ -863,7 +961,7 @@ class GpuSum(GpuOp): ...@@ -863,7 +961,7 @@ class GpuSum(GpuOp):
dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS)); dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],NUM_VECTOR_OP_BLOCKS));
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_reduce_01(self, sio, node, name, x, z, fail): def c_code_reduce_01(self, sio, node, name, x, z, fail):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 1) self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
...@@ -917,7 +1015,7 @@ class GpuSum(GpuOp): ...@@ -917,7 +1015,7 @@ class GpuSum(GpuOp):
%(fail)s; %(fail)s;
} }
} }
""" %locals() """ % locals()
def c_code_reduce_010(self, sio, node, name, x, z, fail): def c_code_reduce_010(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
makecall_inner = self._makecall(node, name, x, z, fail, pattern="010_inner") makecall_inner = self._makecall(node, name, x, z, fail, pattern="010_inner")
...@@ -1035,7 +1133,7 @@ class GpuSum(GpuOp): ...@@ -1035,7 +1133,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
def c_code_reduce_0101(self, sio, node, name, x, z, fail): def c_code_reduce_0101(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -1054,7 +1152,7 @@ class GpuSum(GpuOp): ...@@ -1054,7 +1152,7 @@ class GpuSum(GpuOp):
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[2]);
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_reduce_100(self, sio, node, name, x, z, fail): def c_code_reduce_100(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -1074,7 +1172,7 @@ class GpuSum(GpuOp): ...@@ -1074,7 +1172,7 @@ class GpuSum(GpuOp):
} }
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_reduce_110(self, sio, node, name, x, z, fail): def c_code_reduce_110(self, sio, node, name, x, z, fail):
makecall = self._makecall(node, name, x, z, fail) makecall = self._makecall(node, name, x, z, fail)
...@@ -1233,9 +1331,10 @@ class GpuSum(GpuOp): ...@@ -1233,9 +1331,10 @@ class GpuSum(GpuOp):
dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]); dim3 n_blocks(CudaNdarray_HOST_DIMS(%(x)s)[1]);
%(makecall)s %(makecall)s
} }
""" %locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return
return (22,) return (22,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1267,7 +1366,7 @@ class GpuSum(GpuOp): ...@@ -1267,7 +1366,7 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,): if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -1295,7 +1394,7 @@ class GpuSum(GpuOp): ...@@ -1295,7 +1394,7 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,1): if self.reduce_mask == (1,1):
#this kernel is ok for up to a few thousand elements, but #this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor # it only runs on ONE multiprocessor
...@@ -1327,7 +1426,7 @@ class GpuSum(GpuOp): ...@@ -1327,7 +1426,7 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
#01, 011, 0111 #01, 011, 0111
if 0 == self.reduce_mask[0] and all(self.reduce_mask[1:]) and nd_in in[2,3,4]: if 0 == self.reduce_mask[0] and all(self.reduce_mask[1:]) and nd_in in[2,3,4]:
# this kernel uses one block for each row. # this kernel uses one block for each row.
...@@ -1348,8 +1447,8 @@ class GpuSum(GpuOp): ...@@ -1348,8 +1447,8 @@ class GpuSum(GpuOp):
for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)" for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]') reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
param_dim = ",".join(["const int d%(i)s"%locals() for i in xrange(nd_in)]) param_dim = ",".join(["const int d%(i)s" % locals() for i in xrange(nd_in)])
param_strides = ",".join(["const int sA%(i)s"%locals() for i in xrange(nd_in)]) param_strides = ",".join(["const int sA%(i)s" % locals() for i in xrange(nd_in)])
decl = self._k_decl(node,nodename) decl = self._k_decl(node,nodename)
init = self._k_init(node,nodename) init = self._k_init(node,nodename)
print >> sio, """ print >> sio, """
...@@ -1368,7 +1467,7 @@ class GpuSum(GpuOp): ...@@ -1368,7 +1467,7 @@ class GpuSum(GpuOp):
%(reducebuf)s %(reducebuf)s
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,1,0) or self.reduce_mask == (1,0): if self.reduce_mask == (0,1,0) or self.reduce_mask == (1,0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -1409,7 +1508,7 @@ class GpuSum(GpuOp): ...@@ -1409,7 +1508,7 @@ class GpuSum(GpuOp):
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,1,0): if self.reduce_mask == (0,1,0):
print >> sio, """ print >> sio, """
static __global__ void kernel_reduce_sum_010_AD_%(nodename)s( static __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
...@@ -1448,7 +1547,7 @@ class GpuSum(GpuOp): ...@@ -1448,7 +1547,7 @@ class GpuSum(GpuOp):
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,1,0): if self.reduce_mask == (0,1,0):
# #
# This kernel is optimized when the inner most dimensions have the smallest stride. # This kernel is optimized when the inner most dimensions have the smallest stride.
...@@ -1486,7 +1585,7 @@ class GpuSum(GpuOp): ...@@ -1486,7 +1585,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,1,0): if self.reduce_mask == (1,1,0):
# this kernel uses one block for each column, # this kernel uses one block for each column,
# threads per block for each element per column. # threads per block for each element per column.
...@@ -1526,7 +1625,7 @@ class GpuSum(GpuOp): ...@@ -1526,7 +1625,7 @@ class GpuSum(GpuOp):
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,0,0): if self.reduce_mask == (1,0,0):
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]') reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
...@@ -1548,7 +1647,7 @@ class GpuSum(GpuOp): ...@@ -1548,7 +1647,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,1,1): if self.reduce_mask == (1,1,1):
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
...@@ -1570,7 +1669,7 @@ class GpuSum(GpuOp): ...@@ -1570,7 +1669,7 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,0,1): if self.reduce_mask == (0,0,1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
...@@ -1605,7 +1704,7 @@ class GpuSum(GpuOp): ...@@ -1605,7 +1704,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,0,1,1): if self.reduce_mask == (0,0,1,1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
...@@ -1633,7 +1732,7 @@ class GpuSum(GpuOp): ...@@ -1633,7 +1732,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (0,1,0,1): if self.reduce_mask == (0,1,0,1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
...@@ -1661,7 +1760,7 @@ class GpuSum(GpuOp): ...@@ -1661,7 +1760,7 @@ class GpuSum(GpuOp):
} }
} }
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,1,1,1): if self.reduce_mask == (1,1,1,1):
reducebuf = self._k_reduce_buf('Z[0]') reducebuf = self._k_reduce_buf('Z[0]')
decl = self._k_decl(node, nodename) decl = self._k_decl(node, nodename)
...@@ -1684,7 +1783,7 @@ class GpuSum(GpuOp): ...@@ -1684,7 +1783,7 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
if self.reduce_mask == (1,0,1,1): if self.reduce_mask == (1,0,1,1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]') reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
print >> sio, """ print >> sio, """
...@@ -1719,25 +1818,30 @@ class GpuSum(GpuOp): ...@@ -1719,25 +1818,30 @@ class GpuSum(GpuOp):
} }
%(reducebuf)s %(reducebuf)s
} }
""" %locals() """ % locals()
return sio.getvalue() return sio.getvalue()
class GpuReshape(tensor.Reshape, GpuOp): class GpuReshape(tensor.Reshape, GpuOp):
""" """
Implement Reshape on the gpu. Implement Reshape on the gpu.
""" """
# __hash__, __eq__, __str__ come from tensor.Subtensor # __hash__, __eq__, __str__ come from tensor.Subtensor
def make_node(self, x, shp): def make_node(self, x, shp):
host_reshaped = host_from_gpu(x).reshape(shp,ndim=self.ndim) host_reshaped = host_from_gpu(x).reshape(shp, ndim=self.ndim)
return Apply(self, [x, shp], [CudaNdarrayType(host_reshaped.broadcastable)()]) return Apply(self, [x, shp],
[CudaNdarrayType(host_reshaped.broadcastable)()])
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
x, shp = inp x, shp = inp
out, = out_ out, = out_
if (len(shp) != self.ndim): if (len(shp) != self.ndim):
raise ValueError('shape argument to Reshape.perform has incorrect length %i' raise ValueError('shape argument to Reshape.perform'
' has incorrect length %i'
', should be %i' % (len(shp), self.ndim), shp) ', should be %i' % (len(shp), self.ndim), shp)
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuSubtensor(tensor.Subtensor, GpuOp): class GpuSubtensor(tensor.Subtensor, GpuOp):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
...@@ -1779,6 +1883,7 @@ class GpuSubtensor(tensor.Subtensor, GpuOp): ...@@ -1779,6 +1883,7 @@ class GpuSubtensor(tensor.Subtensor, GpuOp):
cdata = cdata[0] cdata = cdata[0]
out[0] = x.__getitem__(cdata) out[0] = x.__getitem__(cdata)
class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
Implement AdvancedSubtensor1 on the gpu. Implement AdvancedSubtensor1 on the gpu.
...@@ -1800,11 +1905,13 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -1800,11 +1905,13 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_) #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp x, idx = inp
out, = out_ out, = out_
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),)+x.shape[1:]) o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
for (j,i) in enumerate(idx): x.shape[1:])
for (j, i) in enumerate(idx):
o[j] = x[i] o[j] = x[i]
out[0] = o out[0] = o
class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
""" """
Implement AdvancedIncSubtensor1 on the gpu. Implement AdvancedIncSubtensor1 on the gpu.
...@@ -1816,6 +1923,10 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1816,6 +1923,10 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
assert x_.type.dtype == y_.type.dtype assert x_.type.dtype == y_.type.dtype
assert x_.type.ndim == y_.type.ndim assert x_.type.ndim == y_.type.ndim
# if (x_.type.ndim - 1) > y_.type.ndim:
# y_ = tensor.shape_padleft(y_, x_.type.ndim - y_.type.ndim)
# assert x_.type.ndim == y_.type.ndim
assert x_.type.ndim >= y_.type.ndim
if ilist_.type.dtype[:3] not in ('int', 'uin'): if ilist_.type.dtype[:3] not in ('int', 'uin'):
raise TypeError('index must be integers') raise TypeError('index must be integers')
...@@ -1833,6 +1944,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp): ...@@ -1833,6 +1944,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
# CudaNdarray_Subscript() don't support Advanced slicing. # CudaNdarray_Subscript() don't support Advanced slicing.
# so we use the parent version that loop on each indices. # so we use the parent version that loop on each indices.
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
...@@ -1843,6 +1955,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -1843,6 +1955,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs) rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
return Apply(self, [x,y]+rval.inputs[2:], [x.type()]) return Apply(self, [x,y]+rval.inputs[2:], [x.type()])
class GpuFlatten(tensor.Flatten, GpuOp): class GpuFlatten(tensor.Flatten, GpuOp):
""" """
Implement Flatten on the gpu. Implement Flatten on the gpu.
...@@ -1854,6 +1967,7 @@ class GpuFlatten(tensor.Flatten, GpuOp): ...@@ -1854,6 +1967,7 @@ class GpuFlatten(tensor.Flatten, GpuOp):
out_type = CudaNdarrayType(broadcastable=host_out_broadcastable) out_type = CudaNdarrayType(broadcastable=host_out_broadcastable)
return Apply(self, [x], [out_type()]) return Apply(self, [x], [out_type()])
class GpuShape(tensor.Shape, GpuOp): class GpuShape(tensor.Shape, GpuOp):
""" """
Implement Shape on the gpu. Implement Shape on the gpu.
...@@ -1862,6 +1976,7 @@ class GpuShape(tensor.Shape, GpuOp): ...@@ -1862,6 +1976,7 @@ class GpuShape(tensor.Shape, GpuOp):
return Apply(self, [x], [tensor.lvector()]) return Apply(self, [x], [tensor.lvector()])
gpu_shape = GpuShape() gpu_shape = GpuShape()
class GpuJoin(tensor.Join, GpuOp): class GpuJoin(tensor.Join, GpuOp):
""" """
Implement Join on the gpu. Implement Join on the gpu.
...@@ -1939,6 +2054,7 @@ class GpuJoin(tensor.Join, GpuOp): ...@@ -1939,6 +2054,7 @@ class GpuJoin(tensor.Join, GpuOp):
gpu_join = GpuJoin() gpu_join = GpuJoin()
class GpuAlloc(GpuOp): class GpuAlloc(GpuOp):
""" """
Implement Alloc on the gpu. Implement Alloc on the gpu.
...@@ -1990,13 +2106,13 @@ class GpuAlloc(GpuOp): ...@@ -1990,13 +2106,13 @@ class GpuAlloc(GpuOp):
value = inputs[0] value = inputs[0]
shps = inputs[1:] shps = inputs[1:]
nd = len(shps) nd = len(shps)
str = "int dims[%(nd)s];\n"%locals() str = "int dims[%(nd)s];\n" % locals()
for idx,sh in enumerate(shps): for idx,sh in enumerate(shps):
str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n"%locals() str += "dims[%(idx)s] = PyInt_AsLong((PyObject*)%(sh)s);\n" % locals()
str += "if(%(out)s==NULL\n"%locals() str += "if(%(out)s==NULL\n" % locals()
for idx,sh in enumerate(shps): for idx,sh in enumerate(shps):
str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]"%locals() str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]" % locals()
str+="""){ str+="""){
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s= (CudaNdarray*)CudaNdarray_New(); %(out)s= (CudaNdarray*)CudaNdarray_New();
...@@ -2006,7 +2122,7 @@ class GpuAlloc(GpuOp): ...@@ -2006,7 +2122,7 @@ class GpuAlloc(GpuOp):
{ {
%(fail)s; %(fail)s;
} }
"""%locals() """ % locals()
return str return str
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
...@@ -2047,8 +2163,10 @@ class GpuContiguous(GpuOp): ...@@ -2047,8 +2163,10 @@ class GpuContiguous(GpuOp):
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
...@@ -2067,9 +2185,9 @@ class GpuContiguous(GpuOp): ...@@ -2067,9 +2185,9 @@ class GpuContiguous(GpuOp):
%(z)s = %(input)s; %(z)s = %(input)s;
Py_INCREF(%(z)s); Py_INCREF(%(z)s);
} else if ((NULL == %(z)s)"""%locals() } else if ((NULL == %(z)s)""" % locals()
for i in xrange(len(node.inputs[0].type.broadcastable)): for i in xrange(len(node.inputs[0].type.broadcastable)):
str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])"%locals() str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
str += """) str += """)
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
...@@ -2082,7 +2200,7 @@ class GpuContiguous(GpuOp): ...@@ -2082,7 +2200,7 @@ class GpuContiguous(GpuOp):
%(fail)s; %(fail)s;
} }
} }
"""%locals() """ % locals()
return str return str
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -2142,7 +2260,7 @@ def tensordot(a, b, axes=2): ...@@ -2142,7 +2260,7 @@ def tensordot(a, b, axes=2):
# Useful mostly for test as the gpu op are inserted automatically... # Useful mostly for test as the gpu op are inserted automatically...
fscalar = CudaNdarrayType(dtype='float32', broadcastable=()) fscalar = CudaNdarrayType(dtype='float32', broadcastable=())
def scalar(name = None, dtype = None): def scalar(name=None, dtype=None):
"""Return a symbolic scalar variable. """Return a symbolic scalar variable.
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
:param name: a name to attach to this variable :param name: a name to attach to this variable
...@@ -2153,7 +2271,7 @@ def scalar(name = None, dtype = None): ...@@ -2153,7 +2271,7 @@ def scalar(name = None, dtype = None):
return type(name) return type(name)
fvector = CudaNdarrayType(dtype='float32', broadcastable=(False, )) fvector = CudaNdarrayType(dtype='float32', broadcastable=(False, ))
def vector(name = None, dtype = None): def vector(name=None, dtype=None):
"""Return a symbolic vector variable. """Return a symbolic vector variable.
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
:param name: a name to attach to this variable :param name: a name to attach to this variable
...@@ -2164,7 +2282,7 @@ def vector(name = None, dtype = None): ...@@ -2164,7 +2282,7 @@ def vector(name = None, dtype = None):
return type(name) return type(name)
fmatrix = CudaNdarrayType(dtype='float32', broadcastable=(False, False)) fmatrix = CudaNdarrayType(dtype='float32', broadcastable=(False, False))
def matrix(name = None, dtype = None): def matrix(name=None, dtype=None):
"""Return a symbolic matrix variable. """Return a symbolic matrix variable.
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
:param name: a name to attach to this variable :param name: a name to attach to this variable
...@@ -2175,7 +2293,7 @@ def matrix(name = None, dtype = None): ...@@ -2175,7 +2293,7 @@ def matrix(name = None, dtype = None):
return type(name) return type(name)
frow = CudaNdarrayType(dtype='float32', broadcastable=(True, False)) frow = CudaNdarrayType(dtype='float32', broadcastable=(True, False))
def row(name = None, dtype = None): def row(name=None, dtype=None):
"""Return a symbolic row variable (ndim=2, broadcastable=[True,False]). """Return a symbolic row variable (ndim=2, broadcastable=[True,False]).
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
:param name: a name to attach to this variable :param name: a name to attach to this variable
...@@ -2186,7 +2304,7 @@ def row(name = None, dtype = None): ...@@ -2186,7 +2304,7 @@ def row(name = None, dtype = None):
return type(name) return type(name)
fcol = CudaNdarrayType(dtype='float32', broadcastable=(False, True)) fcol = CudaNdarrayType(dtype='float32', broadcastable=(False, True))
def col(name = None, dtype = None): def col(name=None, dtype=None):
"""Return a symbolic column variable (ndim=2, broadcastable=[False,True]). """Return a symbolic column variable (ndim=2, broadcastable=[False,True]).
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
:param name: a name to attach to this variable :param name: a name to attach to this variable
...@@ -2207,7 +2325,7 @@ def tensor3(name=None, dtype=None): ...@@ -2207,7 +2325,7 @@ def tensor3(name=None, dtype=None):
type = CudaNdarrayType(dtype=dtype, broadcastable=(False, False, False)) type = CudaNdarrayType(dtype=dtype, broadcastable=(False, False, False))
return type(name) return type(name)
ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,)*4) ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
def tensor4(name=None, dtype=None): def tensor4(name=None, dtype=None):
"""Return a symbolic 4-D variable. """Return a symbolic 4-D variable.
:param dtype: numeric type (None means to use theano.config.floatX) :param dtype: numeric type (None means to use theano.config.floatX)
...@@ -2215,7 +2333,8 @@ def tensor4(name=None, dtype=None): ...@@ -2215,7 +2333,8 @@ def tensor4(name=None, dtype=None):
""" """
if dtype is None: if dtype is None:
dtype = config.floatX dtype = config.floatX
type = CudaNdarrayType(dtype=dtype, broadcastable=(False, False, False, False)) type = CudaNdarrayType(dtype=dtype,
broadcastable=(False, False, False, False))
return type(name) return type(name)
...@@ -2223,16 +2342,17 @@ def tensor4(name=None, dtype=None): ...@@ -2223,16 +2342,17 @@ def tensor4(name=None, dtype=None):
def profile_printer(fct_name, compile_time, fct_call_time, fct_call, def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size, apply_time, apply_cimpl, message, outputs_size,
other_time): other_time):
if any([x[1].op.__class__.__name__.lower().startswith("gpu") for x in apply_time.keys()]): if any([x[1].op.__class__.__name__.lower().startswith("gpu")
for x in apply_time.keys()]):
local_time = sum(apply_time.values()) local_time = sum(apply_time.values())
print print
print 'Some info useful for gpu:' print 'Some info useful for gpu:'
cpu=0 cpu = 0
gpu=0 gpu = 0
trans=0 trans = 0
for (_,node),t in apply_time.items(): for (_, node), t in apply_time.items():
if isinstance(node.op.__class__.__name__,(HostFromGpu, GpuFromHost)): if isinstance(node.op.__class__.__name__, (HostFromGpu, GpuFromHost)):
trans += t trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"): elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t gpu += t
......
...@@ -848,6 +848,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -848,6 +848,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
else else
{ {
if (verbose)
fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)", " CudaNdarray_conv_valid! (%s)",
...@@ -1418,6 +1420,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1418,6 +1420,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
{ {
rval = out; rval = out;
Py_INCREF(rval); Py_INCREF(rval);
if (verbose)
fprintf(stderr,
"INFO: Conv is reusing the 'out' argument"
" structure.\n");
} }
else else
{ {
...@@ -1425,6 +1431,11 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1425,6 +1431,11 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
fprintf(stderr, fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong" "INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n"); " structure.\n");
else if(verbose)
fprintf(stderr,
"INFO: Conv don't have an 'out' argument"
" structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null //rval might be null
} }
......
...@@ -107,8 +107,8 @@ int device_free(void *ptr) ...@@ -107,8 +107,8 @@ int device_free(void *ptr)
break; break;
} }
if(i==TABLE_SIZE) //if(i==TABLE_SIZE)
printf("Unallocated unknow size!\n"); // printf("Unallocated unknow size!\n");
//fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr); //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
#endif #endif
return 0; return 0;
...@@ -567,38 +567,6 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask) ...@@ -567,38 +567,6 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
return (PyObject*)self_sum; return (PyObject*)self_sum;
} }
__global__ void k_copy_reshape_rowmajor(unsigned int numEls,
unsigned int a_nd, const float * a_data, const int * a_dim, const int * a_str,
unsigned int z_nd, float * z_data, const int * z_dim, const int * z_str)
{
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int numThreads = blockDim.x * gridDim.x;
for (unsigned int i = idx; i < numEls; i += numThreads)
{
const float * a_i = a_data;
unsigned int a_ii = i;
for (unsigned int _d = 0; _d < a_nd; ++_d) //make the rightmost coords change fastest
{
unsigned int d = a_nd - _d-1;
unsigned int a_i_d = a_ii % a_dim[d];
a_ii = a_ii / a_dim[d];
a_i += a_i_d * a_str[d];
}
unsigned int z_ii = i;
float * z_i = z_data;
for (unsigned int _d = 0; _d < z_nd; ++_d) //make the rightmost coords change fastest
{
unsigned int d = z_nd - _d-1;
//i tried to make the for loop count down, but it didn't work!?
unsigned int z_i_d = z_ii % z_dim[d];
z_i += z_i_d * z_str[d];
z_ii = z_ii / z_dim[d];
}
z_i[0] = a_i[0]; //copy one lousy float!
}
}
// Reshape self to the new shape gived by the tuple shape. // Reshape self to the new shape gived by the tuple shape.
// //
// If self is c contiguous, it return a view. Otherwise it always do a copy. // If self is c contiguous, it return a view. Otherwise it always do a copy.
...@@ -606,6 +574,22 @@ __global__ void k_copy_reshape_rowmajor(unsigned int numEls, ...@@ -606,6 +574,22 @@ __global__ void k_copy_reshape_rowmajor(unsigned int numEls,
// c contiguous // c contiguous
PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
{ {
if(!CudaNdarray_is_c_contiguous(self))
{
// allocate new space
//TODO: test to see if we can re-use old one and take a new param to
// use this
CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
if (!rval)
{
return NULL;
}
CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
Py_XDECREF(rval);
return (PyObject*)ret;
}
// check shape tuple // check shape tuple
unsigned int rval_nd; unsigned int rval_nd;
unsigned int * rval_dims; unsigned int * rval_dims;
...@@ -656,9 +640,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -656,9 +640,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
return rval; return rval;
} }
if(CudaNdarray_is_c_contiguous(self))
{
//return a view, not a copy //return a view, not a copy
//we can do this as we checked self is c_contiguous
CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd); CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
if (!rval || 0 != rval->data_allocated if (!rval || 0 != rval->data_allocated
...@@ -678,53 +661,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape) ...@@ -678,53 +661,8 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
} }
free(rval_dims); free(rval_dims);
return (PyObject*)rval; return (PyObject*)rval;
}
// allocate new space (TODO: test to see if we can re-use old one)
CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New();
if (!rval || CudaNdarray_alloc_contiguous(rval, rval_nd, rval_dims)){
Py_XDECREF(rval);
free(rval_dims);
return NULL;
}
// call worker routine
unsigned int threads_per_block = std::min(rval_size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(rval_size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
k_copy_reshape_rowmajor<<<n_blocks,threads_per_block>>>(
rval_size,
self->nd,
CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_DIMS(self), CudaNdarray_DEV_STRIDES(self),
rval->nd,
CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_DIMS(rval), CudaNdarray_DEV_STRIDES(rval));
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
Py_DECREF(rval);
PyObject * shape_inp = CudaNdarray_get_shape(self, NULL);
PyObject * shape_inp2 = PyObject_Str(shape_inp);
PyObject * shape_dest = PyObject_Str(shape);
PyErr_Format(PyExc_RuntimeError,
"Cuda error in CudaNdarray_Reshape"
"()n_blocks=%d, n_threads=%d, input_shape=%s,"
" dest_shape=%s): %s: %s.\n",
n_blocks, threads_per_block,
PyString_AsString(shape_inp2),
PyString_AsString(shape_dest),
"k_copy_reshape_rowmajor",
cudaGetErrorString(err)
);
Py_DECREF(shape_dest);
Py_DECREF(shape_inp);
Py_DECREF(shape_inp2);
free(rval_dims);
return NULL;
}
free(rval_dims);
return (PyObject*)rval;
} }
PyObject * CudaNdarray_View(CudaNdarray * self) PyObject * CudaNdarray_View(CudaNdarray * self)
{ {
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd); CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
...@@ -2837,6 +2775,9 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2837,6 +2775,9 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
} }
if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source) if (CudaNdarray_is_c_contiguous(self) && CudaNdarray_is_c_contiguous(other) && size == size_source)
{ {
if (verbose)
fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1); cublasScopy(size, CudaNdarray_DEV_DATA(other), 1, CudaNdarray_DEV_DATA(self), 1);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
...@@ -2877,8 +2818,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2877,8 +2818,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
assert (cudaSuccess == cudaGetLastError()); assert (cudaSuccess == cudaGetLastError());
if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast); if (verbose) fprintf(stderr, "Copying with default version unbroadcast=%d\n", unbroadcast);
// call worker routine // call worker routine
unsigned int n_blocks = std::min(size, (unsigned int)NUM_VECTOR_OP_BLOCKS); unsigned int threads_per_block = std::min(size,
unsigned int threads_per_block = std::min(ceil_intdiv(size, n_blocks), (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
(unsigned int)NUM_VECTOR_OP_BLOCKS);
const CudaNdarray * cuda_dims = other; const CudaNdarray * cuda_dims = other;
if(unbroadcast) if(unbroadcast)
cuda_dims = self; cuda_dims = self;
...@@ -2891,6 +2834,9 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe ...@@ -2891,6 +2834,9 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, const CudaNdarray * othe
CudaNdarray_DEV_DATA(self), (const int *)CudaNdarray_DEV_STRIDES(self)); CudaNdarray_DEV_DATA(self), (const int *)CudaNdarray_DEV_STRIDES(self));
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if(verbose>1)
fprintf(stderr, "INFO k_elemwise_unary_rowmaj (n_blocks=%i, n_threads_per_block=%i)\n",
n_blocks, threads_per_block);
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
//fprint_CudaNdarray(stderr, self); //fprint_CudaNdarray(stderr, self);
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论