提交 91c8fcf3 authored 作者: nouiz's avatar nouiz

Merge pull request #934 from goodfeli/gpu_max

Gpu max
......@@ -33,6 +33,11 @@ There are less methods to define for an Op than for a Type:
This must return C code that carries the computation we want to do.
sub is a dictionary of strings for you to substitute into your code.
It's not clear if it ever contains anything other than 'fail'.
sub['fail'] is a string of code that you should execute (after calling
PyErr_Format) if your C code needs to raise an exception.
.. method:: c_code_cleanup(node, name, input_names, output_names, sub)
This must return C code that cleans up whatever c_code allocated and
......
......@@ -270,7 +270,7 @@ if cuda_available:
import basic_ops
from basic_ops import (GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuSum, GpuReshape, GpuContiguous,
GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
GpuFlatten, GpuShape, GpuAlloc,
......
......@@ -8,6 +8,8 @@ import numpy
import theano
from theano import Op, Type, Apply, Variable, Constant
from theano import tensor, scalar, config
from theano.scalar import Scalar
scal = scalar # somewhere scalar gets reassigned to be a function
from theano.gof.python25 import all, any
......@@ -484,15 +486,15 @@ class GpuDimShuffle(GpuOp):
return (1, 0)
class GpuSum(GpuOp):
"""GpuSum is a Reduction along some dimensions by summation.
class GpuCAReduce(GpuOp):
"""GpuCAReduce is a Reduction along some dimensions by a scalar op.
The dimensions along which to sum is specified by the
The dimensions along which to reduce is specified by the
`reduce_mask` that you pass to the constructor. The `reduce_mask`
is a tuple of booleans (actually integers 0 or 1) that specify for
each input dimension, whether to reduce it (1) or not (0).
For example:
For example, when scalar_op is a theano.scalar.basic.Add instance:
- reduce_mask == (1,) sums a vector to a scalar
......@@ -505,19 +507,39 @@ class GpuSum(GpuOp):
:note: any reduce_mask of all zeros is a sort of 'copy', and may
be removed during graph optimization
This Op is a work in progress.
This op was recently upgraded from just GpuSum a general CAReduce. Not
many code cases are supported for scalar_op being anything other than
scal.Add instances yet.
Important note: if you implement new cases for this op, be sure to
benchmark them and make sure that they actually result in a speedup.
GPUs are not especially well-suited to reduction operations so it is
quite possible that the GPU might be slower for some cases.
"""
def __init__(self, reduce_mask):
def __init__(self, reduce_mask, scalar_op):
self.reduce_mask = tuple(reduce_mask)
self.scalar_op = scalar_op
# used to make sure that calls to scalar op
# have unique name arguments
self._n_scalar_op_calls = 0
def __eq__(self, other):
return (type(self) == type(other) and
self.reduce_mask == other.reduce_mask)
self.reduce_mask == other.reduce_mask and
self.scalar_op == other.scalar_op)
def __hash__(self):
return hash(type(self)) ^ hash(self.reduce_mask)
return hash(type(self)) ^ hash(self.reduce_mask) ^ hash(type(self.scalar_op))
def __str__(self):
return "GpuSum{%s}" % ','.join(str(i) for i in self.reduce_mask)
return "GpuCAReduce{%s}{%s}" % (
str(self.scalar_op),
','.join(str(i) for i in self.reduce_mask)
)
def make_node(self, x):
if (x.type.ndim != len(self.reduce_mask)):
......@@ -526,10 +548,55 @@ class GpuSum(GpuOp):
in xrange(x.type.ndim) if not self.reduce_mask[i]]
return Apply(self, [x], [CudaNdarrayType(o_broadcast)()])
"""
This method must be commented, because there's no way
to communicate that it's OK to call for + but not for
max
def perform(self, node, inp, out):
x, = inp
z, = out
self._op_guard()
# reduce_max is declared but does nothing but
# raise NotImplementedError.
# We can't call it here anyway because it hasn't
# been added to the python bindings yet
z[0] = x.reduce_sum(self.reduce_mask)
"""
def supports_c_code(self, inputs):
""" Returns True if the current op and reduce pattern
has functioning C code """
# If we don't even have the right method, we certainly
# don't support the C code
# (This is the test that used to be implemented by
# local_gpu_sum)
pattern = (''.join(str(i) for i in self.reduce_mask))
if not hasattr(self, 'c_code_reduce_%s' % pattern):
return False
# Now that this is a general reduction op, we might
# have a method for a pattern, but that pattern
# might not be implemented for the current scalar op.
# To detect this more complicated situation, we
# make fake arguments to c_code, try to run them,
# and see if NotImplementedError gets raised.
node = self.make_node(*inputs)
name = 'fake_name'
inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
sub = { 'fail' : 'fake failure code' }
try:
self.c_code(node, name, inp, out, sub)
self.c_support_code_apply(node, name)
except NotImplementedError:
return False
return True
def c_code(self, node, name, inp, out, sub):
x, = inp
......@@ -553,6 +620,24 @@ class GpuSum(GpuOp):
}
""" % locals()
# It might be nice to use a property of the op class to do this,
# but tensor.elemwise.CAReduce has this exact same check so I guess
# this is OK to do
if self.scalar_op in [scal.minimum, scal.maximum]:
for i in xrange(nd_in):
conds = []
if self.reduce_mask[i]:
conds.append("(CudaNdarray_HOST_DIMS(%(x)s)[%(i)s] == 0)" % locals())
assert len(conds) > 0
cond = "(" + " || ".join(conds) + ")"
print >> sio, """
if %(cond)s
{
PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
%(fail)s;
}
""" %locals()
#
# alloc an output if we need one
#
......@@ -608,7 +693,7 @@ class GpuSum(GpuOp):
#
if all(i == 1 for i in self.reduce_mask):
#check if the tensor is ccontiguous, if true, use the c_c0de_reduce_ccontig code.
#check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
#TODO: check if we are ccontiguous when we un-dimshuffle
#TODO: if only some dims are ccontiguous, call version with less dims.
print >> sio, 'if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals()
......@@ -634,9 +719,9 @@ class GpuSum(GpuOp):
.. code-block:: c
if (verbose)
printf("running kernel_reduce_sum_10_%(name)s\\n");
printf("running kernel_reduce_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads,
kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
n_shared>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
......@@ -660,7 +745,7 @@ class GpuSum(GpuOp):
nd_out = ndim - sum(self.reduce_mask)
print >> sio, """
if (verbose)
printf("running kernel_reduce_sum_%(pattern)s_%(name)s\\n");
printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
if (verbose>1)
printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
......@@ -670,7 +755,7 @@ class GpuSum(GpuOp):
n_threads.x*n_threads.y*n_threads.z,
n_blocks.x,n_blocks.y,
n_blocks.x*n_blocks.y, n_shared);
kernel_reduce_sum_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
""" % locals()
for i in xrange(ndim):
print >> sio, """
......@@ -699,7 +784,7 @@ class GpuSum(GpuOp):
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_%(pattern)s_%(name)s",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
......@@ -715,9 +800,11 @@ class GpuSum(GpuOp):
ndim=None, reduce_mask=None):
"""Return a string to declare a kernel function
The result will look something like this:
.. code-block:: c
static __global__ void kernel_reduce_sum_110_%(nodename)s(
static __global__ void kernel_reduce_110_%(nodename)s(
const int d0,
const int d1,
const int d2,
......@@ -728,7 +815,10 @@ class GpuSum(GpuOp):
float * Z,
const int sZ0)
""" % locals()
Since the nodename is unique, we don't need to put the name
of the scalar_op in here.
"""
if reduce_mask is None:
reduce_mask = self.reduce_mask
if ndim is None:
......@@ -738,7 +828,7 @@ class GpuSum(GpuOp):
sio = StringIO.StringIO()
print >> sio, """
static __global__ void kernel_reduce_sum_%(pattern)s_%(nodename)s(
static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
""" % locals()
for i in xrange(ndim):
print >> sio, """
......@@ -764,39 +854,77 @@ class GpuSum(GpuOp):
def _k_init(self, *args):
return """
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+ threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
//TODO: set error code
// TODO: set error code
// 2012-09-20 IG: as of today, Fred says he will check
// this elsewhere, in a different PR
Z[0] = -666;
return;
}
"""
def _k_reduce_buf(self, z_pos):
# Work with all nvidia driver
# But only for power or multiple of 2!
def _assign_reduce(self, node, name, left, right, sub):
"""
node: the node argument to this op's c_code
name: the name argument to this op's c_code
left: a C code string identifying an lvalue
right: a C code string identifying an expression
sub: the sub argument to this op's c_code
returns C code to reduce left and right, assigning the
result to left."""
x ,= node.inputs
dtype = x.dtype
dummy_left = scal.Scalar(dtype = dtype)()
dummy_right = scal.Scalar(dtype = dtype)()
dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
dummy_name = name + '_scalar_op'+ str(self._n_scalar_op_calls)
self._n_scalar_op_calls += 1
return self.scalar_op.c_code(node, name, (left, right), (left, ), sub)
def _k_reduce_buf(self, z_pos, node, name, sub):
"""
WRITEME
node, name, sub: these should be passed through from the original
call to c_code
"""
# This code (the code in new_version) is currently ignored.
# Code produced later in this function is returned instead.
# The code here works with all nvidia driver
# But only for powers or multiples of 2!
new_version = """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum;
buf[threadNum] = myresult;
__syncthreads();
if (threadNum >= ((threadCount >> 1) * 2))
{
int idx = threadNum - (threadCount >> 1) * 2;
buf[idx] += buf[threadNum];
// buf[0] = 998;
} else {
// buf[threadNum] = 0;-999;
int idx = threadNum - (threadCount >> 1) * 2;"""
new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
new_version += """
}
__syncthreads();
//Work for power of 2 only.
// Works for power of 2 only.
int nTotalThreads = threadCount; // Total number of active threads
while(nTotalThreads > 1)
{
......@@ -807,8 +935,11 @@ class GpuSum(GpuOp):
{
// Get the shared value stored by another thread
float temp = buf[threadNum + halfPoint];
"""
new_version += self._assign_reduce(node, name, 'buf[threadNum]', 'temp', sub)
buf[threadNum] += temp;
new_version += """
}
__syncthreads();
......@@ -820,12 +951,13 @@ class GpuSum(GpuOp):
{
%(z_pos)s = buf[0];
}
__syncthreads();
__syncthreads();"""
""" % locals()
return """
new_version = new_version % locals()
current_version = """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum;
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
......@@ -834,19 +966,19 @@ class GpuSum(GpuOp):
//round up all the partial sums into the first `warpSize` elements
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
"""
current_version += self._assign_reduce(node, name, 'myresult', 'buf[i]', sub) + """
}
buf[threadNum] = mysum;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
buf[threadNum] = myresult;
/*Comment this optimization as it don't work on Fermi GPU.
TODO: find why it don't work or put the GPU compute capability into the version
// no sync because only one warp is running
if(threadCount >32)
{
buf[threadNum] += buf[threadNum+16];
buf[threadNum] += buf[threadNum+8];
buf[threadNum] += buf[threadNum+4];
buf[threadNum] += buf[threadNum+2];
buf[threadNum] += buf[threadNum+1];
{"""
for num in [16,8,4,2,1]:
current_version += self._assign_reduce(node, name, 'buf[threadNum]',
'buf[threadNum+%d]' % num, sub)
current_version += """
if (threadNum == 0)
{
%(z_pos)s = buf[0];
......@@ -856,26 +988,32 @@ class GpuSum(GpuOp):
else */
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
//reduce so that threadNum 0 has the reduction of everything
"""
for num in [16,8,4,2,1]:
this_if = "if (threadNum + %d < threadCount) " % num + \
self._assign_reduce(node, name, 'buf[threadNum]','buf[threadNum+%d]' % num, sub)
current_version += this_if
current_version += """
if (threadNum == 0)
{
%(z_pos)s = buf[0];
}
}
}
""" % locals()
"""
current_version = current_version % locals()
return current_version
#Threads must be organized as: threadNum%nb_reduce correspond to the same sum
#nb_reduce<=warpSize
def _k_reduce_buf_multiple(self, z_pos, nb_reduce):
self._op_guard()
return """
__syncthreads(); // some kernel do multiple reduction.
buf[threadNum] = mysum;
buf[threadNum] = myresult;
__syncthreads();
// rest of function is handled by one warp
......@@ -884,13 +1022,20 @@ class GpuSum(GpuOp):
//round up all the partial sums into the first `nb_reduce` elements
for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
{
mysum += buf[i];
myresult += buf[i];
}
%(z_pos)s = mysum;
%(z_pos)s = myresult;
}
""" % locals()
def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
"""
WRITEME
IG: I believe, based on how this is called in c_code, that it
is for the case where we are reducing on all axes and x is
C contiguous.
"""
self._op_guard()
print >> sio, """
{
if(CudaNdarray_SIZE(%(x)s)==0){
......@@ -901,11 +1046,11 @@ class GpuSum(GpuOp):
std::min(CudaNdarray_SIZE(%(x)s),
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_ccontig_%(name)s"
if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
" n_threads.x=%%d, size=%%d, ndim=%%d\\n",
n_threads.x,CudaNdarray_SIZE(%(x)s),%(x)s->nd);
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_SIZE(%(x)s),
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_DEV_DATA(%(z)s));
......@@ -916,7 +1061,7 @@ class GpuSum(GpuOp):
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_ccontig_%(name)s",
"kernel_reduce_ccontig_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
......@@ -930,6 +1075,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_1(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -943,6 +1089,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_11(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -965,6 +1112,7 @@ class GpuSum(GpuOp):
:param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
Work for N=1,2,3
"""
assert N in [1, 2, 3]
makecall = self._makecall(node, name, x, z, fail)
N_pattern = ''.join(['1'] * N)
......@@ -972,6 +1120,7 @@ class GpuSum(GpuOp):
for i in xrange(N + 1)])
strides_dim = ",".join(["CudaNdarray_HOST_STRIDES(%(x)s)[%(i)s]"
% locals() for i in xrange(N + 1)])
threads_y = """
//get as many y threads as we can fit
while (n_threads.x * (n_threads.y+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
......@@ -980,8 +1129,8 @@ class GpuSum(GpuOp):
n_threads.y += 1;
else
break;
}
""" % locals()
}""" % locals()
threads_z = """
//get as many z threads as we can fit
while (n_threads.x * n_threads.y * (n_threads.z+1) <= NUM_VECTOR_OP_THREADS_PER_BLOCK)
......@@ -990,13 +1139,15 @@ class GpuSum(GpuOp):
n_threads.z += 1;
else
break;
}
""" % locals()
}""" % locals()
if len(self.reduce_mask) == 2:
threads_y = ''
threads_z = ''
if len(self.reduce_mask) == 3:
threads_z = ''
print >> sio, """
{
int verbose = 0;
......@@ -1021,6 +1172,7 @@ class GpuSum(GpuOp):
self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
def c_code_reduce_10(self, sio, node, name, x, z, fail):
self._op_guard()
print >> sio, """
{
int verbose = 0;
......@@ -1032,13 +1184,13 @@ class GpuSum(GpuOp):
NUM_VECTOR_OP_BLOCKS));
if (verbose) {
fprintf(stderr,
"running kernel_reduce_sum_10_%(name)s n_blocks=(%%i,%%i)\\n",
"running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
n_blocks.x,
n_blocks.y);
}
assert( CudaNdarray_HOST_DIMS(%(x)s)[1] == CudaNdarray_HOST_DIMS(%(z)s)[0]);
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
1,
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
......@@ -1057,7 +1209,7 @@ class GpuSum(GpuOp):
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
......@@ -1070,6 +1222,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_010(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
makecall_inner = self._makecall(node, name, x, z, fail,
pattern="010_inner")
......@@ -1108,7 +1261,7 @@ class GpuSum(GpuOp):
if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
int n_shared = 0;
kernel_reduce_sum_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
......@@ -1125,7 +1278,7 @@ class GpuSum(GpuOp):
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s."
" (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s",
"kernel_reduce_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
......@@ -1178,7 +1331,7 @@ class GpuSum(GpuOp):
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_%(pattern)s_%(name)s",
"kernel_reduce_%(pattern)s_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
......@@ -1192,6 +1345,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_0101(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1211,6 +1365,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_100(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
# use threadIdx.x for i0
# use blockIdx.x for i1
......@@ -1231,6 +1386,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_110(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1252,6 +1408,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_001(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1274,6 +1431,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_111(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1306,6 +1464,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_0011(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1336,6 +1495,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_1111(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1368,6 +1528,7 @@ class GpuSum(GpuOp):
""" % locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
self._op_guard()
makecall = self._makecall(node, name, x, z, fail)
print >> sio, """
{
......@@ -1391,18 +1552,36 @@ class GpuSum(GpuOp):
}
""" % locals()
def c_code_cache_version(self):
return (22,)
def c_code_cache_version_apply(self, node):
version = [5] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs],
[Scalar(dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version(scalar_node))
for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version):
return tuple(version)
else:
return ()
def _op_guard(self):
""" Raises NotImplementedError if op is not Add """
if not isinstance(self.scalar_op, theano.scalar.basic.Add):
raise NotImplementedError()
def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO()
nd_in = len(self.reduce_mask)
if all(i == 1 for i in self.reduce_mask):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_ccontig_%(nodename)s(
static __global__ void kernel_reduce_ccontig_%(nodename)s(
const unsigned int d0,
const float *A,
float * Z)
......@@ -1410,7 +1589,7 @@ class GpuSum(GpuOp):
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1419,17 +1598,18 @@ class GpuSum(GpuOp):
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
mysum += A[i0];
myresult += A[i0];
}
%(reducebuf)s
}
""" % locals()
if self.reduce_mask == (1,):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s(
static __global__ void kernel_reduce_1_%(nodename)s(
const unsigned int d0,
const float *A, const int sA0,
float * Z)
......@@ -1437,7 +1617,7 @@ class GpuSum(GpuOp):
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1447,17 +1627,18 @@ class GpuSum(GpuOp):
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
float Ai = A[i0 * sA0];
mysum += Ai;
myresult += Ai;
}
%(reducebuf)s
}
""" % locals()
if self.reduce_mask == (1, 1):
self._op_guard()
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
reducebuf = self._k_reduce_buf('Z[0]')
reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_11_%(nodename)s(
static __global__ void kernel_reduce_11_%(nodename)s(
const int d0,
const int d1,
const float *A, const int sA0, const int sA1,
......@@ -1466,7 +1647,7 @@ class GpuSum(GpuOp):
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1478,7 +1659,7 @@ class GpuSum(GpuOp):
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
float Ai = A[i0 * sA0 + i1 * sA1];
mysum += Ai;
myresult += Ai;
}
}
%(reducebuf)s
......@@ -1492,53 +1673,117 @@ class GpuSum(GpuOp):
# threads per block for each element per row.
N_pattern = ''.join(['1'] * (nd_in - 1))
# TODO: is it faster to hardcode sA3, etc. in the later code, rather
# than have the for_* variables declare them and the later code use
# their names?
if nd_in == 2:
for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
first_i1 = 'threadIdx.x'
sA1 = 'sA1'
for_i2 = "int i2=0, sA2=0;"
sA2 = '0'
first_i2 = '0'
for_i3 = "int i3=0, sA3=0;"
sA3 = '0'
first_i3 = '0'
if nd_in == 3:
for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
first_i1 = 'threadIdx.y'
sA1 = 'sA1'
for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
first_i2 = 'threadIdx.x'
sA2 = 'sA2'
for_i3 = "int i3=0, sA3=0;"
first_i3 = 0
sA3 = '0'
if nd_in == 4:
for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
first_i1 = 'threadIdx.z'
sA1 = 'sA1'
for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
first_i2 = 'threadIdx.y'
sA2 = 'sA2'
for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
first_i3 = 'threadIdx.x'
sA3 = 'sA3'
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node, nodename, sub = {})
param_dim = ",".join(["const int d%(i)s" % locals()
for i in xrange(nd_in)])
param_strides = ",".join(["const int sA%(i)s" % locals()
for i in xrange(nd_in)])
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
mysum = 0;
%(for_i1)s{
%(for_i2)s{
%(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
mysum += Ai;
# TODO: ideally this would all be some clean function of scalar_op,
# but since sum is a special case where it's OK to reduce with an
# extra 0, I would need to change the behavior of the sum reduction
# code to do that. I don't want to benchmark and test changes to the
# sum code so I will leave that for later.
# max reduction is also a special case that is simple to implement.
# this is the special case where reduction is idempotent so it doesn't
# matter if we reduce with the first element multiple times.
if isinstance(self.scalar_op, scal.Add):
# special cased sum code (special case because starts the
# reduction with 0)
print >> sio, """
%(decl)s{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
myresult = 0;
%(for_i1)s{
%(for_i2)s{
%(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
myresult += Ai;
}
}
}
%(reducebuf)s
}
}
%(reducebuf)s
}
}
""" % locals()
""" % locals()
elif isinstance(self.scalar_op, scal.Maximum):
# special cased max code (special case because visits first
# member of each row twice)
print >> sio, """
%(decl)s{
%(init)s
for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
myresult = A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0];
%(for_i1)s{
%(for_i2)s{
%(for_i3)s{
float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
myresult = max(myresult, Ai);
}
}
}
%(reducebuf)s
}
}
""" % locals()
else:
# TODO: implement general case and get rid of the two special
# cases above
# it should initialize myresult to element 0,
# and the for loop should begin traversing from element 1
# raise an error if asked to reduce an empty dimension
# (maybe special-case sum to return 0 instead of returning an
# error)
# in both cases, benchmark the general case against the existing
# code to make sure it does not cause a slowdown
raise NotImplementedError()
if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
self._op_guard()
# this kernel uses one block for each column,
# threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]', node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_010_%(nodename)s(
static __global__ void kernel_reduce_010_%(nodename)s(
const int d0,
const int d1,
const int d2,
......@@ -1560,10 +1805,10 @@ class GpuSum(GpuOp):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float mysum = 0.0f;
float myresult = 0.0f;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
%(reducebuf)s
}
......@@ -1572,8 +1817,9 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (0, 1, 0):
self._op_guard()
print >> sio, """
static __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
static __global__ void kernel_reduce_010_AD_%(nodename)s(
const int A,
const int B,
const int C,
......@@ -1585,7 +1831,7 @@ class GpuSum(GpuOp):
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1599,12 +1845,12 @@ class GpuSum(GpuOp):
int c = i2_D * 32 + threadIdx.x;
if (c < C)
{
mysum = 0;
myresult = 0;
for (int b = 0; b < B; ++b)
{
mysum += X[a * sX0 + b * sX1 + c * sX2];
myresult += X[a * sX0 + b * sX1 + c * sX2];
}
Z[a * sZ0 + c * sZ1] = mysum;
Z[a * sZ0 + c * sZ1] = myresult;
}
}
}
......@@ -1612,6 +1858,7 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (0, 1, 0):
self._op_guard()
#
# This kernel is optimized when the inner most dimensions
# have the smallest stride.
......@@ -1645,7 +1892,7 @@ class GpuSum(GpuOp):
{
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
%(reducebuf)s
}
......@@ -1653,15 +1900,16 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (1, 1, 0):
self._op_guard()
# this kernel uses one block for each column,
# threads per block for each element per column.
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_110_%(nodename)s(
static __global__ void kernel_reduce_110_%(nodename)s(
const int d0,
const int d1,
const int d2,
......@@ -1672,7 +1920,7 @@ class GpuSum(GpuOp):
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1686,7 +1934,7 @@ class GpuSum(GpuOp):
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
float Ai = A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2];
mysum += Ai;
myresult += Ai;
}
}
......@@ -1694,7 +1942,9 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (1, 0, 0):
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]')
self._op_guard()
reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
node, nodename, sub={})
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
......@@ -1705,10 +1955,10 @@ class GpuSum(GpuOp):
{
for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
{
mysum = 0;
myresult = 0;
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
%(reducebuf)s
}
......@@ -1716,21 +1966,23 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]')
self._op_guard()
reducebuf = self._k_reduce_buf('Z[0]', node,
nodename, sub={})
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s
{
%(init)s
mysum = 0;
myresult = 0;
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
}
}
......@@ -1738,11 +1990,13 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (0, 0, 1):
self._op_guard()
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_001_%(nodename)s(
static __global__ void kernel_reduce_001_%(nodename)s(
const int d0,
const int d1,
const int d2,
......@@ -1763,10 +2017,10 @@ class GpuSum(GpuOp):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float mysum = 0.0f;
float myresult = 0.0f;
for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
}
%(reducebuf)s
}
......@@ -1774,9 +2028,11 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (0, 0, 1, 1):
self._op_guard()
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub = {})
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
......@@ -1788,12 +2044,12 @@ class GpuSum(GpuOp):
{
for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
{
float mysum = 0.0f;
float myresult = 0.0f;
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
%(reducebuf)s
......@@ -1802,9 +2058,11 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (0, 1, 0, 1):
self._op_guard()
# this kernel uses one block for each row,
# threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]')
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
node, nodename, sub = {})
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
......@@ -1816,12 +2074,12 @@ class GpuSum(GpuOp):
{
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{
float mysum = 0.0f;
float myresult = 0.0f;
for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
%(reducebuf)s
......@@ -1830,14 +2088,16 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (1, 1, 1, 1):
reducebuf = self._k_reduce_buf('Z[0]')
self._op_guard()
reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
sub = {})
decl = self._k_decl(node, nodename)
init = self._k_init(node, nodename)
print >> sio, """
%(decl)s
{
%(init)s
mysum = 0;
myresult = 0;
for (int i0 = 0; i0 < d0; i0++)
for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
{
......@@ -1845,7 +2105,7 @@ class GpuSum(GpuOp):
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
myresult += A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3];
}
}
}
......@@ -1853,9 +2113,11 @@ class GpuSum(GpuOp):
}
""" % locals()
if self.reduce_mask == (1, 0, 1, 1):
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
self._op_guard()
reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
node, nodename, sub = {})
print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s(
static __global__ void kernel_reduce_1011_%(nodename)s(
const unsigned int d0,
const unsigned int d1,
const unsigned int d2,
......@@ -1867,7 +2129,7 @@ class GpuSum(GpuOp):
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
float myresult = 0.0f;
if (warpSize != 32)
{
......@@ -1881,7 +2143,7 @@ class GpuSum(GpuOp):
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
mysum += Ai;
myresult += Ai;
}
}
}
......
......@@ -582,9 +582,12 @@ def local_gpu_gemm(node):
@register_opt()
@local_optimizer([])
def local_gpu_sum(node):
def local_gpu_careduce(node):
if isinstance(node.op, tensor.elemwise.CAReduce):
if node.op.scalar_op == scal.add:
scalar_op = node.op.scalar_op
# currently, only these two ops are supported at all,
# and max does not support all combinations of axes
if node.op.scalar_op in [scal.add, scal.maximum]:
x, = node.inputs
if x.owner and x.owner.op == host_from_gpu:
if node.op.axis is None:
......@@ -594,22 +597,21 @@ def local_gpu_sum(node):
for a in node.op.axis:
assert reduce_mask[a] == 0
reduce_mask[a] = 1
gsum = GpuSum(reduce_mask)
pattern = (''.join(str(i) for i in reduce_mask))
if hasattr(gsum, 'c_code_reduce_%s' % pattern):
rval = host_from_gpu(gsum(gpu_from_host(x)))
greduce = GpuCAReduce(reduce_mask, scalar_op)
if greduce.supports_c_code([gpu_from_host(x)]):
rval = host_from_gpu(greduce(gpu_from_host(x)))
if rval.type == node.outputs[0].type:
return [rval]
else:
print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
"WARNING: local_gpu_careduce got type wrong"
return None
else:
# Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have
# the same value in the reduce_mask, then we can reshape
# to make them a single dimension, do the sum, and then
# to make them a single dimension, do the reduction, and then
# reshape to get them back.
shape_of = node.fgraph.shape_feature.shape_of
......@@ -625,27 +627,28 @@ def local_gpu_sum(node):
new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i])
pattern = (''.join(str(i) for i in new_mask))
new_gsum = GpuSum(new_mask)
if hasattr(new_gsum, 'c_code_reduce_%s' % pattern):
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
sum_reshaped_x = host_from_gpu(
new_gsum(gpu_from_host(reshaped_x)))
new_greduce = GpuCAReduce(new_mask, scalar_op)
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
gpu_reshaped_x = gpu_from_host(reshaped_x)
reshaped_gpu_inputs = [ gpu_reshaped_x ]
if new_greduce.supports_c_code(reshaped_gpu_inputs):
reduce_reshaped_x = host_from_gpu(
new_greduce(gpu_reshaped_x))
if sum_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_sum = sum_reshaped_x.reshape(
if reduce_reshaped_x.ndim != node.outputs[0].ndim:
unreshaped_reduce = reduce_reshaped_x.reshape(
tensor.stack(*shape_of[node.outputs[0]]))
else:
unreshaped_sum = sum_reshaped_x
if unreshaped_sum.type == node.outputs[0].type:
return [unreshaped_sum]
unreshaped_reduce = reduce_reshaped_x
if unreshaped_reduce.type == node.outputs[0].type:
return [unreshaped_reduce]
else:
print >> sys.stderr, \
"WARNING: local_gpu_sum got type wrong"
"WARNING: local_gpu_careduce got type wrong"
return None
raise Exception(
"GpuSum don't have implemented the pattern",
"GpuCAReduce does not yet implement this pattern:",
pattern)
return False
......
......@@ -9,6 +9,7 @@ from theano import tensor
import numpy
import theano
import theano.tensor as T
from numpy.testing.noseclasses import KnownFailureTest
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
......@@ -108,7 +109,7 @@ def test_sum():
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
if val.size == 0:
assert f2(val) == f(val), ('shape', shape, 'pattern', pattern)
......@@ -145,7 +146,7 @@ def test_sum():
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
......@@ -181,12 +182,244 @@ def test_sum():
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert T.Sum in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
def test_max():
"""
test GpuMax pattern 01, 011, 0111 (tensor.max pattern (1,), (1,2), (1,2,3) )
TODO: are others currently implemented by reshape?
"""
def tensor_pattern_to_gpu_pattern(shape, pattern):
gpu_pattern = [ 0 for elem in shape ]
for idx in pattern:
gpu_pattern[idx] = 1
gpu_pattern = tuple(gpu_pattern)
return gpu_pattern
known_fail = False
for shape, pattern in [((1,1),(1,)),
((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3)),
((2,1),(1,)),
((1,2),(1,)),
((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
#test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
((4100,3),[0]),((3,4101),[0]),#10
((1024,33),[0]),((33,1024),[0]),#10
((1025,33),[0]),((33,1025),[0]),#10
((4100,3),[1]),((3,4101),[1]),#01
((1024,33),[1]),((33,1024),[1]),#01
((1025,33),[1]),((33,1025),[1]),#01
((4100,3),[0,1]),((3,4101),[0,1]),#11
((1024,33),[0,1]),((33,1024),[0,1]),#01
((1025,33),[0,1]),((33,1025),[0,1]),#01
((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
#((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
#test pattern implemented by reshape
((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
b = T.max(a, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f_caused_value_error = False
try:
f_out = f(val)
except ValueError, e:
exc = e
f_caused_value_error = True
except RuntimeError:
if (shape, pattern) in [((1,0),(1,)),
((0,1),(1,)),
((0,0),(1,)),
((0,0,0),(1,2)),
((0,0,0,0),(1,2,3))]:
known_fail = True
continue
else:
raise
f2 = theano.function([a], b, mode=mode_without_gpu)
try:
f2_out = f2(val)
f2_caused_value_error = False
except ValueError, e:
exc2 = e
f2_caused_value_error = True
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
# Check that 0 shape matrices are invalid in the same cases
if f_caused_value_error != f2_caused_value_error:
if f_caused_value_error:
print 'f caused this value error:'
print exc
else:
print 'f did not raise a value error, but should have'
if f2_caused_value_error:
print 'f2 caused this value error:'
print exc2
else:
print 'f should not have raised a value error'
print 'shape was: ',shape
print 'pattern was: ',pattern
assert False
if f_caused_value_error:
continue
if val.size == 0:
assert f2(val).size == f(val).size
assert f2(val).shape == f(val).shape
else:
try:
#We raise the error threashold as we sum big matrix
#and this cause small rounding difference with some seed
#example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5
f2_val = f2(val)
f_val = f(val)
if not _allclose(f2_val, f_val):
print 'failed for the following arguments: '
print 'shape:',shape
print 'pattern: ',pattern
print 'input:'
print val
print 'correct output: '
print f2_val
print 'actual output: '
print f_val
assert False
finally:
theano.tensor.basic.float32_rtol = orig_rtol
#test with dimshuffle
#we shuffle the 2 outer dims.
for shape, pattern in [#((5,),[0]),
((5,4),(0,1)),((5,4),[0]),
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = range(len(shape))
dim_pattern[0] = 1
dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern)
b = T.max(a, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
#test with broadcast
for shape, pattern in [((5,),(0,)),
((5,4),(0,1)),
((5,4),(0,)),
((5,4,3),(0,)),
((5,4,3),(0,1)),
((5,4,3),(2,)),
((5,4,3),(0,1,2)),
((5,4,3,2),(0,1,2,3)),
((5,4,3,2),(0,2,3))]:
# Don't test patterns that aren't implemented for max yet
if tensor_pattern_to_gpu_pattern(shape, pattern) not in \
[ (0,1), (0,1,1), (0,1,1) ]:
continue
shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = T.max(a, pattern)
b2 = T.max(a2, pattern)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32')
val2 = cuda.CudaNdarray(val)
if len(shape) == 1:
val = val[::2]
val2 = val2[::2]
elif len(shape) == 2:
val = val[::2, ::2]
val2 = val2[::2, ::2]
elif len(shape) == 3:
val = val[::2, ::2, ::2]
val2 = val2[::2, ::2, ::2]
elif len(shape) == 4:
val = val[::2, ::2, ::2, ::2]
val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ for x in f2.maker.fgraph.toposort()]
assert T.CAReduce in [x.op.__class__ for x in f.maker.fgraph.toposort()]
assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern,
sum([shape[i] for i in pattern]))
if known_fail:
raise KnownFailureTest("GpuCAReduce does not handle some shapes"
" with 0s in them correctly.")
def test_flatten():
x = cuda.fmatrix('x')
......
......@@ -28,7 +28,10 @@ def test_nvidia_driver1():
profile=False)
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
assert sum(isinstance(node.op, B.GpuSum) for node in topo) == 1
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
'but got:']+[str(app) for app in topo])
raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS "
"does not give good results for reduction."
......
......@@ -44,11 +44,11 @@ def test_int_pow():
f = theano.function([a], (a*4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuSum', 'GpuElemwise', 'HostFromGpu']
assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
f = theano.function([a], tensor.pow(a,4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuElemwise', 'GpuSum', 'HostFromGpu']
assert op_names == ['GpuElemwise', 'GpuCAReduce', 'HostFromGpu']
#theano.printing.debugprint(f)
......
......@@ -1159,6 +1159,7 @@ class Maximum(BinaryScalarOp):
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
maximum = Maximum(upcast_out, name='maximum')
......@@ -1187,7 +1188,6 @@ class Minimum(BinaryScalarOp):
gx = eq(output, x) * gz
gy = eq(output, y) * gz
return (gx, gy)
minimum = Minimum(upcast_out, name='minimum')
......@@ -1222,6 +1222,8 @@ class Add(ScalarOp):
for i in inputs:
retval += [gz]
return retval
add = Add(upcast_out, name='add')
......
......@@ -1082,14 +1082,16 @@ class Elemwise(Op):
class CAReduce(Op):
"""
CAReduce = Commutative Associative Reduce
Reduces a scalar operation along the specified axis(es).
(The scalar op should be both commutative and assocative)
The output will have the same shape as the input minus the reduced
dimensions. It will contain the variable of accumulating all values
over the reduced dimensions using the specified scalar op.
Examples:
CAReduce(add) -> sum
CAReduce(add) -> sum (ie, acts like the numpy sum operation)
CAReduce(mul) -> product
CAReduce(maximum) -> max
CAReduce(minimum) -> min
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论