提交 9dcf3f4c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Delete sandbox.cuda.

上级 bea31470
from __future__ import absolute_import, print_function, division
import numpy
import theano
import theano.tensor as T
from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, HostFromGpu)
from theano.misc import strutil
from theano.tensor.nnet.Conv3D import Conv3D
from theano.sandbox.cuda.opt import gpu_optimizer
from theano.sandbox.cuda import CudaNdarrayType, GpuOp
class GpuConv3D(GpuOp):
"""
GPU implementation of Conv3D.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return '%s' % (self.__class__.__name__)
def make_node(self, V, W, b, d):
"""
Parameters
----------
V
Visible unit, input.
W
Weights, filter.
b
Bias.
d
Strides when moving the filter over the input.
"""
V_ = as_cuda_ndarray_variable(V)
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
broad = (V_.broadcastable[0], W_.broadcastable[0], False, False, False)
return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, inputs, outputs, sub):
V, W, b, d = inputs
fail = sub['fail']
H = outputs[0]
codeSource = """
///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n");
//Check dimensionality of inputs
if (CudaNdarray_NDIM(%(W)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s
}
if (CudaNdarray_NDIM(%(V)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s
}
if (CudaNdarray_NDIM(%(b)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s
}
if (CudaNdarray_NDIM(%(d)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s
}
if (PyArray_DIMS(%(d)s)[0] != 3)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
%(fail)s
}
{ //extra scope so fail doesn't jump over declarations
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s
}
{ //extra scope so error handler jumps don't cause errors
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
%(fail)s
}
{ // extra scope so fail works
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
%(fail)s
}
{ // extra scope so fail works
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s
}
{ // extra scope so fail works
//Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s
}
}
{ // extra scope so fail will not cross declarations
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
bool work_complete = false;
if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_rows_stack
dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
dim3 threads(outputDur);
int shared_size=0;
conv_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
ws3,ws2,ws1,ws4,ws0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D >
"""
return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = out_dur
//grid block size =(out_len*out_wid, nb kern *nb batch)
//
conv_rows_stack( float* img, float* kern, float* bias, float* out,
int img_len, int img_wid, int img_dur,
int kern_height, int kern_wid, int kern_dur,
int nkern, int input_channels,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_ochannel, int img_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_okern)
{
int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
out_len = int( (img_len - kern_height) / dr )+1;
out_wid = int( (img_wid - kern_wid) / dc )+1;
out_dur = int( (img_dur - kern_dur) / dt )+1;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y - batch_id*nkern;
const int out_row = blockIdx.x%out_len;
const int out_col = blockIdx.x/out_len;
const int out_frame=threadIdx.x;
img += batch_id*img_stride_batch + out_row*dr*img_stride_row + out_col*dc*img_stride_col+out_frame*dt*img_stride_frame;
kern += kern_id*kern_stride_okern;
float sum = 0.0f;
for (int z = 0; z < input_channels; z++) {//1 for first layer
for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
}
}
}
out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
out_frame*nkern+//the output frame
out_row*out_wid*out_dur*nkern+//the output row
out_col*out_dur*nkern + //the output_col
kern_id //the output image (channel)
] = sum + bias[kern_id];
}
}
"""
return codeSource
gpu_convd = GpuConv3D()
@local_optimizer([Conv3D])
def local_gpu_conv3d(node):
if isinstance(node.op, Conv3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, W, b, d = node.inputs
return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
as_cuda_ndarray_variable(W),
as_cuda_ndarray_variable(b),
d))]
# Not enabled by default as we don't want people to use it.
gpu_optimizer.register("local_gpu_conv3d", local_gpu_conv3d)
from __future__ import absolute_import, print_function, division
import numpy
import theano
import theano.tensor as T
from six.moves import xrange
from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.misc import strutil
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
from theano.sandbox.cuda.opt import gpu_optimizer
from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp)
class GpuConvGrad3D(GpuOp):
"""
GPU version of gradient of ConvGrad3D with respect to W.
"""
def make_node(self, V, d, WShape, dCdH):
"""
Parameters
----------
V
Visible.
d
Strides.
WShape
Shapes of the weights -> shape of this op output.
dCdH
Other input with what V will be convolved.
"""
V_ = as_cuda_ndarray_variable(V)
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs
print("GpuConvGrad3D python code (warning not updated to new format)")
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0]
outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize
dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype)
# block
for j in xrange(0, WShape[0]):
for z in xrange(0, WShape[1]):
for k in xrange(0, WShape[2]):
for l in xrange(0, WShape[3]):
# threads
for m in xrange(0, WShape[4]):
# thread
for i in xrange(0, batchSize):
for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth):
for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW
def c_code(self, node, nodename, inputs, outputs, sub):
V, d, WShape, dCdH = inputs
fail = sub['fail']
dCdW = outputs[0]
codeSource = """
///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n");
//Check dimensionality of inputs
if (CudaNdarray_NDIM(%(dCdH)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: dCdH must be a 5-d CudaNdArray");
%(fail)s
}
if (CudaNdarray_NDIM(%(V)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: V must be a 5-d CudaNdArray");
%(fail)s
}
if (CudaNdarray_NDIM(%(WShape)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be a 1-d CudaNdArray");
%(fail)s
}
if (PyArray_NDIM(%(d)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: d must be a 1-d CudaNdArray");
%(fail)s
}
if (PyArray_DIMS(%(d)s)[0] != 3)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
%(fail)s
}
{ // for fail
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
if (PyArray_DIMS(%(WShape)s)[0] != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must specify a 5-d shape");
%(fail)s
}
if (!PyArray_ISCONTIGUOUS(%(WShape)s))
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be contiguous");
%(fail)s
}
{ //for fail
dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
const int outputChannels = WShape[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (WShape[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
%(fail)s
}
{ //extra scope so fail works
const int filterHeight = WShape[1];
const int filterWidth = WShape[2];
const int filterDur = WShape[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
%(fail)s
}
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a width of %%i but V is only %%i pixels wide", filterWidth, vidWidth);
%(fail)s
}
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Compute correctl sized of output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[3] != outputDur)
{
PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%i,%%i,%%i,%%i,%%i)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, CudaNdarray_HOST_DIMS(%(dCdH)s)[0], CudaNdarray_HOST_DIMS(%(dCdH)s)[1], CudaNdarray_HOST_DIMS(%(dCdH)s)[2] ,CudaNdarray_HOST_DIMS(%(dCdH)s)[3], CudaNdarray_HOST_DIMS(%(dCdH)s)[4] );
%(fail)s
}
{ // extra scope for fail
npy_intp dims[5];
dims[0] = outputChannels;
dims[4] = inputChannels;
dims[1] = filterHeight;
dims[2] = filterWidth;
dims[3] = filterDur;
if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s);
%(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(dCdW)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConvGrad3D: Could not allocated dCdW");
%(fail)s
}
}
{ //for fail
const int dcdhs4 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[4];
const int dcdhs3 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[3];
const int dcdhs1 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[1];
const int dcdhs2 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[2];
const int dcdhs0 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[0];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
bool out_contiguous = CudaNdarray_is_c_contiguous(%(dCdW)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool work_complete = false;
if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_complete){
//conv_rows_stack
dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
dim3 threads(WShape[3]);
int shared_size=0;
convgrad_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(dCdH)s), CudaNdarray_DEV_DATA(%(dCdW)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
WShape[0], WShape[1], WShape[2], WShape[3], WShape[4],
outputHeight,outputWidth,outputDur,
batchSize, outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
dcdhs3,dcdhs2,dcdhs1,dcdhs4,dcdhs0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}} // extra scope for fail
///////////// < /code generated by GpuConvGrad3D >
"""
return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
//
convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
int img_len, int img_wid, int img_dur,
int dCdW_len, int dCdW_wid, int dCdW_dur,
int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
int out_len, int out_wid, int out_dur,
int batchSize, int nkern, int nstack,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
int dCdW_stride_stack, int dCdW_stride_nkern)
{
int __shared__ kern_id, stack_id;
float __shared__ *d_img, *d_kern;
kern_id= blockIdx.x%nkern;
stack_id = blockIdx.x/nkern;
const int dCdW_row = blockIdx.y%ws1;
const int dCdW_col = blockIdx.y/ws1;
const int dCdW_frame=threadIdx.x;
img +=stack_id*img_stride_stack;
dCdH +=kern_id*dCdW_stride_stack;
float sum = 0.0f;
for(int i=0;i<batchSize;i++){
for(int p=0;p<out_len;p++){
for(int q=0;q<out_wid;q++){
for(int r=0;r<out_dur;r++){
sum += dCdH[i*dCdW_stride_nkern+p*dCdW_stride_row+q*dCdW_stride_col+r*dCdW_stride_frame] *
img[i*img_stride_batch+(dr*p+dCdW_row)*img_stride_row+(dc*q+dCdW_col)*img_stride_col+(dt*r+dCdW_frame)*img_stride_frame];
}
}
}
}
dCdW[kern_id*wsh1*wsh2*wsh3*wsh4+//the good batch
stack_id+//the output image
dCdW_row*wsh2*wsh3*wsh4+//the output row
dCdW_col*wsh3*wsh4 + //the output_col
dCdW_frame*wsh4] = sum;
}
/*
#block
for j in xrange(0,WShape[0]):
for z in xrange(0,WShape[1]):
for k in xrange(0,WShape[2]):
for l in xrange(0,WShape[3]):
#threads
for m in xrange(0,WShape[4]):
#thread
for i in xrange(0,batchSize):
for p in xrange(0,outputHeight):
for q in xrange(0,outputWidth):
for r in xrange(0,outputDur):
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
*/
"""
return codeSource
gpu_conv_grad3d = GpuConvGrad3D()
@local_optimizer([ConvGrad3D])
def local_gpu_conv_grad3d(node):
if isinstance(node.op, ConvGrad3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, d, WShape, dCdH = node.inputs
return [host_from_gpu(gpu_conv_grad3d(
as_cuda_ndarray_variable(V),
d,
WShape,
as_cuda_ndarray_variable(dCdH)))]
# Not enabled by default as we don't want people to use it.
gpu_optimizer.register("local_gpu_conv_grad3d", local_gpu_conv_grad3d)
from __future__ import absolute_import, print_function, division
import numpy
import theano.tensor as T
from theano.misc import strutil
import theano
from six.moves import xrange
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
from theano.gof import local_optimizer
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.sandbox.cuda.opt import gpu_optimizer
from theano.sandbox.cuda import (CudaNdarrayType, HostFromGpu,
host_from_gpu, GpuOp)
class GpuConvTransp3D(GpuOp):
"""
The gpu version of ConvTransp3D.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, W, b, d, H, RShape=None):
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
H_ = as_cuda_ndarray_variable(H)
if RShape:
RShape_ = T.as_tensor_variable(RShape)
else:
RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(
self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs
W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
def perform_(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs
print("\t\t\t\tGpuConvTransp3D python code still uses old format")
output_storage[0][0] = computeR(W, b, d, H, RShape)
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, inputs, outputs, sub):
W, b, d, H, RShape = inputs
fail = sub['fail']
R = outputs[0]
codeSource = """
///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n");
//Check dimensionality of inputs
if (CudaNdarray_NDIM(%(H)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D", CudaNdarray_NDIM(%(H)s));
%(fail)s
}
if (CudaNdarray_NDIM(%(W)s) != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
%(fail)s
}
if (CudaNdarray_NDIM(%(b)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
%(fail)s
}
if (PyArray_NDIM(%(d)s) != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
%(fail)s
}
//Read and check stride arguments
if (PyArray_DIMS(%(d)s)[0] != 3)
{
PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", PyArray_DIMS(%(d)s)[0]);
%(fail)s
}
{ // for fail
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Read and check sizes of inputs
{ // for fail
const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
%(fail)s
}
{ // for fail
const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
%(fail)s
}
{ // for fail
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
int videoHeight = (outputHeight-1) * dr + filterHeight;
int videoWidth = (outputWidth-1) * dc + filterWidth;
int videoDur = (outputDur-1) * dt + filterDur;
if (%(RShape)s)
{
if (PyArray_NDIM(%(RShape)s) != 1)
{
PyErr_Format(PyExc_ValueError, "RShape must be a vector");
%(fail)s
}
if (PyArray_DIMS(%(RShape)s)[0] != 3)
{
PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
%(fail)s
}
{ // for fail
dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
if (RShape0 != -1)
{
if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
{
PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
%(fail)s
}
videoHeight = RShape0;
videoWidth = RShape1;
videoDur = RShape2;
}
}
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError,"Could not allocate R");
%(fail)s;
}
}
cudaMemset(CudaNdarray_DEV_DATA(%(R)s), 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
{ // for fail
bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
bool work_complete = false;
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];
if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_transp_rows_stack
dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
dim3 threads(videoDur);
HERE
int shared_size=0;
conv_transp_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
videoHeight, videoWidth, videoDur,
filterHeight, filterWidth, filterDur,
outputHeight, outputWidth, outputDur,
outputChannels, inputChannels,
dr,dc,dt,
hs3,hs2,hs1,hs4,hs0,
ws3,ws2,ws1,ws4,ws0,
CudaNdarray_HOST_STRIDES(%(b)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
out_contiguous,b_strided,outputDur);
%(fail)s
}
}}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D >
"""
return strutil.render_string(codeSource, locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = videoDur
//grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
//
conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
int img_len, int img_wid, int img_dur,
int kern_len, int kern_wid, int kern_dur,
int H_len, int H_wid, int H_dur,
int nkern, int nstack,
int dr, int dc, int dt,
int H_stride_frame, int H_stride_col, int H_stride_row,
int H_stride_stack, int H_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern,
int bias_stride)
{
int __shared__ batch_id, stack_id;
float __shared__ *d_img, *d_kern;
batch_id= blockIdx.x/nstack;
stack_id = blockIdx.x - batch_id*nstack;
const int R_row = blockIdx.y/img_wid;
const int R_col = blockIdx.y%img_wid;
const int R_frame=threadIdx.x;
const int r = R_row;
const int c = R_col;
const int t = R_frame;
const int ftc = max(0, int(ceil(float(t-kern_dur +1 )/float(dt))));
const int fcc = max(0, int(ceil(float(c-kern_wid +1)/float(dc))));
int rc = max(0, int(ceil(float(r-kern_len+1)/float(dr))));
float sum = 0;
while(rc < H_len){
int rk = r - rc * dr;
if(rk < 0)
break;
int cc = fcc;
while( cc < H_wid){
int ck = c - cc * dc;
if(ck < 0)
break;
int tc = ftc;
while(tc < H_dur){
int tk = t - tc * dt;
if(tk < 0)
break;
//R[i,j,r,c,t] += numpy.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
for(int q=0;q<nkern;q++){
sum += kern[q*kern_stride_nkern+stack_id*kern_stride_stack+rk*kern_stride_row+ck*kern_stride_col+tk*kern_stride_frame]*
H[batch_id*H_stride_batch+q*H_stride_stack+rc*H_stride_row+cc*H_stride_col+tc*H_stride_frame];
}
tc += 1;
}
cc += 1;
}
rc += 1;
}
R[batch_id*nstack*img_len*img_wid*img_dur+//the good batch
stack_id+//the output image
R_row*img_wid*img_dur*nstack+//the output row
R_col*img_dur*nstack + //the output_col
R_frame*nstack] = sum + bias[stack_id*bias_stride];
}
"""
return codeSource
gpu_conv_transpd = GpuConvTransp3D()
@local_optimizer([ConvTransp3D])
def local_gpu_conv_transp3d(node):
if isinstance(node.op, ConvTransp3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
W, b, d, H, RShape = node.inputs
return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
# Not enabled by default as we don't want people to use it.
gpu_optimizer.register("local_gpu_conv_transp3d", local_gpu_conv_transp3d)
# If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W, b, d, H, Rshape=None):
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr, dc, dt = d
assert dr > 0
assert dc > 0
assert dt > 0
videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
print((Rshape[0], videoHeight))
assert False
assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur
# print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape
# else:
# print "No Rshape passed in"
# print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels):
# print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight):
# print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth):
for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j]
ftc = max(
[0,
int(numpy.ceil(
float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
break
cc = fcc
while cc < outputWidth:
ck = c - cc * dc
if ck < 0:
break
tc = ftc
while tc < outputDur:
tk = t - tc * dt
if tk < 0:
break
R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1
"" # close loop over tc
cc += 1
"" # close loop over cc
rc += 1
"" # close loop over rc
"" # close loop over t
"" # close loop over c
"" # close loop over r
"" # close loop over j
"" # close loop over i
return R
from __future__ import absolute_import, print_function, division
import atexit
import errno
import logging
import os
import shutil
import stat
import sys
import textwrap
import warnings
import theano
from theano.compat import get_unbound_function
from theano.compile import optdb
from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
from theano.gof.cmodule import get_lib_extension
from theano.gof.compilelock import get_lock, release_lock
from theano import config
from . import nvcc_compiler
from theano.tensor.basic import register_transfer
# ignore_newtrees is to speed the optimization as this is the pattern
# we use for optimization. Otherwise, we can iterate 100s of time on
# the graph and apply only a few optimizations each time.
gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_seqopt = SequenceDB()
def register_opt(*tags, **kwargs):
if any([not isinstance(t, str) for t in tags]):
raise RuntimeError("Bad call to register_opt."
" All tags must be strings.", tags)
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
'gpu', *tags, **kwargs)
return local_opt
return f
def register_inplace(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
optdb.register(
name, TopoOptimizer(
local_opt, failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu', *tags)
return local_opt
return f
_logger_name = 'theano.sandbox.cuda'
_logger = logging.getLogger(_logger_name)
# is_nvcc_available called here to initialize global vars in
# nvcc_compiler module
nvcc_compiler.is_nvcc_available()
# Compile cuda_ndarray.cu
# This need that nvcc (part of cuda) is installed. If it is not, a warning is
# printed and this module will not be working properly (we set `cuda_available`
# to False).
# This variable is True by default, and set to False if nvcc is not
# available or their is no cuda card or something goes wrong when
# trying to initialize cuda.
cuda_available = True
# Global variable to avoid displaying the same warning multiple times.
cuda_warning_is_displayed = False
# This variable is set to True when we enable cuda.(i.e. when use() is called)
cuda_enabled = False
# Code factorized within a function so that it may be called from multiple
# places (which is not currently the case, but may be useful in the future).
def set_cuda_disabled():
"""
Function used to disable cuda.
A warning is displayed, so that the user is aware that cuda-based code is
not going to work.
Note that there is no point calling this function from outside of
`cuda.__init__`, since it has no effect once the module is loaded.
"""
global cuda_available, cuda_warning_is_displayed
cuda_available = False
# cuda_ndarray compile and import
cuda_path = os.path.abspath(os.path.split(__file__)[0])
cuda_ndarray_loc = os.path.join(config.compiledir, 'cuda_ndarray')
cuda_ndarray_so = os.path.join(
cuda_ndarray_loc, 'cuda_ndarray.' + get_lib_extension())
libcuda_ndarray_so = os.path.join(
cuda_ndarray_loc, 'libcuda_ndarray.' + get_lib_extension())
def try_import():
"""
Load the cuda_ndarray module if present and up to date.
Return True if loaded correctly, otherwise return False.
"""
cuda_files = (
'cuda_ndarray.cu',
'cuda_ndarray.cuh',
'conv_full_kernel.cu',
'cnmem.h',
'cnmem.cpp',
'conv_kernel.cu')
stat_times = [os.stat(os.path.join(cuda_path, cuda_file))[stat.ST_MTIME]
for cuda_file in cuda_files]
date = max(stat_times)
if os.path.exists(cuda_ndarray_so):
if date >= os.stat(cuda_ndarray_so)[stat.ST_MTIME]:
return False
try:
# If we load a previously-compiled version, config.compiledir should
# be in sys.path.
sys.path[0:0] = [config.compiledir]
import cuda_ndarray.cuda_ndarray
del sys.path[0]
except ImportError:
return False
return True
if not nvcc_compiler.is_nvcc_available() or not theano.config.cxx:
# It can happen that the file cuda_ndarray.so is already compiled
# but nvcc is not available. In that case we need to disable the CUDA
# back-end as we won't be able to compile any new op and we can't only
# use already compiled GPU op and not the others.
# Also, if cxx is not available, we need to disable all GPU code.
set_cuda_disabled()
compile_cuda_ndarray = False
elif not config.device.startswith('gpu') and config.force_device:
# We where asked to NEVER use the GPU
set_cuda_disabled()
compile_cuda_ndarray = False
else:
# Add the theano cache directory's cuda_ndarray subdirectory to the
# list of places that are hard-coded into compiled modules' runtime
# library search list. This works in conjunction with
# nvcc_compiler.NVCC_compiler.compile_str which adds this folder during
# compilation with -L and also adds -lcuda_ndarray when compiling
# modules.
nvcc_compiler.add_standard_rpath(cuda_ndarray_loc)
compile_cuda_ndarray = not try_import()
if compile_cuda_ndarray and cuda_available:
get_lock()
try:
# Retry to load again in case someone else compiled it
# while we waited for the lock
if not try_import():
try:
if not nvcc_compiler.is_nvcc_available():
set_cuda_disabled()
if cuda_available:
code = open(os.path.join(cuda_path,
"cuda_ndarray.cu")).read()
if not os.path.exists(cuda_ndarray_loc):
os.makedirs(cuda_ndarray_loc)
# If $TMPDIR is defined, nvopencc wants it to exist
if 'TMPDIR' in os.environ:
tmpdir = os.environ['TMPDIR']
if not os.path.exists(tmpdir):
os.makedirs(tmpdir)
compiler = nvcc_compiler.NVCC_compiler()
preargs = ['-O3'] + compiler.compile_args()
compiler.compile_str(
'cuda_ndarray',
code,
location=cuda_ndarray_loc,
include_dirs=[cuda_path],
libs=[config.cublas.lib],
preargs=preargs,
)
from cuda_ndarray.cuda_ndarray import *
except Exception as e:
_logger.error("Failed to compile cuda_ndarray.cu: %s", str(e))
set_cuda_disabled()
finally:
release_lock()
del compile_cuda_ndarray
if cuda_available:
global cuda_initialization_error_message
# The module should be compiled.
from cuda_ndarray.cuda_ndarray import *
# If necessary,
# create a symlink called libcuda_ndarray.so
# which nvcc_compiler.NVCC_compiler uses when linking
# any module except "cuda_ndarray" itself.
def ok():
"""
Check if an existing library exists and can be read.
"""
try:
open(libcuda_ndarray_so).close()
return True
except IOError:
return False
if not ok():
if sys.platform == "win32":
# The Python `os` module does not support symlinks on win32.
shutil.copyfile(cuda_ndarray_so, libcuda_ndarray_so)
else:
try:
os.symlink(cuda_ndarray_so, libcuda_ndarray_so)
except OSError as e:
# This may happen for instance when running multiple
# concurrent jobs, if two of them try to create the
# symlink simultaneously.
# If that happens, we verify that the existing symlink is
# indeed working.
if getattr(e, 'errno', None) != errno.EEXIST or not ok():
raise
try:
# This only test if the cuda driver is available and if there
# is at least one GPU that support cuda. This do not select a
# device.
gpu_init()
cuda_available = True
cuda_initialization_error_message = ""
# actively closing our gpu session presents segfault-on-exit on some systems
atexit.register(gpu_shutdown)
except EnvironmentError as e:
cuda_available = False
cuda_initialization_error_message = " ".join(e.args)
else:
cuda_initialization_error_message = 'cuda unavailable'
class GpuOp(theano.gof.Op):
"""
Parent class for all GPU Ops.
This class ensures we verify the GPU is working properly when a GPU Op is
used for the first time.
It is defined in __init__.py so that it exists even when `cuda_available`
is False (this is necessary to avoid breaking the test suite).
"""
def prepare_node(self, node, storage_map, compute_map, impl):
if use.device_number is None:
use("gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
# We must do those import to be able to create the full doc when
# nvcc is not available
from theano.sandbox.cuda.var import (CudaNdarrayVariable,
CudaNdarrayConstant,
CudaNdarraySharedVariable,
float32_shared_constructor)
from theano.sandbox.cuda.type import CudaNdarrayType
def dnn_available():
if config.dnn.enabled == "False":
dnn_available.avail = False
dnn_available.msg = "Disabled by dnn.enabled flag"
if dnn_available.avail is None and not cuda_available:
dnn_available.msg = "CUDA not available"
dnn_available.avail = False
elif config.dnn.enabled == "no_check":
raise RuntimeException("The old gpu back-end do not support the flag dnn.enabled=no_check")
elif dnn_available.avail is None:
dev = active_device_number()
if device_properties(dev)['major'] < 3:
dnn_available.msg = "Device not supported"
dnn_available.avail = False
else:
preambule = textwrap.dedent(
"""
#include <stdio.h>
#include <cuda.h>
#include <cudnn.h>
#include <cudnn_helper.h>
""")
body = textwrap.dedent(
"""
cudnnHandle_t _handle = NULL;
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
fprintf(stderr, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
return 1;
}
""")
# to support path that includes spaces, we need to wrap it with double quotes on Windows
path_wrapper = "\"" if os.name =='nt' else ""
params = ["-l", "cudnn"]
params.extend(['-I%s%s%s' % (path_wrapper, os.path.dirname(__file__), path_wrapper)])
if config.dnn.include_path:
params.extend(['-I%s%s%s' % (path_wrapper, config.dnn.include_path, path_wrapper)])
if config.dnn.library_path:
params.extend(['-L%s%s%s' % (path_wrapper, config.dnn.library_path, path_wrapper)])
if config.nvcc.compiler_bindir:
params.extend(['--compiler-bindir',
'%s%s%s' % (path_wrapper, config.nvcc.compiler_bindir, path_wrapper)])
params.extend([flag for flag in config.nvcc.flags.split(' ') if flag])
# Do not run here the test program. It would run on the
# default gpu, not the one selected by the user. If mixed
# GPU are installed or if the GPUs are configured in
# exclusive mode, this cause bad detection.
comp, out, err = nvcc_compiler.NVCC_compiler.try_flags(
flag_list=params, preambule=preambule, body=body,
try_run=False, output=True)
dnn_available.avail = comp
if not dnn_available.avail:
dnn_available.msg = (
"Can not compile with cuDNN. We got this error:\n" +
str(err))
else:
# If we can compile, check that we can import and run.
v = dnn_version()
if isinstance(v, tuple) and v[0] != v[1]:
dnn_available.avail = False
dnn_available.msg = ("Mixed dnn version. The header is"
" from one version, but we link with"
" a different version %s" % str(v))
raise RuntimeError(dnn_available.msg)
if v == -1 or v[0] < 4007:
# 4007 is the final release of cudnn v4
dnn_available.avail = False
dnn_available.msg = "Version is too old. Update to v5, was %d." % v[0]
raise RuntimeError(dnn_available.msg)
else:
dnn_available.avail = comp
if config.dnn.enabled == "True":
if not dnn_available.avail:
raise RuntimeError(
"You enabled cuDNN, but we aren't able to use it: %s" %
dnn_available.msg)
return dnn_available.avail
dnn_available.avail = None
dnn_available.msg = None
class DnnVersion(GpuOp):
def c_compiler(self):
return nvcc_compiler.NVCC_compiler
def c_headers(self):
return ['cudnn.h']
def c_header_dirs(self):
return [config.dnn.include_path]
def c_libraries(self):
return ['cudnn']
def c_lib_dirs(self):
return [config.dnn.library_path]
def c_compile_args(self):
return ['-Wl,-rpath,' + config.dnn.library_path]
def c_support_code(self):
return textwrap.dedent(
"""
#if PY_MAJOR_VERSION >= 3
#define PyInt_FromLong PyLong_FromLong
#endif
""")
def make_node(self):
return theano.gof.Apply(self, [], [theano.gof.Generic()()])
def c_code(self, node, name, inputs, outputs, sub):
o = outputs[0]
return textwrap.dedent(
"""
#if defined(CUDNN_VERSION)
%(o)s = PyTuple_Pack(2, PyInt_FromLong(CUDNN_VERSION), PyInt_FromLong(cudnnGetVersion()));
#else
%(o)s = PyInt_FromLong(-1);
#endif
""") % locals()
def do_constant_folding(self, node):
# Needed as we do not want to cache this information.
return False
def c_code_cache_version(self):
# Not needed, but make it clear that we do not want to cache this.
return None
def dnn_version():
"""Return the current cuDNN version we compile with.
This returns a tuple with the header version and the library
version we link with. For older cudnn version without version
information, we return -1.
"""
if not dnn_available():
raise Exception(
"We can't determine the cudnn version as it is not available",
dnn_available.msg)
if dnn_version.v is None:
f = theano.function([], DnnVersion()(),
theano.Mode(optimizer=None),
profile=False)
dnn_version.v = f()
return dnn_version.v
dnn_version.v = None
if cuda_available:
# check if their is an old cuda_ndarray that was loading instead of the one
# we compiled!
import cuda_ndarray.cuda_ndarray
if cuda_ndarray_so != cuda_ndarray.cuda_ndarray.__file__:
_logger.warning("cuda_ndarray was loaded from %s, but Theano expected "
"to load it from %s. This is not expected as theano "
"should compile it automatically for you. Do you have "
"a directory called cuda_ndarray in your "
"LD_LIBRARY_PATH environment variable? If so, please "
"remove it as it is outdated.",
cuda_ndarray.cuda_ndarray.__file__,
cuda_ndarray_so)
shared_constructor = float32_shared_constructor
from . import basic_ops
from .basic_ops import (
GpuFromHost, HostFromGpu, GpuElemwise,
GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
GpuSubtensor, GpuIncSubtensor,
GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
gpu_flatten, GpuFlatten, GpuShape, GpuAlloc, GpuAllocEmpty, GpuSplit,
GpuJoin, fscalar, fvector, fmatrix, frow, fcol,
ftensor3, ftensor4,
scalar, vector, matrix, row, col,
tensor3, tensor4)
from .basic_ops import (host_from_gpu, gpu_from_host, as_cuda_array,
as_cuda_ndarray_variable)
import cuda_ndarray
from . import opt, dnn
from .rng_curand import CURAND_RandomStreams
def transfer(x, target):
if target == 'gpu':
return as_cuda_ndarray_variable(x)
register_transfer(transfer)
def use(device,
force=False,
default_to_move_computation_to_gpu=True,
move_shared_float32_to_gpu=True,
enable_cuda=True,
test_driver=True):
"""
Error and warning about CUDA should be displayed only when this
function is called. We need to be able to load this module only
to check if it is available!
Parameters
----------
device : string
"cpu", "gpu", "gpuN" (N is the device number to use).
force
Will always raise an exception if we can't use the gpu.
default_to_move_computation_to_gpu
If gpu init succeeded, enable by default optimizations to move
computations to the gpu.
move_shared_float32_to_gpu
If gpu init succeeded, put new shared variables in float32 on the gpu.
enable_cuda
If the gpu is correctly enabled, set the variable cuda_enabled to True.
"""
global cuda_enabled, cuda_initialization_error_message
_logger.warn("The cuda backend is deprecated and will be removed in "
"the next release (v0.10). Please switch to the gpuarray backend. "
"You can get more information about how to switch at this "
"URL:\n https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29\n")
if force and not cuda_available and device.startswith('gpu'):
if not nvcc_compiler.is_nvcc_available():
raise EnvironmentError("You forced the use of gpu device '%s', but"
" nvcc was not found. Set it in your PATH "
"environment variable or set the Theano "
"flags 'cuda.root' to its directory"
"" % device)
else:
raise EnvironmentError("You forced the use of gpu device %s, "
"but CUDA initialization failed "
"with error:\n%s" % (
device,
cuda_initialization_error_message))
elif not nvcc_compiler.is_nvcc_available():
_logger.error("nvcc compiler not found on $PATH. "
"Check your nvcc installation and try again.")
return
elif not cuda_available:
error_addendum = ""
try:
if cuda_initialization_error_message:
error_addendum = (" (error: %s)" %
cuda_initialization_error_message)
except NameError:
# cuda_initialization_error_message is not available b/c compilation failed
pass
_logger.warning("CUDA is installed, but device %s is not available %s",
device, error_addendum)
return
if device == 'gpu':
pass
elif device.startswith('gpu'):
device = int(device[3:])
elif device == 'cpu':
device = -1
else:
raise ValueError("Invalid device identifier", device)
if use.device_number is None:
# No successful call to use() has been made yet
if device != 'gpu' and device < 0:
return
msg = ("Theano flag device=gpu* (old gpu back-end) only support"
" floatX=float32. You have floatX=%s. Use the new gpu"
" back-end with device=cuda* for that value of floatX." %
config.floatX)
if config.floatX == 'float16':
raise RuntimeError(msg)
elif config.floatX == 'float64':
warnings.warn(msg)
# Has PyCUDA already initialized the GPU context
pycuda_init_dev = False
if config.pycuda.init:
import theano.misc.pycuda_init
pycuda_init_dev = theano.misc.pycuda_init.pycuda_available
try:
if pycuda_init_dev:
use.device_number = active_device_number()
# This is needed to initialize the cublas handle.
gpu_init(use.device_number, config.lib.cnmem)
elif(device != 'gpu'):
assert isinstance(device, int)
gpu_init(device, config.lib.cnmem)
use.device_number = device
active_device = active_device_number()
assert active_device == device, (active_device, device)
else:
# This mean the driver should select the GPU. As we
# need to get the device number now, we force the
# selection of the GPU by the driver now and then we
# query the active GPU. If we check the active GPU before
# the device is initialized we will always receive 0
# event if another device is selected later.
if not hasattr(cuda_ndarray.cuda_ndarray, 'select_a_gpu'):
raise Exception(
"Delete your Theano cache. The automatic"
" recompilation did not work.")
cuda_ndarray.cuda_ndarray.select_a_gpu()
use.device_number = active_device_number()
# This is needed to initialize the cublas handle.
gpu_init(use.device_number, config.lib.cnmem)
if test_driver:
import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
if device_properties(use.device_number)["warpSize"] != 32:
raise ValueError("Your GPU has a warpSize != 32. Currently"
" we have code that depends on this. Email"
" the Theano mailing list to tell us about"
" this new GPU as we don't know any with"
" this property")
if config.print_active_device:
if config.lib.cnmem:
if config.lib.cnmem > 1:
cnmem_enabled = "enabled with initial size: %d MB" % config.lib.cnmem
else:
cnmem = min(config.lib.cnmem, 0.95) * 100
cnmem_enabled = "enabled with initial size: %.1f%% of memory" % cnmem
else:
cnmem_enabled = "disabled"
cudnn_version = "not available"
warn = None
try:
if dnn_available():
(hdr_v, runtime_v) = dnn_version()
cudnn_version = runtime_v
# 5200 should not print warning with cudnn 5 final.
if cudnn_version >= 5200:
warn = ("Your cuDNN version is more recent than the one"
" Theano officially supports."
" If you see any problems, try updating Theano or"
" downgrading cuDNN to version 5.1.")
except Exception:
cudnn_version = dnn_available.msg
print("Using gpu device %d: %s (CNMeM is %s, cuDNN %s)" % (
active_device_number(),
active_device_name(),
cnmem_enabled,
cudnn_version,),
file=sys.stderr)
if warn:
warnings.warn(warn)
if device_properties(use.device_number)['regsPerBlock'] < 16384:
# We will try to use too much register per bloc at many places
# when there is only 8k register per multi-processor.
_logger.warning(
"You are probably using an old GPU, that Theano"
" does not support."
" This means GPU code will most likely be slow AND may"
" crash when we try to use features"
" that your GPU does not support.")
except (EnvironmentError, ValueError, RuntimeError) as e:
_logger.error(("ERROR: Not using GPU."
" Initialisation of device %s failed:\n%s"),
str(device), e)
cuda_enabled = False
if force:
e.args += (("You asked to force this device and it failed."
" No fallback to the cpu or other gpu device."),)
raise
elif use.device_number != device and device != 'gpu':
_logger.warning(("Ignoring call to use(%s), GPU number %i "
"is already in use."),
str(device), use.device_number)
if move_shared_float32_to_gpu:
handle_shared_float32(True)
if enable_cuda:
cuda_enabled = True
if default_to_move_computation_to_gpu:
# Do not add inplace tag here. We do not want to
# enable/disable gpu opt based on the inplace tag.
optdb.add_tags('gpu_opt',
'fast_compile',
'fast_run')
optdb.add_tags('gpu_after_fusion',
'fast_run')
optdb.add_tags('gpu_scanOp_make_inplace',
'fast_run')
if force:
try:
# in case the device if just gpu,
# we check that the driver init it correctly.
cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((5, 5))
except (Exception, NameError) as e:
# NameError when no gpu present as cuda_ndarray is not loaded.
e.args += ("ERROR: GPU forced but failed. ",)
raise
use.device_number = None
def unuse():
"""
This undo what was done by the call to.
use('gpu[0-9]', default_to_move_computation_to_gpu=True,
move_shared_float32_to_gpu=True,
enable_cuda=True)
This is used in Pylearn2 tests to enable/disable the GPU when needed.
After this call, the rest of Theano think the GPU shouldn't be used by
default.
"""
global cuda_enabled
cuda_enabled = False
handle_shared_float32(False)
optdb.remove_tags('gpu_opt',
'fast_compile',
'fast_run')
optdb.remove_tags('gpu_after_fusion',
'fast_run')
def handle_shared_float32(tf):
"""
Set the default shared type for float32 tensor to CudaNdarrayType.
This function is intended to be called from use(gpu_index), not directly.
"""
if tf:
theano.compile.shared_constructor(float32_shared_constructor)
else:
theano.compile.shared_constructor(float32_shared_constructor, True)
assert (float32_shared_constructor not in
theano.compile.shared.constructors)
# We can't test the driver during import here as this cause circular
# import dependency. So we also test it in the file theano/__init__.py
if config.device.startswith('gpu'):
use(device=config.device, force=config.force_device, test_driver=False)
elif config.init_gpu_device.startswith('gpu'):
assert config.device == "cpu", (
"We can use the Theano flag init_gpu_device"
" only when the Theano flag device=='cpu'")
_logger.warning(("GPU device %s will be initialized, and used if a GPU is "
"needed. However, no computation, nor shared variables, "
"will be implicitly moved to that device. If you want "
"that behavior, use the 'device' flag instead."),
config.init_gpu_device)
use(device=config.init_gpu_device,
force=config.force_device,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False, test_driver=False)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
from __future__ import absolute_import, print_function, division
import logging
import numpy
from theano import Apply, tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
from theano.sandbox.cuda import cuda_available, GpuOp
_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
if cuda_available:
from theano.sandbox.cuda import basic_ops
class GpuSparseBlockGemv(GpuOp):
"""
GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
information.
This should not be directly called since the interface is subject
to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
function for a stable interface.
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o)
W = basic_ops.as_cuda_ndarray_variable(W)
h = basic_ops.as_cuda_ndarray_variable(h)
assert o.ndim == 3
assert W.ndim == 4
assert h.ndim == 3
assert inputIdx.ndim == 2
assert outputIdx.ndim == 2
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
return Apply(self, [o, W, h, inputIdx, outputIdx],
[o.type()])
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def c_support_code(self):
return """
__global__ void
SparseBlockGemv_fill_lists(
int maxi, int maxj,
const float **inp_list,
float **out_list,
const float **W_list,
const float *W, int W_str_0, int W_str_1,
const float *h, int h_str_0, int h_str_1,
float *out, int o_str_0, int o_str_1,
const npy_intp *iIdx, int iI_str_0,
const npy_intp *oIdx, int oI_str_0
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = blockIdx.z;
if (i >= maxi || j >= maxj) return;
int p = i + j * maxi + b * maxi * maxj;
inp_list[p] = &h[b * h_str_0 + i * h_str_1];
out_list[p] = &out[b * o_str_0 + j * o_str_1];
W_list[p] = &W[iIdx[b*iI_str_0+i] * W_str_0 +
oIdx[b*oI_str_0+j] * W_str_1];
}
__global__ void _sgemvBH_N_a1_b1_small(const float *A[], int lda,
const float *x[], int incx,
float *y[], int incy,
int b, int m, int n) {
for (int p = blockIdx.y * blockDim.y + threadIdx.y; p < b;
p += gridDim.y * blockDim.y) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m;
i += gridDim.x * blockDim.x) {
float yi = 0.0f;
const float *Ap = A[p] + i;
const float *xp = x[p];
#pragma unroll 32
for (int j = 0; j < n; j++) {
yi += Ap[0] * xp[0];
Ap += lda;
xp += incx;
}
atomicAdd(&y[p][i*incy], yi);
}
}
}
__global__ void _sgemvBH_T_a1_b1_small(const float *A[], int lda,
const float *x[], int incx,
float *y[], int incy,
int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int p = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= m || p >= b) return;
float yi = 0.0f;
const float *Ap = A[p] + i * lda;
const float *xp = x[p];
# pragma unroll 32
for (int j = 0; j < n; j++) {
yi += Ap[j] * xp[0];
xp += incx;
}
atomicAdd(&y[p][i*incy], yi);
}
static cublasStatus_t SgemvBatched(cublasHandle_t handle,
cublasOperation_t trans,
int m, int n,
const float *alpha,
const float *A[], int lda,
const float *x[], int incx,
const float *beta,
float *y[], int incy, int batchCount) {
dim3 block(m, batchCount, 1);
dim3 grid(1, 1, 1);
cublasPointerMode_t mode;
cudaError_t err;
if (m < 512) {
block.x = 32;
if (batchCount > 16)
block.y = 16;
else
block.y = batchCount;
} else {
block.x = 512;
block.y = 1;
}
grid.x = (m + block.x - 1) / block.x;
grid.y = (batchCount + block.y - 1) / block.y;
if (grid.x * grid.y > 65535) {
grid.y = (65535 / grid.x);
}
cublasGetPointerMode(handle, &mode);
if (mode != CUBLAS_POINTER_MODE_HOST)
return CUBLAS_STATUS_INVALID_VALUE;
if (*alpha != 1.0 || *beta != 1.0)
return CUBLAS_STATUS_INVALID_VALUE;
if (trans == CUBLAS_OP_N)
_sgemvBH_N_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
y, incy,
batchCount, m, n);
else if (trans == CUBLAS_OP_T)
_sgemvBH_T_a1_b1_small<<<grid, block>>>(A, lda, x, incx,
y, incy,
batchCount, m, n);
else
return CUBLAS_STATUS_INVALID_VALUE;
err = cudaGetLastError();
if (err != cudaSuccess)
return CUBLAS_STATUS_EXECUTION_FAILED;
return CUBLAS_STATUS_SUCCESS;
}
static int SparseBlockGemv_copy(PyArrayObject *a, npy_intp *b) {
cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
if (aa == NULL) { return -1; }
err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
cudaMemcpyHostToDevice);
Py_DECREF(aa);
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU (%s)",
cudaGetErrorString(err));
return -1;
}
return 0;
}
"""
def c_support_code_apply(self, node, nodename):
return """
/* Statics are initialized with 0 */
static const float **%(n)s_inp_list;
static float **%(n)s_out_list;
static const float **%(n)s_W_list;
static size_t %(n)s_list_len;
static npy_intp *%(n)s_iIdx;
static size_t %(n)s_iIdx_len;
static npy_intp *%(n)s_oIdx;
static size_t %(n)s_oIdx_len;
static int %(n)s_prep(int b, int i, int j, int outsize) {
int s = b*i*j;
if (%(n)s_list_len < s) {
device_free(%(n)s_inp_list);
device_free(%(n)s_out_list);
device_free(%(n)s_W_list);
%(n)s_inp_list = (const float **) device_malloc(s*sizeof(float *));
if (%(n)s_inp_list == NULL) return -1;
%(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
if (%(n)s_out_list == NULL) return -1;
%(n)s_W_list = (const float **) device_malloc(s*sizeof(float *));
if (%(n)s_W_list == NULL) return -1;
%(n)s_list_len = s;
}
if (%(n)s_iIdx_len < b*i) {
device_free(%(n)s_iIdx);
%(n)s_iIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
if (%(n)s_iIdx == NULL) return -1;
%(n)s_iIdx_len = b*i;
}
if (%(n)s_oIdx_len < b*j) {
device_free(%(n)s_oIdx);
%(n)s_oIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
if (%(n)s_oIdx == NULL) return -1;
%(n)s_oIdx_len = b*j;
}
return 0;
}
""" % dict(n=nodename)
def c_code(self, node, nodename, inputs, outputs, sub):
o, W, h, inputIdx, outputIdx = inputs
out = outputs[0]
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
if (CudaNdarray_prep_output(&%(out)s, 3, CudaNdarray_HOST_DIMS(%(o)s)))
{
// Error already set
%(fail)s
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
// Error already set
%(fail)s
}
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(o)s)[0],
CudaNdarray_HOST_DIMS(%(h)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[2]) == -1) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory.");
%(fail)s
}
if (SparseBlockGemv_copy(%(inputIdx)s, %(name)s_iIdx) == -1)
{ %(fail)s }
if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
{ %(fail)s }
{ /* Prepare lists for the batch */
dim3 block;
dim3 grid;
block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
if (block.x > 32) {
grid.x = (block.x + 31) / 32;
block.x = 32;
}
if (block.x * block.y > 512) {
grid.y = (block.y + 15) / 16;
block.y = 16;
}
SparseBlockGemv_fill_lists<<<grid, block>>>(
CudaNdarray_HOST_DIMS(%(h)s)[1], CudaNdarray_HOST_DIMS(%(o)s)[1],
%(name)s_inp_list,
%(name)s_out_list,
%(name)s_W_list,
CudaNdarray_DEV_DATA(%(W)s),
CudaNdarray_HOST_STRIDES(%(W)s)[0], CudaNdarray_HOST_STRIDES(%(W)s)[1],
CudaNdarray_DEV_DATA(%(h)s),
CudaNdarray_HOST_STRIDES(%(h)s)[0], CudaNdarray_HOST_STRIDES(%(h)s)[1],
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_iIdx, PyArray_DIM(%(inputIdx)s, 1),
%(name)s_oIdx, PyArray_DIM(%(outputIdx)s, 1));
}
{ /* Run SgemvBatched */
float alpha = 1.0f;
float beta = 1.0f;
cublasStatus_t err;
cublasOperation_t transA = CUBLAS_OP_N;
int lda = CudaNdarray_HOST_STRIDES(%(W)s)[2];
if (lda == 1) {
transA = CUBLAS_OP_T;
lda = CudaNdarray_HOST_STRIDES(%(W)s)[3];
}
if (lda == 0) lda = 1;
err = SgemvBatched(handle, transA,
CudaNdarray_HOST_DIMS(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(h)s)[2], &alpha,
%(name)s_W_list, lda, %(name)s_inp_list,
CudaNdarray_HOST_STRIDES(%(h)s)[2],
&beta, %(name)s_out_list,
CudaNdarray_HOST_STRIDES(%(o)s)[2],
CudaNdarray_HOST_DIMS(%(o)s)[1] *
CudaNdarray_HOST_DIMS(%(h)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[0]);
if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "SgemvBatched failed(%%s)",
cublasGetErrorString(err));
%(fail)s
}
}
// And we're done!
""" % dict(out=out, h=h, o=o, inputIdx=inputIdx, outputIdx=outputIdx,
W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self):
return (12,)
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
Wgrad = gpu_sparse_block_outer(W.zeros_like(),
h, go, inputIdx, outputIdx)
hgrad = gpu_sparse_block_gemv(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)),
go,
outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class GpuSparseBlockOuter(GpuOp):
"""
GPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
information.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of GpuSparseBlockGemv. The gradient is not implemented.
"""
__props__ = ('inplace',)
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = basic_ops.as_cuda_ndarray_variable(o)
x = basic_ops.as_cuda_ndarray_variable(x)
y = basic_ops.as_cuda_ndarray_variable(y)
if alpha is None:
alpha = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[o.type()])
def infer_shape(self, node, input_shapes):
return [input_shapes[0]]
def c_support_code(self):
return """
__global__ void
SparseBlockOuter_fill_lists(
int maxi, int maxj,
const float **x_list,
const float **y_list,
float **out_list,
const float *x, int x_str_0, int x_str_1,
const float *y, int y_str_0, int y_str_1,
float *out, int o_str_0, int o_str_1,
const npy_intp *xIdx, int xI_str_0,
const npy_intp *yIdx, int yI_str_0
) {
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = blockIdx.z;
if (i >= maxi || j >= maxj) return;
int p = i + j * maxi + b * maxi * maxj;
x_list[p] = &x[b * x_str_0 + i * x_str_1];
y_list[p] = &y[b * y_str_0 + j * y_str_1];
out_list[p] = &out[xIdx[b * xI_str_0 + i] * o_str_0 +
yIdx[b * yI_str_0 + j] * o_str_1];
}
/* This is tuned for smaller sizes (< 512) since it's what we get normally */
__global__ void _sgerBH_gen_small(const float *x[], int incx,
const float *y[], int incy,
float alpha,
float *A[], int lda,
int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= m || j >= n) return;
for (int p = blockIdx.z; p < b; p += gridDim.z) {
atomicAdd(&A[p][j * lda + i],
alpha * x[p][i * incx] * y[p][j * incy]);
}
}
static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
const float *alpha,
const float *x[], int incx,
const float *y[], int incy,
float *A[], int lda,
int batchCount) {
dim3 block(m, n, 1);
dim3 grid(1, 1, batchCount);
cublasPointerMode_t mode;
cudaError_t err;
if (incx == 1) {
if (block.x > 32) {
grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.x * block.y > 512) {
grid.y = (block.y + 15) / 16;
block.y = 16;
}
} else {
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
if (block.x * block.y > 512) {
grid.x = (block.x + 15) / 16;
block.x = 16;
}
}
if (grid.x * grid.y * grid.z > 65535) {
if (grid.x * grid.y > 65535)
return CUBLAS_STATUS_INVALID_VALUE;
grid.z = (65535 / (grid.x * grid.y));
}
cublasGetPointerMode(handle, &mode);
if (mode == CUBLAS_POINTER_MODE_HOST) {
_sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda,
batchCount, m, n);
} else {
return CUBLAS_STATUS_INVALID_VALUE;
}
err = cudaGetLastError();
if (err != cudaSuccess)
return CUBLAS_STATUS_EXECUTION_FAILED;
return CUBLAS_STATUS_SUCCESS;
}
static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
if (aa == NULL) { return -1; }
err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
cudaMemcpyHostToDevice);
Py_DECREF(aa);
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, "Cannot copy index data to GPU(%s)",
cudaGetErrorString(err));
return -1;
}
return 0;
}
"""
def c_support_code_apply(self, node, name):
return """
/* statics are initialized with 0 */
static float **%(n)s_out_list;
static const float **%(n)s_x_list;
static const float **%(n)s_y_list;
static size_t %(n)s_list_len;
static npy_intp *%(n)s_xIdx;
static size_t %(n)s_xIdx_len;
static npy_intp *%(n)s_yIdx;
static size_t %(n)s_yIdx_len;
static int %(n)s_prep(int b, int i, int j) {
int s = b*i*j;
if (%(n)s_list_len < s) {
device_free(%(n)s_x_list);
device_free(%(n)s_y_list);
device_free(%(n)s_out_list);
%(n)s_x_list = (const float **) device_malloc(s*sizeof(float *));
if (%(n)s_x_list == NULL) return -1;
%(n)s_y_list = (const float **) device_malloc(s*sizeof(float *));
if (%(n)s_y_list == NULL) return -1;
%(n)s_out_list = (float **) device_malloc(s*sizeof(float *));
if (%(n)s_out_list == NULL) return -1;
%(n)s_list_len = s;
}
if (%(n)s_xIdx_len < b*i) {
device_free(%(n)s_xIdx);
%(n)s_xIdx = (npy_intp*) device_malloc(b*i*sizeof(npy_intp));
if (%(n)s_xIdx == NULL) return -1;
%(n)s_xIdx_len = b*i;
}
if (%(n)s_yIdx_len < b*j) {
device_free(%(n)s_yIdx);
%(n)s_yIdx = (npy_intp*) device_malloc(b*j*sizeof(npy_intp));
if (%(n)s_yIdx == NULL) return -1;
%(n)s_yIdx_len = b*j;
}
return 0;
}
""" % dict(n=name)
def c_code(self, node, name, inputs, outputs, sub):
o, x, y, xIdx, yIdx, alpha = inputs
out = outputs[0]
if self.inplace:
res = """
Py_XDECREF(%(out)s);
%(out)s = %(o)s;
Py_INCREF(%(out)s);
""" % dict(out=out, o=o)
else:
res = """
if (CudaNdarray_prep_output(&%(out)s, 4, CudaNdarray_HOST_DIMS(%(o)s)))
{
// Python error already set
%(fail)s
}
if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
//Error message already set
%(fail)s
}
""" % dict(out=out, o=o, fail=sub['fail'])
return res + """
if (%(name)s_prep(CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_HOST_DIMS(%(y)s)[1]) == -1) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory.");
%(fail)s
}
if (SparseBlockOuter_copy(%(xIdx)s, %(name)s_xIdx) == -1)
{ %(fail)s }
if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
{ %(fail)s }
{
dim3 block;
dim3 grid;
block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
if (block.x > 32) {
grid.x = (block.x + 31) / 32;
block.x = 32;
}
if (block.x * block.y > 512) {
grid.y = (block.y + 15) / 16;
block.y = 16;
}
SparseBlockOuter_fill_lists<<<grid, block>>>(
CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
%(name)s_x_list,
%(name)s_y_list,
%(name)s_out_list,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
%(name)s_yIdx, PyArray_DIM(%(yIdx)s, 1));
}
{
cublasStatus_t err;
int str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
if (str_y == 0) str_y = 1;
int str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
if (str_x == 0) str_x = 1;
int str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
if (str_out == 0) str_out = 1;
err = SgerBatched(handle,
CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
(float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list, str_y,
%(name)s_x_list, str_x,
%(name)s_out_list, str_out,
CudaNdarray_HOST_DIMS(%(x)s)[0] *
CudaNdarray_HOST_DIMS(%(x)s)[1] *
CudaNdarray_HOST_DIMS(%(y)s)[1]);
if (err != CUBLAS_STATUS_SUCCESS) {
if (err == CUBLAS_STATUS_INVALID_VALUE) {
/* The current code would be much too slow for sizes any larger
than this. */
PyErr_SetString(PyExc_ValueError,
"SgerBatched failed, probably because you have your "
"block size too big. The current limit is 65535 for "
"iSize * oSize.");
} else {
PyErr_Format(PyExc_RuntimeError, "SgerBatched failed(%%s)",
cublasGetErrorString(err));
}
%(fail)s
}
}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self):
return (11,)
gpu_sparse_block_outer = GpuSparseBlockOuter(False)
gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
///////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cnmem.h"
#include <cstddef>
#include <vector>
#include <cuda_runtime_api.h>
#if !defined(WIN32) && defined(_MSC_VER)
#define WIN32
#endif
#ifdef WIN32
#include <Windows.h>
#else
#include <pthread.h>
#endif
#define CNMEM_GRANULARITY 512
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) {
switch(status) {
case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS";
case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR";
case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT";
case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED";
case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY";
default: return "CNMEM_STATUS_UNKNOWN_ERROR";
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
#ifdef WIN32
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
} while(0)
#else
#include <execinfo.h>
static inline void printBacktrace() {
void *stackBuffer[64];
int numAddresses = backtrace((void**) &stackBuffer, 64);
char **addresses = backtrace_symbols(stackBuffer, numAddresses);
for( int i = 0 ; i < numAddresses ; ++i ) {
fprintf(stderr, "[%2d]: %s\n", i, addresses[i]);
}
free(addresses);
}
#define CNMEM_DEBUG_ERROR(...) do { \
fprintf(stderr, "Error at line: %d\n", __LINE__); \
fprintf(stderr, __VA_ARGS__); \
fprintf(stderr, "Backtrace:\n"); \
printBacktrace(); \
} while(0)
#endif
#else
#define CNMEM_DEBUG_ERROR(...)
#endif
#if 0
#define CNMEM_DEBUG_INFO printf
#else
#define CNMEM_DEBUG_INFO(...)
#endif
#if 0 // Enable/disable assertions
#include <cassert>
#define CNMEM_ASSERT assert
#else
#define CNMEM_ASSERT(...)
#endif
#define CNMEM_CHECK_TRUE(cond, error) do { \
if( !(cond) ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \
return error; \
} \
} while(0)
#define CNMEM_CHECK(call) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \
cnmemStatus_t status = (call); \
if( status != CNMEM_STATUS_SUCCESS ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \
cnmemGetErrorString(status)); \
(mutex).unlock(); \
return status; \
} \
} while(0)
#define CNMEM_CHECK_CUDA(call) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \
cudaError_t cudaError = (call); \
if( cudaError == cudaErrorMemoryAllocation ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_OUT_OF_MEMORY; \
} \
else if( cudaError != cudaSuccess ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \
cudaGetErrorString(cudaError)); \
(mutex).unlock(); \
return CNMEM_STATUS_CUDA_ERROR; \
} \
} while(0)
#ifdef WIN32
#define CNMEM_CHECK_WIN32(call, error_code) do { \
SetLastError(0); /* Clean the flag. */ \
call; \
DWORD status = GetLastError(); \
if( status ) \
return error_code; \
} while(0)
#else
#define CNMEM_CHECK_PTHREAD(call, error_code) do { \
int status = call; \
if( status ) { \
CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \
return error_code; \
} \
} while(0)
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cnmem {
static inline std::size_t ceilInt(std::size_t m, std::size_t n) {
CNMEM_ASSERT(n > 0);
return (m + n-1) / n * n;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Mutex {
#ifdef WIN32
mutable CRITICAL_SECTION mCriticalSection;
#else
pthread_mutex_t mMutex;
#endif
public:
/// Initialize the mutex.
cnmemStatus_t initialize();
/// Finalize the mutex.
cnmemStatus_t finalize();
/// Lock the mutex.
cnmemStatus_t lock() const;
/// Unlock the mutex.
cnmemStatus_t unlock() const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::initialize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
#if 0
pthread_mutexattr_t attr;
CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR);
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::finalize() {
#ifdef WIN32
CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::lock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Mutex::unlock() const {
#ifdef WIN32
CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR);
#else
CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR);
#endif
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Block {
/// The pointer to the memory region on the device.
char *mData;
/// The size of the memory buffer.
std::size_t mSize;
/// The prev/next blocks in the linked list of blocks.
Block *mNext;
/// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc).
bool mIsHead;
public:
/// Create a block.
Block(char *data, std::size_t size, Block *next, bool isHead)
: mData(data)
, mSize(size)
, mNext(next)
, mIsHead(isHead) {
}
/// The data.
inline const char* getData() const { return mData; }
/// The data (mutable).
inline char* getData() { return mData; }
/// The size of the block.
inline std::size_t getSize() const { return mSize; }
/// The next block in the linked list.
inline const Block* getNext() const { return mNext; }
/// The next block in the linked list (mutable).
inline Block* getNext() { return mNext; }
/// Is it a head block.
inline bool isHead() const { return mIsHead; }
/// Change the next block.
inline void setNext(Block *next) { mNext = next; }
/// Change the size of the block.
inline void setSize(std::size_t size) { mSize = size; }
/// Set the head flag.
inline void setHeadFlag(bool isHead) { mIsHead = isHead; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////
class Manager {
/// The parent manager.
Manager *mParent;
/// The children managers.
std::vector<Manager*> mChildren;
/// The GPU device where the memory is allocated.
int mDevice;
/// The stream this manager is associated with. It could be NULL.
cudaStream_t mStream;
/// Is the stream blocking?
bool mIsStreamBlocking;
/// The list of used blocks.
Block *mUsedBlocks;
/// The list of free blocks.
Block *mFreeBlocks;
/// The managed memory size.
std::size_t mSize;
/// The flags.
unsigned mFlags;
/// To support multi-threading. Each manager has its own mutex.
Mutex mMutex;
public:
/// Create an unitialized manager.
Manager();
/// Dtor.
~Manager();
/// Allocate a block of memory.
cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true);
/// Release a block of memory.
cnmemStatus_t release(void *ptr);
/// Release memory. It returns true if we have no memory leak.
cnmemStatus_t releaseAllUnsafe();
/// Reserve memory for a manager.
cnmemStatus_t reserve(std::size_t size);
/// Steal memory from another manager.
cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size);
/// Print the full memory state.
cnmemStatus_t printMemoryState(FILE *file) const;
/// The amount of used memory.
inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const {
return getMemoryUnsafe(usedMemory, mUsedBlocks);
}
/// The amount of used memory.
inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const {
return getMemoryUnsafe(freeMemory, mFreeBlocks);
}
/// Get a specific child based on the stream id.
cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const;
/// Get a specific child based on the stream id.
cnmemStatus_t getChild(Manager *&manager, std::size_t i) const;
/// Add a new child.
cnmemStatus_t addChild(Manager *manager);
/// The number of children.
cnmemStatus_t getNumChildren(std::size_t &numChildren) const;
/// The associated device.
inline int getDevice() const { return mDevice; }
/// The flags.
inline unsigned getFlags() const { return mFlags; }
/// Get the mutex.
inline const Mutex* getMutex() const { return &mMutex; }
/// The size allocated to that manager.
inline std::size_t getSize() const { return mSize; }
/// The CUDA stream.
inline cudaStream_t getStream() const { return mStream; }
/// Define the parent.
inline void setParent(Manager *parent) { mParent = parent; }
/// Define the device.
inline void setDevice(int device) { mDevice = device; }
/// Define the stream.
inline cnmemStatus_t setStream(cudaStream_t stream) {
mStream = stream;
#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
mIsStreamBlocking = false;
#elif CUDART_VERSION < 5050
mIsStreamBlocking = true;
#else
unsigned flags = 0;
CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags));
mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking);
#endif
return CNMEM_STATUS_SUCCESS;
}
/// Define the flags.
inline void setFlags(unsigned flags) { mFlags = flags; }
private:
/// The member functions below which are marked "Unsafe" are not thread-safe when called on a
/// same Manager object. Make sure they are called by a single thread in that case.
/// Allocate a new block and add it to the free list.
cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Release a block from the active list.
cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev);
/// Find the best free node based on the size.
cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size);
/// Extract a node from the list of free blocks.
cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen);
/// Give a free block from that manager.
cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// Steal a block from another manager.
cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size);
/// The memory consumption of a list.
cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const;
/// Print an internal linked list.
cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const;
};
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::Manager()
: mParent(NULL)
, mChildren()
, mDevice(-1)
, mStream(NULL)
, mIsStreamBlocking(false)
, mUsedBlocks(NULL)
, mFreeBlocks(NULL)
, mSize(0)
, mFlags(CNMEM_FLAGS_DEFAULT)
, mMutex() {
mMutex.initialize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
Manager::~Manager() {
if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it.
return;
}
releaseAllUnsafe();
mMutex.finalize();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::addChild(Manager *manager) {
CNMEM_CHECK(mMutex.lock());
mChildren.push_back(manager);
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) {
CNMEM_CHECK(mMutex.lock());
// If the client is not blocking, we have to explicitly synchronize before giving one buffer.
if( !isBlocking ) {
CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex);
}
// Find the best fit.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex);
// If there's no block left in the list of free blocks (with a sufficient size). Request a new block.
if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) {
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex);
}
// Make sure we do have a block or quit.
if( !best ) {
ptr = NULL;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Split the free block if needed.
CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex);
// Push the node to the list of used nodes.
best->setNext(mUsedBlocks);
mUsedBlocks = best;
// Return the new pointer into memory.
ptr = mUsedBlocks->getData();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) {
// Reset the outputs.
curr = prev = NULL;
// Try to allocate data from the parent or the device.
void *data = NULL;
if( mParent ) {
CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking));
}
else {
CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size);
CNMEM_CHECK_CUDA(cudaMalloc(&data, size));
CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data);
}
// If it failed, there's an unexpected issue.
CNMEM_ASSERT(data);
// We have data, we now need to add it to the list of free nodes. We keep the list sorted.
Block *next = mFreeBlocks;
for( ; next && next->getData() < data ; next = next->getNext() ) {
prev = next;
}
curr = new Block((char*) data, size, next, true);
if( !curr ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) {
// We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node.
Block *next;
if( curr->getSize() == size ) {
next = curr->getNext();
}
else {
std::size_t remaining = curr->getSize()-size;
Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen);
if( !newBlock ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
next = newBlock;
curr->setSize(size);
}
// Redo the "branching" in the nodes.
if( prev ) {
prev->setNext(next);
}
else {
mFreeBlocks = next;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) {
best = NULL, prev = NULL;
for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) {
if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) {
best = temp;
prev = tempPrev;
}
tempPrev = temp;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const {
CNMEM_CHECK(mMutex.lock());
std::size_t i = 0, numChildren = mChildren.size();
for( ; i < numChildren ; ++i ) {
if( mChildren[i]->mStream == stream ) {
manager = mChildren[i];
break;
}
}
CNMEM_CHECK(mMutex.unlock());
return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const {
CNMEM_CHECK(mMutex.lock());
if( i >= mChildren.size() ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
manager = mChildren[i];
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const {
size = 0;
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
size += curr->getSize();
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
#if 0
cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const {
CNMEM_CHECK(mMutex.lock());
CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head));
CNMEM_CHECK(mMutex.unlock());
return status;
}
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const {
CNMEM_CHECK(mMutex.lock());
numChildren = mChildren.size();
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) {
// Make sure the block is not in use any more. It could be too coarse grain and we may change
// it in the future.
CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream));
// Init the returned values to 0.
blockData = NULL;
blockSize = 0;
// Find the best node to steal and reserve it.
Block *best = NULL, *prev = NULL;
CNMEM_CHECK(findBestBlockUnsafe(best, prev, size));
if( !best ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true));
blockData = best->getData();
blockSize = best->getSize();
// Release the memory used by that block.
delete best;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const {
std::size_t size = 0;
for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) {
size += curr->getSize();
}
fprintf(file, "| list=\"%s\", size=%lu\n", name, size);
for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) {
fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n",
(std::size_t) curr,
(std::size_t) curr->getData(),
(std::size_t) curr->getSize(),
(std::size_t) curr->getNext(),
(std::size_t) curr->isHead ());
}
fprintf(file, "|\n");
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::printMemoryState(FILE *file) const {
CNMEM_CHECK(mMutex.lock());
std::size_t streamCode = (std::size_t) mStream;
std::size_t usedMemory, freeMemory;
CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex);
CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex);
fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n",
mParent ? "child" : "root",
mDevice,
streamCode,
usedMemory,
freeMemory);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex);
CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex);
fprintf(file, "\n");
CNMEM_CHECK(mMutex.unlock());
if( mParent ) {
CNMEM_CHECK(mParent->printMemoryState(file));
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::release(void *ptr) {
// Skip if ptr is NULL.
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
// Lock to make sure only one thread execute that fragment of code.
CNMEM_CHECK(mMutex.lock());
// Find the node in the list of used blocks.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) {
prev = curr;
}
// Make sure we have found a node.
if( curr == NULL ) {
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// We have the node so release it.
cnmemStatus_t result = releaseBlockUnsafe(curr, prev);
CNMEM_CHECK(mMutex.unlock());
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseAllUnsafe() {
// Destroy the children if any.
for( std::size_t i = 0; i < mChildren.size(); ++i ) {
Manager *child = mChildren[i];
CNMEM_CHECK(child->releaseAllUnsafe());
delete child;
}
mChildren.clear();
// Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!!
if( !mParent ) {
while( mUsedBlocks ) {
CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL));
}
}
// We should be having only free blocks that are head blocks. Release those blocks.
while( mFreeBlocks ) {
if( mParent ) {
CNMEM_CHECK(mParent->release(mFreeBlocks->getData()));
}
else if( mFreeBlocks->isHead() ) {
void *data = mFreeBlocks->getData();
CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data);
CNMEM_CHECK_CUDA(cudaFree(data));
CNMEM_DEBUG_INFO(">> success\n");
}
Block *block = mFreeBlocks;
mFreeBlocks = mFreeBlocks->getNext();
delete block;
}
// We shouldn't have any used block left. Or, it means the user is causing memory leaks!
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) {
// The current node cannot be NULL!
CNMEM_ASSERT(curr != NULL);
// Change the connection of the node.
if( prev ) {
prev->setNext(curr->getNext());
}
else {
mUsedBlocks = curr->getNext();
}
// Find the location where this block should be added to the free list.
prev = NULL;
Block *iter = mFreeBlocks;
for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) {
prev = iter;
}
// Keep track of the successor of pred. We may lose track of it in the following "else".
Block *next = prev ? prev->getNext() : mFreeBlocks;
// We first check if we can merge the block with its predecessor in the list and curr can be merged.
if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) {
prev->setSize(prev->getSize() + curr->getSize());
delete curr;
curr = prev;
}
else if( prev ) {
prev->setNext(curr);
}
else {
mFreeBlocks = curr;
}
// Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries.
if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) {
curr->setSize(curr->getSize() + next->getSize());
curr->setNext(next->getNext());
delete next;
}
else {
curr->setNext(next);
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::reserve(std::size_t size) {
CNMEM_CHECK(mMutex.lock());
Block *curr, *prev;
CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex);
mSize = size;
CNMEM_CHECK(mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) {
// If we cannot steal, don't even try.
if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) {
stolen = NULL;
return CNMEM_STATUS_INVALID_ARGUMENT;
}
// The stolen block.
void *data = NULL; std::size_t dataSize = 0;
if( !mChildren.empty() ) {
CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size));
}
else if( mParent ) {
CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size));
}
// Make sure we do have a block of memory or quit.
if( !data ) {
stolen = NULL;
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Push the block in the used list.
mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true);
if( !mUsedBlocks ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// Return the new pointer into memory.
stolen = data;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) {
// No block found and no room to grow. Try to steal from a children (if we have any).
data = NULL;
for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) {
Manager *child = mChildren[i];
if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) {
break;
}
}
// If no memory space found, simply return NULL. We have failed to allocate. Quit miserably.
if( !data ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
// We have got a node from a children. We need to update our "used" list before we can do
// anything with it.
Block *curr = mUsedBlocks, *prev = NULL;
for( ; curr ; curr = curr->getNext() ) {
if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) {
break;
}
prev = curr;
}
// Curr points to the node which contains that memory region.
CNMEM_ASSERT(curr);
// If it is exactly the same memory region, we are done!!!
if( curr->getData() == data && curr->getSize() == dataSize ) {
return CNMEM_STATUS_SUCCESS;
}
// Track the blocks before and after curr.
Block *next = curr->getNext();
// We may have up to 3 blocks.
std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData());
std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize);
// The resulting block.
Block *result = curr;
// If we have no space between curr->getData and block->getData.
if( sizeBefore == 0 ) {
curr->setSize(dataSize);
}
else {
curr->setSize(sizeBefore);
Block *block = new Block((char*) data, dataSize, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
data = (char*) data + dataSize;
dataSize = sizeAfter;
result = block;
}
// We have space at the end so we may need to add a new node.
if( sizeAfter > 0 ) {
Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false);
if( !block ) {
return CNMEM_STATUS_OUT_OF_MEMORY;
}
curr->setNext(block);
curr = block;
}
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
class Context {
/// Use a magic number to specify that the context is valid.
enum { CTX_VALID = 0x1f5632a3 };
/// The reference counting mechanism.
int mRefCount;
/// The mutex to increase/decrease the reference counter. TODO: Use atomics.
Mutex mMutex;
/// The memory managers.
std::vector<Manager> mManagers;
/// The global context.
static Context *sCtx;
/// Use a magic number to specify that the context was created.
static int sCtxCheck;
public:
/// Ctor.
Context() : mRefCount(1) { mMutex.initialize(); }
/// Dtor.
~Context();
/// Get the managers.
inline std::vector<Manager>& getManagers() { return mManagers; }
/// Get a single manager associated with a device.
inline Manager& getManager(int i) { return mManagers[i]; }
/// Create the global context.
static cnmemStatus_t create();
/// Check that the context was created.
static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; }
/// Get the global context.
static Context* get();
/// Retain.
static cnmemStatus_t retain();
/// Release.
static cnmemStatus_t release();
};
Context *Context::sCtx;
int Context::sCtxCheck;
///////////////////////////////////////////////////////////////////////////////////////////////////
Context::~Context() {
int oldDevice;
cudaGetDevice(&oldDevice);
for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) {
if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers.
cudaSetDevice(mManagers[i].getDevice());
mManagers[i].releaseAllUnsafe();
}
}
mManagers.clear();
mMutex.finalize();
cudaSetDevice(oldDevice);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Context::create() {
sCtx = new Context;
sCtxCheck = CTX_VALID;
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
Context* Context::get() {
CNMEM_ASSERT(Context::check());
return Context::sCtx;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Context::retain() {
CNMEM_CHECK(sCtx->mMutex.lock());
sCtx->mRefCount++;
CNMEM_CHECK(sCtx->mMutex.unlock());
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t Context::release() {
CNMEM_CHECK(sCtx->mMutex.lock());
int refCount = --sCtx->mRefCount;
CNMEM_CHECK(sCtx->mMutex.unlock());
if( refCount == 0 ) { // Kill the context.
delete sCtx;
Context::sCtx = NULL;
Context::sCtxCheck = 0;
}
return CNMEM_STATUS_SUCCESS;
}
} // namespace cnmem
///////////////////////////////////////////////////////////////////////////////////////////////////
extern "C" {
///////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) {
// Make sure we have at least one device declared.
CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT);
// Find the largest ID of the device.
int maxDevice = 0;
for( int i = 0 ; i < numDevices ; ++i ) {
if( devices[i].device > maxDevice ) {
maxDevice = devices[i].device;
}
}
// Create the global context.
cnmem::Context::create();
cnmem::Context *ctx = cnmem::Context::get();
// Allocate enough managers.
CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT);
std::vector<cnmem::Manager> &managers = ctx->getManagers();
managers.resize(maxDevice+1);
// Create a root manager for each device and create the children.
int oldDevice;
CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice));
for( int i = 0 ; i < numDevices ; ++i ) {
CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device));
std::size_t size = devices[i].size;
if( size == 0 ) {
cudaDeviceProp props;
CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device));
size = props.totalGlobalMem / 2;
}
CNMEM_CHECK_TRUE(size > 0, CNMEM_STATUS_INVALID_ARGUMENT);
cnmem::Manager &manager = ctx->getManager(devices[i].device);
manager.setDevice(devices[i].device);
manager.setFlags(flags);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
CNMEM_CHECK(manager.reserve(size));
for( int j = 0 ; j < devices[i].numStreams ; ++j ) {
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&manager);
child->setDevice(devices[i].device);
child->setStream(devices[i].streams[j]);
child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW);
if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) {
CNMEM_CHECK(child->reserve(devices[i].streamSizes[j]));
}
CNMEM_CHECK(manager.addChild(child));
}
}
CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice));
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFinalize() {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
return cnmem::Context::release();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemRetain() {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
return cnmem::Context::retain();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemRelease() {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
return cnmem::Context::release();
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Context::get()->getManager(device);
cnmem::Manager *child = new cnmem::Manager;
child->setParent(&root);
child->setDevice(device);
child->setStream(stream);
child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW);
root.addChild(child);
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
if( !ptr && !size ) {
return CNMEM_STATUS_SUCCESS;
}
else if( !size ) {
ptr[0] = NULL;
return CNMEM_STATUS_SUCCESS;
}
CNMEM_CHECK_TRUE(ptr, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Context::get()->getManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
size = cnmem::ceilInt(size, CNMEM_GRANULARITY);
cnmemStatus_t result = manager->allocate(ptr[0], size);
// We failed to allocate but there might still be a buffer available in another manager. Try to
// steal it.
if( result == CNMEM_STATUS_OUT_OF_MEMORY ) {
// Try to acquire locks on all the children.
std::size_t numChildren;
CNMEM_CHECK(root.getNumChildren(numChildren));
std::vector<const cnmem::Mutex*> mutexes(numChildren);
std::size_t numLocked = 0;
for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) {
cnmem::Manager *child;
CNMEM_CHECK(root.getChild(child, i));
mutexes[numLocked] = child->getMutex();
if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) {
break;
}
}
// One lock failed, quit. Reduce the damage as much as possible, though.
if( numLocked != numChildren ) {
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
}
return CNMEM_STATUS_UNKNOWN_ERROR;
}
// Grab the lock on the root, first.
const cnmem::Mutex *rootMutex = root.getMutex();
CNMEM_CHECK(rootMutex->lock());
// We acquired all the lock so we try to steal a node from another child.
if( numLocked == mutexes.size() ) {
result = manager->stealUnsafe(ptr[0], size);
}
for( std::size_t i = 0 ; i < numLocked ; ++i ) {
cnmemStatus_t lockStatus = mutexes[i]->unlock();
if( lockStatus != CNMEM_STATUS_SUCCESS ) {
// Starting from now we are panicking!!! One lock failed to be released, we try
// we others. We could also give up because we are already screwed. I don't know
// what's best! Comment are welcome.
result = lockStatus;
}
}
CNMEM_CHECK(rootMutex->unlock());
}
return result;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
if( ptr == NULL ) {
return CNMEM_STATUS_SUCCESS;
}
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Context::get()->getManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->release(ptr);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Context::get()->getManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
const cnmem::Mutex *mutex = manager->getMutex();
CNMEM_CHECK(mutex->lock());
CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex);
size_t usedMem;
CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex);
CNMEM_CHECK(mutex->unlock());
totalMem[0] = usedMem + freeMem[0];
return CNMEM_STATUS_SUCCESS;
}
///////////////////////////////////////////////////////////////////////////////////////////////////
cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) {
CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED);
int device;
CNMEM_CHECK_CUDA(cudaGetDevice(&device));
cnmem::Manager &root = cnmem::Context::get()->getManager(device);
cnmem::Manager *manager = &root;
if( stream ) {
CNMEM_CHECK(root.getChildFromStream(manager, stream));
}
CNMEM_ASSERT(manager);
return manager->printMemoryState(file);
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // extern "C"
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
#pragma once
#ifdef __cplusplus
#include "cstdio"
#else
#include "stdio.h"
#endif
#include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32)
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __declspec(dllexport)
#else
#define CNMEM_API __declspec(dllimport)
#endif
#else
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __attribute__((visibility ("default")))
#else
#define CNMEM_API
#endif
#endif
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus
extern "C" {
#endif
/* ********************************************************************************************* */
typedef enum
{
CNMEM_STATUS_SUCCESS = 0,
CNMEM_STATUS_CUDA_ERROR,
CNMEM_STATUS_INVALID_ARGUMENT,
CNMEM_STATUS_NOT_INITIALIZED,
CNMEM_STATUS_OUT_OF_MEMORY,
CNMEM_STATUS_UNKNOWN_ERROR
} cnmemStatus_t;
/* ********************************************************************************************* */
typedef enum
{
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
} cnmemManagerFlags_t;
/* ********************************************************************************************* */
typedef struct cnmemDevice_t_
{
/** The device number. */
int device;
/** The size to allocate for that device. If 0, the implementation chooses the size. */
size_t size;
/** The number of named streams associated with the device. The NULL stream is not counted. */
int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams;
/** The size reserved for each streams. It can be 0. */
size_t *streamSizes;
} cnmemDevice_t;
/**
* \brief Initialize the library and allocate memory on the listed devices.
*
* For each device, an internal memory manager is created and the specified amount of memory is
* allocated (it is the size defined in device[i].size). For each, named stream an additional
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream.
*
* This function must be called before any other function in the library. It has to be called
* by a single thread since it is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Release all the allocated memory.
*
* This function must be called by a single thread and after all threads that called
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFinalize();
/**
* \brief Increase the internal reference counter of the context object.
*
* This function increases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRetain();
/**
* \brief Decrease the internal reference counter of the context object.
*
* This function decreases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* You can use \c cnmemRelease to explicitly finalize the library.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRelease();
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/**
* \brief Allocate memory.
*
* This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
*
* The behavior of that function is the following:
*
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CNMEM_STATUS_OUT_OF_MEMORY.
*
* - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned.
* Otherwise, the request is passed to the root node and works as if the request were made on
* the NULL stream.
*
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it the execution is
* sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/**
* \brief Release memory.
*
* This function releases memory and recycles a memory block in the manager. This function is
* thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */
/* Utility functions. */
/* ********************************************************************************************* */
/**
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
*
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* xity linear in the number of allocated blocks so do not call it in performance critical
* sections.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/**
* \brief Print a list of nodes to a file.
*
* This function is intended to be used in case of complex scenarios to help understand the
* behaviour of the memory managers/application. It is thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/**
* \brief Converts a cnmemStatus_t value to a string.
*/
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */
#ifdef __cplusplus
} // extern "C"
#endif
// REMEMBER TO INCREASE c_code_cache_version when changing this file
//
enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
/*
* version: -1, autodetect, >=0 a specific version to use.
* If it can't be executed, we revert to the reference implementation
*/
int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0,
int max_threads_dim0 = 512
)
{
int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (verbose>1)
{
fprintf(stderr,
"INFO: Running conv_valid version=%d,"
" MACRO kern_width=%d with inputs:\n",
version, THEANO_KERN_WID);
fprintf(stderr,
"INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
fprintf(stderr,
"INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols);
}
//Check the output size is valid
if (!(CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2]- CudaNdarray_HOST_DIMS(kern)[2] + 1, subsample_rows) ||
CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3]- CudaNdarray_HOST_DIMS(kern)[3] + 1, subsample_cols) ||
CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0] ||
CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0] ||
CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1])) {
PyErr_SetString(PyExc_ValueError, "GpuConv: sizes don't match");
return -1;
}
// we now search through a few implementations until one applies to our arguments.
//TODO: make separate version as if all fill this is slower.
//TODO: Make a switch with power of 2 max size as template
//TODO: make a parameter the number of division
//TODO: Should we make them in separate grid block instead?
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1;
}
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the
// stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
//if we remove the restriction
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //there is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_size_byte+kern_wid*sizeof(float)<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_patch_stack
{
//version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//there is only 16k of shared memory
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_rows_stack
{
int nb_row=1;
//TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){
if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i;
}
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
if (0)
fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z);
if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>;
} else {
//fprintf(stderr, "using true version\n");
f = conv_rows_stack<THEANO_KERN_WID, true>;
}
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(img_wid+kern_wid)*sizeof(float)<shared_avail && //there is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2
{
// version 9:we preload the full kernel
// version 10: load only a few row at a time.
int nb_row=1;
int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9;
else if(version==-1)version = 10;
int k_size = kern_size;
if(version==10)
k_size=kern_wid;
for(int i=2;i<=out_len;i++){
if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i;
}
//to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n",
(version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
//version 8 is the same but we force the split.
// The split is need in case we have too much threads.
// This happen frequently if the kernel length is big.
// Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time.
if (!subsample &&
out_contiguous &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
//check if we can fit the full kernel in the shared memory
if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
full_kern = false;
}
//thread_z is going to be ceil_intdiv(kern_len, nb_split)
// we need enough splits so that
// a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
// b) thread_z * out_len * out_wid fits in the thread count
// c) the kernel doesn't need too much shared memory
// constraint (a)
// device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
// constraint (b)
// (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
nb_split++;
// tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more.
nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z));
}
if (nb_split <= kern_len)
{
assert(thread_z>0);//should not happen, but in case...
if(!full_kern) assert(thread_z!=kern_len);
dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, "
"grid.x=%i, grid.y=%i, shared_size=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack_reduce' version"
" kern_flipped=%i ccontig=%i nb_split=%d,"
" preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
} // else no good nb_splits was found
}
if (1 && (version==6||version==-1) &&
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int block_nstack=nstack;
//Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1);
int n_reduce_buf = block_nstack * kern_len * sizeof(float);
/* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
*
* if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
* */
int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
initial_reduce_boundary >>= 1;
if (n_reduce_buf == sizeof(float))
assert (initial_reduce_boundary == 0);
else
{
assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
}
void (*f)(int, int, int, int,
int, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
float*, int, int, int, int,
int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce<false>;
else
f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],
img_stride_row, img_stride_col,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols, initial_reduce_boundary);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_valid
{
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (1)
{
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid,
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
}
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid,
kern_len, kern_wid,
out_len, out_wid,
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
}
else
{
if (verbose)
fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
if (!work_complete)
{
PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for"
" CudaNdarray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)",
version);
return -1;
}
return 0;
}
int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0,
int max_threads_dim0=512)
{
//144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0;
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
// check the size of the output matrix
assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1, subsample_rows));
assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1, subsample_cols));
assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
const int img_len=CudaNdarray_HOST_DIMS(img)[2];
const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
const int out_len=CudaNdarray_HOST_DIMS(out)[2];
const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
const int img_size=img_len*img_wid;
const int kern_size=kern_len*kern_wid;
const int out_size=out_len*out_wid;
const int img_size_byte = img_size*sizeof(float);
const int kern_size_byte = kern_size*sizeof(float);
//padded image sizes
const int img_wid_padded=img_wid+2*kern_wid-2;
const int img_len_padded=img_len+2*kern_len-2;
const int img_size_padded=img_len_padded * img_wid_padded;
const int img_size_padded_byte = img_size_padded*sizeof(float);
//const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
(THEANO_KERN_WID == 0))){
PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1;
}
bool subsample = subsample_rows!=1 || subsample_cols!=1;
bool img_contiguous = CudaNdarray_is_c_contiguous(img);
bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
bool out_contiguous = CudaNdarray_is_c_contiguous(out);
bool c_contiguous = img_contiguous && kern_contiguous && out_contiguous;
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata;
int kern_stride_col_unflipped=kern_stride_col;
int kern_stride_row_unflipped=kern_stride_row;
if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
//the last two dimensions are c_contiguous but flipped!
kern_stride_col_unflipped=1;
kern_stride_row_unflipped=kern_wid;
kern_flipped=false;
kern_contiguous_2d_unflipped = true;
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
}
if (verbose>1)
{
printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
printf("INFO: out dim: %i %i %i %i out stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
}
if (!subsample &&
out_contiguous &&
(version==3||version==4||version==5||version==-1) &&
out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //there is only 16k of shared memory
(kern_len > 1 || (img_size_padded_byte+kern_size_byte)<=shared_avail) &&
!work_complete) //conv_full_patch_stack_padded
{
//version 3 without split
//version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1.
version=-1;
if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
}
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple
//of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but there is some other stuff that
//for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>max_threads_dim0)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
//force version 4 when more than 1 split are needed to always execute.
else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern);
int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false"));
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
}
}
if (!subsample && c_contiguous &&
(version==0||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_patch
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample &&
img_batch_stack_contiguous &&
out_contiguous &&
(version==2||version==-1) &&
out_size<=max_threads_dim0 &&//Maximum of X threads by block
img_size_byte+kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_patch_stack
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
f<<< grid, threads, shared_size>>>(
img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,img_stride_col, img_stride_row,
kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (1 && !work_complete) //conv_reference_full
{
if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0)
{
if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2],
CudaNdarray_HOST_DIMS(img)[3],
img->devdata,
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]
);
if (verbose)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2],
CudaNdarray_HOST_DIMS(out)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
if (verbose)
fprintf(stderr, " subsample params: %i %i\n",
subsample_rows, subsample_cols);
}
conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version"
" ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true;
}
else
{
if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts));
return -1;
}
}
return 0;
}
PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols,
const int version, const int verbose,
const int max_threads_dim0 = 512
)
{
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
//
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
int logical_rows, logical_cols;
if (mode == ConvMode_VALID)
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
}
else
{
logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
}
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = NULL;
if ( out
&& out->nd==4
&& CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
{
rval = out;
Py_INCREF(rval);
if (verbose)
fprintf(stderr,
"INFO: Conv is reusing the 'out' argument"
" structure.\n");
}
else
{
if (out && verbose)
fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n");
else if(verbose)
fprintf(stderr,
"INFO: Conv don't have an 'out' argument"
" structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null
}
if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
subsample_rows,
subsample_cols,
version, verbose,
max_threads_dim0))
)
{
// if rval is something we just allocated,
// and there was a problem, then we have to free it.
Py_XDECREF(rval);
return NULL;
}
return (PyObject*)rval;
}
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len/nb_split
//grid block size=batch_id
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
__syncthreads();
for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
float sum = 0.0f;
int img_row = out_row;
for (int row=0; row < kern_len; row++) {//loop over row
int inverse_row = (img_row-row);
if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
const float* idx_in=&d_img[inverse_row*img_wid];
const float* idx_kern=&d_kern[row*kern_wid];
int img_col = out_col;
int col=0,last=0;
for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
if(last<0 ||last>=(img_wid))continue;//col outside the image
sum+=idx_in[last]*idx_kern[col];
}
}
out[batch_id*out_len*out_wid+//the output image
out_row*out_wid+out_col] = sum;
}
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void
conv_full_patch( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
img+=img_len*img_wid*batch_id;//the good batch
load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
//we store the full image and the full kernel in the shared memory
//each thread compute only one value for the output
//thread block size=out_wid, out_len
//grid block size=batch_id, nkern
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
conv_full_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
float __shared__ *kern_, *img_;
extern __shared__ float s_data[];
const int batch_id = blockIdx.x;
const int nkern_id = blockIdx.y;
const int out_col = threadIdx.x;
const int out_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+ threadIdx.x;
float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid];
int col=0;
int max_col=kern_wid;
int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col];
}
}
//Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory.
__syncthreads();
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
* dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
* template split: if true, each thread compute more than 1 output pixel.
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
nb_rows = blockDim.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
img_wid_valid=img_wid+2*kern_wid-2;
if(!split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
const int out_row = ty;//output row
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
img+img_stride_stack*stack,
thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}else{//low_mem version
//don't need to fill the last rows padding as this is done later.
fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
__shared__ int out_len_max;
//TODO pass a parameter nb_split
if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
__syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
__syncthreads();
const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,img_wid,
len_to_load+previous_row,
img_stride_col, img_stride_row, kern_wid-1,
c_contiguous);
//TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed.
int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,kern_len-1);
fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
//return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
}
template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
{
return 0;
}
template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
{
return x[0] * y[0];
}
template<int NSTACK>
__global__ void
conv_full_load_everything( float* img, float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1;
out_wid = img_wid + kern_wid - 1;
nb_thread_id = blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
const int out_col = threadIdx.x;//output col
const int out_row = threadIdx.y;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
img += blockIdx.x * img_stride_batch;//the good batch
// load the image to shared memory
for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
{
int stack = i / (img_wid*img_len);
int row = (i % (img_wid*img_len)) / img_wid;
int col = (i % (img_wid*img_len)) % img_wid;
d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
}
for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
{
// load the kernel into shared memory and flip it
for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
{
int stack = i / (kern_wid*kern_len);
int row = (i % (kern_wid*kern_len)) / kern_wid;
int col = (i % (kern_wid*kern_len)) % kern_wid;
d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
= kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
}
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; ++row)
{
int irow = out_row - kern_len+1+row;
if (irow < 0 || irow > img_len) continue;
for (int col = 0; col < kern_wid; ++col)
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0)
{
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
{
for (int stack = 0; stack < nstack; ++stack)
{
sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
}
}
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_idx+//the output image
out_row*out_wid+out_col] = sum;
__syncthreads(); //don't start loading another kernel until we're done here
}
}
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
// REMEMBER TO INCREASE c_code_cache_version when changing this file
//
//implement the valid convolution only
/*
for (int iter_m=0; iter_m < Os[0]; iter_m++) {
// Reposition index into input image based on requested output size
int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
int new_m = (pos_m+dim_ker[0]-1);
for (int iter_n=0; iter_n < Os[1]; iter_n++) { // loop over columns
int pos_n=iter_n*%(self_dy)s;
%(type)s sum=0;
// Sum over kernel, if index into image is out of bounds
// fill with the value
for (int j=0; j < dim_ker[0]; j++) {
int inverse_row = (new_m-j);
const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_kern[k]*idx_in[last];
}
}//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
}//for n
}//for m
*/
#ifndef CONV_KERNEL_CU
#define CONV_KERNEL_CU
#include <stdint.h>
/*
#define CHECK_BANK_CONFLICTS 0
#if CHECK_BANK_CONFLICTS
#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
#else
#define AS(i, j) As[i][j]
#define BS(i, j) Bs[i][j]
#endif
*/
#define MIN(a, b) ((a) < (b) ? (a) : (b) )
#define MAX(a, b) ((a) < (b) ? (b) : (a) )
//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
//bit, it is 32 bit.
const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
{
if(flipped)
//TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
else
{
for(int i = thread_id; i < N; i += nb_thread)
{
dst[i] = src[i];
}
}
}
else
{
nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
// Global memory:
// <-------------------------------------->
// A A A A A // points of 256-byte alignment
// dddddddddddddddddddddd // layout of src in global memory
if (thread_id < nb_thread)
{
const float * my_src_ptr = (const float *)(
((uintptr_t)src) & COALESCED_ALIGN);
my_src_ptr += thread_id;
while (my_src_ptr < src + N)
{
if (my_src_ptr >= src)
{
int i = my_src_ptr - src;
if (flipped)
{
dst[N - 1 - i] = *my_src_ptr;
}
else
{
dst[i] = *my_src_ptr;
}
}
my_src_ptr += nb_thread;
}
}
}
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
*/
__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){
if (c_contiguous)
{
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}
else
{
if (flipped)
{
int LAST = nb_row * nb_col - 1;
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
{
// XXX
// THIS IS SLOW - use whatever blocks are in the the
// threads to avoid division and modulo
dst[LAST - i] \
= src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}
}
else
{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
{
// XXX
// THIS IS SLOW - use whatever blocks are in the the
// threads to avoid division and modulo
dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
}
}
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
/*
* We load from global memory to shared memory. The outer if is optimized away at compilation.
* We put the image at the center of another one. Usefull to padd an image with 0.
*/
__device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread,
const int nb_col, const int nb_row,
const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
}
}else{
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
int col=i%nb_col;
int row=i/nb_col;
dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
}
}
}
template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){
return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
//return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
}
template<> __device__ float convolutionRowNoFlip<1>(const float *data,
const float *kern){
return data[0]*kern[0];
}
template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){
return 0;
}
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
template<bool accumulate>
__device__ void store_or_accumulate(float& dst,const float value ){
if(accumulate){
dst += value;
}else
dst = value;
}
/**
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* Don't implement the stack.
* each thread compute only one value for the output if split is false
* thread block size=out_wid, out_len(or less then out_len if split is true)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_len*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template split: if true, each thread computes more than 1 output pixel
* When true, allow for output image bigger then 512 pixel.
* Use more registers.
*/
template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void
conv_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
__shared__ int batch_id, kern_id;
batch_id = blockIdx.x;
kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
const int thread_id = ty*blockDim.x + tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
kern+=kern_len*kern_wid*nstack*kern_id;
img+=img_len*img_wid*(nstack*batch_id);
load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
__syncthreads();
if(!split){
int out_row = ty;//output row
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}else{
for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
}
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split==false else it compute more than 1 values
* thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* dx: patch stride rows(1 for normal convolution)
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void
conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{
int __shared__ nb_thread_id;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
int kern_id = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = out_row*out_wid + out_col;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
if(!split){
kern+=kern_stride_nkern*kern_id;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern)
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
__syncthreads();
int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row;
load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}else{
float __shared__ *kern_, *img_;
int __shared__ out_len_max;
kern_=kern+kern_stride_nkern*kern_id;//the good nkern
img_=img+img_stride_batch*batch_id;//the good batch
//out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
//TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere?
for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory
int idx1=img_stride_stack*stack;
load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern){
int idx2=kern_stride_stack*stack;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
}
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){
__syncthreads();
int idx2=kern_stride_stack*stack;
if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row;
else
idx2+=(row)*kern_stride_row;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads();
}
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
if(out_row<out_len)
store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum);
}
}
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
* grid block size=batch_id, nkern
* dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
*
* nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load.
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
//int __shared__ out_len, out_wid, nb_thread_id;
//out_len = img_len - kern_len + 1;
//out_wid = img_wid - kern_wid + 1;
const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
int batch_id = blockIdx.x;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
int tz = threadIdx.z;
int out_col = tx;//output col
int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
//d_img size [IMAGE_LEN * IMAGE_WID];
float * d_img=&s_data[0];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern
img+=img_stride_batch*batch_id;//the good batch
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){
__syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous);
if(split && ! preload_full_kern){
for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
//N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
// (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
// details, but that seemed really weird. tricky bug to find too.
int idx3 = flipped_kern
? max((kern_len - (int)blockDim.z - first_row),0)
: first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0;
if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
}else{
int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
//reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum;
__syncthreads();
if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
//sum=0;
for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*out_wid*out_len];
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum;
}
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* Don't implement the stack and nkern in the kernel.
* thread block size=out_wid
* grid block size=out_len,batch_id
* dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x;;//output row
const int thread_id = threadIdx.x;
d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
img+=img_stride_batch*batch_id;//selection the good image from the batch
img+=out_row*img_stride_row;//select the good top row.
kern+=kern_stride_nkern*kern_id;//the good nkern
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
img_stride_col, img_stride_row, false, c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
* we store kern_len row of the image and the full kernel in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),nb_batch*nb_kern
* dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
conv_rows_stack( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
int rows_to_read = MIN(
kern_len + nb_rows - 1,
img_len - blockIdx.x * nb_rows);
/**
* Every thread ultimately computes one value in the output, at coordinates
* out[ batch_id, kern_id, out_row, out_col]
*
* The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
* are the threadIdx.x and threadIdx.y.
*
* Every thread block deals only with one image, and one filter kernel.
*/
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
/*
* The kernel works by looping over channels (aka colours, aka the stack).
* On each iteration, a thread block loads one channel of all the image rows that
* it needs to use, and one channel slice of one kernel.
*/
d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0; stack < nstack; stack++){
int offset =
img_stride_batch * batch_id
+ img_stride_stack * stack
//blockIdx.x is which chunk of nb_rows this thread block deals with
+ img_stride_row * (blockIdx.x * nb_rows);
load_to_shared(
d_img, // dst
img+offset, // src
thread_id, // linear position in block
nb_thread_id, // number of threads
img_wid, // cols in image to read
rows_to_read, // number of rows to read
img_stride_col, // img[i, j, k, l] to img[i, j, k, l + 1]
img_stride_row, // img[i, j, k, l] to img[i, j, k + 1, l]
false, // flip while reading
c_contiguous);
offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads();//to be sure all thread have finished before we modif the shared memory.
}
if (out_row < out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
* we store block_len row of the image(at a time) and one or all kernel row in the shared memory
* each thread compute only one value for the output
* thread block size=out_wid, block_len
* grid block size=intceil(out_len/block_len),nb_batch*nb_kern
* dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
* Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void
conv_rows_stack2( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y%nkern;
nb_rows = blockDim.y;
extern __shared__ float s_data[];
const int out_col = threadIdx.x;//output col
const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
const int shared_row = threadIdx.y;
const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){
int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
__syncthreads();
load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous);
if(preload_full_kern)
load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row
__syncthreads();
if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
img+_idx1, thread_id, nb_thread_id, img_wid, 1,
img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
}
if(!preload_full_kern){
int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
_idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
load_to_shared(d_kern, kern+_idx3,
thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, true, c_contiguous);
}
__syncthreads();
//if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len){
const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
}
__syncthreads();
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum;
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
*
* This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
*
* template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__global__ void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols,
const int initial_reduce_boundary)
{
const int outsize = nB * nK * out_len * out_wid;
extern __shared__ float reducebuf[];
for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
int ss = threadIdx.x;
int rr = threadIdx.y;
int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
float sum = 0.0f;
if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
}else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
if (blockDim.x * blockDim.y == 1)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
else
{
reducebuf[reduceIdx] = sum;
__syncthreads();
int reduce_boundary = initial_reduce_boundary;
// add in the terms above the reduce boundary
if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
reduce_boundary >>= 1;
// there are an equal number of terms above and below the reduce_boundary
while (reduce_boundary)
{
__syncthreads();
if (reduceIdx < reduce_boundary)
{
reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
}
reduce_boundary >>= 1;
}
if (reduceIdx == 0)
{
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
}
}
}
}
/**
* Reference implementation of 'valid' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
__global__ void
conv_reference_valid(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, outsize;
numThreads = blockDim.x * gridDim.x;
outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < outsize; i += numThreads) //physical
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical + kern_len - 1 - rr;
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical + kern_wid-1-cc;
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
//coords[i*5+0] = iB;
//coords[i*5+1] = iK;
//coords[i*5+2] = iR;
//coords[i*5+3] = iC;
//coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
/**
* Reference implementation of 'full' mode convolution (with stack)
*
* This implementation works for any size of image and kernel. It does not use shared memory.
*
* TODO: explain parameters, preconditions
*/
__global__ void
conv_reference_full(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
int out_len, int out_wid, //physical dimensions
float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
int subsample_rows, int subsample_cols)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int numThreads, physical_outsize;
numThreads = blockDim.x * gridDim.x;
physical_outsize = nB * nK * out_len * out_wid;
for (int i = idx; i < physical_outsize; i += numThreads)
{
//figure out what output element we're in charge of computing
int ii = i;
int iB = ii % nB; // output batch index
ii = ii / nB;
int iK = ii % nK; // output kernel index
ii = ii / nK;
int iR_physical = ii % out_len; //output kernel row
int iC_physical = ii / out_len; // output kernel column
int iR_logical = iR_physical * subsample_rows;
int iC_logical = iC_physical * subsample_cols;
float sum = 0.0f;
for (int ss = 0; ss < stacklen; ++ss)
{
for (int rr = 0; rr < kern_len; ++rr)
{
int img_rr = iR_logical - rr;
if ((img_rr >= 0) && (img_rr < img_len))
{
for (int cc = 0; cc < kern_wid; ++cc)
{
int img_cc = iC_logical - cc;
if ((img_cc >= 0) && (img_cc < img_wid))
{
float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
sum += k_0 * i_0;
}
}
}
}
}
out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
}
}
#endif // #ifndef CONV_KERNEL_CU
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef _GLIBCXX_ATOMIC_BUILTINS
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// CUDA: thread number configuration.
// Use 1024 threads per block, which requires cuda sm_2x or above,
// or fall back to attempt compatibility (best of luck to you).
#if __CUDA_ARCH__ >= 200
const int CUDA_NUM_THREADS = 1024;
#else
const int CUDA_NUM_THREADS = 512;
#endif
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
// (Adapted from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h_im = h_offset + i * dilation_h;
for (int j = 0; j < kernel_w; ++j)
{
int w_im = w_offset + j * dilation_w;
for (int k = 0; k < kernel_d; ++k)
{
int d_im = d_offset + k * dilation_d;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * dilation_h * (width * depth) +
j * dilation_w * depth +
k * dilation_d] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
__global__ void im3d2col_kernel(const int n, const float* data_im,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
const int w_index = index / depth_col;
const int h_index = w_index / width_col;
const int d_col = index % depth_col;
const int h_col = h_index % height_col;
const int w_col = w_index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w * kernel_d;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
const int d_offset = d_col * stride_d - pad_d;
float* data_col_ptr = data_col;
data_col_ptr += c_col * (height_col * width_col * depth_col) +
h_col * (width_col * depth_col) + w_col * depth_col + d_col;
const float* data_im_ptr = data_im;
data_im_ptr += c_im * (height * width * depth) +
h_offset * (width * depth) + w_offset * depth + d_offset;
for (int i = 0; i < kernel_h; ++i)
{
int h_im = h_offset + i;
for (int j = 0; j < kernel_w; ++j)
{
int w_im = w_offset + j;
for (int k = 0; k < kernel_d; ++k)
{
int d_im = d_offset + k;
*data_col_ptr = (h_im >= 0 && w_im >= 0 && d_im >= 0 &&
h_im < height && w_im < width && d_im < depth) ?
data_im_ptr[i * (width * depth) + j * depth + k] : 0;
data_col_ptr += height_col * width_col * depth_col;
}
}
}
}
}
void im3d2col(const float* data_im, const int channels,
const int height, const int width, const int depth,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_col)
{
// We are going to launch channels * height_col * width_col * depth_col kernels, each
// kernel responsible for copying a single-channel grid.
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int dil_kernel_d = (kernel_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_kernel_d) / stride_d + 1;
int num_kernels = channels * height_col * width_col * depth_col;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
else{
im3d2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_im,
height, width, depth,
kernel_h, kernel_w, kernel_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_col);
}
}
// CUDA kernel for the case of dilation
__global__ void dilated_col2im3d_kernel(
const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int kernel_h, const int kernel_w, const int kernel_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
int kernel_extent_d = (kernel_d - 1) * dilation_d + 1;
// compute the start and end of the output
const int d_col_start = (d_im < kernel_extent_d) ? 0 : (d_im - kernel_extent_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
int d_k = (d_im - d_col * stride_d);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0 && d_k % dilation_d == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
d_k /= dilation_d;
int data_col_index = c_im * kernel_h * kernel_w * kernel_d * height_col * width_col * depth_col +
h_k * kernel_w * kernel_d * height_col * width_col * depth_col +
w_k * kernel_d * height_col * width_col * depth_col +
d_k * height_col * width_col * depth_col +
h_col * width_col * depth_col +
w_col * depth_col +
d_col;
val += data_col[data_col_index];
}
}
}
}
data_im[index] = val;
}
}
__global__ void col2im3d_kernel(const int n, const float* data_col,
const int height, const int width, const int depth,
const int channels,
const int kernel_h, const int kernel_w, const int kernel_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
const int height_col, const int width_col, const int depth_col,
float* data_im)
{
CUDA_KERNEL_LOOP(index, n)
{
float val = 0;
const int d_im = index % depth + pad_d;
const int w_index = index / depth;
const int w_im = w_index % width + pad_w;
const int h_index = w_index / width;
const int h_im = h_index % height + pad_h;
const int c_im = h_index / height;
// compute the start and end of the output
const int d_col_start = (d_im < kernel_d) ? 0 : (d_im - kernel_d) / stride_d + 1;
const int d_col_end = min(d_im / stride_d + 1, depth_col);
const int w_col_start = (w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start = (h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
int offset =
(c_im * kernel_h * kernel_w * kernel_d + h_im * kernel_w * kernel_d +
w_im * kernel_d + d_im) * height_col * width_col * depth_col;
int coeff_h_col = (1 - stride_h * kernel_w * kernel_d * height_col) * width_col * depth_col;
int coeff_w_col = (1 - stride_w * kernel_d * height_col * width_col) * depth_col;
int coeff_d_col = (1 - stride_d * height_col * width_col * depth_col);
for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col + d_col * coeff_d_col];
}
}
}
data_im[index] = val;
}
}
void col2im3d(const float* data_col, const int channels,
const int height, const int width, const int depth,
const int patch_h, const int patch_w, const int patch_d,
const int dilation_h, const int dilation_w, const int dilation_d,
const int pad_h, const int pad_w, const int pad_d,
const int stride_h, const int stride_w, const int stride_d,
float* data_im)
{
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int dil_patch_d = (patch_d - 1) * dilation_d + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int depth_col = (depth + 2 * pad_d - dil_patch_d) / stride_d + 1;
int num_kernels = channels * height * width * depth;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
dilated_col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
else{
col2im3d_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(num_kernels, data_col,
height, width, depth, channels,
patch_h, patch_w, patch_d,
pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d,
height_col, width_col, depth_col,
data_im);
}
}
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter, Nicolas Ballas
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
// Adaptation for 3d
CudaNdarray* corr3dMM(CudaNdarray *const bottom,
CudaNdarray *const weight,
CudaNdarray *const top,
const int direction,
const int dH = 1,
const int dW = 1,
const int dD = 1,
const int dilH = 1,
const int dilW = 1,
const int dilD = 1,
const int padH = 0,
const int padW = 0,
const int padD = 0)
{
if (bottom->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires bottom of 5D");
return NULL;
}
if (!CudaNdarray_is_c_contiguous(bottom))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires bottom to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(bottom)[0],
CudaNdarray_HOST_STRIDES(bottom)[1],
CudaNdarray_HOST_STRIDES(bottom)[2],
CudaNdarray_HOST_STRIDES(bottom)[3],
CudaNdarray_HOST_STRIDES(bottom)[4]);
return 0;
}
if (weight->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires weight of 5D");
return 0;
}
if (!CudaNdarray_is_c_contiguous(weight))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires weight to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(weight)[0],
CudaNdarray_HOST_STRIDES(weight)[1],
CudaNdarray_HOST_STRIDES(weight)[2],
CudaNdarray_HOST_STRIDES(weight)[3],
CudaNdarray_HOST_STRIDES(weight)[4]);
return 0;
}
if (top->nd != 5)
{
PyErr_SetString(PyExc_ValueError, "GpuCorr3dMM requires top of 5D");
return 0;
}
if (!CudaNdarray_is_c_contiguous(top))
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM requires top to be C-contiguous, "
"but strides are: %d %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(top)[0],
CudaNdarray_HOST_STRIDES(top)[1],
CudaNdarray_HOST_STRIDES(top)[2],
CudaNdarray_HOST_STRIDES(top)[3],
CudaNdarray_HOST_STRIDES(top)[4]);
return 0;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth)
const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
const int bottomDepth = CudaNdarray_HOST_DIMS(bottom)[4];
// weights: (nFilters, nChannels, rows, columns, depth)
const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
const int kD = CudaNdarray_HOST_DIMS(weight)[4];
if (nChannels != CudaNdarray_HOST_DIMS(weight)[1])
{
PyErr_SetString(PyExc_ValueError,
"GpuCorr3dMM images and kernel must have the same stack size\n");
return 0;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
const int dil_kD = (kD - 1) * dilD + 1;
// top: (batchSize, nFilters, topHeight, topWidth, topDepth)
const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
const int topWidthNoDW = (bottomWidth + 2*padW - dil_kW);
const int topDepthNoDD = (bottomDepth + 2*padD - dil_kD);
// the above values might be negative so we need to use Python-like
// flooring integer division to be compatible with get_conv_output.
// note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const int topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
const int topDepth = _CONV_FLOORDIV_X(topDepthNoDD, dD) + 1;
#undef _CONV_FLOORDIV
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
topWidth != CudaNdarray_HOST_DIMS(top)[3] ||
topDepth != CudaNdarray_HOST_DIMS(top)[4])
{
PyErr_Format(PyExc_ValueError,
"GpuCorr3dMM shape inconsistency:\n"
" bottom shape: %d %d %d %d %d\n"
" weight shape: %d %d %d %d %d\n"
" top shape: %d %d %d %d %d (expected %d %d %d %d %d)\n",
batchSize, nChannels, bottomHeight, bottomWidth, bottomDepth,
nFilters, nChannels, kH, kW, kD,
CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
CudaNdarray_HOST_DIMS(top)[4],
batchSize, nFilters, topHeight, topWidth, topDepth);
return 0;
}
// Create temporary columns
int col_dim[2];
col_dim[0] = nChannels * kW * kH * kD;
col_dim[1] = topHeight * topWidth * topDepth;
CudaNdarray* col = (CudaNdarray*) CudaNdarray_NewDims(2, col_dim);
if (0 == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM failed to allocate working memory of %d x %d\n",
col_dim[0], col_dim[1]);
return 0;
}
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
CudaNdarray *output;
if (direction == 0)
{ // forward pass
output = top;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid correlation: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// First, im3d2col
im3d2col(bottom->devdata + n * bottom_stride,
nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return 0;
}
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
&one,
col->devdata, N_,
weight->devdata, K_,
&zero,
top->devdata + n * top_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return 0;
}
}
}
else if (direction == 1)
{
// backprop wrt. weights
output = weight;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad wrt. weights could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid convolution: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// First, im2col
im3d2col(bottom->devdata + n * bottom_stride, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD,
col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return 0;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
(n == 0) ? &zero : &one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return 0;
}
}
}
else if (direction == 2)
{
// backprop wrt. inputs
output = bottom;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM grad wrt. inputs could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// full convolution: gemm, then col2im3d
// Iterate over batch
for (int n = 0; n < batchSize; n++)
{
// gemm into columns
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_,
&one,
top->devdata + n * top_stride, N_,
weight->devdata, K_,
&zero,
col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return 0;
}
// col2im3d back to the data
col2im3d(col->devdata, nChannels,
bottomHeight, bottomWidth, bottomDepth,
kH, kW, kD,
dilH, dilW, dilD,
padH, padW, padD,
dH, dW, dD, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorr3dMM encountered a CUDA error in col2im: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorr3dMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return 0;
}
}
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// allocation and refcounting is done in BaseGpuCorr3dMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
// This uses a lot of code from Caffe (http://caffe.berkeleyvision.org/);
// sources are clearly marked. Below we reproduce the original license of
// the Caffe software.
/*
Copyright (c) 2014, The Regents of the University of California (Regents)
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#undef _GLIBCXX_ATOMIC_BUILTINS
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/caffe_common.hpp)
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// CUDA: thread number configuration.
// Use 1024 threads per block, which requires cuda sm_2x or above,
// or fall back to attempt compatibility (best of luck to you).
#if __CUDA_ARCH__ >= 200
const int CUDA_NUM_THREADS = 1024;
#else
const int CUDA_NUM_THREADS = 512;
#endif
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/util/im2col.cu)
// Kernels for fast unfold + copy
// CUDA kernel for the case of dilation
__global__ void dilated_im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
float* data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float* data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h_im = h_offset + i * dilation_h;
int w_im = w_offset + j * dilation_w;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
__global__ void im2col_kernel(const int n, const float* data_im,
const int height, const int width, const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_col) {
CUDA_KERNEL_LOOP(index, n) {
const int h_index = index / width_col;
const int h_col = h_index % height_col;
const int w_col = index % width_col;
const int c_im = h_index / height_col;
const int c_col = c_im * kernel_h * kernel_w;
const int h_offset = h_col * stride_h - pad_h;
const int w_offset = w_col * stride_w - pad_w;
float* data_col_ptr = data_col;
data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
const float* data_im_ptr = data_im;
data_im_ptr += (c_im * height + h_offset) * width + w_offset;
for (int i = 0; i < kernel_h; ++i) {
for (int j = 0; j < kernel_w; ++j) {
int h_im = h_offset + i ;
int w_im = w_offset + j ;
*data_col_ptr =
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
data_im_ptr[i * width + j] : 0;
data_col_ptr += height_col * width_col;
}
}
}
}
void im2col(const float* data_im, const int channels,
const int height, const int width, const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
float* data_col) {
// We are going to launch channels * height_col * width_col kernels, each
// kernel responsible for copying a single-channel grid.
int dil_kernel_h = (kernel_h - 1) * dilation_h + 1;
int dil_kernel_w = (kernel_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_kernel_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_kernel_w) / stride_w + 1;
int num_kernels = channels * height_col * width_col;
if(dilation_h != 1 || dilation_w != 1){
dilated_im2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
}
else{
im2col_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_im, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col);
}
}
// CUDA kernel for the case of dilation
__global__ void dilated_col2im_kernel(const int n, const float* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_im) {
CUDA_KERNEL_LOOP(index, n) {
float val = 0;
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
// compute the start and end of the output
const int w_col_start =
(w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// TODO: use LCM of stride and dilation to avoid unnecessary loops
for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
int h_k = (h_im - h_col * stride_h);
int w_k = (w_im - w_col * stride_w);
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
h_k /= dilation_h;
w_k /= dilation_w;
int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
height_col + h_col) * width_col + w_col;
val += data_col[data_col_index];
}
}
}
data_im[index] = val;
}
}
__global__ void col2im_kernel(const int n, const float* data_col,
const int height, const int width, const int channels,
const int kernel_h, const int kernel_w,
const int pad_h, const int pad_w,
const int stride_h, const int stride_w,
const int height_col, const int width_col,
float* data_im) {
CUDA_KERNEL_LOOP(index, n) {
float val = 0;
const int w_im = index % width + pad_w;
const int h_im = (index / width) % height + pad_h;
const int c_im = index / (width * height);
// compute the start and end of the output
const int w_col_start =
(w_im < kernel_w) ? 0 : (w_im - kernel_w) / stride_w + 1;
const int w_col_end = min(w_im / stride_w + 1, width_col);
const int h_col_start =
(h_im < kernel_h) ? 0 : (h_im - kernel_h) / stride_h + 1;
const int h_col_end = min(h_im / stride_h + 1, height_col);
// equivalent implementation, no dilation
int offset =
(c_im * kernel_h * kernel_w + h_im * kernel_w + w_im) * height_col * width_col;
int coeff_h_col = (1 - stride_h * kernel_w * height_col) * width_col;
int coeff_w_col = (1 - stride_w * height_col * width_col);
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
}
}
data_im[index] = val;
}
}
void col2im(const float* data_col, const int channels,
const int height, const int width, const int patch_h, const int patch_w,
const int dilation_h, const int dilation_w,
const int pad_h, const int pad_w, const int stride_h,
const int stride_w, float* data_im) {
int dil_patch_h = (patch_h - 1) * dilation_h + 1;
int dil_patch_w = (patch_w - 1) * dilation_w + 1;
int height_col = (height + 2 * pad_h - dil_patch_h) / stride_h + 1;
int width_col = (width + 2 * pad_w - dil_patch_w) / stride_w + 1;
int num_kernels = channels * height * width;
// To avoid involving atomic operations, we will launch one kernel per
// bottom dimension, and then in the kernel add up the top dimensions.
if(dilation_h != 1 || dilation_w != 1){
dilated_col2im_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_col, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im);
}
else{
col2im_kernel<<<GET_BLOCKS(num_kernels),
CUDA_NUM_THREADS>>>(
num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im);
}
}
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
CudaNdarray* corrMM(CudaNdarray *const bottom,
CudaNdarray *const weight,
CudaNdarray *const top,
const int direction,
const int dH = 1,
const int dW = 1,
const int dilH = 1,
const int dilW = 1,
const int padH = 0,
const int padW = 0)
{
if (bottom->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
return NULL;
}
if (!CudaNdarray_is_c_contiguous(bottom))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires bottom to be C-contiguous, "
"but strides are: %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(bottom)[0],
CudaNdarray_HOST_STRIDES(bottom)[1],
CudaNdarray_HOST_STRIDES(bottom)[2],
CudaNdarray_HOST_STRIDES(bottom)[3]);
return NULL;
}
if (weight->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
return NULL;
}
if (!CudaNdarray_is_c_contiguous(weight))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires weight to be C-contiguous, "
"but strides are: %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(weight)[0],
CudaNdarray_HOST_STRIDES(weight)[1],
CudaNdarray_HOST_STRIDES(weight)[2],
CudaNdarray_HOST_STRIDES(weight)[3]);
return NULL;
}
if (top->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
return NULL;
}
if (!CudaNdarray_is_c_contiguous(top))
{
PyErr_Format(PyExc_ValueError,
"GpuCorrMM requires top to be C-contiguous, "
"but strides are: %d %d %d %d\n",
CudaNdarray_HOST_STRIDES(top)[0],
CudaNdarray_HOST_STRIDES(top)[1],
CudaNdarray_HOST_STRIDES(top)[2],
CudaNdarray_HOST_STRIDES(top)[3]);
return NULL;
}
// Extract some shape information for later and check shape consistency
// bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
const int batchSize = CudaNdarray_HOST_DIMS(bottom)[0];
const int nChannels = CudaNdarray_HOST_DIMS(bottom)[1];
const int bottomHeight = CudaNdarray_HOST_DIMS(bottom)[2];
const int bottomWidth = CudaNdarray_HOST_DIMS(bottom)[3];
// weights: (nFilters, nChannels, rows, columns)
const int nFilters = CudaNdarray_HOST_DIMS(weight)[0];
const int kH = CudaNdarray_HOST_DIMS(weight)[2];
const int kW = CudaNdarray_HOST_DIMS(weight)[3];
if (nChannels != CudaNdarray_HOST_DIMS(weight)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuCorrMM images and kernel must have the same stack size\n");
return NULL;
}
// implicit dilated filter
const int dil_kH = (kH - 1) * dilH + 1;
const int dil_kW = (kW - 1) * dilW + 1;
// top: (batchSize, nFilters, topHeight, topWidth)
const int topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
const int topWidthNoDW = (bottomWidth + 2*padW - dil_kW);
// the above values might be negative so we need to use Python-like
// flooring integer division to be compatible with get_conv_output.
// note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
const int topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
const int topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV
if (batchSize != CudaNdarray_HOST_DIMS(top)[0] ||
nFilters != CudaNdarray_HOST_DIMS(top)[1] ||
topHeight != CudaNdarray_HOST_DIMS(top)[2] ||
topWidth != CudaNdarray_HOST_DIMS(top)[3]) {
PyErr_Format(PyExc_ValueError,
"GpuCorrMM shape inconsistency:\n"
" bottom shape: %d %d %d %d\n"
" weight shape: %d %d %d %d\n"
" top shape: %d %d %d %d (expected %d %d %d %d)\n",
batchSize, nChannels, bottomHeight, bottomWidth,
nFilters, nChannels, kH, kW,
CudaNdarray_HOST_DIMS(top)[0], CudaNdarray_HOST_DIMS(top)[1],
CudaNdarray_HOST_DIMS(top)[2], CudaNdarray_HOST_DIMS(top)[3],
batchSize, nFilters, topHeight, topWidth);
return NULL;
}
// Create temporary columns
int col_dim[2];
col_dim[0] = nChannels * kW * kH;
col_dim[1] = topHeight * topWidth;
CudaNdarray* col = (CudaNdarray*)CudaNdarray_NewDims(2, col_dim);
if (NULL == col)
{
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM failed to allocate working memory of %d x %d\n",
col_dim[0], col_dim[1]);
return NULL;
}
// Define some useful variables
const int bottom_stride = CudaNdarray_HOST_STRIDES(bottom)[0];
const int top_stride = CudaNdarray_HOST_STRIDES(top)[0];
const int K_ = col_dim[0];
const int N_ = col_dim[1];
const int M_ = nFilters;
const float one = 1.0f;
const float zero = 0.0f;
CudaNdarray *output;
if (direction == 0) { // forward pass
output = top;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid correlation: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
// Second, gemm
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
&one,
col->devdata, N_,
weight->devdata, K_,
&zero,
top->devdata + n * top_stride, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups here,
// but the group-related offsets help explain what M_, N_ and K_ are
int weight_offset = M_ * K_;
int col_offset = K_ * N_;
int top_offset = M_ * N_;
for (int n = 0; n < num_; ++n) {
// First, im2col
im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
col_data);
// Second, innerproduct with groups
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
(Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
(Dtype)0., top_data + (*top)[i]->offset(n) + top_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_N,
N_, M_, K_,
1.,
col_data + col_offset * g, N_,
weight + weight_offset * g, K_,
0.,
top_data + (*top)[i]->offset(n) + top_offset * g, N_);
}
}
*/
}
else if (direction == 1) { // backprop wrt. weights
output = weight;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad wrt. weights could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// valid convolution: im2col, then gemm
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// First, im2col
im2col(bottom->devdata + n * bottom_stride, nChannels, bottomHeight,
bottomWidth, kH, kW, dilH, dilW,
padH, padW, dH, dW, col->devdata);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUDA error in im2col: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
// Second, gemm
// Note that we accumulate into weight. We do so by setting beta = 0
// for the first iteration and beta = 1 for subsequent ones. (This
// is faster than setting weight to all zeros before the loop.)
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
K_, M_, N_,
&one,
col->devdata, N_,
top->devdata + n * top_stride, N_,
(n == 0) ? &zero : &one,
weight->devdata, K_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
// Note that this is for grouped convolution; we can ignore groups
for (int n = 0; n < num_; ++n) {
// Since we saved memory in the forward pass by not storing all col
// data, we will need to recompute them.
im2col_gpu(bottom_data + (*bottom)[i]->offset(n), channels_, height_,
width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
stride_h_, stride_w_, col_data);
// gradient w.r.t. weight. Note that we will accumulate diffs.
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
(Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
col_data + col_offset * g, (Dtype)1.,
weight_diff + weight_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_T, CUBLAS_OP_N, K_, M_, N_,
1.0,
col_data + col_offset * g, N_,
top_diff + top[i]->offset(n) + top_offset * g, N_,
1.0,
weight_diff + weight_offset * g, K_);
}
}
*/
}
else if (direction == 2) { // backprop wrt. inputs
output = bottom;
if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
cudaError_t err = cudaMemset(output->devdata, 0,
CudaNdarray_SIZE(output) * sizeof(real));
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM grad wrt. inputs could not fill the output with zeros: %s",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
Py_DECREF(col);
return output;
}
// full convolution: gemm, then col2im
// Iterate over batch
for (int n = 0; n < batchSize; n++) {
// gemm into columns
cublasStatus_t status = cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
N_, K_, M_,
&one,
top->devdata + n * top_stride, N_,
weight->devdata, K_,
&zero,
col->devdata, N_);
if (status != CUBLAS_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUBLAS error: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cublasGetErrorString(status));
Py_DECREF(col);
return NULL;
}
// col2im back to the data
col2im(col->devdata, nChannels, bottomHeight, bottomWidth,
kH, kW, dilH, dilW, padH, padW,
dH, dW, bottom->devdata + n * bottom_stride);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuCorrMM encountered a CUDA error in col2im: %s\n"
"This could be a known bug in CUDA, please see the "
"GpuCorrMM() documentation.\n",
cudaGetErrorString(err));
Py_DECREF(col);
return NULL;
}
}
/*
// Original caffe code for comparison
// https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
for (int n = 0; n < num_; ++n) {
// gradient w.r.t. bottom data, if necessary
if (propagate_down[i]) {
for (int g = 0; g < group_; ++g) {
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
(Dtype)1., weight + weight_offset * g,
top_diff + top[i]->offset(n) + top_offset * g,
(Dtype)0., col_diff + col_offset * g);
== (see https://github.com/BVLC/caffe/blob/master/src/caffe/util/math_functions.cu#L16)
cublasSgemm(CUBLAS_OP_N, CUBLAS_OP_T, N_, K_, M_,
1.,
top_diff + top[i]->offset(n) + top_offset * g, N_,
weight + weight_offset * g, K_,
0.,
col_diff + col_offset * g, N_);
}
// col2im back to the data
col2im_gpu(col_diff, channels_, height_, width_,
kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
bottom_diff + (*bottom)[i]->offset(n));
}
}
*/
}
// Free temporary columns
Py_DECREF(col);
// Note that we don't change the refcount of the output matrix here. Output
// (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
// in here output is just aliased to one of bottom, weights, or top.
return output;
}
This source diff could not be displayed because it is too large. You can view the blob instead.
#ifndef _CUDA_NDARRAY_H
#define _CUDA_NDARRAY_H
#include <algorithm>
// Defines for Python 2/3 compatibility.
#if PY_MAJOR_VERSION >= 3
// Py3k treats all ints as longs. This one is not caught by npy_3kcompat.h.
#define PyNumber_Int PyNumber_Long
#include "numpy/npy_3kcompat.h"
// Py3k strings are unicode, these mimic old functionality.
//
// NOTE: npy_3kcompat.h replaces PyString_X with PyBytes_X, which breaks
// compatibility with some functions returning text.
#define PyString_Check PyUnicode_Check
#define PyString_FromString PyUnicode_FromString
#define PyString_AsString PyUnicode_AsUTF8
#define PyString_FromStringAndSize PyUnicode_FromStringAndSize
#define PyString_Size PyUnicode_GET_SIZE
#define PyInt_FromSize_t PyLong_FromSize_t
// Python 3 expects a PyObject* as the first argument to PySlice_GetIndicesEx().
#define SLICE_CAST(x) (x)
#else
// Python 2 expects a PySliceObject* as the first argument to PySlice_GetIndicesEx().
#define SLICE_CAST(x) ((PySliceObject*)(x))
#endif // end #if PY_MAJOR_VERSION >= 3
#ifndef Py_TYPE
# define Py_TYPE(o) ((o)->ob_type)
#endif
#ifndef Py_REFCNT
# define Py_REFCNT(o) ((o)->ob_refcnt)
#endif
#include <numpy/arrayobject.h>
#include <stdio.h>
#include <stdint.h>
#ifndef SIZE_MAX
#define SIZE_MAX ((size_t)-1)
#endif
// Cuda GPUs only accept a single representation for NaN whereas CPU may have
// more than one. So it's better to use the CUDA one to be sure
#ifdef NAN
#undef NAN
#endif
#include <math_constants.h>
#define NAN CUDART_NAN_F
#include <cublas_v2.h>
#ifdef _WIN32
# ifdef _CUDA_NDARRAY_C
# define DllExport __declspec( dllexport )
# else
# define DllExport __declspec( dllimport )
# endif
# define ALWAYS_INLINE
#else //else _WIN32
# define DllExport __attribute__((visibility ("default")))
# define ALWAYS_INLINE __attribute__((always_inline))
#endif
typedef float real;
#define REAL_TYPENUM 11
#ifdef __DEVICE_EMULATION__
#define NUM_VECTOR_OP_BLOCKS 4096
#define NUM_VECTOR_OP_THREADS_PER_BLOCK 1 //This prevents printf from getting tangled up
#else
#define NUM_VECTOR_OP_BLOCKS 4096 //Max number of blocks to launch. Should be read from device properties. (#10)
#define NUM_VECTOR_OP_THREADS_PER_BLOCK 256 //Should be read from device properties. (#10)
#endif
#if 1
// Do not wait after every kernel & transfer.
#define CNDA_THREAD_SYNC
#else
// This is useful for using normal profiling tools
#define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024)
#endif
#define VERBOSE_DEVICE_MALLOC 1
#define NO_VERBOSE_DEVICE_MALLOC 0
/* Use this handle to make cublas calls */
extern DllExport cublasHandle_t handle;
/**
* Allocation and freeing of device memory should go through these functions so
* that the lib can track memory usage.
*
* device_malloc will set the Python error message before returning None.
* device_free will return nonzero on failure (after setting the python error message)
*
* Set the Python error
*/
DllExport void * device_malloc(size_t size);
DllExport void * device_malloc(size_t size, int verbose);
DllExport int device_free(void * ptr);
DllExport void *get_work_mem(size_t sz);
// Pointor to 1 int on the device
// Used in CudaNdarray_TakeFrom and in an op
// to tell that there is an out of bound error
// When it is allocated, it should always be 0
// So if there is an error, we must reset it to 0 BEFORE we raise the error
// This prevent us from setting it to 0 before each use
extern DllExport int* err_var;
static inline int init_err_var(){
if (err_var == NULL) {
err_var = (int*)device_malloc(sizeof(int));
if (!err_var) { // PyErr set by device_malloc
return -1;
}
cudaError_t err = cudaMemset((void*)err_var, 0,
sizeof(int));
if (cudaSuccess != err) {
// Clear the error flag, cudaMemset doesn't do it.
cudaGetLastError();
PyErr_Format(
PyExc_RuntimeError,
"Error setting device error code to 0. %s",
cudaGetErrorString(err));
return -1;
}
}
return 0;
}
static inline int check_err_var(){
//-10 could be any value different then 0.
int cpu_err_var=-10;
cudaError_t err;
CNDA_BEGIN_ALLOW_THREADS
// As we execute cudaMemcpy on the default stream, it waits
// for all kernels (on all streams) to be finished before
// starting to copy
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) {
PyErr_Format(
PyExc_RuntimeError,
"Cuda error: %s when trying to get the error"
" value.\\n",
cudaGetErrorString(err));
return -1;
}
if (cpu_err_var != 0) {
PyErr_Format(
PyExc_IndexError,
"One of the index value is out of bound. Error code: %i.\\n",
cpu_err_var);
// Must reset it to 0 to don't reset it before each use.
err = cudaMemset((void*)err_var, 0, sizeof(int));
if (cudaSuccess != err) {
PyErr_Format(PyExc_MemoryError,
"Error setting device error code to 0 after having"
" an index error. %s", cudaGetErrorString(err));
return -1;
}
return -1;
}
return 0;
}
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a % b) ? 1: 0);
}
/**
* struct CudaNdarray
*
* This is a Python type.
*
*/
struct CudaNdarray
{
PyObject_HEAD
/**
* base:
* either NULL or a pointer to a fellow CudaNdarray into which this one is viewing.
* This pointer is never followed, except during Py_DECREF when we do not need it any longer.
*/
PyObject * base;
/* Type-specific fields go here. */
//GpuTensorType::VoidTensor * vt;
int nd; //the number of dimensions of the tensor
// Client should acces host_structure via CudaNdarray_HOST_DIMS / CudaNdarray_HOST_STRIDES functions
int * host_structure; //dim0, dim1, ... stride0, stride1, ...
int data_allocated; //the number of bytes allocated for devdata
//device pointers (allocated by cudaMalloc)
mutable int dev_structure_fresh;
//dev_structure should be accessed via the functions like
//CudaNdarray_DEV_DIMS, otherwise may not be
//synchronized with host_structure. The accessor functions will allocate it when needed.
mutable int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
real* devdata; //pointer to data element [0,..,0].
};
enum operator_t
{
IADD=0,
IDIV,
CPY,
N_ELEMWISE_OPS // This is to know the number of operation
};
/*
* Return a CudaNdarray whose 'nd' dimensions are all 0.
* if nd==-1, it is not initialized.
*
* Set the Python error
*/
DllExport PyObject *
CudaNdarray_New(int nd=-1);
/**
* Return 1 for a CudaNdarray otw 0
*/
DllExport int
CudaNdarray_Check(const PyObject * ob);
/**
* Return 1 for a CudaNdarray otw 0
*/
DllExport int
CudaNdarray_CheckExact(const PyObject * ob);
/**
* Return true for a C-contiguous CudaNdarray, else false
*/
DllExport bool
CudaNdarray_is_c_contiguous(const CudaNdarray * self);
/**
* Return true for a F-contiguous CudaNdarray, else false
*/
DllExport bool
CudaNdarray_is_f_contiguous(const CudaNdarray * self);
/****
* Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
*/
DllExport int cnda_structure_size(int nd);
/*
* This describes the shape of the ndarray. The array
* of dimensions is itself stored on the host.
* If you need to access the dimensions array from inside
* a kernel, use CudaNdarray_DEVICE_DIMS.
*/
DllExport const int *
CudaNdarray_HOST_DIMS(const CudaNdarray * self);
DllExport const int *
CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
DllExport const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
DllExport inline void ALWAYS_INLINE
cnda_mark_dev_structure_dirty(CudaNdarray * self)
{
self->dev_structure_fresh = 0;
}
DllExport int
CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase);
// Default: do not ignore sync of dev and host structures in comparing, and do not ignore difference in base pointers
DllExport int
CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
/****
* Set the dimension[idx] to value d.
*
* Updates the log2dim shadow array.
*
* Does not sync structure to device.
*/
DllExport inline void ALWAYS_INLINE
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{
if ((idx >= self->nd) || (idx < 0) || (d < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: self->ndim=%i, idx=%i stride=%i\n",
self->nd, idx, d);
}
if (d != self->host_structure[idx])
{
self->host_structure[idx] = d;
int log2d = (int)log2((double)d);
self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
cnda_mark_dev_structure_dirty(self);
}
}
DllExport inline void ALWAYS_INLINE
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{
if ((idx >= self->nd) || (idx < 0))
{
fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
}
if (s != CudaNdarray_HOST_STRIDES(self)[idx])
{
self->host_structure[idx+self->nd] = s;
cnda_mark_dev_structure_dirty(self);
}
}
/***
* Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
*
* This means: recalculate the log2dims and transfer structure to the card
*/
DllExport int cnda_copy_structure_to_device(const CudaNdarray * self);
/* CudaNdarray_DEV_DIMS gives the same information as CudaNdarray_HOST_DIMS,
* but stored on the GPU. Use this pointer when it needs to be accessed
* from inside a CUDA kernel.
*/
DllExport const int *CudaNdarray_DEV_DIMS(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
// The following 4 macro are here to help make c code generator that work on
// both PyArray and CudaNdarray. This is at least used for Subtensor and
// GpuSubtensor
#define CudaNdarray_DIMS CudaNdarray_HOST_DIMS
#define CudaNdarray_NDIM(self) self->nd
#define CudaNdarray_STRIDES CudaNdarray_HOST_STRIDES
#define CudaNdarray_BYTES CudaNdarray_DEV_DATA
/**
* Return the number of elements in the ndarray (product of the dimensions)
*/
DllExport size_t CudaNdarray_SIZE(const CudaNdarray *self);
static PyObject *CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure);
/**
* Allocate a new CudaNdarray with room for given number of dimensions
*
* No Storage space is allocated (and all dimensions are 0)
*
* Set the Python error
*/
DllExport PyObject * CudaNdarray_new_nd(const int nd);
/**
* [Re]allocate a CudaNdarray with access to 'nd' dimensions.
*
* Note: This does not allocate storage for data, or free
* pre-existing storage.
*
* Set the Python error
*/
DllExport inline int ALWAYS_INLINE
CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{
if (nd != self->nd)
{
if (self->dev_structure)
{
if (device_free(self->dev_structure))
{
return -1;
}
self->dev_structure = NULL;
}
if (self->host_structure)
{
free(self->host_structure);
self->host_structure = NULL;
self->nd = -1;
}
if (nd == -1) return 0;
self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
if (NULL == self->host_structure)
{
PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
return -1;
}
//initialize all dimensions and strides to 0
for (int i = 0; i < cnda_structure_size(nd); ++i)
{
self->host_structure[i] = 0;
}
//The device structure will be created in cnda_copy_structure_to_device
//if needed.
self->nd = nd;
self->dev_structure_fresh = 0;
}
return 0;
}
/**
* CudaNdarray_alloc_contiguous
*
* Allocate storage space for a tensor of rank 'nd' and given dimensions.
* (No-op if self already has a contiguous tensor of the right dimensions)
*
* If fortran is non-zeros, a fortran order is made, otherwise it is a c order.
*
* Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
*/
template<typename inttype>
static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
const inttype * dim, int fortran=0)
{
// allocate an empty ndarray with c_contiguous access
// return 0 on success
size_t size = 1; //set up the strides for contiguous tensor
assert (nd >= 0);
// Here we modify the host structure to have the desired shape and
// strides. This does not cause the storage to be freed or reallocated.
if (CudaNdarray_set_nd(self, nd))
{
return -1;
}
if (fortran)
{
for (int i = 0; i < nd; i++)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
//Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested %llu * %llu",
(unsigned long long)size, (unsigned long long)dim[i]);
return -1;
}
size = size * dim[i];
}
}
else
{
for (int i = nd-1; i >= 0; --i)
{
CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
CudaNdarray_set_dim(self, i, dim[i]);
//Detect overflow on unsigned integer
if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
PyErr_Format(PyExc_AssertionError,
"Can't store in size_t for the bytes requested %llu * 4",
(unsigned long long)size);
return -1;
}
size = size * dim[i];
}
}
// Detect overflow on unsigned integer
if (size > (SIZE_MAX / sizeof(real))) {
PyErr_Format(PyExc_RuntimeError,
"Can't store in size_t for the bytes requested %llu",
(unsigned long long)size);
return -1;
}
// If the allocated buffer is already of the right size, we don't need to
// do anything else.
// Note: self->data_allocated is 0 for a view, so views will fail this
// check and be turned into independent arrays below.
if (self->data_allocated == size)
{
return 0;
}
// The structure of self will be reused with newly allocated memory.
// If self was a view, we should remove the reference to its base.
// (If base was already NULL, the following has no effect.)
Py_XDECREF(self->base);
self->base = NULL;
// If self is a view, do not try to free its memory
if (self->data_allocated && device_free(self->devdata))
{
self->devdata = NULL;
self->data_allocated = 0;
return -1;
}
self->devdata = (float*)device_malloc(size*sizeof(real));
if (size && !self->devdata)
{
CudaNdarray_set_nd(self, -1);
self->data_allocated = 0;
self->devdata = 0;
return -1;
}
if (0)
fprintf(stderr,
"Allocated devdata %p (self=%p)\n",
self->devdata,
self);
self->data_allocated = size;
return 0;
}
/*
* Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
* Set the python error.
*/
template<typename inttype>
static PyObject *CudaNdarray_NewDims(int nd, const inttype * dims)
{
CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New();
if (rval)
{
if (CudaNdarray_alloc_contiguous(rval, nd, dims))
{
Py_DECREF(rval);
return NULL;
}
}else{
PyErr_SetString(PyExc_MemoryError,
"Failed to allocate the CudaNdarray structure.");
}
return (PyObject*)rval;
}
/**
* CudaNdarray_set_device_data
*
* Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
*/
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base);
DllExport int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base);
/**
* Return an independent copy of self
*/
DllExport PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
/**
* Return an independent copy of self
*/
DllExport PyObject * CudaNdarray_Copy(const CudaNdarray * self);
/**
* Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
*/
DllExport PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask);
/**
* Reshape self to the new shape gived by the tuple shape.
*/
DllExport PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape);
/**
* Transfer the contents of numpy array `obj` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
*/
DllExport int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
/**
* Transfer the contents of CudaNdarray `other` to `self`.
*
* self is reallocated to have the correct dimensions if necessary.
* TODO: WRITEME: what does "if necessary" mean?
* TODO: we use this to implement set/inc subtensor, where self is a view of
* the original tensor so that we write only to the subtensor. How
* do we ensure that self is not reallocated in this case?
*
* unbroadcast: if true, this means that other is broadcastable in some
* dimensions, and the result, self, is not.
* ie, if unbroadcast=false, we must do the broadcasting
* operation as part of the copy.
* e.g. suppose self and other are 2D matrices and other
* has only one row. Then we need to copy this row several
* times when copying to self.
*
* Set the Python error
*/
DllExport int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
const CudaNdarray * other, bool unbroadcast = false);
/**
* Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
*/
DllExport PyObject *
CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args = NULL);
DllExport PyObject *
CudaNdarray_ZEROS(int n, int * dims);
/**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/
DllExport inline bool ALWAYS_INLINE
CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{
bool c_contiguous = true;
int size = 1;
for (int i = self->nd-1; (i >= 0) && c_contiguous; --i)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
c_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return c_contiguous;
}
/**
* True iff the strides look like [1, dim[0], dim[0]*dim[1], ...]
*/
DllExport inline bool ALWAYS_INLINE
CudaNdarray_is_f_contiguous(const CudaNdarray * self)
{
bool f_contiguous = true;
int size = 1;
for (int i = 0; (i < self->nd) && f_contiguous; i++)
{
if (CudaNdarray_HOST_DIMS(self)[i] == 1)
continue;
if (CudaNdarray_HOST_STRIDES(self)[i] != size)
{
f_contiguous = false;
}
size = size * CudaNdarray_HOST_DIMS(self)[i];
}
return f_contiguous;
}
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
DllExport int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
DllExport PyObject*
CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args);
// Set the Python error
int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
// Ensures that *arr is a pointer to a contiguous ndarray of the specified
// dimensions.
// *arr may initially be NULL, a pointer to an ndarray of the wrong size,
// or a pointer to an ndarray of the right size. In the last case it will
// not change.
// If fortran is non-zero, a fortran order is expected/created
//
// Set the Python error
DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
const int * dims, int fortran = 0);
DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
switch(err) {
case CUBLAS_STATUS_SUCCESS:
return "success";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "the library was not initialized";
case CUBLAS_STATUS_ALLOC_FAILED:
return "the resource allocation failed";
case CUBLAS_STATUS_INVALID_VALUE:
return "the parameters n<0 or incx,incy=0";
#ifdef CUBLAS_STATUS_ARCH_MISMATCH
case CUBLAS_STATUS_ARCH_MISMATCH:
return "required device feature not present";
#endif
case CUBLAS_STATUS_MAPPING_ERROR:
return "an access to GPU memory space failed";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "the function failed to launch on the GPU";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "an internal operation failed";
#ifdef CUBLAS_STATUS_NOT_SUPPORTED
case CUBLAS_STATUS_NOT_SUPPORTED:
return "unsupported function";
#endif
default:
return "unknow code";
}
}
#endif
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
#ifndef CUDNN_HELPER_H
#define CUDNN_HELPER_H
#include <cudnn.h>
// If needed, define element of the V4 interface in terms of elements of
// previous versions
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
#endif
#ifndef CUDNN_VERSION
#include <assert.h>
// Here we define the R2 API in terms of functions in the R1 interface
// This is only for what we use
static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
switch (err) {
case CUDNN_STATUS_SUCCESS:
return "The operation completed successfully.";
case CUDNN_STATUS_NOT_INITIALIZED:
return "The handle was not initialized(Is your driver recent enought?).";
case CUDNN_STATUS_ALLOC_FAILED:
return "Ressource allocation failed inside the library.";
case CUDNN_STATUS_BAD_PARAM:
return "An incorrect value was passed in.";
case CUDNN_STATUS_ARCH_MISMATCH:
return "The current GPU does not support the required features (only cc 3.0+ are supported).";
case CUDNN_STATUS_MAPPING_ERROR:
return "An access to GPU memory space failed (probably due to a failure to bind texture).";
case CUDNN_STATUS_EXECUTION_FAILED:
return "A kernel failed to execute.";
case CUDNN_STATUS_INTERNAL_ERROR:
return "An internal cuDNN operation failed.";
case CUDNN_STATUS_NOT_SUPPORTED:
return "The combination of parameters is not currently supported.";
default:
return "Unknown error code.";
}
}
// some macros to help support cudnn R1 while using R2 code.
#define cudnnCreateTensorDescriptor cudnnCreateTensor4dDescriptor
#define cudnnDestroyTensorDescriptor cudnnDestroyTensor4dDescriptor
#define cudnnSetFilter4dDescriptor cudnnSetFilterDescriptor
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t
cudnnSetTensorNdDescriptor(
cudnnTensorDescriptor_t tensorDesc,
cudnnDataType_t dataType,
int nbDims,
const int dimA[],
const int strideA[]) {
if (nbDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetTensor4dDescriptorEx(
tensorDesc, dataType,
dimA[0], dimA[1], dimA[2], dimA[3],
strideA[0], strideA[1], strideA[2], strideA[3]);
}
static inline cudnnStatus_t
cudnnGetConvolution2dForwardOutputDim(
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t inputTensorDesc,
const cudnnFilterDescriptor_t filterDesc,
int *n,
int *c,
int *h,
int *w) {
return cudnnGetOutputTensor4dDim(convDesc, CUDNN_CONVOLUTION_FWD,
n, c, h, w);
}
typedef int cudnnConvolutionFwdAlgo_t;
typedef int cudnnConvolutionFwdPreference_t;
#define CUDNN_CONVOLUTION_FWD_NO_WORKSPACE 0
static inline cudnnStatus_t
cudnnGetConvolutionForwardAlgorithm(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t destDesc,
cudnnConvolutionFwdPreference_t preference,
size_t memoryLimitInbytes,
cudnnConvolutionFwdAlgo_t *algo) {
*algo = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnGetConvolutionForwardWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t srcDesc,
const cudnnFilterDescriptor_t filterDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensor4dDescriptor_t destDesc,
cudnnConvolutionFwdAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnConvolutionForward_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionFwdAlgo_t algo,
void *workSpace,
size_t workSpaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionForward(handle, srcDesc, srcData,
filterDesc, filterData,
convDesc, destDesc, destData,
r);
}
#define cudnnConvolutionForward cudnnConvolutionForward_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardFilter_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
return cudnnConvolutionBackwardFilter(handle, srcDesc, srcData,
diffDesc, diffData,
convDesc, gradDesc, gradData,
r);
}
#define cudnnConvolutionBackwardFilter cudnnConvolutionBackwardFilter_v2
static inline cudnnStatus_t
cudnnConvolutionBackwardData_v2(
cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
const void *beta,
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
assert(*(float *)alpha == 1.0);
cudnnAccumulateResult_t r;
if (*(float *)beta == 0.0) {
r = CUDNN_RESULT_NO_ACCUMULATE;
} else if (*(float *)beta == 1.0) {
r = CUDNN_RESULT_ACCUMULATE;
} else {
assert(0 && "beta must be 0.0 or 1.0");
}
/* This function needs the casting because its params are not
declared as const */
return cudnnConvolutionBackwardData(handle,
(cudnnFilterDescriptor_t)filterDesc,
filterData,
(cudnnTensorDescriptor_t)diffDesc,
diffData,
(cudnnConvolutionDescriptor_t)convDesc,
(cudnnTensorDescriptor_t)gradDesc,
gradData,
r);
}
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
static inline cudnnStatus_t
cudnnSetPoolingNdDescriptor(
cudnnPoolingDescriptor_t poolingDesc,
const cudnnPoolingMode_t mode,
int nbDims,
const int windowDimA[],
const int paddingA[],
const int strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode,
windowDimA[0], windowDimA[1],
strideA[0], strideA[1]);
}
static inline cudnnStatus_t
cudnnGetPoolingNdDescriptor(
const cudnnPoolingDescriptor_t poolingDesc,
const int nbDimsRequested,
cudnnPoolingMode_t *mode,
int *nbDims,
int windowA[],
int paddingA[],
int strideA[]) {
int win0, win1, str0, str1;
cudnnStatus_t err;
if (nbDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err;
*nbDims = 2;
paddingA[0] = 0;
paddingA[1] = 0;
windowA[0] = win0;
windowA[1] = win1;
strideA[0] = str0;
strideA[1] = str1;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnPoolingForward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
destDesc, destData);
}
#define cudnnPoolingForward cudnnPoolingForward_v2
static inline cudnnStatus_t
cudnnPoolingBackward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t srcDiffDesc,
const void *srcDiffData,
const cudnnTensorDescriptor_t destDesc,
const void *destData,
const void *beta,
const cudnnTensorDescriptor_t destDiffDesc,
void *destDiffData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingBackward(handle, poolingDesc,
srcDesc, srcData,
srcDiffDesc, srcDiffData,
destDesc, destData,
destDiffDesc, destDiffData);
}
#define cudnnPoolingBackward cudnnPoolingBackward_v2
//Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else
// r2 rc1 and rc2 do not have the same macro defined
// I didn't checked if this the right combination, but as we do not wrap the padding interface, it is fine for now.
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING ((cudnnPoolingMode_t)1)
#endif
#endif
from __future__ import absolute_import, print_function, division
import pkg_resources
import theano
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
try:
from theano.sandbox.cuda import cuda_ndarray
dimshuffle = cuda_ndarray.cuda_ndarray.dimshuffle
except ImportError:
pass
cula_available = False
try:
from scikits.cuda import cula
cula_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cula_initialized = False
class GpuSolve(GpuOp):
"""
CULA GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans',)
def __init__(self, trans='N'):
self.trans = trans
super(GpuSolve, self).__init__()
def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node(self, inp1, inp2):
inp1 = as_cuda_ndarray_variable(inp1)
inp2 = as_cuda_ndarray_variable(inp2)
assert inp1.ndim == 2
assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self, node, storage_map, _, no_recycling, impl=None):
# Initialize CULA the first time it is needed
global cula_initialized
if not cula_available:
raise RuntimeError('Cula is not available and '
'GpuSolve Op can not be constructed.')
if not cula_initialized:
cula.culaInitialize()
cula_initialized = True
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
# size of the matrices to invert
z = outputs[0]
# Matrix
A = inputs[0][0]
# Solution vectors
b = inputs[1][0]
# A is not explicitly converted between C and F order, instead we
# switch the "transpose" flag
if self.trans in ('T', 'C'):
trans = 'N'
else:
trans = 'T'
# Convert b to F-order from c-order.
b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))
# This copy forces allocation of a new C-contiguous buffer
# and returns it.
A_cpy = A.copy()
b_cpy = b_cpy.copy()
def cula_gpu_solve(A_, b_, trans='T'):
A_shape = A_.shape
b_shape = b_.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
if trans in ['T', 'C']:
l, n = A_shape
k, m = b_shape
if n != k:
raise ValueError('A and b must be aligned.')
elif trans in ['N']:
n, l = A_shape
k, m = b_shape
if l != m:
raise ValueError('A and b must be aligned.')
else:
raise ValueError('Invalid value for trans')
lda = max(1, n)
ldb = max(1, n, l)
# construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b.
A_ptr = A_.gpudata
b_ptr = b_.gpudata
cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
return A_, b_
A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans)
# Convert b to F-order from c-order and assign it to output:
b_cpy = b_cpy.reshape(b.shape[::-1])
b_cpy = dimshuffle(b_cpy, (1, 0))
z[0] = b_cpy
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
gpu_solve = GpuSolve()
This source diff could not be displayed because it is too large. You can view the blob instead.
#section support_code
static cudnnHandle_t _handle = NULL;
static int
c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
int dim = CudaNdarray_NDIM(var);
int *strides = (int *)malloc(dim * sizeof(int));
int default_str = 1;
int return_value = 0;
if (strides != NULL) {
for (int i = dim-1; i >= 0; i--)
{
if (CudaNdarray_HOST_STRIDES(var)[i])
strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
else
strides[i] = default_str;
default_str *= CudaNdarray_HOST_DIMS(var)[i];
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s"
"dim=%d",
cudnnGetErrorString(err), dim);
return_value = -1;
}
} else {
PyErr_Format(PyExc_MemoryError,
"Could not allocate memory for strides array of size %d.",
dim);
return_value = -1;
}
free(strides);
return return_value;
}
static int
c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
int dim = CudaNdarray_NDIM(var);
cudnnStatus_t err = cudnnSetFilterNdDescriptor_v4(desc,
CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW,
dim,
CudaNdarray_HOST_DIMS(var));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
#section init_code
{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
#if PY_MAJOR_VERSION >= 3
return NULL;
#else
return;
#endif
}
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
/* Keep track, from one execution to another, of the dimension of the data
and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time.
Note : Implementation selection for backward convolution only exists starting
at V3.
*/
int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
bool APPLY_SPECIFIC(previous_algo_set);
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
for (int i = 0; i < 5; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
APPLY_SPECIFIC(previous_output_shape)[i] = 0;
}
APPLY_SPECIFIC(previous_algo_set) = false;
// Select default implementations for the case where the convolution
// implementations should be selected based on the size of the data.
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code_struct
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size\n");
return 1;
}
int nb_dim = CudaNdarray_NDIM(input);
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1;
#endif
if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
cudaError_t err2 = cudaMemset((*output)->devdata, 0,
CudaNdarray_SIZE(*output) * sizeof(real));
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv could not fill the output with zeros: %s",
cudaGetErrorString(err2));
return 1;
}
return 0;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
{
size_t worksize;
void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo;
if (CHOOSE_ALGO)
{
// A new convolution implementation should be selected, based either on
// timing or heuristics if in one of the two following cases :
// - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the
// shapes of the inputs differ from the last time an implementation
// was chosen.
bool reuse_previous_algo;
if (CHOOSE_ALGO_ONCE)
{
// Only choose a new implementation of none has been chosen before.
reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
}
else
{
// Reuse the previous implementation if the inputs and the kernels
// have the same shapes as they had when the previous implementation
// was selected
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
APPLY_SPECIFIC(previous_input_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
APPLY_SPECIFIC(previous_kerns_shape)[i]);
}
reuse_previous_algo = same_shapes;
}
// If the previously choosen implementation can't be reused, select a
// new one based on the shapes of the current inputs
if (!reuse_previous_algo)
{
// Obtain a convolution algorithm appropriate for the input and kernel
// shapes. Either by choosing one according to heuristics or by making
// cuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
// Time the different implementations to choose the best one
int requestedCount = 1;
int count;
cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
err = cudnnFindConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
requestedCount,
&count,
&choosen_algo_perf);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
chosen_algo = choosen_algo_perf.algo;
}
else
{
// The implementation should be chosen using heuristics based on the
// input shapes and the amount of memory available.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
}
// Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo;
APPLY_SPECIFIC(previous_algo_set) = true;
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i];
APPLY_SPECIFIC(previous_kerns_shape)[i] =
CudaNdarray_HOST_DIMS(kerns)[i];
}
}
else
{
// Reuse the previously chosen convolution implementation
chosen_algo = APPLY_SPECIFIC(previous_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_GEMM:
a = "gemm (2)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
a = "direct (3)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
a = "fft (4)";
break;
case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
a = "fft tiling (5)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
a = "winograd (6)";
break;
#endif
}
printf("GpuDNNConv: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support
// strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't.
// Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// defined only for 2d-filters
if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
// Extract the spatial size of the filters
int filter_h = CudaNdarray_HOST_DIMS(kerns)[2];
int filter_w = CudaNdarray_HOST_DIMS(kerns)[3];
// Extract the spatial size of the input
int input_h = CudaNdarray_HOST_DIMS(input)[2];
int input_w = CudaNdarray_HOST_DIMS(input)[3];
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
else
{
// chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
}
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
chosen_algo,
&worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
// TODO: Print a warning
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
chosen_algo,
&worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
err = cudnnConvolutionForward(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
desc,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size\n");
return 1;
}
int nb_dim = CudaNdarray_NDIM(output);
#ifdef CONV_INPLACE
Py_XDECREF(*input);
*input = im;
Py_INCREF(*input);
#else
if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1;
#endif
if (CudaNdarray_DIMS(im)[0] == 0 || CudaNdarray_DIMS(kerns)[0] == 0 || CudaNdarray_DIMS(kerns)[1] == 0) {
cudaError_t err2 = cudaMemset((*input)->devdata, 0,
CudaNdarray_SIZE(*input) * sizeof(real));
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv grad wrt. inputs could not fill the output with zeros: %s",
cudaGetErrorString(err2));
return 1;
}
return 0;
}
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1;
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
nb_dim, expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return 1;
}
if (nb_dim == 4) {
if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
(CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
(CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
(CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
(long int)expected_output_dims[0], (long int)expected_output_dims[1],
(long int)expected_output_dims[2], (long int)expected_output_dims[3],
(long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
(long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
return 1;
}
} else if (nb_dim == 5) {
if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
(CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
(CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
(CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
(CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
(long int)expected_output_dims[0], (long int)expected_output_dims[1],
(long int)expected_output_dims[2], (long int)expected_output_dims[3],
(long int)expected_output_dims[4],
(long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
(long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
(long int)CudaNdarray_HOST_DIMS(output)[4]);
return 1;
}
}
{
size_t worksize;
void *workspace;
cudnnConvolutionBwdDataAlgo_t chosen_algo;
if (CHOOSE_ALGO)
{
// A new convolution implementation should be selected, based either on
// timing or heuristics, if in one of the two following cases :
// - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the
// shapes of the inputs differ from the last time an implementation
// was chosen.
bool reuse_previous_algo;
if (CHOOSE_ALGO_ONCE)
{
// Only choose a new implementation of none has been chosen before.
reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
}
else
{
// Reuse the previous implementation if the the kernels and the outputs
// have the same shapes as they had when the previous implementation
// was selected
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
APPLY_SPECIFIC(previous_kerns_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
APPLY_SPECIFIC(previous_output_shape)[i]);
}
reuse_previous_algo = same_shapes;
}
// If the previously choosen implementation can't be reused, select a
// new one based on the shapes of the current inputs
if (!reuse_previous_algo)
{
// Obtain a convolution algorithm appropriate for the kernel and output
// shapes. Either by choosing one according to heuristics or by making
// cuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
// Time the different implementations to choose the best one
int requestedCount = 1;
int count;
cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
requestedCount,
&count,
&choosen_algo_perf);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error selecting convolution algo: "
"%s", cudnnGetErrorString(err));
return 1;
}
chosen_algo = choosen_algo_perf.algo;
}
else
{
// Choose the convolution implementation using heuristics based on the
// shapes of the inputs and the amount of memory available.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
}
// Store the shapes of the kernels and output as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
APPLY_SPECIFIC(previous_algo_set) = true;
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_kerns_shape)[i] =
CudaNdarray_HOST_DIMS(kerns)[i];
APPLY_SPECIFIC(previous_output_shape)[i] =
CudaNdarray_HOST_DIMS(output)[i];
}
}
else
{
// Reuse the previously chosen convlution implementation
chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_0:
a = "implicit gemm (0)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
a = "precomp gemm (1)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
a = "fft tiling (3)";
break;
#if CUDNN_VERSION > 5000
case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
a = "winograd (4)";
break;
#endif
}
printf("GpuDNNConvGI: algo %s\n", a);
}
// The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// The tiled-FFT implementation (only in V4 onward) does not support
// strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't.
// Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// defined only for 2d-filters
if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
// Extract the spatial size of the filters
int filter_h = CudaNdarray_HOST_DIMS(kerns)[2];
int filter_w = CudaNdarray_HOST_DIMS(kerns)[3];
// Extract the spatial size of the input
int input_h = CudaNdarray_HOST_DIMS(*input)[2];
int input_w = CudaNdarray_HOST_DIMS(*input)[3];
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
else
{
// chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
// Infer required workspace size from the chosen implementation
err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
// Allocate workspace for the convolution
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
// Perform the convolution
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size\n");
return 1;
}
int nb_dim = CudaNdarray_NDIM(output);
#ifdef CONV_INPLACE
Py_XDECREF(*kerns);
*kerns = km;
Py_INCREF(*kerns);
#else
if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1;
#endif
if (CudaNdarray_DIMS(input)[0] == 0 || CudaNdarray_DIMS(km)[0] == 0 || CudaNdarray_DIMS(km)[1] == 0) {
cudaError_t err2 = cudaMemset((*kerns)->devdata, 0,
CudaNdarray_SIZE(*kerns) * sizeof(real));
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv grad wrt. weights could not fill the output with zeros: %s",
cudaGetErrorString(err2));
return 1;
}
return 0;
}
if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1;
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
nb_dim, expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return 1;
}
if (nb_dim == 4) {
if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
(CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
(CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
(CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
" but received gradient with shape %ldx%ldx%dx%ld",
(long int)expected_output_dims[0], (long int)expected_output_dims[1],
(long int)expected_output_dims[2], (long int)expected_output_dims[3],
(long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
(long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3]);
return 1;
}
} else if (nb_dim == 5) {
if ((CudaNdarray_HOST_DIMS(output)[0] != expected_output_dims[0]) ||
(CudaNdarray_HOST_DIMS(output)[1] != expected_output_dims[1]) ||
(CudaNdarray_HOST_DIMS(output)[2] != expected_output_dims[2]) ||
(CudaNdarray_HOST_DIMS(output)[3] != expected_output_dims[3]) ||
(CudaNdarray_HOST_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
(long int)expected_output_dims[0], (long int)expected_output_dims[1],
(long int)expected_output_dims[2], (long int)expected_output_dims[3],
(long int)expected_output_dims[4],
(long int)CudaNdarray_HOST_DIMS(output)[0], (long int)CudaNdarray_HOST_DIMS(output)[1],
(long int)CudaNdarray_HOST_DIMS(output)[2], (long int)CudaNdarray_HOST_DIMS(output)[3],
(long int)CudaNdarray_HOST_DIMS(output)[4]);
return 1;
}
}
{
size_t worksize;
void *workspace;
cudnnConvolutionBwdFilterAlgo_t chosen_algo;
if (CHOOSE_ALGO)
{
// A new convolution implementation should be selected, based either on
// timing or heuristics, if in one of the two following cases :
// - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the
// shapes of the inputs differ from the last time an implementation
// was chosen.
bool reuse_previous_algo;
if (CHOOSE_ALGO_ONCE)
{
// Only choose a new implementation of none has been chosen before.
reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
}
else
{
// Reuse the previous implementation if the the kernels and the outputs
// have the same shapes as they had when the previous implementation
// was selected
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
APPLY_SPECIFIC(previous_input_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
APPLY_SPECIFIC(previous_output_shape)[i]);
}
reuse_previous_algo = same_shapes;
}
// If the previously choosen implementation can't be reused, select a
// new one based on the shapes of the current inputs
if (!reuse_previous_algo)
{
// Obtain a convolution algorithm appropriate for the input and output
// shapes. Either by choosing one according to heuristics or by making
// cuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
// Time the different implementations to choose the best one
int requestedCount = 1;
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(kerns),
requestedCount,
&count,
&choosen_algo_perf);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error selecting convolution algo: "
"%s", cudnnGetErrorString(err));
return 1;
}
chosen_algo = choosen_algo_perf.algo;
}
else
{
// Choose the convolution implementation using heuristics based on the
// shapes of the inputs and the amount of memory available.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
}
// Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_bwd_f_algo) = chosen_algo;
APPLY_SPECIFIC(previous_algo_set) = true;
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i];
APPLY_SPECIFIC(previous_output_shape)[i] =
CudaNdarray_HOST_DIMS(output)[i];
}
}
else
{
// Reuse the previously chosen convlution implementation
chosen_algo = APPLY_SPECIFIC(previous_bwd_f_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
if (0){
char * a;
switch(chosen_algo){
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0:
a = "algo 0 (0)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
a = "algo 1 (1)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
a = "fft (2)";
break;
case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
a = "algo 3 (3)";
break;
}
printf("GpuDNNConvGW: algo %s\n", a);
}
// The FFT implementation (only in v3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
// Extract the spatial size of the filters
int filter_h = CudaNdarray_HOST_DIMS(*kerns)[2];
int filter_w = CudaNdarray_HOST_DIMS(*kerns)[3];
// Extract the spatial size of the input
int input_h = CudaNdarray_HOST_DIMS(input)[2];
int input_w = CudaNdarray_HOST_DIMS(input)[3];
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
}
// Infer required workspace size from the chosen implementation
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(kerns),
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
// Allocate workspace for the convolution
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
// Perform the convolution
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
"""
This file implement 3 different version of the elemwise op on the
gpu. Only NaiveAlgo is used and it is not very naive now.
The elemwise fct are also used with scalar operation! So it can happen
that ndim is 0 as with all scalar type.
"""
from __future__ import absolute_import, print_function, division
import logging
import numpy
from theano.scalar.basic import upgrade_to_float_no_complex, complex_types
from theano.scalar.basic_scipy import Erfinv
from six import StringIO
from six.moves import xrange
from theano import Apply
from theano import gof, scalar
_logger_name = 'theano.sandbox.cuda.elemwise'
_logger = logging.getLogger(_logger_name)
def _logical_scalar(x):
return numpy.all(x.type.broadcastable)
def get_str_list_logical_scalar(node, value_str='ii_i%i_value',
data_str='ii_i%i_data[0]'):
l = []
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
l += [value_str % ipos]
else:
l += [data_str % ipos]
return l
class SupportCodeError(Exception):
"""
It is currently not possible to auto-generate a GPU implementation for
an elementwise Op with c_support_code_apply().
But we support Op.c_support_code.
"""
class NaiveAlgo(object):
"""
Parameters
----------
scalar_op
The scalar operation to execute on each element.
sync
If True, will wait after the kernel launch and check for error call.
"""
verbose = 0 # 1, 2 or 3 for more verbose output.
@property
def cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (20, self.verbose, self.sync, ver)
else:
return ver
def __init__(self, scalar_op, sync=True, inplace_pattern=None):
if inplace_pattern is None:
inplace_pattern = {}
try:
code = scalar_op.c_support_code_apply(None, "nodename")
if code:
raise SupportCodeError(scalar_op)
except gof.utils.MethodNotDefined:
pass
self.scalar_op = scalar_op
self.sync = sync
self.inplace_pattern = inplace_pattern
def c_src_kernel(self, node, nodename, nd):
sio = StringIO()
# print 'C_SRC_KERNEL', sio.getvalue()
print("// %s" % str(node.op), file=sio)
print("// node.op.destroy_map=%s" % str(
getattr(node.op, 'destroy_map', None)), file=sio)
for ipos, i in enumerate(node.inputs):
print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio)
print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
self.scalar_op.__class__.__name__, nodename, nd), file=sio)
if (nd):
print("\t,", ", ".join("const int dim%i" % i
for i in xrange(nd)), file=sio)
# declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] +
["int i%i_str_%i" % (ipos, d) for d in xrange(nd)])
print("\t,", s, file=sio)
# declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] +
["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
print("\t,", s, file=sio)
# print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
# print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio)
print(" const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
print(" const int numThreads = blockDim.x * gridDim.x;", file=sio)
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print(" const float ii_i%i_value = i%i_data[0];" % (ipos, ipos), file=sio)
# loop over the elements to be treated by this kernel call
print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
# calculate the data pointers for all arguments
print(" int ii = i;", file=sio)
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
for ipos, i in enumerate(node.outputs):
print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
for d in xrange(nd - 1, -1, -1):
if d > 0:
print(" int pos%i = ii %% dim%i;" % (d, d), file=sio)
print(" ii = ii / dim%i;" % d, file=sio)
else:
print(" int pos%i = ii;" % d, file=sio)
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print(" ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
for ipos, i in enumerate(node.outputs):
print(" ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code(
Apply(self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable()
for input in node.inputs],
[scalar.Scalar(dtype=output.type.dtype).make_variable()
for output in node.outputs]),
nodename + '_scalar_',
get_str_list_logical_scalar(node),
['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("ii_o%i_data[0] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio)
# indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs):
# print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print("}", file=sio)
# print sio.getvalue()
return sio.getvalue()
def c_src_kernel_tiling(self, node, nodename):
"""
The kernel applies to problems with <= 5 dimensions.
"""
# The kernel is intended to be structured roughly like this:
"""
static __global__ void kernel()
{
for (int v = blockIdx.y; v < dim0; v += gridDim.x)
{
for (int w = blockIdx.y; w < dim1; w += gridDim.y)
{
for (int x = threadIdx.x; x < dim2; x += blockDim.x)
{
for (int y = threadIdx.y; y < dim3; y += blockDim.y)
{
for (int z = threadIdx.z; z < dim4; z += blockDim.z)
{
out[v * out_stride[0] + ...] = f(in1[...], in2[...])
}
}
}
}
}
}
"""
nd = node.outputs[0].type.ndim
sio = StringIO()
# print 'C_SRC_KERNEL', sio.getvalue()
if nd in (4,):
# print some leading comments to make the code easier to read
print("// %s" % str(node.op), file=sio)
print("// node.op.destroy_map=%s" % str(
getattr(node.op, 'destroy_map', None)), file=sio)
for ipos, i in enumerate(node.inputs):
print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio)
print(
"static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
(self.scalar_op.__class__.__name__,
nodename,
'tiling%i' % nd), file=sio)
if (nd):
print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
# declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio)
# declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio)
# print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
# print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio)
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
print(" __shared__ float value0[%i];" % len(node.inputs), file=sio)
print(" __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
# print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
print(" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print(" value0[%i] = i%i_data[0];" % (ipos, ipos), file=sio)
for ipos in xrange(nd):
print(" shared_dims[%i] = dim%i;" % (ipos, ipos), file=sio)
print(" }", file=sio)
print(" __syncthreads();", file=sio)
if (nd == 4):
print("""
for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
{
for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
{
//for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
{
//for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
{
""", file=sio)
else:
raise NotImplementedError()
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
for ipos, i in enumerate(node.outputs):
print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
for d in xrange(nd):
for ipos, i in enumerate(node.inputs):
if not _logical_scalar(i):
print(" ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d), file=sio)
for ipos, i in enumerate(node.outputs):
print(" ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d), file=sio)
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code(
Apply(
self.scalar_op,
[scalar.Scalar(
dtype=input.type.dtype).make_variable()
for input in node.inputs],
[scalar.Scalar(
dtype=output.type.dtype).make_variable()
for output in node.outputs]),
nodename + '_scalar_',
get_str_list_logical_scalar(node, value_str='value0[%i]'),
['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio)
print(" }" * nd, file=sio)
# TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot)
# indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs):
# print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print("}", file=sio)
print(sio.getvalue())
return sio.getvalue()
def c_src_kernel_tiling_less_registers(self, node, nodename):
"""
The kernel applies to problems with <= 5 dimensions.
"""
nd = node.outputs[0].type.ndim
n_in = len(node.inputs)
n_out = len(node.outputs)
sio = StringIO()
if nd not in (2,):
return sio.getvalue()
# print some leading comments to make the code easier to read
print("// %s" % str(node.op), file=sio)
print("// node.op.destroy_map=%s" % str(
getattr(node.op, 'destroy_map', None)), file=sio)
for ipos, i in enumerate(node.inputs):
print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio)
print(
"static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
(self.scalar_op.__class__.__name__,
nodename,
'tiling%i_less_registers' % nd), file=sio)
if (nd):
print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
# declare inputs
for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data_0" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio)
# declare outputs
for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio)
# print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
# print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio)
# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
use_shared_stride = False
use_shared_limits = False
def decl_limits(nd):
if use_shared_limits:
print("__shared__ float * limits[%(nd)s];" % locals(), file=sio)
def stride(io, p, d):
if use_shared_stride:
return "s%s_str[%i][%i]" % (io, p, d)
else:
return "%s%i_str_%i" % (io, p, d)
def limits(d):
if use_shared_limits:
return "limits[%i]" % d
else:
return "limits%i" % d
def decl_shared_stride(nin, nout, nd):
if not use_shared_stride:
return
print("""
__shared__ int si_str[%(nin)s][%(nd)s];
__shared__ int so_str[%(nout)s][%(nd)s];
if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
""" % locals(), file=sio)
for i in xrange(nin):
for d in xrange(nd):
print("si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(), file=sio)
for i in xrange(n_out):
for d in xrange(nd):
print("so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(), file=sio)
print("} __syncthreads();", file=sio)
def calc_limit(d):
s = stride('o', 0, d)
lname = limits(d)
if use_shared_limits:
print("if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
if d == 0:
print("%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
else:
dm1 = d - 1
print("%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
print("} __syncthreads();", file=sio)
else:
if d == 0:
print("const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(), file=sio)
else:
dm1 = d - 1
print("const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(), file=sio)
def decl_ptrs(d, offset):
dm1 = d - 1
assert dm1 >= 0
for i in xrange(n_in):
s = stride('i', i, d)
print("const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
for i in xrange(n_out):
s = stride('o', i, d)
print("float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(), file=sio)
def inc_ptrs(d, amt):
for i in xrange(n_in):
s = stride('i', i, d)
print("i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
for i in xrange(n_out):
s = stride('o', i, d)
print("o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(), file=sio)
def while_limit(d):
lname = limits(d)
print("while (o0_data_%(d)s < %(lname)s) { " % locals(), file=sio)
def end_while(d):
print("}", file=sio)
def task_code(d):
print(self.scalar_op.c_code(
Apply(
self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable()
for input in node.inputs],
[scalar.Scalar(dtype=output.type.dtype).make_variable()
for output in node.outputs]),
nodename + '_scalar_',
['i%i_data_%i[0]' % (ipos, d) for ipos,
i in enumerate(node.inputs)],
['o%i_data_%i[0]' % (ipos, d) for ipos,
i in enumerate(node.outputs)],
sub=dict(fail='return;')), file=sio)
# TODO: set a failure code somehow!!!
if nd == 4:
decl_shared_stride(n_in, n_out, nd)
decl_limits(nd)
calc_limit(0)
inc_ptrs(0, 'blockIdx.x')
while_limit(0)
if 1:
calc_limit(1)
decl_ptrs(1, 'blockIdx.y')
while_limit(1)
if 1:
calc_limit(2)
decl_ptrs(2, 'threadIdx.y')
while_limit(2)
if 1:
calc_limit(3)
decl_ptrs(3, 'threadIdx.x')
while_limit(3)
if 1:
task_code(3)
inc_ptrs(3, 'blockDim.x')
end_while(3)
inc_ptrs(2, 'blockDim.y')
end_while(2)
inc_ptrs(1, 'gridDim.y')
end_while(1)
inc_ptrs(0, 'gridDim.x')
end_while(0)
print("}", file=sio)
print(sio.getvalue())
return sio.getvalue()
def c_src_kernel_Ccontiguous(self, node, nodename):
sio = StringIO()
# print 'C_SRC_KERNEL', sio.getvalue()
print("// %s" % str(node.op), file=sio)
print("// node.op.destroy_map=%s" % str(
getattr(node.op, 'destroy_map', None)), file=sio)
for ipos, i in enumerate(node.inputs):
print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio)
print("static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % (self.scalar_op.__class__.__name__, nodename), file=sio)
# declare inputs
for ipos, i in enumerate(node.inputs):
print("\t,", "const float * i%i_data" % ipos, file=sio)
# declare outputs
for ipos, i in enumerate(node.outputs):
print("\t,", "float * o%i_data" % ipos, file=sio)
print("\t)\n{", file=sio)
print(" const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
print(" const int numThreads = blockDim.x * gridDim.x;", file=sio)
# For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable
for ipos, i in enumerate(node.inputs):
if _logical_scalar(i):
print(" const float ii_i%i_value = i%i_data[0];" % (ipos, ipos), file=sio)
# loop over the elements to be treated by this kernel call
print(" for (int i = idx; i < numEls; i += numThreads) {", file=sio)
# perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code??
for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code(
Apply(
self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable()
for input in node.inputs],
[scalar.Scalar(dtype=output.type.dtype).make_variable()
for output in node.outputs]),
nodename + '_scalar_',
# , ['i%i_data[i]'%ipos for ipos,
# i in enumerate(node.inputs)]
get_str_list_logical_scalar(node, data_str='i%i_data[i]'),
['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;'))
# TODO: set a failure code somehow!!!
print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio)
print("}", file=sio)
# print sio.getvalue()
return sio.getvalue()
def c_src_callkernel(self, node, nodename):
#
# This function serves three main goals:
#
# The first is stride unpacking:
# it accepts input and output arguments as
# float * , int*
# pairs, and it constructs a kernel function call where inputs and arguments are named
# like
# float *, int, int, int ...
#
# The second is to recognize when any dimensions can be collapsed as
# being contiguous. That mean that we can merge that dimensions with another
# one for all inputs/outputs and have the same retusuls (confusing... read code)
#
# The thrid is to make a special case for scalar element. We allow the collapsing of them.
# In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.
# TODO: make a special case for broadcasting, to store the data in shared memory.
nd = node.outputs[0].type.ndim
nb_inputs = len(node.inputs)
nb_outputs = len(node.outputs)
d = dict()
# input_params and output_params go into the function declaration/definition
input_params = ", ".join(
"const float * i%i_data, const int * i%i_str" % (ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_params = ", ".join(
"float * o%i_data, const int * o%i_str" % (ipos, ipos)
for ipos in xrange(len(node.outputs)))
# input_args and output_args go into the recursive call.
input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
for ipos in xrange(len(node.inputs)))
output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
for ipos in xrange(len(node.outputs)))
prod_dims = '*'.join(
["dims[%i]" % di for di in xrange(nd)] + ['1'])
scalar_op = self.scalar_op.__class__.__name__
sio = StringIO()
print("""
static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[])
{
//can we collapse dims[i] and dims[i-1]
for(int i=nd-1;i>0;i--){
if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
collapse[i]=1;
}else collapse[i]=0;
}
}
""" % locals(), file=sio)
print("""
static int callkernel_%(nodename)s(unsigned int numEls, const int d,
const int * dims,
%(input_params)s,
%(output_params)s)
{
numEls = %(prod_dims)s;
""" % locals(), file=sio)
if self.verbose:
print("""
std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n";
""" % locals(), file=sio)
print(
'std::cerr << ' + " << ' ' << ".join(
['" "'] +
list("dims[%i]" % di for di in xrange(nd)) +
["'\\n';"]),
file=sio)
if self.verbose > 1:
for ipos in xrange(len(node.inputs)):
istrings = [
"i%s_str[%i]" % (ipos, di) for di in xrange(nd)]
ipositions = " << ' ' << ".join(
["i%s_data" % ipos] + istrings)
print("""
std::cerr << " %(ipos)s data strides" << %(ipositions)s << "\\n";
""" % dict(ipos=ipos, ipositions=ipositions), file=sio)
for ipos in xrange(len(node.outputs)):
print("""
std::cerr << " %(ipos)s data strides" <<
""" % locals() + " << ' ' << ".join(
["o%s_data" % ipos] +
list(
"o%s_str[%i]" % (ipos, di) for di in xrange(nd)
)) +
''' << "\\n"; ''', file=sio)
# collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides
if nd > 0:
print("int local_dims[%(nd)s];" % locals(), file=sio)
else:
print("int *local_dims=NULL;", file=sio)
if nb_inputs > 0 and nd > 0:
print("""
int local_str[%(nb_inputs)s][%(nd)s];
int local_ostr[%(nb_outputs)s][%(nd)s];
""" % locals(), file=sio)
else:
print("""
int local_str[1][1];
int local_ostr[1][1];
""", file=sio)
print("""
int nd_collapse = %(nd)s;
for(int i=0;i<%(nd)s;i++){//init new dim
local_dims[i]=dims[i];
}
""" % locals(), file=sio)
for ipos in xrange(len(node.inputs)):
print("""
for(int i=0;i<%(nd)s;i++){//init new strides
local_str[%(ipos)s][i]=i%(ipos)s_str[i];
}
""" % locals(), file=sio)
for ipos in xrange(len(node.outputs)):
print("""
for(int i=0;i<%(nd)s;i++){//init new strides
local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
}
""" % locals(), file=sio)
if self.verbose > 2:
print('std::cerr <<"before broadcast collapse\\n";', file=sio)
print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
print('std::cerr << "local_dims";', file=sio)
for d in xrange(nd):
print('std::cerr << " " << local_dims[%(d)s]; ' % locals(), file=sio)
print('std::cerr << "\\n";', file=sio)
if nd > 0:
for ipos in xrange(len(node.inputs)):
print(
'std::cerr << " local_str inputs %(ipos)s: " <<' % locals() +
' << " " << '.join(["local_str[%s][%s]" % (ipos, x)
for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)):
print(
'std::cerr << " local_ostr inputs %(ipos)s: " <<' %
locals() +
' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
print("""
for(int id=0;id<nd_collapse;id++){
bool all_broadcast=true;
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
}
for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
}
if(all_broadcast){
for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
local_dims[j-1]=local_dims[j];
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
local_str[input_id][j-1]=local_str[input_id][j];
}
}
for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
local_ostr[output_id][j-1]=local_ostr[output_id][j];
}
}
nd_collapse--; id--;
}
}
""" % locals(), file=sio)
if self.verbose > 2:
print('std::cerr <<"after broadcast collapse\\n";', file=sio)
print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
print('std::cerr << "local_dims";', file=sio)
for d in xrange(nd):
print('std::cerr << " " << local_dims[%(d)s]; ' %
locals(), file=sio)
print('std::cerr << "\\n";', file=sio)
if nd > 0:
for ipos in xrange(len(node.inputs)):
print('std::cerr << " local_str %(ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_str[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)):
print(
'std::cerr << " local_ostr %(ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because we make less index calculation in the gpu.
if nd > 0:
print("int nd_collapse_[%(nd)s] = {" %
locals() + ','.join(
['1' for x in xrange(nd)]) + "};", file=sio)
else:
print("int *nd_collapse_ = NULL;", file=sio)
for ipos in xrange(len(node.inputs)):
if not _logical_scalar(node.inputs[ipos]):
if nd > 0:
print("""
int nd_collapse_%(ipos)s[%(nd)s] = {""" %
locals() +
','.join(['1' for x in xrange(nd)]) +
"};", file=sio)
else:
print("""
int * nd_collapse_%(ipos)s = NULL;""" %
locals(), file=sio)
print("""
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){
if(nd_collapse_%(ipos)s[i]==0)
nd_collapse_[i]=0;
}
""" % locals(), file=sio)
if self.verbose > 1:
print("""
std::cerr<< "nd_collapse_%(ipos)s "<<
""" % locals(), file=sio)
print(' << " " << '.join(["nd_collapse_ %s[" %
ipos + str(i) + "]" for i in xrange(nd)]),
file=sio)
print('<< "\\n";', file=sio)
# update the local stride.
for ipos in xrange(len(node.inputs)):
print("""
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
}
}
""" % locals(), file=sio)
for ipos in xrange(len(node.outputs)):
print("""
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
}
}
""" % locals(), file=sio)
# update the local dims.
print("""
for(int i=nd_collapse-1;i>0;i--){
if(nd_collapse_[i]==1){
local_dims[i-1]*=local_dims[i];//set new dims
for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
local_dims[j-1]=local_dims[j];
}
}
""" % locals(), file=sio)
# update the new number of dim
print("""
for(int i=1, end=nd_collapse;i<end;i++){
if(nd_collapse_[i]==1)nd_collapse--;
}
if(nd_collapse == 1 """ % locals(), file=sio)
l = ["local_str[%s][nd_collapse-1]==1 " %
ipos for ipos in xrange(len(node.inputs)) if not
_logical_scalar(node.inputs[ipos])]
l += ["local_ostr[%s][nd_collapse-1]==1 " %
ipos for ipos in xrange(len(node.outputs)) if not
_logical_scalar(node.outputs[ipos])]
if len(l) > 0:
print(" && ", " && ".join(l), file=sio)
print("""){nd_collapse=0;} """, file=sio)
if self.verbose:
print('std::cerr <<"after can_collapse\\n";', file=sio)
print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio)
if self.verbose > 1:
for d in xrange(nd):
print('std::cerr << " " << local_dims[%(d)s]; ' %
locals(),
file=sio)
print('std::cerr << "\\n";', file=sio)
if nd > 0:
for ipos in xrange(len(node.inputs)):
print(
'std::cerr << " local_str % (ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_str[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)):
print('std::cerr << " local_ostr % (ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
def launch_Ccontiguous(nodename, scalar_op, sync=True):
kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data" % ipos)
for ipos in xrange(len(node.outputs)):
kernel_call_args.append("o%i_data" % ipos)
kernel_call_args = ", ".join(kernel_call_args)
verb = ""
if self.verbose:
verb = 'std::cerr << " Running ccontiguous version\\n";'
print("""
//first use at least a full warp
int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
//next start adding multiprocessors
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
//std::cerr << "calling callkernel returned\\n";
""" % locals(), file=sio)
if sync:
print("""
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
"GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
n_blocks, threads_per_block,
"kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
return -1;
}
%(verb)s
return 0;
""" % locals(), file=sio)
else:
print(" return 0; " % locals(), file=sio)
def launch_General(nodename, scalar_op, force_nd, sync=True):
# kernel_call_args are used to invoke the cuda kernel
local = "local_"
kernel_call_args = ["numEls"]
kernel_call_args.extend(
local + "dims[%i]" %
di for di in xrange(force_nd))
for ipos in xrange(len(node.inputs)):
kernel_call_args += ["i%i_data" % ipos] + list(
local + "str[%i][%i]" %
(ipos, di) for di in xrange(force_nd))
# strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)):
kernel_call_args += ["o%i_data" % ipos] + list(
local + "ostr[%i][%i]" %
(ipos, di) for di in xrange(force_nd))
# strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
if self.verbose:
print("""
std::cerr << " Running general version with %(force_nd)s dims\\n";
""" % locals(), file=sio)
print("std::cerr << " + ' << " " << '.join(
kernel_call_args) + ' << "\\n";', file=sio)
# std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
kernel_call_args = ", ".join(kernel_call_args)
print("""
//first use at least a full warp
int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
//next start adding multiprocessors
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
// next start adding more warps per multiprocessor
if (threads_per_block * n_blocks < numEls)
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
""" % locals(), file=sio)
if sync:
print("""
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
"GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
n_blocks, threads_per_block,
"kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
return -1;
}
return 0;
""" % locals(), file=sio)
else:
print(" return 0; " % locals(), file=sio)
print("if(numEls==0) return 0;", file=sio)
print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" %
locals(), file=sio)
print("case 0: {", file=sio)
launch_Ccontiguous(nodename, scalar_op, self.sync)
print(" } break;", file=sio)
for i in xrange(1, nd + 1):
print("case " + str(i) + ": {", file=sio)
launch_General(nodename, scalar_op, i, self.sync)
print(" } break;", file=sio)
print("}", file=sio) # end case
print("return -2;", file=sio) # should not get to this point
print("}", file=sio) # end fct
# N.B. cudaGetLastError is called by c_code
return sio.getvalue()
def c_support_code_apply(self, node, nodename):
nd = node.outputs[0].type.ndim
defines = """
#define INTDIV_POW2(a, b) (a >> b)
#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
"""
kernels = "".join(
[self.c_src_kernel(node, nodename, x)
for x in xrange(1, nd + 1)] +
[self.c_src_kernel_Ccontiguous(node, nodename)] +
[self.c_src_callkernel(node, nodename)])
return defines + kernels
def c_support_code(self):
return self.scalar_op.c_support_code()
def c_code(self, node, nodename, inputs, outputs, sub):
d = dict(sub)
nd = node.outputs[0].type.ndim
d.update(locals())
sio = StringIO()
nin = len(inputs)
nout = len(outputs)
fail = sub['fail']
opname = str(self.scalar_op)
initial_dims = ','.join('1' for i in xrange(nd))
if 1 or self.scalar_op == scalar.pow:
print("""
//std::cerr << "C_CODE %(opname)s START\\n";
//standard elemwise size checks
""" % locals(), file=sio)
if nd > 0:
print("""
int dims[%(nd)s] = {%(initial_dims)s};
""" % locals(), file=sio)
else:
print("""
int *dims = NULL;
""", file=sio)
# check that all inputs have valid dimensions
emitted_inames = {}
for id, iname in enumerate(inputs):
if iname in emitted_inames:
assert emitted_inames[iname] is node.inputs[id]
continue
# with python 2.4 (at least), if a broadcastable pattern is made of
# numpy.bool_ instead of bool, calling int() once is not enough.
broadcasts = map(int, map(int, node.inputs[id].broadcastable))
broadcasts = ', '.join(map(str, broadcasts))
nd = node.inputs[id].ndim
if nd > 0:
print("""
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
""" % locals(), file=sio)
else:
print("""
int *broadcasts_%(iname)s = NULL;
""" % locals(), file=sio)
emitted_inames[iname] = node.inputs[id]
# check that all inputs have valid dimensions
emitted_inames = {}
for id, iname in enumerate(inputs):
if iname in emitted_inames:
continue
print("""
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
if (%(nd)s != %(iname)s->nd)
{
PyErr_Format(PyExc_TypeError,
"need %(nd)s dims, not %%i", %(iname)s->nd);
%(fail)s;
}
for (int i = 0; i< %(nd)s; ++i)
{
dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
if ((!(broadcasts_%(iname)s[i] &&
CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) &&
(dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
{
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Input dimension mis-match. Input"
" %(id)d (indices start at 0) has shape[%%i] == %%i"
", but the output's size on that axis is %%i.",
i,
CudaNdarray_HOST_DIMS(%(iname)s)[i],
dims[i]
);
%(fail)s;
}
}
""" % locals(), file=sio)
emitted_inames[iname] = True
# check that all outputs have valid dimensions
for idx, oname in enumerate(outputs):
if idx not in self.inplace_pattern.keys():
print("""
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{
Py_DECREF(%(oname)s);
%(oname)s = NULL;
}
}
if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
{
Py_XDECREF(%(oname)s);
%(oname)s = NULL;
}
if (NULL == %(oname)s)
{
%(oname)s = (CudaNdarray*)CudaNdarray_New();
if (!%(oname)s)
{
//error string already set
%(fail)s;
}
if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
{
//error string already set
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals(), file=sio)
else:
input_idx = self.inplace_pattern[idx]
iname = inputs[input_idx]
print("""
Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s;
Py_INCREF(%(oname)s);
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
{
PyErr_Format(PyExc_ValueError,
"GpuElemwise. Output dimension mis-match. Output"
" %(idx)d (indices start at 0), working inplace"
" on input %(input_idx)s, has shape[%%i] == %%i"
", but the output's size on that axis is %%i.",
i,
CudaNdarray_HOST_DIMS(%(oname)s)[i],
dims[i]
);
Py_DECREF(%(oname)s);
%(oname)s = NULL;
%(fail)s;
}
}
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
""" % locals(), file=sio)
print("""
{
//new block so that failure gotos don't skip over variable initialization
//std::cerr << "calling callkernel\\n";
if (callkernel_%(nodename)s(1, 0, dims
""" % locals(), file=sio)
for iname in inputs:
print("""
, CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
""" % locals(), file=sio)
for oname in outputs:
print("""
, CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
""" % locals(), file=sio)
print("""
))
{
// error
""", file=sio)
for oname in outputs:
print("""
Py_DECREF(%(oname)s);
%(oname)s = NULL;
""" % locals(), file=sio)
print("""
%(fail)s;
}
else // no error
{
}
}
//std::cerr << "C_CODE %(opname)s END\\n";
""" % locals(), file=sio)
# print sio.getvalue()
return sio.getvalue()
class ErfinvGPU(Erfinv):
"""
Provides a c-code implementation of the inverse error function for GPU.
Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfinv, as we
currently rely on Nvidia's cublas library to provide the erfinv
c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfinv does not have c_code as scipy does not
export the required C function.
"""
def c_headers(self):
return ['math_functions.h', 'cublas_v2.h']
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type in complex_types:
raise NotImplementedError('type not supported', type)
return "%(z)s = erfinv(%(x)s);" % locals()
erfinv_gpu = ErfinvGPU(upgrade_to_float_no_complex, name='erfinv_gpu')
class ErfcxGPU(Erfinv):
"""
Provides a c-code implementation of the scaled complementary error function
for GPU.
Notes
-----
We do not add this c_code to theano.scalar.basic_scipy.Erfcx, as we
currently rely on Nvidia's cublas library to provide the erfcx
c-implementation (which requires different c_headers). As it stands,
theano.scalar.basic_scipy.Erfcx does not have c_code as scipy does not
export the required C function.
"""
def c_headers(self):
return ['math_functions.h', 'cublas_v2.h']
def c_code(self, node, name, inp, out, sub):
x, = inp
z, = out
if node.inputs[0].type in complex_types:
raise NotImplementedError('type not supported', type)
return "%(z)s = erfcx(%(x)s);" % locals()
erfcx_gpu = ErfcxGPU(upgrade_to_float_no_complex, name='erfcx_gpu')
from __future__ import absolute_import, print_function, division
import theano
import copy
from theano import Op
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.sandbox.cuda.basic_ops import gpu_flatten
from theano.tensor.extra_ops import CumOp
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host, HostFromGpu
from theano.sandbox.cuda import register_opt as register_gpu_opt
class GpuCumsum(CumOp, GpuOp):
"""
Parameters
----------
axis
Can not be None. If you want the array flatten, do it before.
"""
SUPPORTED_NDIMS = 3
__props__ = ('axis', 'max_threads_dim0', 'max_grid_size1', 'max_grid_size2')
def __init__(self, axis):
self.axis = axis
self.max_threads_dim0 = None
self.max_grid_size1 = None
self.max_grid_size2 = None
# We must reuse the same method, not reimplement and call it.
# Otherwise DebugMode will print many warnings.
perform = Op.perform
def make_node(self, x):
assert x.dtype == 'float32'
if not isinstance(x.type, CudaNdarrayType):
raise TypeError('x must be a CudaNdarrayType', x)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and 3D array are supported right now!')
if self.axis >= x.ndim or self.axis < -x.ndim:
raise ValueError('axis(={1}) out of bounds'.format(self.axis))
return theano.Apply(self, [x], [x.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling, impl=None):
node_ = copy.copy(node)
assert node.op is node_.op
if node_.op.max_threads_dim0 is None or node_.op.max_grid_size1 is None or node_.op.max_grid_size2 is None:
cuda = theano.sandbox.cuda
device_id = cuda.use.device_number
if device_id is None:
cuda.use("gpu",
force=False,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False,
test_driver=True)
device_id = cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
prop = cuda_ndarray.device_properties(device_id)
node_.op.max_threads_dim0 = prop['maxThreadsDim0']
node_.op.max_grid_size1 = prop['maxGridSize1']
node_.op.max_grid_size2 = prop['maxGridSize2']
return super(GpuCumsum, node_.op).make_thunk(node_, storage_map,
compute_map, no_recycling, impl)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.axis)
def c_code_cache_version(self):
return (9,)
def c_support_code_apply(self, node, nodename):
return """
__device__
void k_reductionPhase_%(nodename)s(float* partialCumSum) {
// Traverse down from leaves to root building partial sums at internal nodes in the tree.
for (unsigned int stride = 1; stride <= blockDim.x; stride *= 2) {
__syncthreads();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index < blockDim.x*2) {
partialCumSum[index] += partialCumSum[index - stride];
}
}
}
__device__
void k_reversePhase_%(nodename)s(float* partialCumSum) {
// Traverse back up the tree building the scan from the partial sums
for (unsigned int stride = exp2(ceil(log2((float)blockDim.x))); stride > 0; stride /= 2) {
__syncthreads();
unsigned int index = (threadIdx.x + 1) * (stride * 2) - 1;
if(index + stride < blockDim.x*2) {
partialCumSum[index + stride] += partialCumSum[index];
}
}
}
__device__
void k_fetchData_%(nodename)s(float* partialCumSum, float* input, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides.y + idZ * dataStrides.z;
int idx_even = (globalThreadID*2 ) * dataStrides.x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides.x + offset;
partialCumSum[threadIdx.x*2] = input[idx_even];
partialCumSum[threadIdx.x*2 + 1] = input[idx_odd];
}
__device__
void k_pushData_%(nodename)s(float* partialCumSum, float* output, int globalThreadID, dim3 dataStrides, int offsetY, int offsetZ) {
__syncthreads();
// blockIdx.y and blockIdx.z represents the current independent cumsum
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int offset = idY * dataStrides.y + idZ * dataStrides.z;
int idx_even = (globalThreadID*2 ) * dataStrides.x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides.x + offset;
output[idx_even] = partialCumSum[threadIdx.x*2];
output[idx_odd] = partialCumSum[threadIdx.x*2 + 1];
}
__global__
void k_cumadd_%(nodename)s(float* input, float* output, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, int beforeLastElementIdx, int lastElementIdx) {
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
int dataOffsetY_input = idY * inputStrides.y + idZ * inputStrides.z;
int dataOffsetY_output = idY * outputStrides.y + idZ * outputStrides.z;
int idx_last_input = lastElementIdx*inputStrides.x + dataOffsetY_input;
int idx_last_output = lastElementIdx*outputStrides.x + dataOffsetY_output;
int idx_beforelast = beforeLastElementIdx*outputStrides.x + dataOffsetY_output;
output[idx_last_output] = input[idx_last_input] + output[idx_beforelast];
}
__global__
void k_finalCumSum_%(nodename)s(float* output, float* blockSum, int nbElementsPerCumsum, dim3 dataStrides, int offsetY, int offsetZ) {
int globalThreadID = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
// Check if current has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return;
}
int idY = blockIdx.y + offsetY;
int idZ = blockIdx.z + offsetZ;
const float currentBlockSum = blockSum[blockIdx.x*(gridDim.y*gridDim.z) + idY*gridDim.z + idZ];
int offset = idY * dataStrides.y + idZ * dataStrides.z;
int idx_even = (globalThreadID*2 ) * dataStrides.x + offset;
int idx_odd = (globalThreadID*2 + 1) * dataStrides.x + offset;
output[idx_even] += currentBlockSum;
output[idx_odd] += currentBlockSum;
}
__global__
void k_blockCumSum_%(nodename)s(float* input, float* output, int nbElementsPerCumsum, dim3 inputStrides, dim3 outputStrides, int offsetY, int offsetZ, float* blockSum) {
// Regarding blockIdx and threadIdx, 'Cumsum' is always performed along the X axis.
// The Y and Z axis of the grid will contain all independent cumsums of the 2D/3D case.
int globalThreadID = blockIdx.x * blockDim.x + threadIdx.x;
// Check if current thread has data to process.
if (globalThreadID >= ceil(nbElementsPerCumsum/2.0)) {
return;
}
extern __shared__ float partialCumSum[];
// Load data in shared memory
k_fetchData_%(nodename)s(partialCumSum, input, globalThreadID, inputStrides, offsetY, offsetZ);
// Use a dichotomy approach to compute the cumsum (i.e. balanced binary tree).
// The tree is sweeped from the leaves to the root and from the root to the leaves.
// Similar to http://www.umiacs.umd.edu/~ramani/cmsc828e_gpusci/ScanTalk.pdf
k_reductionPhase_%(nodename)s(partialCumSum);
k_reversePhase_%(nodename)s(partialCumSum);
// Write the final output to global memory
k_pushData_%(nodename)s(partialCumSum, output, globalThreadID, outputStrides, offsetY, offsetZ);
if (blockSum != NULL){
if (threadIdx.x == blockDim.x - 1) {
blockSum[blockIdx.x*(gridDim.y*gridDim.z) + (blockIdx.y + offsetY)*gridDim.z + blockIdx.z + offsetZ] = partialCumSum[threadIdx.x*2 + 1];
}
}
}
int cumSum_%(nodename)s(CudaNdarray* input, CudaNdarray* output, int axis, int maxThreads, int maxGridY, int maxGridZ) {
int shape[3] = { 1, 1, 1 };
dim3 inputStrides(0, 0, 0);
dim3 outputStrides(0, 0, 0);
switch (CudaNdarray_NDIM(input))
{
case 1:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
break;
case 2:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
break;
case 3:
shape[0] = CudaNdarray_HOST_DIMS(input)[0];
shape[1] = CudaNdarray_HOST_DIMS(input)[1];
shape[2] = CudaNdarray_HOST_DIMS(input)[2];
inputStrides.x = CudaNdarray_HOST_STRIDES(input)[0];
inputStrides.y = CudaNdarray_HOST_STRIDES(input)[1];
inputStrides.z = CudaNdarray_HOST_STRIDES(input)[2];
outputStrides.x = CudaNdarray_HOST_STRIDES(output)[0];
outputStrides.y = CudaNdarray_HOST_STRIDES(output)[1];
outputStrides.z = CudaNdarray_HOST_STRIDES(output)[2];
break;
default:
return -1;
}
if (shape[axis] <= 1) {
CudaNdarray_CopyFromCudaNdarray(output, input);
return 0;
}
// Perform cumsum on array of even size.
int nbElementsPerCumsum = shape[axis] - (shape[axis] %% 2);
// Determine how many elements can be processed in one block.
int dimBlockX = ceil( min(nbElementsPerCumsum, 2*maxThreads) / 2.0);
// Determine how many blocks are needed in total.
int dimGridX = ceil(nbElementsPerCumsum / (2.0*dimBlockX)); // Nb. of blocks needed per cumsum.
int dimGridY; // Nb. of independent cumsums (width).
int dimGridZ; // Nb. of independent cumsums (height).
int tmp;
switch (axis)
{
case 0:
dimGridY = shape[1];
dimGridZ = shape[2];
break;
case 1:
dimGridY = shape[0];
dimGridZ = shape[2];
tmp = inputStrides.x;
inputStrides.x = inputStrides.y;
inputStrides.y = tmp;
tmp = outputStrides.x;
outputStrides.x = outputStrides.y;
outputStrides.y = tmp;
break;
case 2:
dimGridY = shape[1];
dimGridZ = shape[0];
tmp = inputStrides.x;
inputStrides.x = inputStrides.z;
inputStrides.z = tmp;
tmp = outputStrides.x;
outputStrides.x = outputStrides.z;
outputStrides.z = tmp;
break;
default:
return -1;
}
const int shapeBlockSum[2] = { dimGridX, dimGridY*dimGridZ };
CudaNdarray* deviceBlockSum = (CudaNdarray*) CudaNdarray_NewDims(2, shapeBlockSum);
// Perform `maxGridY`*`maxGridZ` cumsums in parallel.
for (int offsetY = 0; offsetY < dimGridY; offsetY += maxGridY){
int localDimGridY = min(dimGridY - offsetY, maxGridY);
for (int offsetZ = 0; offsetZ < dimGridZ; offsetZ += maxGridZ){
int localDimGridZ = min(dimGridZ - offsetZ, maxGridZ);
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
dim3 dimBlock(dimBlockX, 1, 1); // One cumsum per block.
int sharedBytes = (2*dimBlockX) * sizeof(float);
k_blockCumSum_%(nodename)s<<<dimGrid, dimBlock, sharedBytes>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
nbElementsPerCumsum,
inputStrides,
outputStrides,
offsetY,
offsetZ,
CudaNdarray_DEV_DATA(deviceBlockSum)
);
if (dimGridX > 1) {
// Do a cumsum over the blockSum (recursive).
if (cumSum_%(nodename)s(deviceBlockSum, deviceBlockSum, 0, maxThreads, maxGridY, maxGridZ) == -1){
Py_DECREF(deviceBlockSum);
return -1;
}
// Since there are more than one block (i.e. `dimGridX > 1`)
// report partial cumsums of previous blocks to subsequents ones.
dim3 dimGrid(dimGridX, localDimGridY, localDimGridZ);
dim3 dimBlock(dimBlockX, 1, 1);
k_finalCumSum_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(output),
CudaNdarray_DEV_DATA(deviceBlockSum),
nbElementsPerCumsum,
outputStrides,
offsetY,
offsetZ
);
}
// If shape[axis] is odd, the last element is compute manually
if (shape[axis] != nbElementsPerCumsum){
dim3 dimGrid(1, localDimGridY, localDimGridZ);
dim3 dimBlock(1, 1, 1);
k_cumadd_%(nodename)s<<<dimGrid, dimBlock>>>
(
CudaNdarray_DEV_DATA(input),
CudaNdarray_DEV_DATA(output),
inputStrides,
outputStrides,
offsetY,
offsetZ,
shape[axis]-2,
shape[axis]-1
);
}
}
}
Py_DECREF(deviceBlockSum);
CNDA_THREAD_SYNC;
return 0;
}
""" % locals()
def c_code(self, node, nodename, inames, onames, sub):
x, = inames
z, = onames
# We assume array has been already flattened if needed.
axis = self.axis if self.axis is not None else 0
fail = sub['fail']
max_threads_dim0 = self.max_threads_dim0
max_grid_size1 = self.max_grid_size1
max_grid_size2 = self.max_grid_size2
if max_threads_dim0 is None or max_grid_size1 is None or max_grid_size2 is None:
raise NotImplementedError("GpuCumsum.c_code should not be called "
"directly. It should be called by "
"make_thunk() that add some information "
"related to the selected GPU.")
code = """
const int* shape = CudaNdarray_HOST_DIMS(%(x)s);
bool needAllocation = !%(z)s || CudaNdarray_NDIM(%(x)s) != CudaNdarray_NDIM(%(z)s);
int axis = %(axis)s;
if (axis < 0) {
// Convert negative axis to positive axis.
axis += CudaNdarray_NDIM(%(x)s);
}
// If output is already allocated, check if its shape matches the input's one.
if (!needAllocation) {
for (int i= 0; i < CudaNdarray_NDIM(%(x)s); ++i) {
if (CudaNdarray_HOST_DIMS(%(x)s)[i] != CudaNdarray_HOST_DIMS(%(z)s)[i]) {
needAllocation = true;
}
}
}
if (needAllocation){
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*) CudaNdarray_NewDims(CudaNdarray_NDIM(%(x)s), shape);
}
if (!%(z)s) {
%(fail)s;
}
{ // Namespace for kernel calls //
if (cumSum_%(nodename)s(%(x)s, %(z)s, axis, %(max_threads_dim0)s, %(max_grid_size1)s, %(max_grid_size2)s) == -1){
%(fail)s;
}
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n",
"cumSum_%(nodename)s",
cudaGetErrorString(sts));
%(fail)s;
}
}
""" % locals()
return code
def values_eq_approx_high_tol(a, b):
"""
This fct is needed to don't have DebugMode raise useless
error due to rounding error.
This happen with big input size due to change in the order of
operation.
"""
rtol = None
if a.size > 100000:
# For float32 the default rtol is 1e-5
rtol = 5e-5
return CudaNdarrayType.values_eq_approx(a, b, rtol=rtol)
@register_gpu_opt()
@local_optimizer([CumOp])
def use_gpu_cumsum(node):
if type(node.op) is CumOp \
and node.inputs[0].dtype == 'float32' \
and node.inputs[0].owner \
and isinstance(node.inputs[0].owner.op, HostFromGpu):
if node.op.mode != 'add':
return None
axis = node.op.axis
x = node.inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None
x = gpu_from_host(x)
if axis is None and x.ndim > 1:
x = gpu_flatten(x)
# ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None:
axis = 0
ret = host_from_gpu(GpuCumsum(axis)(x))
ret.tag.values_eq_approx = values_eq_approx_high_tol
return [ret]
from __future__ import absolute_import, print_function, division
import numpy as np
import theano
import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse
if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
CudaNdarray)
if pycuda_available:
import pycuda.gpuarray
try:
import scikits.cuda
from scikits.cuda import fft, cublas
scikits.cuda.misc.init()
scikits_cuda_available = True
except (ImportError, Exception):
scikits_cuda_available = False
# TODO: investigate the effect of enabling fastmath on FFT performance
# (how can it be enabled?).
# base class for shared code between scikits.cuda-based ops
class ScikitsCudaOp(GpuOp):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def output_type(self, inp):
raise NotImplementedError
def make_node(self, inp):
inp = basic_ops.gpu_contiguous(
basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2, impl=None):
if not scikits_cuda_available:
raise RuntimeError(
"scikits.cuda is needed for all GPU fft implementation,"
" including fftconv.")
class CuFFTOp(ScikitsCudaOp):
def output_type(self, inp):
# add one extra dim for real/imag
return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim + 1))
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
plan_input_shape = [None]
plan = [None]
def thunk():
input_shape = inputs[0][0].shape
# construct output shape
output_shape = list(input_shape)
# DFT of real input is symmetric, no need to store
# redundant coefficients
output_shape[-1] = output_shape[-1] // 2 + 1
# extra dimension with length 2 for real/imag
output_shape += [2]
output_shape = tuple(output_shape)
z = outputs[0]
# only allocate if there is no previous allocation of the
# right size.
if z[0] is None or z[0].shape != output_shape:
z[0] = CudaNdarray.zeros(output_shape)
input_pycuda = to_gpuarray(inputs[0][0])
# I thought we'd need to change the type on output_pycuda
# so it is complex64, but as it turns out scikits.cuda.fft
# doesn't really care either way and treats the array as
# if it is complex64 anyway.
output_pycuda = to_gpuarray(z[0])
# only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape
plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64,
batch=input_shape[0])
fft.fft(input_pycuda, output_pycuda, plan[0])
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
class CuIFFTOp(ScikitsCudaOp):
def output_type(self, inp):
# remove extra real/imag dim
return CudaNdarrayType(
broadcastable=[False] * (inp.type.ndim - 1))
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
from theano.misc.pycuda_utils import to_gpuarray
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
plan_input_shape = [None]
plan = [None]
def thunk():
input_shape = inputs[0][0].shape
# construct output shape
# chop off the extra length-2 dimension for real/imag
output_shape = list(input_shape[:-1])
# restore full signal length
output_shape[-1] = (output_shape[-1] - 1) * 2
output_shape = tuple(output_shape)
z = outputs[0]
# only allocate if there is no previous allocation of the
# right size.
if z[0] is None or z[0].shape != output_shape:
z[0] = CudaNdarray.zeros(output_shape)
input_pycuda = to_gpuarray(inputs[0][0])
# input_pycuda is a float32 array with an extra dimension,
# but will be interpreted by scikits.cuda as a complex64
# array instead.
output_pycuda = to_gpuarray(z[0])
# only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape
plan[0] = fft.Plan(output_shape[1:], np.complex64, np.float32,
batch=output_shape[0])
fft.ifft(input_pycuda, output_pycuda, plan[0])
# strangely enough, enabling rescaling here makes it run
# very, very slowly. so do this rescaling manually
# afterwards!
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
def to_complex_gpuarray(x, copyif=False):
"""
Adapted version of theano.misc.pycuda_utils.to_gpuarray that takes
an array with an extra trailing dimension of length 2 for
real/imaginary parts, and turns it into a complex64 PyCUDA
GPUArray.
"""
if not isinstance(x, CudaNdarray):
raise ValueError("We can transfer only CudaNdarray "
"to pycuda.gpuarray.GPUArray")
else:
# Check if trailing dimension has length 2
assert x.shape[-1] == 2
# check if dtype is float32
assert x.dtype == 'float32'
# Check if it is c contiguous
size = 1
c_contiguous = True
for i in range(x.ndim - 1, -1, -1):
if x.shape[i] == 1:
continue
if x._strides[i] != size:
c_contiguous = False
break
size *= x.shape[i]
if not c_contiguous:
if copyif:
x = x.copy()
else:
raise ValueError("We were asked to not copy memory, "
"but the memory is not c contiguous.")
# Now x is always c contiguous
px = pycuda.gpuarray.GPUArray(x.shape[:-1], np.complex64, base=x,
gpudata=x.gpudata)
return px
def bptrs(a):
"""
Pointer array when input represents a batch of matrices.
Taken from scikits.cuda tests/test_cublas.py.
"""
return pycuda.gpuarray.arange(a.ptr, a.ptr + a.shape[0] * a.strides[0],
a.strides[0], dtype=cublas.ctypes.c_void_p)
def sc_complex_dot_batched(bx_gpu, by_gpu, bc_gpu, transa='N', transb='N',
handle=None):
"""
Uses cublasCgemmBatched to compute a bunch of complex dot products
in parallel.
"""
if handle is None:
handle = scikits.cuda.misc._global_cublas_handle
assert len(bx_gpu.shape) == 3
assert len(by_gpu.shape) == 3
assert len(bc_gpu.shape) == 3
assert bx_gpu.dtype == np.complex64
assert by_gpu.dtype == np.complex64
assert bc_gpu.dtype == np.complex64
# Get the shapes of the arguments
bx_shape = bx_gpu.shape
by_shape = by_gpu.shape
# Perform matrix multiplication for 2D arrays:
alpha = np.complex64(1.0)
beta = np.complex64(0.0)
transa = transa.lower()
transb = transb.lower()
if transb in ['t', 'c']:
N, m, k = by_shape
elif transb in ['n']:
N, k, m = by_shape
else:
raise ValueError('invalid value for transb')
if transa in ['t', 'c']:
N2, l, n = bx_shape
elif transa in ['n']:
N2, n, l = bx_shape
else:
raise ValueError('invalid value for transa')
if l != k:
raise ValueError('objects are not aligned')
if N != N2:
raise ValueError('batch sizes are not the same')
if transb == 'n':
lda = max(1, m)
else:
lda = max(1, k)
if transa == 'n':
ldb = max(1, k)
else:
ldb = max(1, n)
ldc = max(1, m)
# construct pointer arrays needed for cublasCgemmBatched
bx_arr = bptrs(bx_gpu)
by_arr = bptrs(by_gpu)
bc_arr = bptrs(bc_gpu)
cublas.cublasCgemmBatched(handle, transb, transa, m, n, k, alpha,
by_arr.gpudata, lda, bx_arr.gpudata, ldb,
beta, bc_arr.gpudata, ldc, N)
class BatchedComplexDotOp(ScikitsCudaOp):
"""
This version uses cublasCgemmBatched under the hood, instead of
doing multiple cublasCgemm calls.
"""
def make_node(self, inp1, inp2):
inp1 = basic_ops.gpu_contiguous(
basic_ops.as_cuda_ndarray_variable(inp1))
inp2 = basic_ops.gpu_contiguous(
basic_ops.as_cuda_ndarray_variable(inp2))
assert inp1.dtype == "float32"
assert inp2.dtype == "float32"
assert inp1.ndim == 4 # (batch, a, b, real/imag)
assert inp2.ndim == 4
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_thunk(self, node, storage_map, _, _2, impl=None):
super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
bx = inputs[0]
by = inputs[1]
input_shape_x = bx[0].shape # (batch, a, b, 2)
input_shape_y = by[0].shape # (batch, b, c, 2)
output_shape = (input_shape_x[0], input_shape_x[1],
input_shape_y[2], 2) # (batch, a, c, 2)
bz = outputs[0]
# only allocate if there is no previous allocation of the
# right size.
if bz[0] is None or bz[0].shape != output_shape:
bz[0] = CudaNdarray.zeros(output_shape)
input_bx_pycuda = to_complex_gpuarray(bx[0])
input_by_pycuda = to_complex_gpuarray(by[0])
output_b_pycuda = to_complex_gpuarray(bz[0])
# fancy native batched version
sc_complex_dot_batched(input_bx_pycuda, input_by_pycuda,
output_b_pycuda)
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
cufft = CuFFTOp()
cuifft = CuIFFTOp()
batched_complex_dot = BatchedComplexDotOp()
def mult_and_reduce(input_fft_v, filters_fft_v, input_shape=None,
filter_shape=None):
"""
Parameters
----------
input_fft_v
It's (b, ic, i0, i1//2 + 1, 2).
filters_fft_v
It's (oc, ic, i0, i1//2 + 1, 2).
"""
if input_shape is None:
input_shape = input_fft_v.shape # symbolic
if filter_shape is None:
filter_shape = filters_fft_v.shape # symbolic
b, ic, i0, i1_f, _ = input_shape
oc = filter_shape[0]
# reshape to flatten the dimensions that are multiplied elemwise
input_r = input_fft_v.reshape((b, ic, i0 * i1_f, 2))
filters_r = filters_fft_v.reshape((oc, ic, i0 * i1_f, 2))
# shuffle for batched dot product
input_s = input_r.dimshuffle(2, 0, 1, 3) # (i0 * i1_f, b, ic, 2)
filters_s = filters_r.dimshuffle(2, 1, 0, 3) # (i0 * i1_f, ic, oc, 2)
output_s = batched_complex_dot(input_s, filters_s)
# shuffle again
output_r = output_s.dimshuffle(1, 2, 0, 3)
# reshape to unflatten
output = output_r.reshape((b, oc, i0, i1_f, 2))
return output
def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', pad_last_dim=False):
"""
Perform a convolution through fft.
Only support input which will be even on the last dimension
(width). All other dimensions can be anything and the filters can
have an even or odd width.
If you must use input which has an odd width, you can either pad
it or use the `pad_last_dim` argument which will do it for you and
take care to strip the padding before returning. Don't use this
argument if you are not sure the input is odd since the padding is
unconditional and will make even input odd, thus leading to
problems.
On valid mode the filters must be smaller than the input.
Parameters
----------
input
(b, ic, i0, i1).
filters
(oc, ic, f0, f1).
border_mode : {'valid', 'full'}
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
"""
# use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None:
image_shape = input.shape
if filter_shape is None:
filter_shape = filters.shape
# batch size, input channels, input dim 0, input dim 1
b, ic, i0, i1 = image_shape
# output channels, input channels, filter dim 0, filter dim 1
oc, ic_, f0, f1 = filter_shape
# pad filters/image to output shape
if border_mode == 'valid':
o0 = i0
if pad_last_dim:
o1 = i1 + 1
input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
input)
else:
o1 = i1
input_padded = input
filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
filters)
elif border_mode == 'full':
# In this particular case, the values of (o0, o1) represent
# the dimensions of the work buffer more than the actual dimensions
# of the desired output.
o0 = i0 + 2 * (f0 - 1)
o1 = i1 + 2 * (f1 - 1)
if pad_last_dim:
o1 = o1 + 1
# We line up the filters and the images in a way
# such that the filters are tightly placed against the
# top-left of the array, and the images intersect with
# them on one pixel. The top-left pixel of the images
# is the bottom-right pixel of the filters when we
# do the layout here.
filters_padded = T.zeros((oc, ic, o0, o1), dtype='float32')
filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1],
filters)
input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1)],
input)
else:
raise ValueError('invalid mode')
input_padded = T.opt.Assert("in conv2d_fft: width is not even")(
input_padded, T.eq(o1 % 2, 0))
# reshape for FFT
input_flat = input_padded.reshape((b * ic, o0, o1))
filters_flat = filters_padded.reshape((oc * ic, o0, o1))
# perform FFT
input_fft_flat = cufft(input_flat) # (b * ic, o0, o1//2 + 1, 2)
filters_fft_flat = cufft(filters_flat) # (oc * ic, o0, o1//2 + 1, 2)
# unfold ic dimension
input_fft_v_shape = (b, ic, o0, o1 // 2 + 1, 2)
filters_fft_v_shape = (oc, ic, o0, o1 // 2 + 1, 2)
input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
# (b, oc, o0, o1//2 + 1, 2)
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape)
# reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1 // 2 + 1, 2))
# perform IFFT
output_flat = cuifft(output_fft_flat) # (b * oc, o0, o1)
# reshape
output_circ = output_flat.reshape((b, oc, o0, o1)) # circular!
# Now we extract the region of interest.
# We just cut it out from the output_circ
# array that was used for the computation.
# We do not need to handle pad_last_dim in a
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else:
raise ValueError('invalid mode')
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
output = (1.0 / T.cast(o0 * o1, 'float32')) * output
# output should now be the result of a batched valid convolution
# of the input with the filters.
return basic_ops.as_cuda_ndarray_variable(output)
def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
border_mode='valid', pad_last_dim=False):
"""
Perform a convolution through fft.
Only supports input whose shape is even on the last dimension.
All other dimensions can be anything and the filters can
have an even or odd last dimension.
The semantics associated with the last three dimensions
are not important as long as they are in the same order between
the inputs and the filters. For example, when the convolution
is done on a sequence of images, they could be either
(duration, height, width) or (height, width, duration).
If you must use input which has an odd width, you can either pad
it or use the `pad_last_dim` argument which will do it for you and
take care to strip the padding before returning. pad_last_dim checks
that the last dimension is odd before the actual paddding
On valid mode the filters must be smaller than the input.
Parameters
----------
input
(b, ic, i0, i1, i2).
filters
(oc, ic, f0, f1, i2).
border_mode : {'valid', 'full'}.
pad_last_dim
Unconditionally pad the last dimension of the input
to to turn it from odd to even. Will strip the
padding before returning the result.
"""
# use symbolic shapes to compute shape info at runtime if not specified
if image_shape is None:
image_shape = input.shape
if filter_shape is None:
filter_shape = filters.shape
# batch size, input channels, input dim 0, input dim 1
b, ic, i0, i1, i2 = image_shape
# output channels, input channels, filter dim 0, filter dim 1
oc, ic_, f0, f1, f2 = filter_shape
# Check that the last dimension is odd
is_odd = T.eq(T.mod(input.shape[4], 2), 1)
# pad filters/image to output shape
if border_mode == 'valid':
o0 = i0
o1 = i1
o2 = i2
input_padded = input
if pad_last_dim:
o2 = ifelse(is_odd, o2 + 1, o2)
input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1, :i2],
input)
filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
filters)
elif border_mode == 'full':
# In this particular case, the values of (o0, o1) represent
# the dimensions of the work buffer more than the actual dimensions
# of the desired output.
o0 = i0 + 2 * (f0 - 1)
o1 = i1 + 2 * (f1 - 1)
o2 = i2 + 2 * (f2 - 1)
if pad_last_dim:
o2 = ifelse(is_odd, o2 + 1, o2)
# We line up the filters and the images in a way
# such that the filters are tightly placed against the
# top-left of the array, and the images intersect with
# them on one pixel. The top-left pixel of the images
# is the bottom-right pixel of the filters when we
# do the layout here.
filters_padded = T.zeros((oc, ic, o0, o1, o2), dtype='float32')
filters_padded = T.set_subtensor(filters_padded[:, :, :f0, :f1, :f2],
filters)
input_padded = T.zeros((b, ic, o0, o1, o2), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, (f0 - 1):(f0 - 1 + i0), (f1 - 1):(f1 - 1 + i1), (f2 - 1):(f2 - 1 + i2)],
input)
else:
raise ValueError('invalid mode')
# reshape for FFT
input_flat = input_padded.reshape((b * ic, o0, o1, o2))
filters_flat = filters_padded.reshape((oc * ic, o0, o1, o2))
# perform FFT
input_fft_flat = cufft(input_flat) # (b * ic, o0, o1, o2//2 + 1, 2)
filters_fft_flat = cufft(filters_flat) # (oc * ic, o0, o1, o2//2 + 1, 2)
# Unfold ic dimension.
# We have to collapse two dimensions together
# in order to reuse the same `mult_and_reduce`.
# This explains the o0 * 01 instead of just keeping
# the two dimensions intact.
input_fft_v_shape = (b, ic, o0 * o1, o2 // 2 + 1, 2)
filters_fft_v_shape = (oc, ic, o0 * o1, o2 // 2 + 1, 2)
input_fft_v = input_fft_flat.reshape(input_fft_v_shape)
filters_fft_v = filters_fft_flat.reshape(filters_fft_v_shape)
# (b, oc, o0 * o1, o2//2 + 1, 2)
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape)
# output_fft_s = input_fft_v
# reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
# perform IFFT
output_flat = cuifft(output_fft_flat) # (b * oc, o0, o1, o2)
# reshape
output_circ = output_flat.reshape((b, oc, o0, o1, o2)) # circular!
# Now we extract the region of interest.
# We just cut it out from the output_circ
# array that was used for the computation.
# We do not need to handle pad_last_dim in a
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else:
raise ValueError('invalid mode')
# output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
output = (1.0 / T.cast(o0 * o1 * o2, 'float32')) * output
# output should now be the result of a batched valid convolution
# of the input with the filters.
return basic_ops.as_cuda_ndarray_variable(output)
"""
Helper routines for generating gpu kernels for nvcc.
"""
from __future__ import absolute_import, print_function, division
def nvcc_kernel(name, params, body):
"""
Return the c code of a kernel function.
Parameters
----------
params
The parameters to the function as one or more strings.
body
The [nested] list of statements for the body of the
function. These will be separated by ';' characters.
"""
paramstr = ', '.join(params)
def flatbody():
for b in body:
if isinstance(b, (list, tuple)):
for bb in b:
yield bb
else:
yield b
bodystr = ';\n'.join(flatbody())
return """__global__ void %(name)s (%(paramstr)s)
{
%(bodystr)s;
}
""" % locals()
def code_version(version):
"""
Decorator to support version-based cache mechanism.
"""
if not isinstance(version, tuple):
raise TypeError('version must be tuple', version)
def deco(f):
f.code_version = version
return f
return deco
UNVERSIONED = ()
@code_version((1,))
def inline_reduce(N, buf, pos, count, manner_fn):
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer.
pos
Index of executing thread.
count
Number of executing threads.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
loop_line = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % (buf))
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..warpSize],
// leaving the reduction result in buf[0].
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(N)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the sum of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce.code_version)
def inline_reduce_max(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "max(%s, %s)" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_sum(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "%s + %s" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_min(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "min(%s, %s)" % (a, b))
@code_version(inline_reduce.code_version)
def inline_reduce_prod(N, buf, pos, count):
return inline_reduce(N, buf, pos, count,
lambda a, b: "%s * %s" % (a, b))
@code_version((2,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax(N, buf, buf2, threadPos, threadCount):
"""
Parameters
----------
N
Length of the buffer.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
:Precondition: buf and buf2 contain two identical copies of the input
to softmax
:Postcondition: buf contains the softmax, buf2 contains un-normalized
softmax
Notes
-----
buf and buf2 should be in gpu shared memory, we access it many times.
We use __i as an int variable in a loop.
"""
return [ # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]', '}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'__syncthreads()',
]
@code_version((1,))
def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
manner_fn, manner_init,
b='', stride_b=''):
"""
Return C++ code for a function that reduces a contiguous buffer.
Parameters
----------
N
Length of the buffer.
buf
Buffer pointer of size warpSize * sizeof(float).
pos
Index of executing thread.
count
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
manner_fn
A function that accepts strings of arguments a
and b, and returns c code for their reduction. (Example:
return "%(a)s + %(b)s" for a sum reduction).
manner_init
A function that accepts strings of arguments a
and return c code for its initialization.
:postcondition:
This function leaves the answer in position 0 of the buffer. The
rest of the buffer is trashed by this function.
Notes
-----
buf should be in gpu shared memory, we access it many times.
"""
if b:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s] +"
" %(b)s[%(pos)s * %(stride_b)s]" % locals())
loop_line = manner_fn("red",
manner_init("%(x)s[i * %(stride_x)s] + "
"%(b)s[i * %(stride_b)s]" %
locals()))
else:
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos))
r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos))
return """
{
// This function trashes buf[1..n_threads],
// leaving the reduction result in buf[0].
float red = %(init)s;
#pragma unroll 16
for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){
red = %(loop_line)s;
}
buf[%(pos)s] = red;
__syncthreads();
if (%(pos)s < warpSize)
{
for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize)
{
%(buf)s[%(pos)s] = %(loop_line2)s;
}
if (%(pos)s < 16)
{
//reduce so that %(pos)s 0 has the reduction of everything
if(%(pos)s + 16 < %(N)s)
%(buf)s[%(pos)s] = %(r_16)s;
if(%(pos)s + 8 < %(N)s)
%(buf)s[%(pos)s] = %(r_8)s;
if(%(pos)s + 4 < %(N)s)
%(buf)s[%(pos)s] = %(r_4)s;
if(%(pos)s + 2 < %(N)s)
%(buf)s[%(pos)s] = %(r_2)s;
if(%(pos)s + 1 < %(N)s)
%(buf)s[%(pos)s] = %(r_1)s;
}
}
}
""" % locals()
@code_version(inline_reduce_fixed_shared.code_version)
def inline_reduce_fixed_shared_max(N, buf, x, stride_x, pos, count,
b='', stride_b=''):
return inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
lambda a, b: "max(%s, %s)" % (a, b),
lambda a: a,
b, stride_b)
@code_version((1,) + inline_reduce_max.code_version +
inline_reduce_sum.code_version)
def inline_softmax_fixed_shared(N, buf, x, stride_x,
sm, sm_stride,
threadPos, threadCount,
b='', stride_b=''):
"""
Parameters
----------
N
Length of the buffer, atleast waprSize(32).
buf
A shared memory buffer of size warpSize * sizeof(float).
x
A ptr to the gpu memory where the row is stored.
stride_x
The stride between each element in x.
sm
A ptr to the gpu memory to store the result.
sm_stride
The stride between each sm element.
threadPos
Index of executing thread.
threadCount
Number of executing threads.
b
Optional, pointer to the bias.
stride_b
Optional, the stride of b if b is provided.
:Precondition: buf is empty
:Postcondition: buf[0] contains the softmax,
buf2 contains un-normalized softmax
Notes
-----
buf should be in gpu shared memory, we access it many times.
We use tx as an int variable in a loop.
"""
ret = [
# get max of buf (trashing all but buf[0])
inline_reduce_fixed_shared_max(N, buf, x, stride_x,
threadPos, threadCount, b, stride_b),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
inline_reduce_fixed_shared(N, buf, x, stride_x, threadPos, threadCount,
lambda a, b: "%s + %s" % (a, b),
lambda a: "exp(%s - row_max)" % a,
b, stride_b),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]
# This set all value correctly
if b:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
" exp(%(x)s[tx * %(stride_x)s] +"
" %(b)s[tx * %(stride_b)s] - row_max)"
" / row_sum" % locals()]
else:
ret += [
"%(sm)s[tx * %(sm_stride)s] = "
"exp(%(x)s[tx * %(stride_x)s] - row_max) / row_sum" % locals()]
ret += [
"}",
'__syncthreads()',
]
return ret
from __future__ import absolute_import, print_function, division
# This is work in progress
from theano import Apply, tensor
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.tensor.nnet.neighbours import Images2Neibs
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import (
as_cuda_ndarray_variable, host_from_gpu, gpu_from_host)
from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
class GpuImages2Neibs(Images2Neibs, GpuOp):
def __init__(self, mode='valid'):
if mode not in ['valid', 'ignore_borders', 'wrap_centered']:
raise NotImplementedError("Only the mode valid, ignore_borders"
" and wrap_centered"
" have been implemented for the op"
" GpuImages2Neibs")
self.mode = mode
def make_node(self, ten4, neib_shape, neib_step):
ten4 = as_cuda_ndarray_variable(ten4)
neib_shape = tensor.as_tensor_variable(neib_shape)
neib_step = tensor.as_tensor_variable(neib_step)
assert ten4.ndim == 4
assert ten4.dtype == 'float32'
assert neib_shape.ndim == 1
assert neib_step.ndim == 1
assert neib_shape.dtype in tensor.integer_dtypes
assert neib_step.dtype in tensor.integer_dtypes
return Apply(self, [ten4, neib_shape, neib_step],
[CudaNdarrayType(broadcastable=(False, False),
dtype=ten4.type.dtype)()])
def c_code_cache_version(self):
return (8,)
def c_support_code_apply(self, node, nodename):
mode = self.mode
return """
//a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s(
const int nb_batch,
const int nb_stack,
const int height,
const int width,
const int c,
const int d,
const int step_x,
const int step_y,
const int grid_c,
const int grid_d,
const int stride0, const int stride1,
const int stride2, const int stride3,
float * global_ten4,
const int out_s0, const int out_s1,
float * global_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
left = left/grid_c;
const int s = left%%nb_stack;
left = left/nb_stack;
const int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
int i = threadIdx.y; // loop over c
{
int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
}
int j = threadIdx.x; // loop over d
{
int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
}
int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}
static __global__ void k_multi_warp_%(nodename)s(
const int nb_batch,
const int nb_stack,
const int height,
const int width,
const int c,
const int d,
const int step_x,
const int step_y,
const int grid_c,
const int grid_d,
const int stride0, const int stride1,
const int stride2, const int stride3,
float * global_ten4,
const int out_s0, const int out_s1,
float * global_out
)
{
const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;
tblock<nb_batch*nb_stack*grid_c*grid_d;
tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d;
int left = tblock/grid_d;
const int a = left%%grid_c;
left = left/grid_c;
const int s = left%%nb_stack;
left = left/nb_stack;
const int n = left;
if(n>nb_batch)continue;
if(s>nb_stack)continue;
if(a>grid_c)continue;
if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*
(s + nb_stack*n));
// loop over c
for (int i = threadIdx.y; i < c; i+=blockDim.y)
{
int ten4_2 = i + a * step_x;
if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 )
ten4_2 += height;
else if (ten4_2 >= height)
ten4_2 -= height;
}
// loop over d
for (int j = threadIdx.x; j < d; j+=blockDim.x)
{
int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 )
ten4_3 += width;
else if (ten4_3 >= width)
ten4_3 -= width;
}
int ten4_idx = stride3*ten4_3 +
stride2*ten4_2 +
stride1*s + stride0*n;
int z_col = j + d * i;
int z_idx = z_col * out_s1 +
z_row * out_s0;
global_out[z_idx] = global_ten4[ten4_idx];
}
}
}
}
""" % locals()
def c_code(self, node, name, inp, out, sub):
ten4, neib_shape, neib_step = inp
z, = out
fail = sub['fail']
mode = self.mode
return """
#ifndef CEIL_INTDIV
#define CEIL_INTDIV(a, b) ((a/b) + ((a %% b) ? 1: 0))
#endif
int grid_c = -1;
int grid_d = -1;
{
if (CudaNdarray_NDIM(%(ten4)s) != 4)
{
PyErr_Format(PyExc_TypeError, "pvals wrong rank");
%(fail)s;
}
if (PyArray_NDIM(%(neib_shape)s) != 1)
{
PyErr_Format(PyExc_TypeError, "unis wrong rank");
%(fail)s;
}
if (PyArray_DIMS(%(neib_shape)s)[0] != 2)
{
PyErr_Format(PyExc_ValueError,
"neib_shape has to contain two elements");
%(fail)s;
}
const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 0);
const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1);
if ( "%(mode)s" == "wrap_centered") {
if (c%%2!=1 || d%%2!=1){
PyErr_Format(PyExc_TypeError,
"Images2Neibs: in mode wrap_centered need patch with odd shapes");
%(fail)s;
}
if ( CudaNdarray_HOST_DIMS(%(ten4)s)[2] < c ||
CudaNdarray_HOST_DIMS(%(ten4)s)[3] < d)
{
PyErr_Format(PyExc_TypeError,
"Images2Neibs: in wrap_centered mode, don't"
" support image shapes smaller then the patch"
" shapes: neib_shape=(%%d,%%d),"
" ten4[2:]=[%%d,%%d]",
c, d, CudaNdarray_HOST_DIMS(%(ten4)s)[2],
CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
%(fail)s;
}
grid_c = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]),
step_x);
grid_d = CEIL_INTDIV(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]),
step_y);
}else if ( "%(mode)s" == "valid") {
if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[2] < c) ||
((((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c) %% step_x)!=0))
{
PyErr_Format(PyExc_TypeError,
"neib_shape[0]=%%d, neib_step[0]=%%d and"
" ten4.shape[2]=%%d not consistent",
c, step_x,
CudaNdarray_HOST_DIMS(%(ten4)s)[2]);
%(fail)s;
}
if ( ((CudaNdarray_HOST_DIMS(%(ten4)s))[3] < d) ||
((((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d) %% step_y)!=0))
{
PyErr_Format(PyExc_TypeError,
"neib_shape[1]=%%d, neib_step[1]=%%d and"
" ten4.shape[3]=%%d not consistent",
d, step_y,
CudaNdarray_HOST_DIMS(%(ten4)s)[3]);
%(fail)s;
}
//number of patch in height
grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
}else if ( "%(mode)s" == "ignore_borders") {
//number of patch in height
grid_c = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[2]-c)/step_x);
//number of patch in width
grid_d = 1+(((CudaNdarray_HOST_DIMS(%(ten4)s))[3]-d)/step_y);
}else{
PyErr_Format(PyExc_TypeError,
"Images2Neibs: unknow mode '%(mode)s'");
%(fail)s;
}
// new dimensions for z
const int z_dim1 = c * d;
const int z_dim0 = grid_c
* grid_d
* CudaNdarray_HOST_DIMS(%(ten4)s)[1]
* CudaNdarray_HOST_DIMS(%(ten4)s)[0];
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != z_dim0)
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != z_dim1))
{
Py_XDECREF(%(z)s);
npy_intp dims[2];
dims[0] = z_dim0;
dims[1] = z_dim1;
%(z)s = (CudaNdarray*)CudaNdarray_NewDims(2, dims);
if (!%(z)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc z output");
%(fail)s;
}
}
}
{ // NESTED SCOPE
const int nb_batch = CudaNdarray_HOST_DIMS(%(ten4)s)[0];
const int nb_stack = CudaNdarray_HOST_DIMS(%(ten4)s)[1];
const int height = CudaNdarray_HOST_DIMS(%(ten4)s)[2];
const int width = CudaNdarray_HOST_DIMS(%(ten4)s)[3];
const int c = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 0);
const int d = *(dtype_%(neib_shape)s*) PyArray_GETPTR1(
%(neib_shape)s, 1);
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*)
PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1);
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and
//a higher core utilisation. for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 &&
n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
n_threads.z++;
}
int nb_block;
if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
else
nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
dim3 n_blocks(std::min(32*1024,nb_block));
int n_shared = 0;
void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
float*,
int, int,
float*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>(
nb_batch,
nb_stack,
height, width,
c, d, step_x, step_y,
grid_c, grid_d,
CudaNdarray_HOST_STRIDES(%(ten4)s)[0],
CudaNdarray_HOST_STRIDES(%(ten4)s)[1],
CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
CudaNdarray_DEV_DATA(%(ten4)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1],
CudaNdarray_DEV_DATA(%(z)s)
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i; shared: %%i)\\n",
"k_multi_warp_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z,
n_shared);
%(fail)s;
}
} // END NESTED SCOPE
""" % locals()
def gpu_images2neibs(ten4, neib_shape, neib_step=None, mode='valid'):
return GpuImages2Neibs(mode)(ten4, neib_shape, neib_step)
@local_optimizer([Images2Neibs])
def use_gpu_images2neibs(node):
if (type(node.op) is Images2Neibs and
node.inputs[0].dtype == 'float32' and
node.op.mode in ['valid', 'ignore_borders',
'wrap_centered']):
return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
node.inputs[1], node.inputs[2],
mode=node.op.mode))]
if cuda_available:
register_gpu_opt()(use_gpu_images2neibs)
from __future__ import absolute_import, print_function, division
from theano import Op, Apply
from six import StringIO
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
inline_softmax,
inline_softmax_fixed_shared)
class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
"""
Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
"""
nin = 3
nout = 3
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b, y_idx):
# N.B. won't work when we don't cast y_idx to float anymore
x = as_cuda_ndarray_variable(x)
b = as_cuda_ndarray_variable(b)
y_idx = as_cuda_ndarray_variable(y_idx)
nll = y_idx.type()
sm = x.type()
am = y_idx.type()
return Apply(self, [x, b, y_idx], [nll, sm, am])
def c_headers(self):
return ['<float.h>']
def c_support_code(self):
return """
__global__ void k_xent_sm_1hot_bias(const int M, const int N,
const float * x_data, const int xs0, const int xs1,
const float * b, const int bs0,
const float * y_idx_data, const int y_idxs0,
float * nll_data, const int nlls0,
float * sm_data, const int sms0, const int sms1,
float * am_data, const int ams0)
{
for (int row = blockIdx.x; row < M; row += gridDim.x){
const float * x = x_data + xs0 * row;
float * sm = sm_data + sms0 * row;
extern __shared__ float per_thread_values[];
__shared__ float row_max, sum, sum_inv;
__shared__ int row_max_threadIdx;
float per_thread_row_max, per_thread_sum;
int per_thread_row_max_j;
// COMPUTE ROW MAX AND ARGMAX
// compute separate per-thread maximums and argmax's
per_thread_row_max = -FLT_MAX;
per_thread_row_max_j = 0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{
float row_ij = x[j*xs1] + b[j*bs0];
per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j;
per_thread_row_max = fmaxf(row_ij, per_thread_row_max);
}
per_thread_values[threadIdx.x] = per_thread_row_max;
// wait for access to shared per_thread_values to do final
// reduction in thread 0
__syncthreads();
// Finish the reduction in one go in a single thread. Could be
// smarter about this with more hierarchical reductions but think
// this will do for now.
if (threadIdx.x == 0) {
// compute overall maximum and the id of the thread which has it
row_max = -FLT_MAX;
row_max_threadIdx = 0;
for (int j = 0; j < blockDim.x; ++j)
{
float per_thread_max = per_thread_values[j];
row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx;
row_max = fmaxf(per_thread_max, row_max);
}
}
// all threads wait for access to shared row_max and row_maxThreadIdx
__syncthreads();
// thread whose max was the overall max writes out the overall argmax:
if (threadIdx.x == row_max_threadIdx) am_data[row*ams0] = per_thread_row_max_j;
// COMPUTE SOFTMAX
// compute the exp and the per-thread sums of exps
per_thread_sum = 0.0;
for (int j = threadIdx.x; j < N; j += blockDim.x)
{
float row_ij = x[j*xs1] + b[j*bs0];
float sm_ij = __expf(row_ij - row_max);
per_thread_sum += sm_ij;
sm[j * sms1] = sm_ij;
}
per_thread_values[threadIdx.x] = per_thread_sum;
// wait for access to shared per_thread_values to do final
// reduction in thread 0
__syncthreads();
if (threadIdx.x == 0) {
// compute overall sum
sum = 0.0;
for (int j = 0; j < blockDim.x; ++j)
{
sum += per_thread_values[j];
}
sum_inv = 1.0 / sum;
}
// all threads wait for access to shared sum, sum_inv
__syncthreads();
// all threads normalize their softmax result using sum_inv
for (int j = threadIdx.x; j < N; j += blockDim.x)
{
sm[j * sms1] *= sum_inv;
}
// COMPUTE NEGATIVE LOG-LIKELIHOOD FOR TARGET INDEX
if (threadIdx.x == 0) {
const int y_idx = (int)y_idx_data[row * y_idxs0];
if ((y_idx >= N) || (y_idx < 0))
{
//TODO: set raise an error bit in a global var?
nll_data[row*nlls0] = 0.0; // raise some suspicion at least...
}
else
{
nll_data[row*nlls0] = - x[y_idx*xs1]
- b[y_idx*bs0]
+ row_max
+ logf(sum);
}
}
}
}
"""
def c_code(self, node, nodename, inp, out, sub):
x, b, y_idx = inp
nll, sm, am = out
classname = self.__class__.__name__
fail = sub['fail']
sio = StringIO()
print("""
if (CudaNdarray_NDIM(%(y_idx)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
%(fail)s;
}
if (CudaNdarray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
%(fail)s;
}
if (CudaNdarray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(x)s)[0] !=
CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,y_idx arguments");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"dimension mismatch in x,b arguments");
%(fail)s;
}
if ((NULL == %(nll)s) //initial condition
|| (CudaNdarray_HOST_DIMS(%(nll)s)[0] !=
CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(nll)s);
%(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1,
CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!%(nll)s)
{
%(fail)s;
}
}
if ((NULL == %(sm)s)
|| (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(sm)s)[1] !=
CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(sm)s);
%(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2,
CudaNdarray_HOST_DIMS(%(x)s));
if(!%(sm)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc sm output");
// no need to decref cnda_nll, the cleanup code should do it up
%(fail)s;
}
}
if ((NULL == %(am)s)
|| (CudaNdarray_HOST_DIMS(%(am)s)[0] !=
CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
{
Py_XDECREF(%(am)s);
%(am)s = (CudaNdarray*) CudaNdarray_NewDims(1,
CudaNdarray_HOST_DIMS(%(y_idx)s));
if(!%(am)s)
{
PyErr_SetString(PyExc_MemoryError,
"failed to alloc am output");
// no need to decref nll and sm,
// the cleanup code should do it up
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK);
int n_shared_bytes = n_threads * sizeof(float);
k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(nll)s),
CudaNdarray_HOST_STRIDES(%(nll)s)[0],
CudaNdarray_DEV_DATA(%(sm)s),
CudaNdarray_HOST_STRIDES(%(sm)s)[0],
CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(am)s),
CudaNdarray_HOST_STRIDES(%(am)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %(classname)s %(nodename)s: %%s.\\n"
"The kernel was launched with %%d threads,"
" %%d blocks and %%d shared memory\\n",
cudaGetErrorString(err),
n_threads, n_blocks, n_shared_bytes);
// no need to decref output vars the cleanup code will do it
%(fail)s;
}
}
""" % locals(), file=sio)
return sio.getvalue()
def c_code_cache_version(self):
# return ()
return (5,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = \
GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
"""
Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
"""
nin = 3
nout = 1
"""Gradient wrt x of the CrossentropySoftmax1Hot Op"""
def __init__(self, **kwargs):
Op.__init__(self, **kwargs)
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, dy, sm, y_idx):
dy = as_cuda_ndarray_variable(dy)
sm = as_cuda_ndarray_variable(sm)
y_idx = as_cuda_ndarray_variable(y_idx)
return Apply(self, [dy, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
# return ()
return (8,)
def c_code(self, node, nodename, inp, out, sub):
dnll, sm, y_idx = inp
dx, = out
fail = sub['fail']
return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const npy_intp %(dnll)s_dims0 = (CudaNdarray_NDIM(%(dnll)s) > 0 ?
CudaNdarray_HOST_DIMS(%(dnll)s)[0] :
(npy_intp) 0);
// Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar
// or a vector with just one element.
const npy_intp %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ?
CudaNdarray_HOST_STRIDES(%(dnll)s)[0] :
(npy_intp) 0);
if ((CudaNdarray_NDIM(%(dnll)s) > 1)
|| (CudaNdarray_NDIM(%(sm)s) != 2)
|| (CudaNdarray_NDIM(%(y_idx)s) != 1))
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if (%(dnll)s_dims0 !=
CudaNdarray_HOST_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_Format(PyExc_ValueError,
"dnll.shape[0] == %%i, but sm.shape[0] == %%i",
%(dnll)s_dims0,
CudaNdarray_HOST_DIMS(%(sm)s)[0]);
%(fail)s;
}
if (%(dnll)s_dims0 !=
CudaNdarray_HOST_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1)
{
PyErr_SetString(PyExc_ValueError,
"dnll.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
{
PyErr_SetString(PyExc_ValueError,
"sm.shape[0] != y_idx.shape[0]");
%(fail)s;
}
if ((NULL == %(dx)s)
|| (CudaNdarray_HOST_DIMS(%(dx)s)[0] !=
CudaNdarray_HOST_DIMS(%(sm)s)[0])
|| (CudaNdarray_HOST_DIMS(%(dx)s)[1] !=
CudaNdarray_HOST_DIMS(%(sm)s)[1]))
{
Py_XDECREF(%(dx)s);
%(dx)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(dx)s)
|| CudaNdarray_alloc_contiguous(%(dx)s, 2,
CudaNdarray_HOST_DIMS(%(sm)s)))
{
Py_XDECREF(%(dx)s);
%(dx)s = NULL;
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[0],
NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256);
kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(%(dx)s)[0],
CudaNdarray_HOST_DIMS(%(dx)s)[1],
CudaNdarray_DEV_DATA(%(dnll)s),
%(dnll)s_strides0,
CudaNdarray_DEV_DATA(%(sm)s),
CudaNdarray_HOST_STRIDES(%(sm)s)[0],
CudaNdarray_HOST_STRIDES(%(sm)s)[1],
CudaNdarray_DEV_DATA(%(y_idx)s),
CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
CudaNdarray_DEV_DATA(%(dx)s),
CudaNdarray_HOST_STRIDES(%(dx)s)[0],
CudaNdarray_HOST_STRIDES(%(dx)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n"
"The kernel was launched with %%d threads and"
" %%d blocks\\n",
"kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s",
cudaGetErrorString(err), n_threads, n_blocks);
%(fail)s;
}
}
assert(%(dx)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
return """
__global__ void kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s(
int N, int K,
const float * dnll, const int dnll_s0,
const float * sm, const int sm_s0, const int sm_s1,
const float * y_idx, const int y_idx_s0,
float * dx, const int dx_s0, const int dx_s1)
{
for (int i = blockIdx.x; i < N; i += gridDim.x)
{
float dnll_i = dnll[i * dnll_s0];
int y_i = (int)y_idx[i * y_idx_s0];
for (int j = threadIdx.x; j < K; j += blockDim.x)
{
if (y_i == j)
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
}
else
{
dx[i * dx_s0 + j * dx_s1] =
dnll_i * sm[i * sm_s0 + j * sm_s1];
}
//dx[i * dx_s0 + j * dx_s1] =
// dnll_i * sm[i * sm_s0 + j * sm_s1];
//dx[i*dx_s0+j*dx_s1] = 0;
}
}
}
""" % locals()
gpu_crossentropy_softmax_1hot_with_bias_dx = \
GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax(GpuOp):
"""
Implement Softmax on the gpu.
"""
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = as_cuda_ndarray_variable(x)
return Apply(self, [x], [x.type()])
def infer_shape(self, node, shape):
return shape
def c_code_cache_version(self):
return (9,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, = inp
z, = out
fail = sub['fail']
return """
if (CudaNdarray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error");
%(fail)s;
}
if ((NULL == %(z)s) ||
(CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
(CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2,
CudaNdarray_HOST_DIMS(%(x)s)))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
32 * 1024);
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
2 * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{
//Those numbers are based on not too recent GPU
//to make them compatible with more GPU.
//TODO: read the information from the card.
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmax_%(nodename)s
<<<
n_blocks,
n_threads,
n_shared_bytes
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}else{
kSoftmax_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
n_threads * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s.\\n Used %%d blocks,"
" %%d threads %%d bytes of shared memory",
"kSoftmax[_fixed_shared]%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel(
"kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x',
'const int sx0',
'const int sx1',
'float * sm',
'const int sm_s0',
'const int sm_s1'],
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf2[tx] = buf[tx]", "}", "__syncthreads()",
inline_softmax('N',
'buf',
'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel(
"kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=["extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x',
'blockDim.x'),
"__syncthreads()", "}", ])
return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax()
class GpuSoftmaxWithBias(GpuOp):
"""
Implement SoftmaxWithBias on the gpu.
"""
nin = 2
nout = 1
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x, b):
x = as_cuda_ndarray_variable(x)
return Apply(self, [x, b], [x.type()])
def infer_shape(self, node, shape):
return [shape[0]]
def c_code_cache_version(self):
# return ()
return (9,) + inline_softmax.code_version
def c_code(self, node, nodename, inp, out, sub):
x, b = inp
z, = out
fail = sub['fail']
return """
if (CudaNdarray_NDIM(%(x)s) != 2)
{
PyErr_SetString(PyExc_ValueError, "rank error input");
%(fail)s;
}
if (CudaNdarray_NDIM(%(b)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "rank error for the bias");
%(fail)s;
}
if ((CudaNdarray_HOST_DIMS(%(x)s)[1] !=
CudaNdarray_HOST_DIMS(%(b)s)[0]))
{
PyErr_Format(PyExc_ValueError,
"number of columns in x (%%ld)"
" does not match length of b (%%ld)",
(long int)CudaNdarray_HOST_DIMS(%(x)s)[1],
(long int)CudaNdarray_HOST_DIMS(%(b)s)[0]);
%(fail)s;
}
if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(x)s)[1]))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s)
|| CudaNdarray_alloc_contiguous(%(z)s, 2,
CudaNdarray_HOST_DIMS(%(x)s)))
{
Py_XDECREF(%(z)s);
%(z)s = NULL;
%(fail)s;
}
}
{
int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024);
//TODO, detect the maximum number of thread per block.
int n_threads = std::min(CudaNdarray_HOST_DIMS(%(x)s)[1], 512);
int n_shared_bytes = CudaNdarray_HOST_DIMS(%(x)s)[1] *
2 * sizeof(float);
if (CudaNdarray_HOST_DIMS(%(x)s)[0] > 0)
{
if(n_shared_bytes < (32 * 1024 - 500)){
kSoftmaxWithBias_%(nodename)s
<<<
n_blocks,
n_threads,
n_shared_bytes
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}else{
kSoftmaxWithBias_fixed_shared%(nodename)s
<<<
n_blocks,
n_threads,
n_threads * sizeof(float)
>>>(
CudaNdarray_HOST_DIMS(%(x)s)[0],
CudaNdarray_HOST_DIMS(%(x)s)[1],
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(b)s),
CudaNdarray_HOST_STRIDES(%(b)s)[0],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. n_blocks=%%d,"
" n_threads=%%d, n_shared_bytes=%%d\\n",
"kSoftmaxWithBias_%(nodename)s",
cudaGetErrorString(err),
n_blocks, n_threads, n_shared_bytes);
%(fail)s;
}
}
}
assert(%(z)s);
""" % locals()
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel(
"kSoftmaxWithBias_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] += b[tx * sb0]",
"buf2[tx] = buf[tx]", "}",
"__syncthreads()", inline_softmax('N', 'buf', 'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x',
'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm',
'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1',
'sm_ptr',
'sm_s1',
'threadIdx.x',
'blockDim.x',
'b', 'sb0'),
"__syncthreads()",
"}",
])
return ret1 + "\n" + ret2
gpu_softmax_with_bias = GpuSoftmaxWithBias()
from __future__ import absolute_import, print_function, division
import distutils
import logging
import os
import subprocess
import sys
from locale import getpreferredencoding
from theano import config
from theano.compat import decode, decode_with
from theano.configdefaults import local_bitwidth
from theano.gof.utils import hash_from_file
from theano.gof.cmodule import (std_libs, std_lib_dirs,
std_include_dirs, dlimport,
Compiler,
get_lib_extension)
from theano.misc.windows import output_subprocess_Popen
_logger = logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
nvcc_path = 'nvcc'
nvcc_version = None
def is_nvcc_available():
"""
Return True iff the nvcc compiler is found.
"""
if not config.cuda.enabled:
return False
def set_version():
p_out = output_subprocess_Popen([nvcc_path, '--version'])
ver_line = decode(p_out[0]).strip().split('\n')[-1]
build, version = ver_line.split(',')[1].strip().split()
assert build == 'release'
global nvcc_version
nvcc_version = version
try:
set_version()
return True
except Exception:
# try to find nvcc into cuda.root
p = os.path.join(config.cuda.root, 'bin', 'nvcc')
if os.path.exists(p):
global nvcc_path
nvcc_path = p
try:
set_version()
except Exception:
return False
return True
else:
return False
rpath_defaults = []
def add_standard_rpath(rpath):
rpath_defaults.append(rpath)
class NVCC_compiler(Compiler):
supports_amdlibm = False
@classmethod
def try_compile_tmp(cls, src_code, tmp_prefix='', flags=(),
try_run=False, output=False, comp_args=False):
return cls._try_compile_tmp(src_code, tmp_prefix, flags,
try_run, output, nvcc_path, comp_args)
@classmethod
def try_flags(cls, flag_list, preambule="", body="",
try_run=False, output=False, comp_args=False):
return cls._try_flags(flag_list, preambule, body, try_run, output,
nvcc_path, comp_args)
@staticmethod
def version_str():
return "nvcc " + nvcc_version
@staticmethod
def compile_args():
"""
This args will be received by compile_str() in the preargs paramter.
They will also be included in the "hard" part of the key module.
"""
flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath:
flags.append('-use_fast_math')
cuda_ndarray_cuh_hash = hash_from_file(
os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
# NumPy 1.7 Deprecate the old API.
# The following macro asserts that we don't bring new code
# that use the old API.
flags.append("-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
# If the user didn't specify architecture flags add them
if not any(['-arch=sm_' in f for f in flags]):
# We compile cuda_ndarray.cu during import.
# We should not add device properties at that time.
# As the device is not selected yet!
# TODO: re-compile cuda_ndarray when we bind to a GPU?
import theano.sandbox.cuda
if hasattr(theano.sandbox, 'cuda'):
n = theano.sandbox.cuda.use.device_number
if n is None:
_logger.warn(
"We try to get compilation arguments for CUDA"
" code, but the GPU device is not initialized."
" This is probably caused by an Op that work on"
" the GPU that don't inherit from GpuOp."
" We Initialize the GPU now.")
theano.sandbox.cuda.use(
"gpu",
force=True,
default_to_move_computation_to_gpu=False,
move_shared_float32_to_gpu=False,
enable_cuda=False)
n = theano.sandbox.cuda.use.device_number
p = theano.sandbox.cuda.device_properties(n)
flags.append('-arch=sm_' + str(p['major']) +
str(p['minor']))
return flags
@staticmethod
def compile_str(
module_name, src_code,
location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[],
rpaths=rpath_defaults, py_module=True, hide_symbols=True):
"""
Parameters
----------
module_name: str
This has been embedded in the src_code.
src_code
A complete c or c++ source listing for the module.
location
A pre-existing filesystem directory where the
cpp file and .so will be written.
include_dirs
A list of include directory names (each gets prefixed with -I).
lib_dirs
A list of library search path directory names (each gets
prefixed with -L).
libs
A list of libraries to link with (each gets prefixed with -l).
preargs
A list of extra compiler arguments.
rpaths
List of rpaths to use with Xlinker. Defaults to `rpath_defaults`.
py_module
If False, compile to a shared library, but
do not import as a Python module.
hide_symbols
If True (the default), hide all symbols from the library symbol
table unless explicitely exported.
Returns
-------
module
Dynamically-imported python module of the compiled code.
(unless py_module is False, in that case returns None.)
Notes
-----
On Windows 7 with nvcc 3.1 we need to compile in the real directory
Otherwise nvcc never finish.
"""
# Remove empty string directory
include_dirs = [d for d in include_dirs if d]
lib_dirs = [d for d in lib_dirs if d]
rpaths = list(rpaths)
if sys.platform == "win32":
# Remove some compilation args that cl.exe does not understand.
# cl.exe is the compiler used by nvcc on Windows.
for a in ["-Wno-write-strings", "-Wno-unused-label",
"-Wno-unused-variable", "-fno-math-errno"]:
if a in preargs:
preargs.remove(a)
if preargs is None:
preargs = []
else:
preargs = list(preargs)
if sys.platform != 'win32':
preargs.append('-fPIC')
if config.cmodule.remove_gxx_opt:
preargs = [p for p in preargs if not p.startswith('-O')]
cuda_root = config.cuda.root
# The include dirs gived by the user should have precedence over
# the standards ones.
include_dirs = include_dirs + std_include_dirs()
if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs:
include_dirs.append(os.path.abspath(os.path.split(__file__)[0]))
libs = libs + std_libs()
if 'cudart' not in libs:
libs.append('cudart')
lib_dirs = lib_dirs + std_lib_dirs()
if sys.platform != 'darwin':
# config.dnn.include_path add this by default for cudnn in the
# new back-end. This should not be used in this back-end. So
# just remove them.
lib_dirs = [ld for ld in lib_dirs if
not(ld == os.path.join(cuda_root, 'lib') or
ld == os.path.join(cuda_root, 'lib64'))]
if sys.platform != 'darwin':
# sometimes, the linker cannot find -lpython so we need to tell it
# explicitly where it is located
# this returns somepath/lib/python2.x
python_lib = distutils.sysconfig.get_python_lib(plat_specific=1,
standard_lib=1)
python_lib = os.path.dirname(python_lib)
if python_lib not in lib_dirs:
lib_dirs.append(python_lib)
if (config.nvcc.cudafe == 'heuristic' and not
any(marker in src_code for marker in ("__global__", "__device__",
"__host__", "<<<",
"nvmatrix.cuh"))):
# only calls existing CUDA functions, can compile much faster
cppfilename = os.path.join(location, 'mod.cpp')
src_code = ("#include <cuda.h>\n"
"#include <cuda_runtime_api.h>\n" +
src_code)
else:
# contains CUDA host code or device functions, needs .cu extension
cppfilename = os.path.join(location, 'mod.cu')
with open(cppfilename, 'w') as cppfile:
_logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code)
lib_filename = os.path.join(
location, '%s.%s' %
(module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename)
# TODO: Why do these args cause failure on gtx285 that has 1.3
# compute capability? '--gpu-architecture=compute_13',
# '--gpu-code=compute_13',
# nvcc argument
preargs1 = []
preargs2 = []
for pa in preargs:
if pa.startswith('-Wl,'):
# the -rpath option is not understood by the Microsoft linker
if sys.platform != 'win32' or not pa.startswith('-Wl,-rpath'):
preargs1.append('-Xlinker')
preargs1.append(pa[4:])
continue
for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
'-L', '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math',
'--use-local-env', '--cl-version=', '-std=']:
if pa.startswith(pattern):
preargs1.append(pa)
break
else:
preargs2.append(pa)
# Don't put -G by default, as it slow things down.
# We aren't sure if -g slow things down, so we don't put it by default.
cmd = [nvcc_path, '-shared'] + preargs1
if config.nvcc.compiler_bindir:
cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir])
if sys.platform == 'win32':
# add flags for Microsoft compiler to create .pdb files
preargs2.extend(['/Zi', '/MD'])
cmd.extend(['-Xlinker', '/DEBUG'])
# remove the complaints for the duplication of `double round(double)`
# in both math_functions.h and pymath.h,
# by not including the one in pymath.h
cmd.extend(['-D HAVE_ROUND'])
else:
if hide_symbols:
preargs2.append('-fvisibility=hidden')
if local_bitwidth() == 64:
cmd.append('-m64')
else:
cmd.append('-m32')
if len(preargs2) > 0:
cmd.extend(['-Xcompiler', ','.join(preargs2)])
# We should not use rpath if possible. If the user provided
# provided an cuda.root flag, we need to add one, but
# otherwise, we don't add it. See gh-1540 and
# https://wiki.debian.org/RpathIssue for details.
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib'))
if sys.platform != 'darwin':
# the CUDA libs are universal (contain both 32-bit and 64-bit)
rpaths.append(os.path.join(config.cuda.root, 'lib64'))
if sys.platform != 'win32':
# the -rpath option is not understood by the Microsoft linker
for rpath in rpaths:
cmd.extend(['-Xlinker', ','.join(['-rpath', rpath])])
# to support path that includes spaces, we need to wrap it with double quotes on Windows
path_wrapper = "\"" if os.name == 'nt' else ""
cmd.extend(['-I%s%s%s' % (path_wrapper, idir, path_wrapper) for idir in include_dirs])
cmd.extend(['-L%s%s%s' % (path_wrapper, ldir, path_wrapper) for ldir in lib_dirs])
cmd.extend(['-o', lib_filename])
cmd.append(os.path.split(cppfilename)[-1])
cmd.extend(['-l%s' % l for l in libs])
if sys.platform == 'darwin':
# This tells the compiler to use the already-loaded python
# symbols (which should always be the right ones).
cmd.extend(['-Xcompiler', '-undefined,dynamic_lookup'])
# Remove "-u Symbol" arguments, since they are usually not
# relevant for the new compilation, even if they were used for
# compiling python. If they are necessary, the nvcc syntax is
# "-U Symbol" with a capital U.
done = False
while not done:
try:
indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u
except ValueError:
done = True
# CUDA Toolkit v4.1 Known Issues:
# Host linker on Mac OS 10.7 (and 10.6 for me) passes -no_pie option
# to nvcc this option is not recognized and generates an error
# http://stackoverflow.com/questions/9327265/nvcc-unknown-option-no-pie
# Passing -Xlinker -pie stops -no_pie from getting passed
if sys.platform == 'darwin' and nvcc_version >= '4.1':
cmd.extend(['-Xlinker', '-pie'])
# cmd.append("--ptxas-options=-v") #uncomment this to see
# register and shared-mem requirements
_logger.debug('Running cmd %s', ' '.join(cmd))
orig_dir = os.getcwd()
try:
os.chdir(location)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
finally:
os.chdir(orig_dir)
for eline in nvcc_stderr.split('\n'):
if not eline:
continue
if 'skipping incompatible' in eline:
# ld is skipping an incompatible library
continue
if 'declared but never referenced' in eline:
continue
if 'statement is unreachable' in eline:
continue
_logger.info("NVCC: %s", eline)
if p.returncode:
for i, l in enumerate(src_code.split('\n')):
print(i + 1, l, file=sys.stderr)
print('===============================', file=sys.stderr)
# filter the output from the compiler
for l in nvcc_stderr.split('\n'):
if not l:
continue
# filter out the annoying declaration warnings
try:
if l[l.index(':'):].startswith(': warning: variable'):
continue
if l[l.index(':'):].startswith(': warning: label'):
continue
except Exception:
pass
print(l, file=sys.stderr)
print(nvcc_stdout)
print(cmd)
raise Exception('nvcc return status', p.returncode,
'for cmd', ' '.join(cmd))
elif config.cmodule.compilation_warning and nvcc_stdout:
print(nvcc_stdout)
# On Windows, nvcc print useless stuff by default
if sys.platform != 'win32' and nvcc_stdout:
# this doesn't happen to my knowledge
print("DEBUG: nvcc STDOUT", nvcc_stdout, file=sys.stderr)
if py_module:
# touch the __init__ file
open(os.path.join(location, "__init__.py"), 'w').close()
return dlimport(lib_filename)
This source diff could not be displayed because it is too large. You can view the blob instead.
from __future__ import absolute_import, print_function, division
from functools import wraps
import numpy
from theano import tensor, scalar as scal, Constant
from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError)
from theano.sandbox.cuda.basic_ops import (
GpuFromHost, HostFromGpu, host_from_gpu, GpuDimShuffle, GpuElemwise, GpuReshape)
_one = scal.constant(numpy.asarray(1.0, dtype='float32'))
def grab_cpu_scalar(v, nd):
if v.owner is not None:
n = v.owner
if (isinstance(n.op, GpuDimShuffle) and
n.op.new_order == ('x',) * nd):
return host_from_gpu(n.inputs[0])
elif (isinstance(n.op, DimShuffle) and
n.op.new_order == ('x',) * nd):
return n.inputs[0]
elif isinstance(n.op, GpuFromHost):
return grab_cpu_scalar(n.inputs[0], nd=nd)
else:
return None
else:
if (isinstance(v, Constant) and
v.broadcastable == (True,) * nd):
return v.dimshuffle(())
def find_node(v, cls, ignore_clients=False):
# This digs through possibly redundant transfers to for the node
# that has the op class specified.
if v.owner is not None and (ignore_clients or len(v.clients) == 1):
if isinstance(v.owner.op, cls):
return v.owner
elif (isinstance(v.owner.op, GpuFromHost) and
v.owner.inputs[0].owner is not None and
(ignore_clients or len(v.owner.inputs[0].clients) == 1) and
isinstance(v.owner.inputs[0].owner.op, HostFromGpu)):
return find_node(v.owner.inputs[0].owner.inputs[0], cls)
else:
return None
def is_equal(var, val):
# Returns True if var is always equal to val (python value), False
# otherwise (including if var is not constant)
try:
v = get_scalar_constant_value(var)
return v == val
except NotScalarConstantError:
return False
def alpha_merge(cls, alpha_in, beta_in):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.mul and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
if targ is None:
targ = find_node(node.inputs[1], cls)
if targ is None:
return
lr = grab_cpu_scalar(node.inputs[0],
nd=targ.outputs[0].ndim)
else:
lr = grab_cpu_scalar(node.inputs[1],
nd=targ.outputs[0].ndim)
if lr is None or targ is None:
return None
inputs = list(targ.inputs)
try:
c = get_scalar_constant_value(lr)
if c == 0:
inputs[alpha_in] = lr
inputs[beta_in] = lr
elif c == 1:
inputs[alpha_in] = targ.inputs[alpha_in]
inputs[beta_in] = targ.inputs[beta_in]
else:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
except NotScalarConstantError:
inputs[alpha_in] = lr * targ.inputs[alpha_in]
inputs[beta_in] = lr * targ.inputs[beta_in]
return maker(targ, *inputs)
return opt
return wrapper
def output_merge(cls, alpha_in, beta_in, out_in):
def wrapper(maker):
@local_optimizer([GpuElemwise])
@wraps(maker)
def opt(node):
if (isinstance(node.op, GpuElemwise) and
node.op.scalar_op == scal.add and
node.nin == 2):
targ = find_node(node.inputs[0], cls)
W = node.inputs[1]
if targ is None:
targ = find_node(node.inputs[1], cls)
W = node.inputs[0]
if targ is None:
return None
if not is_equal(targ.inputs[beta_in], 0.0):
# other cases are too complex for now
return None
if W.broadcastable != targ.inputs[out_in].broadcastable:
# May change later to do the broadcast, but it's
# under discussion.
return None
inputs = list(targ.inputs)
inputs[out_in] = W
inputs[beta_in] = _one.clone()
return maker(targ, *inputs)
return opt
return wrapper
def pad_dims(input, leftdims, rightdims):
"""Reshapes the input to a (leftdims + rightdims) tensor
This helper function is used to convert pooling inputs with arbitrary
non-pooling dimensions to the correct number of dimensions for the
GPU pooling ops.
This reduces or expands the number of dimensions of the input to
exactly `leftdims`, by adding extra dimensions on the left or by
combining some existing dimensions on the left of the input.
Use `unpad_dims` to reshape back to the original dimensions.
Examples
--------
Given input of shape (3, 5, 7), ``pad_dims(input, 2, 2)``
adds a singleton dimension and reshapes to (3, 1, 5, 7).
Given that output from pad_dims, ``unpad_dims(output, input, 2, 2)``
reshapes back to (3, 5, 7).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 2)``
does not reshape and returns output with shape (3, 5, 7, 9).
Given input of shape (3, 5, 7, 9, 11), ``pad_dims(input, 2, 2)``
combines the first two dimensions and reshapes to (8, 7, 9, 11).
Given input of shape (3, 5, 7, 9), ``pad_dims(input, 2, 3)``
adds a singleton dimension and reshapes to (3, 1, 5, 7, 9).
"""
assert input.ndim >= rightdims
if input.ndim == (leftdims + rightdims):
return input
# extract image dimensions
img_shape = input.shape[-rightdims:]
non_pool_ndim = input.ndim - rightdims
if non_pool_ndim < leftdims:
# too few dimensions, pad on the left
dummy_dims = tensor.as_tensor([1] * (leftdims - non_pool_ndim))
new_shape = tensor.join(0, dummy_dims,
input.shape[:non_pool_ndim],
img_shape)
else:
# too many dimensions, combine the leading dimensions
batched_ndim = non_pool_ndim - leftdims + 1
batch_size = tensor.prod(input.shape[:batched_ndim])
# convert to a vector for tensor.join
batch_size = tensor.shape_padright(batch_size, 1)
new_shape = tensor.join(0, batch_size,
input.shape[batched_ndim:non_pool_ndim],
img_shape)
# store in the required shape
new_shape = tensor.cast(new_shape, 'int64')
input_ND = GpuReshape(leftdims + rightdims)(input, new_shape)
return input_ND
def unpad_dims(output, input, leftdims, rightdims):
"""Reshapes the output after pad_dims.
This reverts the padding by `pad_dims`.
"""
if output.ndim == input.ndim:
return output
# restore the output to the original shape
outshp = tensor.join(0, input.shape[:-rightdims], output.shape[-rightdims:])
return GpuReshape(input.ndim)(output, outshp)
from __future__ import absolute_import, print_function, division
import numpy
import theano.gof
from theano.compat import PY3
from theano.sandbox.cuda import CudaNdarrayType, GpuOp
from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb
from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config
class CURAND_Base(GpuOp):
"""
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by
CURAND. This Op uses a generic-typed shared variable to point to a CObject
that encapsulates this opaque reference.
Each random variable is created with a generator of None.
The actual random number generator is allocated from the seed, on the first
call to allocate random numbers (see c_code).
Parameters
----------
output_type
A theano type (e.g. tensor.fvector).
seed: int
destructive
True or False (on the generator)
Notes
-----
One caveat is that the random number state is simply not serializable.
Consequently, attempts to serialize functions compiled with these
random numbers will fail.
"""
def __init__(self, output_type, seed, destructive):
theano.gof.Op.__init__(self)
self.destructive = destructive
self.seed = seed
if self.destructive:
self.destroy_map = {0: [0]}
self.output_type = output_type
assert output_type.dtype == "float32"
def as_destructive(self):
"""
Return an destructive version of self.
"""
return self.__class__(self.output_type, self.seed, destructive=True)
def _config(self):
"""
Return a tuple of attributes that define the Op.
"""
return (self.destructive,
self.output_type,
self.seed,
)
def __eq__(self, other):
return type(self) == type(other) and self._config() == other._config()
def __hash__(self):
return hash((type(self), self._config()))
def __str__(self):
return (self.__class__.__name__ + "{inplace=%s, out_dtype=%s}" %
(self.destructive, self.output_type))
def make_node(self, generator, size):
return theano.gof.Apply(self, [generator, size],
[generator.type(), self.output_type()])
@classmethod
def new_auto_update(cls, generator, ndim, dtype, size, seed):
"""
Return a symbolic sample from generator.
cls dictates the random variable (e.g. uniform, normal).
"""
v_size = theano.tensor.as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
self = cls(output_type=CudaNdarrayType((False,) * ndim),
seed=seed,
destructive=False)
o_gen, sample = self(generator, cast(v_size, 'int32'))
sample.generator = generator # for user
sample.update = (generator, o_gen) # for CURAND_RandomStreams
generator.default_update = o_gen # for pfunc uses this attribute
return sample
def c_headers(self):
return ["curand.h"]
def c_libraries(self):
return ['curand']
def c_support_code(self):
return """
#if PY_MAJOR_VERSION >= 3
void free_generator(PyObject *_gen)
{
curandGenerator_t * gen = (curandGenerator_t*)NpyCapsule_AsVoidPtr(_gen);
#else
void free_generator(void *_gen)
{
curandGenerator_t * gen = (curandGenerator_t*)_gen;
#endif
curandStatus_t err = curandDestroyGenerator(*gen);
if (err != CURAND_STATUS_SUCCESS)
{
fprintf(stderr, "Failure (%i) in destroying CURAND generator.\\n",
(int)err);
}
free(gen);
}
"""
def c_code(self, node, nodename, inp, out, sub):
i_generator, size = inp
o_generator, o_sample = out
destructive = int(self.destructive)
ndim = self.output_type.ndim
o_type_num = numpy.asarray(0, dtype=self.output_type.dtype).dtype.num
fail = sub['fail']
seed = self.seed
call_string = self._curand_call_str(o_sample=o_sample)
if self.output_type.dtype == 'float32':
otype = 'float'
else:
otype = 'double'
code = """
//////// <code generated by CURAND_Base>
int odims[%(ndim)s];
int n_elements = 1;
int must_alloc_sample = ((NULL == %(o_sample)s)
|| !CudaNdarray_Check((PyObject*)%(o_sample)s)
|| (CudaNdarray_NDIM(%(o_sample)s) != %(ndim)s));
if (PyArray_NDIM(%(size)s) != 1)
{
PyErr_SetString(PyExc_ValueError, "size must be vector");
%(fail)s
}
if (PyArray_DIMS(%(size)s)[0] != %(ndim)s)
{
PyErr_Format(PyExc_ValueError, "size must have length %%i (not %%i)",
%(ndim)s, PyArray_DIMS(%(size)s)[0]);
%(fail)s
}
if (PyArray_TYPE(%(size)s) != NPY_INT32)
{
PyErr_SetString(PyExc_ValueError, "size must be int32");
%(fail)s
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)PyArray_GETPTR1(%(size)s, i))[0];
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
}
if (must_alloc_sample)
{
Py_XDECREF(%(o_sample)s);
%(o_sample)s = (CudaNdarray*)CudaNdarray_NewDims(%(ndim)s, odims);
if(!%(o_sample)s)
{
%(fail)s;
}
}
if (!PyCObject_Check(%(i_generator)s))
{
// allocate a new generator for o_generator
Py_XDECREF(%(o_generator)s);
curandGenerator_t * gen = (curandGenerator_t*)malloc(sizeof(curandGenerator_t));
assert(gen);
if (CURAND_STATUS_SUCCESS !=
curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)) {
PyErr_Format(PyExc_RuntimeError, "Failed to initialize curand generator");
%(fail)s;
}
if (CURAND_STATUS_SUCCESS !=
curandSetPseudoRandomGeneratorSeed(*gen,%(seed)s))
{
PyErr_Format(PyExc_RuntimeError, "Failed to set curand generator seed");
%(fail)s;
}
%(o_generator)s = PyCObject_FromVoidPtr(gen, &free_generator);
assert (%(i_generator)s == Py_None);
}
else if (%(destructive)s)
{
// use i_generator for o_generator
Py_XDECREF(%(o_generator)s);
Py_INCREF(%(i_generator)s);
%(o_generator)s = %(i_generator)s;
}
else
{
// copy i_generator for o_generator
PyErr_Format(PyExc_NotImplementedError, "non-destructive CURAND generation");
%(fail)s;
}
{
curandGenerator_t * gen = (curandGenerator_t*)PyCObject_AsVoidPtr(%(o_generator)s);
curandStatus_t err = %(call_string)s
if (err != CURAND_STATUS_SUCCESS)
{
PyErr_Format(PyExc_RuntimeError, "curand error generating random normals %%i", (int)err);
%(fail)s;
}
cudaThreadSynchronize();
}
//////// </ code generated by CURAND_Base>
""" % locals()
if PY3:
code = code.replace("PyCObject", "NpyCapsule")
return code
def c_code_cache_version(self):
return (5,)
class CURAND_Normal(CURAND_Base):
"""
Op to draw normal numbers using CURAND.
"""
def _curand_call_str(self, **kwargs):
return """curandGenerateNormal(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s),
n_elements,
0.0, 1.0);
""" % kwargs
class CURAND_Uniform(CURAND_Base):
"""
Op to draw uniform numbers using CURAND.
"""
def _curand_call_str(self, **kwargs):
return """ curandGenerateUniform(*gen,
CudaNdarray_DEV_DATA(%(o_sample)s),
n_elements);
""" % kwargs
class CURAND_RandomStreams(object):
"""
RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable.
Parameters
----------
seed : int
"""
def __init__(self, seed):
self._start_seed = seed
self._cur_seed = seed
self._has_lost_states = False # True if self.state_updates incomplete
self.state_updates = []
def updates(self):
"""
List of all (old, new) generator update pairs created by this
instance.
"""
return list(self.state_updates)
def next_seed(self):
"""
Return a unique seed for initializing a random variable.
"""
self._cur_seed += 1
return self._cur_seed - 1
def __getstate__(self):
rval = dict(self.__dict__)
# the CObject used to store updates cannot be serialized
rval['state_updates'] = []
rval['_has_lost_states'] = True
return rval
def uniform(self, size, low=0.0, high=1.0, ndim=None,
dtype=config.floatX):
"""
Return symbolic tensor of uniform numbers.
"""
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg
else:
msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(None) # makes a generic
s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed())
self.state_updates.append(u.update)
rval = u * (high - low) + low
if u.type.broadcastable != rval.type.broadcastable:
raise NotImplementedError(
'Increase the size to match the broadcasting pattern of '
'low and `high` arguments'
)
return rval
def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
dtype=config.floatX):
"""
Return symbolic tensor of normally-distributed numbers.
Parameters
----------
size
Can be a list of integer or Theano variable (ex: the shape
of other Theano Variable)
"""
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg
else:
msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(None) # makes a generic
s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed())
self.state_updates.append(u.update)
rval = u * std + avg
if u.type.broadcastable != rval.type.broadcastable:
raise NotImplementedError(
'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments'
)
return rval
@local_optimizer([CURAND_Base])
def local_destructive(node):
op = node.op
if isinstance(op, CURAND_Base) and not op.destructive:
# op might be gpu version
new_op = op.as_destructive()
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True),
99, 'fast_run', 'inplace')
ctheano.sandbox.cuda.type
CudaNdarray_unpickler
p1
(cnumpy.core.multiarray
_reconstruct
p2
(cnumpy
ndarray
p3
(I0
tS'b'
tRp4
(I1
(I1
tcnumpy
dtype
p5
(S'f4'
I0
I1
tRp6
(I3
S'<'
NNNI-1
I-1
I0
tbI00
S'\x00\x00(\xc2'
tbtR.
\ No newline at end of file
No preview for this file type
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论