提交 023320fe authored 作者: Ian Goodfellow's avatar Ian Goodfellow

added Conv3D and related ops

上级 e8c50c78
import theano
import theano.tensor as T
import numpy as N
from theano.sandbox.cuda import cuda_available, cuda_enabled
from theano.sandbox.cuda.basic_ops import *
from util import strutil
from .Conv3D import Conv3D
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
class GpuConv3D(theano.Op):
""" GPU implementation of Conv3D """
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return '%s' % (self.__class__.__name__)
def make_node(self, V, W, b, d):
"""
:param V: Visible unit, input
:param W: Weights, filter
:param b: bias
:param d: strides when moving the filter over the input
"""
V_ = as_cuda_ndarray_variable(V)
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
return theano.Apply(self, inputs=[V_, W_, b_, d_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
fail = sub['fail']
H = outputs[0]
codeSource = """
///////////// < code generated by GpuConv3D >
//printf("\t\t\t\tConv3DGPU c code\\n");
//Check dimensionality of inputs
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
%(fail)s
}
if (%(V)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
%(fail)s
}
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
%(fail)s
}
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
{ //extra scope so fail doesn't jump over declarations
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
%(fail)s
}
{ //extra scope so error handler jumps don't cause errors
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
%(fail)s
}
{ // extra scope so fail works
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
%(fail)s
}
{ // extra scope so fail works
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
%(fail)s
}
{ // extra scope so fail works
//Make correctly sized output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
%(fail)s
}
}
{ // extra scope so fail will not cross declarations
//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
// Compute H
//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
bool work_complete = false;
if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_rows_stack
dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
dim3 threads(outputDur);
int shared_size=0;
conv_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
ws3,ws2,ws1,ws4,ws0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by GpuConv3D >
"""
return strutil.renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = out_dur
//grid block size =(out_len*out_wid, nb kern *nb batch)
//
conv_rows_stack( float* img, float* kern, float* bias, float* out,
int img_len, int img_wid, int img_dur,
int kern_height, int kern_wid, int kern_dur,
int nkern, int input_channels,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_ochannel, int img_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_okern)
{
int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
float __shared__ *d_img, *d_kern;
out_len = int( (img_len - kern_height) / dr )+1;
out_wid = int( (img_wid - kern_wid) / dc )+1;
out_dur = int( (img_dur - kern_dur) / dt )+1;
batch_id= blockIdx.y/nkern;
kern_id = blockIdx.y - batch_id*nkern;
const int out_row = blockIdx.x%out_len;
const int out_col = blockIdx.x/out_len;
const int out_frame=threadIdx.x;
img += batch_id*img_stride_batch + out_row*dr*img_stride_row + out_col*dc*img_stride_col+out_frame*dt*img_stride_frame;
kern += kern_id*kern_stride_okern;
float sum = 0.0f;
for (int z = 0; z < input_channels; z++) {//1 for first layer
for (int k =0; k < kern_height; k++) {
for (int l = 0; l < kern_wid; l++) {
for (int m = 0; m < kern_dur; m++) {
sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
}
}
}
out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
out_frame*nkern+//the output frame
out_row*out_wid*out_dur*nkern+//the output row
out_col*out_dur*nkern + //the output_col
kern_id //the output image (channel)
] = sum + bias[kern_id];
}
}
"""
return codeSource#renderString(codeSource,locals())
gpu_convd = GpuConv3D()
@theano.sandbox.cuda.opt.register_opt()
@theano.gof.opt.local_optimizer([])
def local_gpu_conv3d(node):
if isinstance(node.op, Conv3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, W, b, d = node.inputs
return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
import theano
import theano.tensor as T
import numpy as N
from theano.sandbox.cuda import cuda_available, cuda_enabled
from theano.sandbox.cuda.basic_ops import *
from util import strutil
from .ConvGrad3D import ConvGrad3D
class GpuConvGrad3D(theano.Op):
""" GPU version of gradient of ConvGrad3D with respect to W """
def make_node(self, V, d, WShape, dCdH):
"""
:param V: visible
:param d: strides
:param WShape: shapes of the weights -> shape of this op output
:param dCdH: other input with what V will be convolved.
"""
V_ = as_cuda_ndarray_variable(V)
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH)
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
def perform_(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs
print "GpuConvGrad3D python code (warning not updated to new format)"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d
dCdW = N.zeros(WShape, dtype=V.dtype)
#block
for j in xrange(0,WShape[0]):
for z in xrange(0,WShape[1]):
for k in xrange(0,WShape[2]):
for l in xrange(0,WShape[3]):
#threads
for m in xrange(0,WShape[4]):
#thread
for i in xrange(0,batchSize):
for p in xrange(0,outputHeight):
for q in xrange(0,outputWidth):
for r in xrange(0,outputDur):
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
output_storage[0][0] = dCdW
def c_code(self, node, nodename, (V,d,WShape,dCdH), outputs, sub):
fail = sub['fail']
dCdW = outputs[0]
codeSource = """
///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n");
//Check dimensionality of inputs
if (%(dCdH)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: dCdH must be a 5-d CudaNdArray");
%(fail)s
}
if (%(V)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: V must be a 5-d CudaNdArray");
%(fail)s
}
if (%(WShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be a 1-d CudaNdArray");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: d must be a 1-d CudaNdArray");
%(fail)s
}
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
{ // for fail
//Read and check sizes of inputs
const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
if (%(WShape)s->dimensions[0] != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must specify a 5-d shape");
%(fail)s
}
if (!PyArray_ISCONTIGUOUS(%(WShape)s))
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be contiguous");
%(fail)s
}
{ //for fail
dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
const int outputChannels = WShape[0];
const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
if (WShape[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
%(fail)s
}
{ //extra scope so fail works
const int filterHeight = WShape[1];
const int filterWidth = WShape[2];
const int filterDur = WShape[3];
const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
%(fail)s
}
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a width of %%i but V is only %%i pixels wide", filterWidth, vidWidth);
%(fail)s
}
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Compute correctl sized of output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
CudaNdarray_HOST_DIMS(%(dCdH)s)[3] != outputDur)
{
PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%i,%%i,%%i,%%i,%%i)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, CudaNdarray_HOST_DIMS(%(dCdH)s)[0], CudaNdarray_HOST_DIMS(%(dCdH)s)[1], CudaNdarray_HOST_DIMS(%(dCdH)s)[2] ,CudaNdarray_HOST_DIMS(%(dCdH)s)[3], CudaNdarray_HOST_DIMS(%(dCdH)s)[4] );
%(fail)s
}
{ // extra scope for fail
npy_intp dims[5];
dims[0] = outputChannels;
dims[4] = inputChannels;
dims[1] = filterHeight;
dims[2] = filterWidth;
dims[3] = filterDur;
if(!(%(dCdW)s) || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s);
%(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(dCdW)s)) {
PyErr_Format(PyExc_MemoryError, "GpuConvGrad3D: Could not allocated dCdW");
%(fail)s
}
}
{ //for fail
const int dcdhs4 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[4];
const int dcdhs3 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[3];
const int dcdhs1 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[1];
const int dcdhs2 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[2];
const int dcdhs0 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[0];
const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
bool out_contiguous = CudaNdarray_is_c_contiguous(%(dCdW)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool work_complete = false;
if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_complete){
//conv_rows_stack
dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
dim3 threads(WShape[3]);
int shared_size=0;
convgrad_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(dCdH)s), CudaNdarray_DEV_DATA(%(dCdW)s),
vidHeight, vidWidth, vidDur,
filterHeight, filterWidth, filterDur,
WShape[0], WShape[1], WShape[2], WShape[3], WShape[4],
outputHeight,outputWidth,outputDur,
batchSize, outputChannels, inputChannels,
dr,dc,dt,
vs3,vs2,vs1,vs4,vs0,
dcdhs3,dcdhs2,dcdhs1,dcdhs4,dcdhs0);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
%(fail)s
}
}}}}} // extra scope for fail
///////////// < /code generated by GpuConvGrad3D >
"""
return strutls.render_string(codeSource,locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
//
convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
int img_len, int img_wid, int img_dur,
int dCdW_len, int dCdW_wid, int dCdW_dur,
int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
int out_len, int out_wid, int out_dur,
int batchSize, int nkern, int nstack,
int dr, int dc, int dt,
int img_stride_frame, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch,
int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
int dCdW_stride_stack, int dCdW_stride_nkern)
{
int __shared__ kern_id, stack_id;
float __shared__ *d_img, *d_kern;
kern_id= blockIdx.x%nkern;
stack_id = blockIdx.x/nkern;
const int dCdW_row = blockIdx.y%ws1;
const int dCdW_col = blockIdx.y/ws1;
const int dCdW_frame=threadIdx.x;
img +=stack_id*img_stride_stack;
dCdH +=kern_id*dCdW_stride_stack;
float sum = 0.0f;
for(int i=0;i<batchSize;i++){
for(int p=0;p<out_len;p++){
for(int q=0;q<out_wid;q++){
for(int r=0;r<out_dur;r++){
sum += dCdH[i*dCdW_stride_nkern+p*dCdW_stride_row+q*dCdW_stride_col+r*dCdW_stride_frame] *
img[i*img_stride_batch+(dr*p+dCdW_row)*img_stride_row+(dc*q+dCdW_col)*img_stride_col+(dt*r+dCdW_frame)*img_stride_frame];
}
}
}
}
dCdW[kern_id*wsh1*wsh2*wsh3*wsh4+//the good batch
stack_id+//the output image
dCdW_row*wsh2*wsh3*wsh4+//the output row
dCdW_col*wsh3*wsh4 + //the output_col
dCdW_frame*wsh4] = sum;
}
/*
#block
for j in xrange(0,WShape[0]):
for z in xrange(0,WShape[1]):
for k in xrange(0,WShape[2]):
for l in xrange(0,WShape[3]):
#threads
for m in xrange(0,WShape[4]):
#thread
for i in xrange(0,batchSize):
for p in xrange(0,outputHeight):
for q in xrange(0,outputWidth):
for r in xrange(0,outputDur):
dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
*/
"""
return codeSource#renderString(codeSource,locals())
gpu_conv_grad3d = GpuConvGrad3D()
@theano.sandbox.cuda.opt.register_opt()
@theano.gof.opt.local_optimizer([])
def local_gpu_conv_gradd(node):
if isinstance(node.op, ConvGrad3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
V, d, WShape, dCdH = node.inputs
return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))]
import numpy as N
import theano.tensor as T
from util import strutil
import theano
from .ConvTransp3D import ConvTransp3D
from theano.sandbox.cuda import cuda_available, cuda_enabled
from theano.sandbox.cuda.basic_ops import *
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
class GpuConvTransp3D(theano.Op):
""" The gpu version of ConvTransp3D """
def __eq__(self,other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, W, b, d, H, RShape = None):
W_ = as_cuda_ndarray_variable(W)
b_ = as_cuda_ndarray_variable(b)
d_ = T.as_tensor_variable(d)
H_ = as_cuda_ndarray_variable(H)
if RShape:
RShape_ = T.as_tensor_variable(RShape)
else:
RShape_ = T.as_tensor_variable([-1,-1,-1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
outputs = [CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()])
def infer_shape(self, node, input_shapes):
W,b,d,H,RShape = node.inputs
W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
def perform_(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs
print "\t\t\t\tGpuConvTransp3D python code still uses old format"
output_storage[0][0] = computeR(W,b,d,H,RShape)
def c_code_cache_version(self):
return ()
def c_code(self, node, nodename, (W, b, d, H, RShape), outputs, sub):
fail = sub['fail']
R = outputs[0]
codeSource = """
///////////// < code generated by GpuConvTransp3D >
//printf("\t\t\t\tGpuConvTransp c code\\n");
//Check dimensionality of inputs
if (%(H)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s
}
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
%(fail)s
}
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
%(fail)s
}
//Read and check stride arguments
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
%(fail)s
}
{ // for fail
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Read and check sizes of inputs
{ // for fail
const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
const int outputChannels = CudaNdarray_HOST_DIMS(%(W)s)[0];
if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
{
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
%(fail)s
}
{ // for fail
const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
%(fail)s
}
{ // for fail
const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
int videoHeight = (outputHeight-1) * dr + filterHeight;
int videoWidth = (outputWidth-1) * dc + filterWidth;
int videoDur = (outputDur-1) * dt + filterDur;
if (%(RShape)s)
{
if (%(RShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "RShape must be a vector");
%(fail)s
}
if (%(RShape)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
%(fail)s
}
{ // for fail
dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
if (RShape0 != -1)
{
if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
{
PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
%(fail)s
}
videoHeight = RShape0;
videoWidth = RShape1;
videoDur = RShape2;
}
}
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
Py_XDECREF(%(R)s);
%(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError,"Could not allocate R");
%(fail)s;
}
}
cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
{ // for fail
bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
int version = -1;
int verbose = 0;
bool subsample =(dr>1)||(dc>1)||(dt>1);
bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
bool work_complete = false;
const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];
if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
//conv_transp_rows_stack
dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
dim3 threads(videoDur);
HERE
int shared_size=0;
conv_transp_rows_stack<<<grid, threads, shared_size>>>(
CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
videoHeight, videoWidth, videoDur,
filterHeight, filterWidth, filterDur,
outputHeight, outputWidth, outputDur,
outputChannels, inputChannels,
dr,dc,dt,
hs3,hs2,hs1,hs4,hs0,
ws3,ws2,ws1,ws4,ws0,
CudaNdarray_HOST_STRIDES(%(b)s)[0]);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
}
else
{
if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
cudaGetErrorString(sts));
%(fail)s
}
}
if(!work_complete){
PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
out_contiguous,b_strided,outputDur);
%(fail)s
}
}}}}}} // for fail
///////////// < /code generated by GpuConvTransp3D >
"""
return renderString(codeSource,locals())
def c_support_code_apply(self, node, nodename):
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
__global__ void
//thread block size = videoDur
//grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
//
conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
int img_len, int img_wid, int img_dur,
int kern_len, int kern_wid, int kern_dur,
int H_len, int H_wid, int H_dur,
int nkern, int nstack,
int dr, int dc, int dt,
int H_stride_frame, int H_stride_col, int H_stride_row,
int H_stride_stack, int H_stride_batch,
int kern_stride_frame, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern,
int bias_stride)
{
int __shared__ batch_id, stack_id;
float __shared__ *d_img, *d_kern;
batch_id= blockIdx.x/nstack;
stack_id = blockIdx.x - batch_id*nstack;
const int R_row = blockIdx.y/img_wid;
const int R_col = blockIdx.y%img_wid;
const int R_frame=threadIdx.x;
const int r = R_row;
const int c = R_col;
const int t = R_frame;
const int ftc = max(0, int(ceil(float(t-kern_dur +1 )/float(dt))));
const int fcc = max(0, int(ceil(float(c-kern_wid +1)/float(dc))));
int rc = max(0, int(ceil(float(r-kern_len+1)/float(dr))));
float sum = 0;
while(rc < H_len){
int rk = r - rc * dr;
if(rk < 0)
break;
int cc = fcc;
while( cc < H_wid){
int ck = c - cc * dc;
if(ck < 0)
break;
int tc = ftc;
while(tc < H_dur){
int tk = t - tc * dt;
if(tk < 0)
break;
//R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
for(int q=0;q<nkern;q++){
sum += kern[q*kern_stride_nkern+stack_id*kern_stride_stack+rk*kern_stride_row+ck*kern_stride_col+tk*kern_stride_frame]*
H[batch_id*H_stride_batch+q*H_stride_stack+rc*H_stride_row+cc*H_stride_col+tc*H_stride_frame];
}
tc += 1;
}
cc += 1;
}
rc += 1;
}
R[batch_id*nstack*img_len*img_wid*img_dur+//the good batch
stack_id+//the output image
R_row*img_wid*img_dur*nstack+//the output row
R_col*img_dur*nstack + //the output_col
R_frame*nstack] = sum + bias[stack_id*bias_stride];
}
"""
return codeSource
gpu_conv_transpd = GpuConvTransp3D()
@theano.sandbox.cuda.opt.register_opt()
@theano.gof.opt.local_optimizer([])
def local_gpu_conv_transpd(node):
if isinstance(node.op, ConvTransp3D):
if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
W, b, d, H, RShape = node.inputs
return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
assert len(d) == 3
outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr,dc,dt = d
assert dr > 0
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
if Rshape != None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
print (Rshape[0], videoHeight)
assert False
assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape
#else:
# print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur))
R = N.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth):
for t in xrange(0,videoDur):
R[i,j,r,c,t] = b[j]
ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
rc = max([0, int(N.ceil(float(r-filterHeight+1)/float(dr))) ])
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
break
cc = fcc
while cc < outputWidth:
ck = c - cc * dc
if ck < 0:
break
tc = ftc
while tc < outputDur:
tk = t - tc * dt
if tk < 0:
break
R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
tc += 1
"" #close loop over tc
cc += 1
"" #close loop over cc
rc += 1
"" #close loop over rc
"" #close loop over t
"" #close loop over c
"" #close loop over r
"" #close loop over j
"" #close loop over i
return R
from ops.Conv3D import *
from ops.ConvGrad3D import *
import theano
from theano.tensor import basic as T
import numpy as N
from theano.sandbox.cuda import cuda_available, cuda_enabled
from util import strutil
from theano import printing
from theano.tensor.blas_headers import blas_header_text
from theano.tensor.blas import ldflags
#Note: not a true convolution because we don't bother with flipping the kernel
#An op that takes a weight tensor W. a bias vector b, and a visible tensor V, produces a hidden unit tensor H
#Also parmeterized by integer strides dr,dc,dt
#H[i,r,c,t,j] = video i within the minibatch, feature map j, location and time within feature map (r,c,t)
#W[j,k,l,m,z] = weights connecting H[i,r,c,t,j] to V[i,dr*r+k,dc*c+l,dt*t+m,z]
#b[j] = bias of feature map j
#V[i,r,c,t,j] = pixel at (r,c,t) within video featuremap j of video i within the minibatch
#i.e., H[i,j,r,c,t] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i,z, dr*r+k,dc*c+l,dt*t+m]
#The layouts of these variables are chosen to improve locality of reference.
#numpy seems to put the largest stride on axis 0 and decrease the stride from there. If we do convolution
#one filter at a time, one example at a time, then we want the largest strides to
#be over the examples. We want the smallest stride to be over the input channel because as we change
#the channel we re-visit the same location in the input.
#The smallest stride being over the input channel means that the weights need to be formatted with the input
#channel as the last index
#partial C / partial b_j = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j )
# = sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * delta(k = j)
# = sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] )
#partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) *
# (partial sum_s sum_u sum_v sum_a W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *
# (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z])
# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * V[i,dr*p+k,dc*q+l,dt*r+m,z]
#derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since dr, dc, dt are natural numbers.
class Conv3D(theano.Op):
""" 3D "convolution" of multiple filters on a minibatch (does not flip the kernel, moves kernel with a user specified stride) """
def __eq__(self,other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return "Conv3D"
def make_node(self, V, W, b, d):
"""
:param V: Visible unit, input
:param W: Weights, filter
:param b: bias, shape == (W.shape[0],)
:param d: strides when moving the filter over the input
"""
V_ = T.as_tensor_variable(V)
W_ = T.as_tensor_variable(W)
b_ = T.as_tensor_variable(b)
d_ = T.as_tensor_variable(d)
node = theano.Apply(self, inputs=[V_, W_,b_,d_], outputs = [ T.TensorType(V_.dtype, (V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
return node
def grad(self,inputs, output_gradients):
V,W,b,d = inputs
dCdH ,= output_gradients
#make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
#print dCdH.broadcastable
#print "dCdH.broadcastable"
#quit(-1)
#dCdH = printing.Print("dCdH = ",["shape"])
dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4] )
WShape = W.shape
dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH)
dCdb = T.sum(dCdH, axis=(0,1,2,3))
dCdd = None #not differentiable, since d is not continuous
return [ dCdV, dCdW, dCdb, dCdd ]
def perform(self, node, inputs, output_storage):
V, W, b, d = inputs
print "Conv3D python code"
output_storage[0][0] = computeH(V,W,b,d)
def infer_shape(self, node, input_shapes):
V,W,b,d = node.inputs
V_shape, W_shape, b_shape, d_shape = input_shapes
dr = d[0]
dc = d[1]
dt = d[2]
batch_size = V_shape[0]
output_channels = W_shape[0]
vidHeight = V_shape[1]
filterHeight = W_shape[1]
vidWidth = V_shape[2]
filterWidth = W_shape[2]
vidDur = V_shape[3]
filterDur = W_shape[3]
output_height = T.floor( (vidHeight - filterHeight) / dr )+1
output_width = T.floor( (vidWidth - filterWidth) / dc )+1
output_dur = T.floor( (vidDur - filterDur) / dt ) +1
return [(batch_size, output_height, output_width, output_dur, output_channels )]
def c_support_code(self):
return blas_header_text()
def c_libraries(self):
return ldflags()
def c_compile_args(self):
flags = ldflags(libs=False, flags=True)
flags.append('-Werror')
return flags
def c_lib_dirs(self):
return ldflags(libs=False, libs_dir=True)
def c_header_dirs(self):
return ldflags(libs=False, include_dir=True)
def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
fail = sub['fail']
H = outputs[0]
codeSource = """
///////////// < code generated by Conv3D >
//printf("\t\t\t\tConv3D c code\\n");
//Check dimensionality of inputs
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
%(fail)s
}
if (%(V)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
%(fail)s
}
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector.");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector.");
%(fail)s
}
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given",%(d)s->dimensions[0]);
%(fail)s
}
//Read and check sizes of inputs
{ // exta scope so error handler jumps don't cause errors
const int batchSize = %(V)s->dimensions[0];
const int outputChannels = %(W)s->dimensions[0];
const int inputChannels = %(V)s->dimensions[4];
if (%(W)s->dimensions[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels);
%(fail)s
}
if (%(b)s->dimensions[0] != outputChannels)
{
PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels);
%(fail)s
}
{ //extra scope so error handler jumps don't cause errors
const int filterHeight = %(W)s->dimensions[1];
const int filterWidth = %(W)s->dimensions[2];
const int filterDur = %(W)s->dimensions[3];
const int vidHeight = %(V)s->dimensions[1];
const int vidWidth = %(V)s->dimensions[2];
const int vidDur = %(V)s->dimensions[3];\
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
%(fail)s
}
{ // extra scope so fail works
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
%(fail)s
}
{ // extra scope so fail works
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
{ // extra scope so fail works
//Make correctly sized output
const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const long long outputDur = int( (vidDur - filterDur) / dt ) +1;
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = outputChannels;
dims[1] = outputHeight;
dims[2] = outputWidth;
dims[3] = outputDur;
if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] ||
%(H)s->dimensions[1]!=dims[1] ||
%(H)s->dimensions[2]!=dims[2] ||
%(H)s->dimensions[3]!=dims[3] ||
%(H)s->dimensions[4]!=dims[4]){
Py_XDECREF(%(H)s);
%(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
if (!(%(H)s)) {
PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output.");
%(fail)s
}
}
{ // extra scope so fail works
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
const int ws0 = %(W)s->strides[0];
const int ws1 = %(W)s->strides[1];
const int ws2 = %(W)s->strides[2];
const int vs1 = %(V)s->strides[1];
const int ws4 = %(W)s->strides[4];
const int vs4 = %(V)s->strides[4];
const int ws3 = %(W)s->strides[3];
const int vs3 = %(V)s->strides[3];
const int vs2 = %(V)s->strides[2];
const int bs = %(b)s->strides[0];
const int hs4 = %(H)s->strides[4];
// Compute H
//H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
//TODO: add special cases
// ex: filterDur == 1 && batchSize == 1 && dt = 1 (for SFA)
// ex: inputChannels == 1 """
#if the data types are not mixed, we can insert special case optimizations based on BLAS
VV, WV, bv, dv = node.inputs
HV = node.outputs[0]
if VV.dtype == WV.dtype and HV.dtype == VV.dtype:
if VV.dtype == 'float64':
gemv = 'dgemv_'
elif VV.dtype == 'float32':
gemv = 'sgemv_'
else:
raise Exception('Unrecognized dtype for convolution '+V.value.dtype)
codeSource += """
if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
{
std::cout << "lots of channels special case code" << std::endl;
#define blas_type dtype_ ## %(V)s
const blas_type constant_one = 1.0;
char N = 'T';
int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0));
int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4));
int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4));
//special case code for the "lots of channels" case
//uses a BLAS matrix vector multiply to compute the contribute for
//all channels of an input pixel to all channels of an output pixel
//simultaneously
long long Hpos = 0;
long long Vpos = 0;
for (int i = 0; i < batchSize; i++) {
long long Hposi = Hpos;
long long Vposi = Vpos;
for (int r = 0; r < outputHeight; r++) {
long long Hposr = Hpos;
long long Vposr = Vpos;
for (int c = 0; c < outputWidth; c++) {
long long Hposc = Hpos;
long long Vposc = Vpos;
for (int t = 0; t < outputDur; t++) {
long long Hpost = Hpos;
long long Vpost = Vpos;
//of the loops so far, j should be the innermost, because
//each loop through j visits the same elements of V
//this implies that the last index of H should be the j index
//since V and H should have the same format, this means
//z should be the last index in v, and therefore the innermost
//of the next set of for loops
int Wpos = 0;
int bPos = 0;
long long Hposj = Hpos;
for (int j = 0; j < outputChannels; j++) {
// H[i,r,c,t,j] = b[j]
ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos);
Hposj += hs4;
bPos += bs;
}
dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);
for (int k =0; k < filterHeight; k++) {
int Wposk = Wpos;
long long Vposk = Vpos;
for (int l = 0; l < filterWidth; l++) {
int Wposl = Wpos;
long long Vposl = Vpos;
for (int m = 0; m < filterDur; m++) {
//H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])
//note: changing the weights so that outputChannels and inputChannels were the last two rather than
//the first and last elements did not speed this up, even for extremely large input sizes
%(gemv)s(&N, & inputChannels, & outputChannels,
&constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e,
& ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one,
writePos,& hs4e);
Wpos += ws3;
Vpos += vs3;
} // close m
Wpos = Wposl + ws2;
Vpos = Vposl + vs2;
} //close l
Wpos = Wposk + %(W)s->strides[1];
Vpos = Vposk + %(V)s->strides[1];
} //close k
Hpos = Hpost + %(H)s->strides[3];
Vpos = Vpost + vs3 * dt;
} //close t
Hpos = Hposc + %(H)s->strides[2];
Vpos = Vposc + vs2 * dc;
} //close c
Hpos = Hposr + %(H)s->strides[1];
Vpos = Vposr + %(V)s->strides[1] * dr;
} //closes r
Hpos = Hposi + %(H)s->strides[0];
Vpos = Vposi + %(V)s->strides[0];
} //closes i
} //closes "lots of channels" special case code
else
"""
codeSource += """
{
//General case code
//std::cout << "general case code" << std::endl;
long long Hpos = 0;
long long Vpos = 0;
for (int i = 0; i < batchSize; i++) {
long long Hposi = Hpos;
long long Vposi = Vpos;
for (int r = 0; r < outputHeight; r++) {
long long Hposr = Hpos;
long long Vposr = Vpos;
for (int c = 0; c < outputWidth; c++) {
long long Hposc = Hpos;
long long Vposc = Vpos;
for (int t = 0; t < outputDur; t++) {
long long Hpost = Hpos;
long long Vpost = Vpos;
//of the loops so far, j should be the innermost, because
//each loop through j visits the same elements of V
//this implies that the last index of H should be the j index
//since V and H should have the same format, this means
//z should be the last index in v, and therefore the innermost
//of the next set of for loops
int Wpos = 0;
int bPos = 0;
for (int j = 0; j < outputChannels; j++) {
long long Hposj = Hpos;
long long Vposj = Vpos;
int Wposj = Wpos;
// H[i,r,c,t,j] = b[j]
dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);
writePos = ELEM_AT(%(b)s,bPos);
for (int k =0; k < filterHeight; k++) {
int Wposk = Wpos;
long long Vposk = Vpos;
for (int l = 0; l < filterWidth; l++) {
int Wposl = Wpos;
long long Vposl = Vpos;
for (int m = 0; m < filterDur; m++) {
int Wposm = Wpos;
long long Vposm = Vpos;
for (int z = 0; z < inputChannels; z++) {
//H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
Wpos += ws4;
Vpos += vs4;
} // close z
Wpos = Wposm + ws3;
Vpos = Vposm + vs3;
} // close m
Wpos = Wposl + ws2;
Vpos = Vposl + vs2;
} //close l
Wpos = Wposk + %(W)s->strides[1];
Vpos = Vposk + %(V)s->strides[1];
} //close k
bPos += bs;
Wpos = Wposj + ws0;
Hpos = Hposj + hs4;
Vpos = Vposj;
//std::cout << "incremented Wpos by " << ws0 << std::endl;
//std::cout << "incremented Hpos by " << hs4 << std::endl;
} //close j
Hpos = Hpost + %(H)s->strides[3];
Vpos = Vpost + vs3 * dt;
} //close t
Hpos = Hposc + %(H)s->strides[2];
Vpos = Vposc + vs2 * dc;
} //close c
Hpos = Hposr + %(H)s->strides[1];
Vpos = Vposr + %(V)s->strides[1] * dr;
} //closes r
Hpos = Hposi + %(H)s->strides[0];
Vpos = Vposi + %(V)s->strides[0];
} //closes i
} //closes general case code
}}}}}}} //extra scope so error handler jumps don't cross declarations
///////////// < /code generated by Conv3D >
"""
return strutil.renderString(codeSource,locals())
global conv3D
conv3D = Conv3D()
def computeH(V,W,b,d):
assert len(W.shape) == 5
assert len(V.shape) == 5
if len(b.shape) != 1:
print b.shape
assert False
assert len(d) == 3
batchSize = V.shape[0]
outputChannels = W.shape[0]
inputChannels = V.shape[4]
if W.shape[4] != inputChannels:
raise Exception("W.shape[4] = "+str(W.shape[4])+" but inputChannels = "+str(inputChannels))
filterHeight = W.shape[1]
filterWidth = W.shape[2]
filterDur = W.shape[3]
vidHeight = V.shape[1]
vidWidth = V.shape[2]
vidDur = V.shape[3]
assert vidHeight >= filterHeight
assert vidWidth >= filterWidth
assert vidDur >= filterDur
dx,dy,dt = d
assert dx > 0
assert dy > 0
assert dt > 0
outputHeight = int( (vidHeight - filterHeight) / dx )+1
outputWidth = int( (vidWidth - filterWidth) / dy )+1
outputDur = int( (vidDur - filterDur) / dt ) +1
H = N.zeros( (batchSize, outputHeight,
outputWidth, outputDur, outputChannels ), dtype=V.dtype )
#H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dx*x+k,dy*y+l,dt*t+m]
for i in xrange(0,H.shape[0]):
#print '\texample '+str(i+1)+'/'+str(H.shape[0])
for j in xrange(0,H.shape[4]):
#print '\t\tfeature map '+str(j+1)+'/'+str(H.shape[1])
for x in xrange(0,H.shape[1]):
#print '\t\t\trow '+str(x+1)+'/'+str(H.shape[2])
for y in xrange(0,H.shape[2]):
for t in xrange(0,H.shape[3]):
H[i,x,y,t,j] = b[j]
for k in xrange(0,filterHeight):
for l in xrange(0,filterWidth):
for m in xrange(0,filterDur):
for z in xrange(0,inputChannels):
#if (i,j,x,y,t) == (0,0,0,0,0):
# print (( W[j,z,k,l,m] , V[i,z,d[0]*x+k,d[1]*y+l,d[2]*t+m] ), (k,l,m) )
w = W[j,k,l,m,z]
v = V[i,d[0]*x+k, d[1]*y+l, d[2]*t+m,z]
#if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
# print 'setting H[0] += '+str(w*v)+' W['+str((j,z,k,l,m))+']='+str(w)+' V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
H[i,x,y,t,j] += w * v
return H
from . import ConvGrad3D
from . import ConvTransp3D
if cuda_available:
from . import GpuConv3D
import theano
from theano.tensor import basic as T
from theano.sandbox.cuda import cuda_available, cuda_enabled
from util import strutil
import numpy as N
class ConvGrad3D(theano.Op):
""" Gradient of Conv3D with respect to W """
def __eq__(self,other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, V, d, WShape, dCdH):
V_ = T.as_tensor_variable(V)
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = T.as_tensor_variable(dCdH)
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], outputs = [ T.TensorType(V_.dtype, (False,False,False,False,False))() ] )
def infer_shape(self, node, input_shapes):
V,d,W_shape, dCdH = node.inputs
return [ ( W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4] ) ]
def grad(self,inputs, output_gradients):
C,d, WShape, B = inputs
dLdA ,= output_gradients
z = T.zeros_like(C[0,0,0,0,:])
dLdC = convTransp3D( dLdA, z, d, B, C.shape[1:4])
dLdd = None #not differentiable, since d is not continuous
dLdWShape = None #not differentiable, since d is not continuous
dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d)
return [ dLdC, dLdd, dLdWShape, dLdB ]
def perform(self, node, inputs, output_storage):
V, d, WShape, dCdH = inputs
print "ConvGradW3D python code"
#partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[4]
outputHeight = dCdH.shape[1]
outputWidth = dCdH.shape[2]
outputDur = dCdH.shape[3]
assert V.shape[0] == batchSize
inputFilters = V.shape[4]
inputHeight = V.shape[1]
inputWidth = V.shape[2]
inputDur = V.shape[3]
dr, dc, dt = d
dCdW = N.zeros(WShape, dtype=V.dtype)
#print 'computing output of shape '+str(WShape)
for k in xrange(0,WShape[1]):
for l in xrange(0,WShape[2]):
for m in xrange(0,WShape[3]):
for i in xrange(0,batchSize):
for p in xrange(0,outputHeight):
for q in xrange(0,outputWidth):
for r in xrange(0,outputDur):
for j in xrange(0,WShape[0]):
for z in xrange(0,WShape[4]):
dCdW[j,k,l,m,z] += dCdH[i,p,q,r,j] * V[i,dr*p+k,dc*q+l,dt*r+m,z]
output_storage[0][0] = dCdW
def c_compile_args(self):
flags = ['-Werror']
return flags
def c_code(self, node, nodename, (V,d,WShape,dCdH), outputs, sub):
fail = sub['fail']
dCdW = outputs[0]
codeSource = """
///////////// < code generated by ConvGradW3D >
//printf("\t\t\t\tConvGradW3D c code\\n");
//Check dimensionality of inputs
if (%(dCdH)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
%(fail)s
}
if (%(V)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
%(fail)s
}
if (%(WShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector.");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector.");
%(fail)s
}
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given",%(d)s->dimensions[0]);
%(fail)s
}
{ //extra scope so that fail will not jump over declarations
//Read and check sizes of inputs
const int batchSize = %(V)s->dimensions[0];
if (%(WShape)s->dimensions[0] != 5)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape");
%(fail)s
}
if (!PyArray_ISCONTIGUOUS(%(WShape)s))
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
%(fail)s
}
{ //extra scope so that fail will not jump over declarations
dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
const int outputChannels = WShape[0];
const int inputChannels = %(V)s->dimensions[4];
if (WShape[4] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
%(fail)s
}
{ //extra scope so fail works
const int filterHeight = WShape[1];
const int filterWidth = WShape[2];
const int filterDur = WShape[3];
const int vidHeight = %(V)s->dimensions[1];
const int vidWidth = %(V)s->dimensions[2];
const int vidDur = %(V)s->dimensions[3];
if (vidHeight < filterHeight)
{
PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
%(fail)s
}
if (vidWidth < filterWidth)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth);
%(fail)s
}
if (vidDur < filterDur)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
%(fail)s
}
{ // extra scope so fail works
//Read and check stride arguments
const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
{ // extra scope so fail works
//Compute correct sized of output
const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
const int outputDur = int( (vidDur - filterDur) / dt ) +1;
if (%(dCdH)s->dimensions[0] != batchSize ||
%(dCdH)s->dimensions[4] != outputChannels ||
%(dCdH)s->dimensions[1] != outputHeight ||
%(dCdH)s->dimensions[2] != outputWidth ||
%(dCdH)s->dimensions[3] != outputDur)
{
PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, %(dCdH)s->dimensions[0], %(dCdH)s->dimensions[1], %(dCdH)s->dimensions[2] ,%(dCdH)s->dimensions[3], %(dCdH)s->dimensions[4] );
%(fail)s
}
{ // extra scope for fail
npy_intp dims[5];
dims[0] = outputChannels;
dims[4] = inputChannels;
dims[1] = filterHeight;
dims[2] = filterWidth;
dims[3] = filterDur;
if(!(%(dCdW)s) || %(dCdW)s->dimensions[0]!=dims[0] ||
%(dCdW)s->dimensions[1]!=dims[1] ||
%(dCdW)s->dimensions[2]!=dims[2] ||
%(dCdW)s->dimensions[3]!=dims[3] ||
%(dCdW)s->dimensions[4]!=dims[4] ){
Py_XDECREF(%(dCdW)s);
%(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
if (!(%(dCdW)s)) {
PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
%(fail)s
}
}
{ //extra scope so fail works
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
const int dhs3 = %(dCdH)s->strides[3];
const int dtvs3 = dt * %(V)s->strides[3];
// Compute dCdW
//TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
// dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j] * V[i,dr*p+k,dc*q+l,dt*r+m,z]
for (int j = 0; j < outputChannels; j++) {
for (int z = 0; z < inputChannels; z++) {
for (int k = 0; k < filterHeight; k++) {
for (int l = 0; l < filterWidth; l++) {
for (int m = 0; m < filterDur; m++) {
//printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);
dtype_%(dCdW)s & writePos = ELEM5(%(dCdW)s, j,k,l,m,z);
writePos = 0;
for (int i = 0; i < batchSize; i++) {
for (int p = 0; p < outputHeight; p++) {
for (int q = 0; q < outputWidth; q++) {
int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ;
int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] + (dr * p+k) * %(V)s->strides[1] + (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3];
for (int r = 0; r < outputDur; r++) {
writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
//writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
Hpos += dhs3;
Vpos += dtvs3;
}
}
}
}
}
}
}
}
}
}}}}}}} // extra scope for fail
///////////// < /code generated by ConvGradW3D >
"""
return strutil.renderString(codeSource,locals())
convGrad3D = ConvGrad3D()
from Conv3D import conv3D
from ConvTransp3D import convTransp3D
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
from . import GpuConvGrad3D
import numpy as N
from theano.tensor import basic as T
from util import strutil
import theano
from theano.sandbox.cuda import cuda_available, cuda_enabled
class ConvTransp3D(theano.Op):
""" "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """
def __eq__(self,other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def make_node(self, W, b, d, H, RShape = None):
"""
:param W: Weights, filter
:param b: bias, shape == (W.shape[0],)
:param d: strides when moving the filter over the input
:param H: The output of Conv3D
"""
W_ = T.as_tensor_variable(W)
b_ = T.as_tensor_variable(b)
d_ = T.as_tensor_variable(d)
H_ = T.as_tensor_variable(H)
if RShape:
RShape_ = T.as_tensor_variable(RShape)
else:
RShape_ = T.as_tensor_variable([-1,-1,-1])
return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], outputs = [ T.TensorType(H_.dtype, (False,False,False,False,False))() ] )
def c_compile_args(self):
flags = ['-Werror']
return flags
def infer_shape(self, node, input_shapes):
W,b,d,H,RShape = node.inputs
W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
def grad(self,inputs, output_gradients):
W,b,d,H, RShape = inputs
dCdR ,= output_gradients
dCdH = conv3D( dCdR, W, T.zeros_like(H[0,0,0,0,:]), d)
WShape = W.shape
dCdW = convGrad3D(dCdR,d,WShape,H)
dCdb = T.sum(dCdR,axis=(0,1,2,3))
dCdd = None #not differentiable, since d is not continuous
dCdRShape = None #not differentiable, since RShape is not continuous
return [ dCdW, dCdb, dCdd, dCdH, RShape ]
def perform(self, node, inputs, output_storage):
W, b, d, H, RShape = inputs
print "\t\t\t\tConvTransp3D python code"
output_storage[0][0] = computeR(W,b,d,H,RShape)
def c_code(self, node, nodename, (W, b, d, H, RShape), outputs, sub):
fail = sub['fail']
R = outputs[0]
codeSource = """
///////////// < code generated by ConvTransp3D >
//printf("\t\t\t\tConvTransp3D c code\\n");
//Check dimensionality of inputs
if (%(H)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
%(fail)s
}
if (%(W)s->nd != 5)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
%(fail)s
}
if (%(b)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
%(fail)s
}
if (%(d)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
%(fail)s
}
//Read and check stride arguments
if (%(d)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0] );
%(fail)s
}
{ // for fail 1
int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
if (dr <= 0 || dc <= 0 || dt <= 0)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
%(fail)s
}
//Read and check sizes of inputs
{ // for fail 2
const int batchSize = %(H)s->dimensions[0];
const int outputChannels = %(W)s->dimensions[0];
if (%(H)s->dimensions[4] != outputChannels)
{
PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]);
%(fail)s
}
{ // for fail 3
const int inputChannels = %(W)s->dimensions[4];
if (%(b)s->dimensions[0] != inputChannels)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", %(b)s->dimensions[0], inputChannels );
%(fail)s
}
{ // for fail 4
const int filterHeight = %(W)s->dimensions[1];
const int filterWidth = %(W)s->dimensions[2];
const int filterDur = %(W)s->dimensions[3];
const int outputHeight = %(H)s->dimensions[1];
const int outputWidth = %(H)s->dimensions[2];
const int outputDur = %(H)s->dimensions[3];
int videoHeight = (outputHeight-1) * dr + filterHeight;
int videoWidth = (outputWidth-1) * dc + filterWidth;
int videoDur = (outputDur-1) * dt + filterDur;
if (%(RShape)s)
{
if (%(RShape)s->nd != 1)
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
%(fail)s
}
if (%(RShape)s->dimensions[0] != 3)
{
PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
%(fail)s
}
dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
if (RShape0 != -1)
{
if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
{
PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2);
%(fail)s
}
videoHeight = RShape0;
videoWidth = RShape1;
videoDur = RShape2;
}
} //closes if RShape
{ // for fail 5
//Allocate the reconstruction
npy_intp dims[5];
dims[0] = batchSize;
dims[4] = inputChannels;
dims[1] = videoHeight;
dims[2] = videoWidth;
dims[3] = videoDur;
if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] ||
%(R)s->dimensions[1]!=dims[1] ||
%(R)s->dimensions[2]!=dims[2] ||
%(R)s->dimensions[3]!=dims[3] ||
%(R)s->dimensions[4]!=dims[4])
{
Py_XDECREF(%(R)s);
%(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num);
if (!(%(R)s)) {
PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
%(fail)s
}
}
for (int i = 0; i < 3; i++)
if (%(R)s->strides[i] < %(R)s->strides[4])
{
PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
%(fail)s
}
{ // for fail 6
memset(%(R)s->data, 0, (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
(videoHeight-1) * %(R)s->strides[1] +
(videoWidth-1) * %(R)s->strides[2] +
(videoDur-1) * %(R)s->strides[3]);
#define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data;
int rs4 = %(R)s->strides[4];
int ws0 = %(W)s->strides[0];
int ws4 = %(W)s->strides[4];
int hs4 = %(H)s->strides[4];
// Compute R
// R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]
for (int i = 0; i < batchSize; i++) {
for (int r = 0; r < videoHeight; r++) {
const int frc = std::max(0.0, ceil(float(r-filterHeight+1)/float(dr)));
for (int c = 0; c < videoWidth; c++) {
const int fcc = std::max(0.0, ceil(float(c-filterWidth +1)/float(dc)));
for (int t = 0; t < videoDur; t++) {
const int ftc = std::max(0.0, ceil(float(t-filterDur +1) /float(dt)));
long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3];
long long Rpos = Rpost;
for (int j = 0; j < inputChannels; j++)
{
//ELEM5(%(R)s, i,r,c,t,j) = b[j];
ELEM_AT(%(R)s,Rpos) = b[j];
Rpos += rs4;
}
for (int rc = frc; rc < outputHeight; rc++) {
const int rk = r - rc * dr;
if (rk < 0) break;
for (int cc = fcc; cc < outputWidth; cc++) {
const int ck = c - cc * dc;
if (ck < 0) break;
for (int tc = ftc; tc < outputDur; tc++)
{
const int tk = t - tc * dt;
if (tk < 0) break;
int Wpos = rk * %(W)s->strides[1] + ck * %(W)s->strides[2] + tk * %(W)s->strides[3];
int Hpostc = i * %(H)s->strides[0] + rc * %(H)s->strides[1] + cc * %(H)s->strides[2] + tc * %(H)s->strides[3];
Rpos = Rpost;
for (int j = 0; j < inputChannels; j++)
{
int Wposj = Wpos;
dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
int Hpos = Hpostc;
for (int k = 0; k < outputChannels; k++) {
//TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
//writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
//writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
Wpos += ws0;
Hpos += hs4;
} //close the k loop
Rpos += rs4;
Wpos = Wposj + ws4;
} //close the j loop
} // close the tc loop
} //cc
} //rc
} //t
} //c
} //r
} //i
} //for fail 6
} //for fail 5
} //for fail 4
} //for fail 3
} //for fail 2
} // for fail 1
///////////// < /code generated by ConvTransp3D >
"""
return strutil.renderString(codeSource,locals())
convTransp3D = ConvTransp3D()
#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
def computeR(W,b,d,H,Rshape = None):
assert len(W.shape) == 5
assert len(H.shape) == 5
assert len(b.shape) == 1
assert len(d) == 3
outputChannels, filterHeight, filterWidth, filterDur, inputChannels = W.shape
batchSize, outputHeight, outputWidth, outputDur, outputChannelsAgain = H.shape
assert outputChannelsAgain == outputChannels
assert b.shape[0] == inputChannels
dr,dc,dt = d
assert dr > 0
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
if Rshape != None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
print (Rshape[0], videoHeight)
assert False
assert Rshape[1] >= videoWidth
assert Rshape[2] >= videoDur
#print "setting video size to Rshape = "+str(Rshape)
videoHeight, videoWidth, videoDur = Rshape
#else:
# print "No Rshape passed in"
#print "video size: "+str((videoHeight, videoWidth, videoDur))
R = N.zeros( (batchSize, videoHeight,
videoWidth, videoDur, inputChannels ) , dtype=H.dtype)
#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0,batchSize):
#print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0,inputChannels):
#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
for r in xrange(0,videoHeight):
#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
for c in xrange(0,videoWidth):
for t in xrange(0,videoDur):
R[i,r,c,t,j] = b[j]
ftc = max([0, int(N.ceil(float(t-filterDur +1 )/float(dt))) ])
fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
rc = max([0, int(N.ceil(float(r-filterHeight+1)/float(dr))) ])
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
break
cc = fcc
while cc < outputWidth:
ck = c - cc * dc
if ck < 0:
break
tc = ftc
while tc < outputDur:
tk = t - tc * dt
if tk < 0:
break
R[i,r,c,t,j] += N.dot(W[:,rk,ck,tk,j], H[i,rc,cc,tc,:] )
tc += 1
"" #close loop over tc
cc += 1
"" #close loop over cc
rc += 1
"" #close loop over rc
"" #close loop over t
"" #close loop over c
"" #close loop over r
"" #close loop over j
"" #close loop over i
return R
from Conv3D import conv3D
from ConvGrad3D import convGrad3D
if cuda_available:
import GpuConvTransp3D
from nnet import * from nnet import *
from Conv3D import *
from ConvGrad3D import *
from ConvTransp3D import *
from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论