提交 239b6d80 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2117 from abergeron/dnn

Add support for subsampling and gradient to GpuDnnConv
......@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c
global-include *.cu
global-include *.cuh
global-include *.h
global-include *.sh
global-include *.pkl
recursive-include docs
......
......@@ -193,7 +193,7 @@ def do_setup():
install_requires=['numpy>=1.5.0', 'scipy>=0.7.2'],
package_data={
'': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
'ChangeLog'],
'*.h', 'ChangeLog'],
'theano.misc': ['*.sh']
},
scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
......
import numpy
import theano
from theano.gof.type import *
from theano import Op, Apply
from theano.tensor import TensorType
from theano.gof.type import CDataType
# todo: test generic
class ProdOp(Op):
__props__ = ()
def make_node(self, i):
return Apply(self, [i], [CDataType('void *', 'py_decref')()])
def c_support_code(self):
return """
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def c_code(self, node, name, inps, outs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = (void *)%(inp)s;
Py_INCREF(%(inp)s);
""" % dict(out=outs[0], inp=inps[0])
def c_code_cache_version(self):
return (0,)
class GetOp(Op):
__props__ = ()
def make_node(self, c):
return Apply(self, [c], [TensorType('float32', (False,))()])
def c_support_code(self):
return """
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def c_code(self, node, name, inps, outs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = (PyArrayObject *)%(inp)s;
Py_INCREF(%(out)s);
""" % dict(out=outs[0], inp=inps[0])
def c_code_cache_version(self):
return (0,)
def test_cdata():
i = TensorType('float32', (False,))()
c = ProdOp()(i)
i2 = GetOp()(c)
# This should be a passthrough function for vectors
f = theano.function([i], i2)
v = numpy.random.randn(9).astype('float32')
v2 = f(v)
assert (v2 == v).all()
......@@ -2,6 +2,8 @@
__docformat__ = "restructuredtext en"
from theano.compat import PY3
from theano.gof import utils
from theano.gof.utils import MethodNotDefined, object2
from theano.gof import graph
......@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject):
c_extract_code=self.c_extract(name, sub, check_input))
def c_cleanup(self, name, sub):
"""Optional: Return c code to clean up after `c_extract`.
"""Return c code to clean up after `c_extract`.
This returns C code that should deallocate whatever `c_extract`
allocated or decrease the reference counts. Do not decrease
......@@ -250,7 +252,7 @@ class PureType(object):
# If filter_inplace is defined, it will be called instead of
# filter() This is to allow reusing the old allocated memory. As
# of this writing this is used only when we transfer new data to a
# shared variable on the gpu.
# shared variable on the gpu.
#def filter_inplace(value, storage, strict=False, allow_downcast=None)
......@@ -470,3 +472,97 @@ class Generic(SingletonType):
return self.__class__.__name__
generic = Generic()
class CDataType(Type):
"""
Represents opaque C data to be passed around. The intent is to
ease passing arbitrary data between ops C code.
"""
def __init__(self, ctype, freefunc=None):
"""
Build a type made to represent a C pointer in theano.
:param ctype: The type of the pointer (complete with the `*`)
:param freefunc: a function to call to free the pointer. This
function must have a `void` return and take a
single pointer argument.
"""
assert isinstance(ctype, basestring)
self.ctype = ctype
if freefunc is not None:
assert isinstance(freefunc, basestring)
self.freefunc = freefunc
def __eq__(self, other):
return (type(self) == type(other) and
self.ctype == other.ctype,
self.freefunc == other.freefunc)
def __hash__(self):
return hash((type(self), self.ctype, self.freefunc))
def filter(self, data, strict=False, allow_downcast=None):
if data is not None:
raise TypeError("only None is valid")
def is_valid_value(self, a):
return a is None
def c_declare(self, name, sub, check_input=True):
return """
%(ctype)s %(name)s;
""" % dict(ctype=self.ctype, name=name)
def c_init(self, name, sub):
return "%(name)s = NULL;" % dict(name=name)
def c_extract(self, name, sub, check_input=True):
if PY3:
s = """
%(name)s = (%(ctype)s)PyCapsule_GetPointer(py_%(name)s, NULL);
if (%(name)s == NULL) %(fail)s
"""
else:
s = """
%(name)s = (%(ctype)s)PyCObject_AsVoidPtr(py_%(name)s);
"""
return s % dict(name=name, ctype=self.ctype, fail=sub['fail'])
def c_sync(self, name, sub):
freefunc = self.freefunc
if freefunc is None:
freefunc = "NULL"
s = """
Py_XDECREF(py_%(name)s);
if (%(name)s == NULL) {
py_%(name)s = Py_None;
Py_INCREF(py_%(name)s);
} else """
if PY3:
s += """{
py_%(name)s = PyCapsule_New((void *)%(name)s, NULL,
(void (*)(void *))%(freefunc)s);
}"""
else:
s += """{
py_%(name)s = PyCObject_FromVoidPtr((void *)%(name)s,
(void (*)(void *))%(freefunc)s);
}"""
if self.freefunc is not None:
s += """
if (py_%(name)s == NULL) { %(freefunc)s(%(name)s); }
"""
return s % dict(name=name, freefunc=freefunc)
def c_cleanup(self, name, sub):
# No need to do anything here since the CObject/Capsule will
# free the data for us when released.
return ""
def c_code_cache_version(self):
return (1,)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.ctype)
......@@ -2,32 +2,53 @@ import copy
import os
import theano
from theano import Apply
from theano import tensor
from theano import Apply, tensor
from theano.gof.type import CDataType
from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3
class GpuDnnConv(GpuOp):
__props__ = ('border_mode',)
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
def __init__(self, border_mode):
self.border_mode = border_mode
class DnnBase(GpuOp):
"""
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
"""
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
def c_header_dirs(self):
return [os.path.dirname(__file__)]
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
def c_libraries(self):
return ['cudnn']
def c_support_code(self):
return """
cudnnHandle_t _handle = NULL;
"""
def c_init_code(self):
if PY3:
error_out = "NULL"
else:
error_out = ""
return ["""{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err));
return %s;
}
}""" % (error_out,)]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
class GpuDnnConvDesc(GpuOp):
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
......@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp):
def c_libraries(self):
return ['cudnn']
def c_compiler(self):
return NVCC_compiler
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
assert border_mode in ('valid', 'full')
self.border_mode = border_mode
assert len(subsample) == 2
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 and img_shape.type.dtype != numpy.int64:
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 and kern_shape.type.dtype != numpy.int64:
raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape],
[CDataType("cudnnConvolutionDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
img_shape, kern_shape = inputs
desc, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
return """
{
cudnnStatus_t err;
int pad_h%(name)s;
int pad_w%(name)s;
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
err = cudnnSetConvolutionDescriptorEx(
%(desc)s,
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsx=self.subsample[0], subsy=self.subsample[1])
def c_code_cache_version(self):
return (1,)
class GpuDnnConvBase(DnnBase):
__props__ = ()
def make_node(self, img, kern, desc):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
def c_support_code_struct(self, node, struct_id):
return """
cudnnHandle_t handle%(id)d;
cudnnTensor4dDescriptor_t input%(id)d;
cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d;
cudnnConvolutionDescriptor_t op%(id)d;
""" % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub):
return """
handle%(id)d = NULL;
cudnnStatus_t err%(id)d;
input%(id)d = NULL;
output%(id)d = NULL;
kerns%(id)d = NULL;
op%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d));
......@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id):
......@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU
cudnnDestroyTensor4dDescriptor(input%(id)d);
cudnnDestroyTensor4dDescriptor(output%(id)d);
cudnnDestroyFilterDescriptor(kerns%(id)d);
cudnnDestroyConvolutionDescriptor(op%(id)d);
cudnnDestroy(handle%(id)d);
""" % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub):
img, kern = inputs
out, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
def c_set_tensor4d(self, var, desc, err, fail):
return """
cudnnStatus_t err%(name)s;
int pad_w%(name)s;
int pad_h%(name)s;
if (!CudaNdarray_is_c_contiguous(%(img)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%(fail)s
}
if (!CudaNdarray_is_c_contiguous(%(kerns)s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
input%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(img)s)[0],
CudaNdarray_HOST_DIMS(%(img)s)[1],
CudaNdarray_HOST_DIMS(%(img)s)[2],
CudaNdarray_HOST_DIMS(%(img)s)[3],
CudaNdarray_HOST_STRIDES(%(img)s)[0],
CudaNdarray_HOST_STRIDES(%(img)s)[1],
CudaNdarray_HOST_STRIDES(%(img)s)[2],
CudaNdarray_HOST_STRIDES(%(img)s)[3]
%(err)s = cudnnSetTensor4dDescriptorEx(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s));
cudnnGetErrorString(%(err)s));
%(fail)s
}
err%(name)s = cudnnSetFilterDescriptor(
kerns%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(kerns)s)[0],
CudaNdarray_HOST_DIMS(%(kerns)s)[1],
CudaNdarray_HOST_DIMS(%(kerns)s)[2],
CudaNdarray_HOST_DIMS(%(kerns)s)[3]
""" % dict(var=var, err=err, desc=desc, fail=fail)
def c_set_filter(self, var, desc, err, fail):
return """
%(err)s = cudnnSetFilterDescriptor(
%(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
cudnnGetErrorString(err%(name)s));
cudnnGetErrorString(%(err)s));
%(fail)s
}
if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1;
pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
err%(name)s = cudnnSetConvolutionDescriptor(
op%(id)d, input%(id)d, kerns%(id)d,
pad_h%(name)s,
pad_w%(name)s,
1, 1, 1, 1,
CUDNN_CONVOLUTION
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
""" % dict(var=var, desc=desc, err=err, fail=fail)
def c_code(self, node, name, inputs, outputs, sub):
desc = inputs[2]
out, = outputs
checks = []
for v in inputs[:2]:
checks.append("""
if (!CudaNdarray_is_c_contiguous(%s)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
%s
}
""" % (v, sub['fail']))
sets = []
for p, v, d in zip(inputs[:2], self.conv_inputs, self.conv_types[:2]):
sets.append(getattr(self, 'c_set_'+d)(p, v + str(sub['struct_id']),
'err' + name, sub['fail']))
set_out = getattr(self, 'c_set_' + self.conv_types[2])(
out, self.conv_output + str(sub['struct_id']), 'err' + name,
sub['fail'])
return """
cudnnStatus_t err%(name)s;
%(checks)s
%(sets)s
{
int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim(
op%(id)d, CUDNN_CONVOLUTION_FWD,
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim(
%(desc)s, %(path)s,
&out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
// workaround for cudnn R1 bug
if (%(path)s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
(out_dims[0] != CudaNdarray_HOST_DIMS(%(input2)s)[1] ||
out_dims[1] != CudaNdarray_HOST_DIMS(%(input1)s)[1])) {
out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[1];
out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
// This is a horrible hack that is unfortulately necessary
int *dd = (int *)%(desc)s;
out_dims[2] = dd[5];
out_dims[3] = dd[6];
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
}
}
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
output%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(out)s)[0],
CudaNdarray_HOST_DIMS(%(out)s)[1],
CudaNdarray_HOST_DIMS(%(out)s)[2],
CudaNdarray_HOST_DIMS(%(out)s)[3],
CudaNdarray_HOST_STRIDES(%(out)s)[0],
CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_STRIDES(%(out)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d,
output%(id)d, CudaNdarray_DEV_DATA(%(out)s),
%(set_out)s
err%(name)s = %(method)s(
_handle,
%(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
%(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
%(desc)s,
%(output_desc)s, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
......@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err%(name)s));
%(fail)s
}
""" % dict(img=img, kerns=kern, out=out, bmode=bmode,
fail=sub['fail'], id=sub['struct_id'], name=name)
""" % dict(out=out, desc=desc, fail=sub['fail'], id=sub['struct_id'],
name=name, checks='\n'.join(checks), sets='\n'.join(sets),
set_out=set_out, input1=inputs[0], input2=inputs[1],
input1_desc=self.conv_inputs[0]+str(sub['struct_id']),
input2_desc=self.conv_inputs[1]+str(sub['struct_id']),
output_desc=self.conv_output+str(sub['struct_id']),
method=self.conv_op, path=self.path_flag)
def c_code_cache_version(self):
return (4,)
return (7,)
class GpuDnnConv(GpuDnnConvBase):
conv_inputs = 'input', 'kerns'
conv_output = 'output'
conv_types = 'tensor4d', 'filter', 'tensor4d'
conv_op = 'cudnnConvolutionForward'
path_flag = 'CUDNN_CONVOLUTION_FWD'
def grad(self, inp, grads):
img, kerns, desc = inp
top, = grads
top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, desc)
d_kerns = GpuDnnConvGradW()(img, top, desc)
return d_img, d_kerns, theano.gradient.DisconnectedType()()
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [0]]
class GpuDnnConvGradW(GpuDnnConvBase):
conv_inputs = 'input', 'output',
conv_output = 'kerns'
conv_types = 'tensor4d', 'tensor4d', 'filter'
path_flag = 'CUDNN_CONVOLUTION_WEIGHT_GRAD'
conv_op = 'cudnnConvolutionBackwardFilter'
class GpuDnnConvGradI(GpuDnnConvBase):
conv_inputs = 'kerns', 'output',
conv_output = 'input'
conv_types = 'filter', 'tensor4d', 'tensor4d'
path_flag = 'CUDNN_CONVOLUTION_DATA_GRAD'
conv_op = 'cudnnConvolutionBackwardData'
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
return GpuDnnConv()(img, kerns, desc)
@local_optimizer([GpuConv])
def local_conv_dnn(node):
if isinstance(node.op, GpuConv):
if (node.op.subsample != (1, 1) or
node.op.border_mode not in ['full', 'valid']):
if node.op.border_mode not in ['full', 'valid']:
return
img, kern = node.inputs
border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img),
gpu_contiguous(kern))]
subsample = node.op.subsample
return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern),
border_mode=border_mode, subsample=subsample)]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
......@@ -4,7 +4,7 @@ Tests for GPU convolution
import sys
import time
import unittest
import traceback
import numpy
......@@ -19,14 +19,14 @@ except ImportError:
import theano
from theano import tensor
from theano.gof.python25 import any
from theano.tests.unittest_tools import seed_rng
from theano.tests.unittest_tools import seed_rng, assert_allclose
# Skip test if cuda is not available.
from theano.sandbox import cuda
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv
from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase, dnn_conv
#needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
......@@ -48,7 +48,7 @@ if device_id is None:
enable_cuda=False,
test_driver=True)
device_id = theano.sandbox.cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
......@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
t2 = None
rval = True
try:
t0 = time.time()
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = cuda_tensor4()
k = cuda_tensor4()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern)
t2 = time.time()
for i in range(nb_iter):
gpuval2 = f(img, kern)
assert numpy.allclose(numpy.asarray(gpuval),
numpy.asarray(gpuval2))
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch",
print >> sys.stdout, gpuval.shape, cpuval.shape
rval = False
if rval:
rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
assert numpy.all(numpy.isfinite(gpuval)), gpuval
except NotImplementedError, e:
print >> sys.stdout, '_params_allgood Failed allclose', e
rval = False
t0 = time.time()
cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time()
i = cuda_tensor4()
k = cuda_tensor4()
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample,
version=version,
verbose=verbose,
kshp=compile_kshp)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
if cls is not None:
assert any([isinstance(node.op, cls)
for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
gpuval = f(img, kern)
t2 = time.time()
for i in range(nb_iter):
gpuval2 = f(img, kern)
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval)
assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape)
assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
assert numpy.all(numpy.isfinite(gpuval)), gpuval
if (t2 is not None):
if mode == 'valid':
......@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
if not rval:
print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride,
version)
diff = cpuval - gpuval
diffabs = numpy.absolute(diff)
pr_diff = diffabs / numpy.absolute(cpuval)
nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print "max absolute diff:", (diffabs.max(), "avg abs diff:",
numpy.average(diffabs))
print "median abs diff:", (numpy.median(diffabs), "nb close:",
nb_close, "/", diff.size)
print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
numpy.average(pr_diff))
if not rval and print_ != False:
if npy_img.shape[0] > 5:
print "img", npy_img[0]
print "kern", npy_kern[0]
print "gpu", gpuval[0][0]
print "cpu", cpuval[0][0]
print "diff", diff[0][0]
else:
print "img", npy_img
print "kern", npy_kern
print "gpu", gpuval
print "cpu", cpuval
print "diff", diff
return rval
def exec_conv(version, shapes, verbose, random, mode,
......@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode,
theano_mode=theano_mode, cls=None):
if verbose > 0:
_params_allgood_header()
nb_failed = 0
nb_tests = 0
failed_version = set()
failed_id = []
for ver in version:
for id, (ishape, kshape, subshape,
istride, kstride) in enumerate(shapes):
ret = False
try:
ret = _params_allgood(ishape,
kshape,
mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones,
theano_mode=theano_mode,
cls=cls)
except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride)
print "Exception", type(e), e
pass
if not ret:
failed_version.add(ver)
failed_id.append(id)
nb_failed += 1
nb_tests += 1
if nb_failed > 0:
print "nb_failed", nb_failed, "on", nb_tests,
print "failed_version", failed_version, "failed_id", failed_id
assert nb_failed == 0, nb_failed
else:
print 'Executed', nb_tests, 'different shapes'
yield (_params_allgood, ishape, kshape, mode, subshape,
istride, kstride, ver, verbose, random, print_, id,
rtol, 1e-8, 0, ones, None, theano_mode, cls)
def get_basic_shapes():
......@@ -453,8 +379,9 @@ def test_valid_0_2():
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_1_3_11_12():
......@@ -483,8 +410,9 @@ def test_valid_1_3_11_12():
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_4():
......@@ -515,8 +443,9 @@ def test_valid_4():
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_5():
......@@ -532,7 +461,6 @@ def test_valid_5():
random = False
shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
......@@ -545,10 +473,10 @@ def test_valid_5():
if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_7_8_13():
......@@ -567,7 +495,6 @@ def test_valid_7_8_13():
random = False
shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
......@@ -580,10 +507,10 @@ def test_valid_7_8_13():
if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_9_10():
......@@ -599,7 +526,6 @@ def test_valid_9_10():
random = False
shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) +
......@@ -611,19 +537,16 @@ def test_valid_9_10():
if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5):
yield t
def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
seed_rng()
shapes = get_valid_shapes()
#shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
verbose = 0
random = True
......@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls)
return exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls)
def test_valid():
_test_valid(None, version=[-2, -1, 6])
for t in _test_valid(None, version=[-2, -1, 6]):
yield t
def test_gemm_valid():
extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
_test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes)
for t in _test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes):
yield t
def test_dnn_valid():
_test_valid(GpuDnnConv, mode=theano_mode.including("cudnn"))
for t in _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")):
yield t
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
......@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
, ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
]
# shapes=shapes[:277]
verbose = 0
random = True
shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls)
return exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls)
def test_full():
_test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5])
for t in _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5]):
yield t
def test_gemm_full():
_test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"))
for t in _test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm")):
yield t
def test_dnn_full():
_test_full(GpuDnnConv, mode=theano_mode.including("cudnn"))
for t in _test_full(GpuDnnConv, mode=theano_mode.including("cudnn")):
yield t
def test_subsample(conv_gemm=False):
def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
seed_rng()
shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
......@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False):
# We put only the version that implement the subsample to make the
# test faster.
version_valid = [-2, -1, 1, 3, 11, 12]
version_full = [-2, -1]
verbose = 0
random = True
print_ = False
......@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False):
if ones:
random = False
if conv_gemm:
# Test the GpuCorrMM version
mode = theano_mode.including("conv_gemm")
cls = cuda.blas.BaseGpuCorrMM
# dummy version; not used by GpuCorrMM so one version is enough
version_valid = version_full = [-1]
else:
mode = theano_mode
cls = None
for t in exec_conv(version_valid, shapes, verbose, random, 'valid',
print_=print_, ones=ones,
theano_mode=mode, cls=cls):
yield t
for t in exec_conv(version_full, shapes, verbose, random, 'full',
print_=print_, ones=ones,
theano_mode=mode, cls=cls):
yield t
exec_conv(version_valid, shapes, verbose, random, 'valid',
print_=print_, ones=ones,
theano_mode=mode, cls=cls)
exec_conv(version_full, shapes, verbose, random, 'full',
print_=print_, ones=ones,
theano_mode=mode, cls=cls)
def test_subsample():
for t in _test_subsample(None, theano_mode,
version_valid=[-2, -1, 1, 3, 11, 12],
version_full=[-2, -1]):
yield t
def test_gemm_subsample():
test_subsample(conv_gemm=True)
for t in _test_subsample(cuda.blas.BaseGpuCorrMM,
theano_mode.including("conv_gemm")):
yield t
def test_dnn_subsample():
for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')):
yield t
class TestConv2DGPU(unittest.TestCase):
......@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode = theano_mode_orig
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
direction):
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
elif direction == 'bprop img':
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(
npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
1, 0, 2, 3)
assert_allclose(cpuval, gpuval, rtol=1e-4)
def test_gemm_directly():
for direction in ['fprop', 'bprop img', 'bprop kern']:
print 'Testing direction: ' + direction
for bs in range(1, 5):
for ch in range(1,4):
for nf in range(1,4):
for rImg1 in range(5, 9):
for rImg2 in range(5, 9):
for rFlt1 in range(2, 4):
for rFlt2 in range(2, 4):
for subsx in range(1, 3) if direction == 'fprop' else [1]:
for subsy in range(1, 3) if direction == 'fprop' else [1]:
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
elif direction == 'bprop img':
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(1, 0, 2, 3)
if not numpy.allclose(cpuval, gpuval, rtol=1e-4):
print "Test failed for"
print "direction: ", direction
print "ishape: ", ishape
print "kshape: ", kshape
print "subsample: ", subsample
assert False
def test_gemm_grads():
for bs in range(1, 5):
for ch in range(1,4):
for nf in range(1,4):
for rImg1 in range(5, 9):
for rImg2 in range(5, 9):
for rFlt1 in range(2, 4):
for rFlt2 in range(2, 4):
for direction in ['bprop img', 'bprop kern']:
yield (gemm_directly, bs, ch, nf, rImg1,
rImg2, rFlt1, rFlt2, 1, 1,
direction)
for subsx in range(1, 3):
for subsy in range(1, 3):
yield (gemm_directly, bs, ch, nf,
rImg1, rImg2, rFlt1, rFlt2,
subsx, subsy, 'fprop')
def gemm_op(mode, subsample):
pad = 'full' if mode == 'full' else (0, 0)
return theano.sandbox.cuda.blas.GpuCorrMM('valid', subsample, pad)
def dnn_op(mode, subsample):
def f(img, kern):
return dnn_conv(img, kern, border_mode=mode, conv_mode='cross',
subsample=subsample)
return f
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
# TODO: also test custom pad values
corr_op = op(mode, subsample)(i, k)
# try to compile reference implementation without shape,
# so we don't have to compile hundreds of versions
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
border_mode=mode, subsample=subsample)
try:
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
except Exception:
# compile with shape information only when needed
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
except Exception:
# skip if the reference implementation can't do it
pass
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
assert_allclose(a, b, rtol=1e-4)
def test_conv_grads():
for mode in 'valid', 'full':
for bs in [1, 5]:
for ch in [4]:
......@@ -910,68 +912,11 @@ def test_gemm_grads():
for rImg2 in [2, 8]:
for rFlt1 in [1, 2]:
for rFlt2 in [1, 2]:
for subsx in [1, 2]:
for subsy in [1, 2] if subsx == 1 else [2]:
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
pad = 'full' if mode == 'full' else (0, 0)
# TODO: also test custom pad values
corr_op = theano.sandbox.cuda.blas.GpuCorrMM(
'valid', subsample, pad)(i, k)
# try to compile reference implementation without shape,
# so we don't have to compile hundreds of versions
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
border_mode=mode, subsample=subsample)
try:
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
except Exception:
# compile with shape information only when needed
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
except Exception:
# skip if the reference implementation can't do it
print ".",
else:
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
print ":",
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
if (a.shape != b.shape) or not numpy.allclose(a, b, rtol=1e-4):
print "Test failed for", p
print "mode: ", mode
print "ishape: ", ishape
print "kshape: ", kshape
print "subsample: ", subsample
assert False
sys.stdout.flush()
for subsample in (1, 1), (1, 2), (2, 2):
for op in [gemm_op, dnn_op]:
yield (conv_grad, mode, bs, ch, nf,
rImg1, rImg2, rFlt1, rFlt2,
subsample, op)
def benchmark():
......@@ -1017,15 +962,15 @@ def benchmark():
,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version = [-1]
verbose = 1
random = True
exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3)
exec_conv(version, shapes_full, verbose, random, 'full')
for t in exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3):
t[0](*t[1:])
for t in exec_conv(version, shapes_full, verbose, random, 'full'):
t[0](*t[1:])
def test_stack_rows_segfault_070312():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论