提交 239b6d80 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #2117 from abergeron/dnn

Add support for subsampling and gradient to GpuDnnConv
...@@ -2,6 +2,7 @@ global-include *.txt ...@@ -2,6 +2,7 @@ global-include *.txt
global-include *.c global-include *.c
global-include *.cu global-include *.cu
global-include *.cuh global-include *.cuh
global-include *.h
global-include *.sh global-include *.sh
global-include *.pkl global-include *.pkl
recursive-include docs recursive-include docs
......
...@@ -193,7 +193,7 @@ def do_setup(): ...@@ -193,7 +193,7 @@ def do_setup():
install_requires=['numpy>=1.5.0', 'scipy>=0.7.2'], install_requires=['numpy>=1.5.0', 'scipy>=0.7.2'],
package_data={ package_data={
'': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl', '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
'ChangeLog'], '*.h', 'ChangeLog'],
'theano.misc': ['*.sh'] 'theano.misc': ['*.sh']
}, },
scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'], scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],
......
import numpy
import theano
from theano.gof.type import * from theano import Op, Apply
from theano.tensor import TensorType
from theano.gof.type import CDataType
# todo: test generic # todo: test generic
class ProdOp(Op):
__props__ = ()
def make_node(self, i):
return Apply(self, [i], [CDataType('void *', 'py_decref')()])
def c_support_code(self):
return """
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def c_code(self, node, name, inps, outs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = (void *)%(inp)s;
Py_INCREF(%(inp)s);
""" % dict(out=outs[0], inp=inps[0])
def c_code_cache_version(self):
return (0,)
class GetOp(Op):
__props__ = ()
def make_node(self, c):
return Apply(self, [c], [TensorType('float32', (False,))()])
def c_support_code(self):
return """
void py_decref(void *p) {
Py_XDECREF((PyObject *)p);
}
"""
def c_code(self, node, name, inps, outs, sub):
return """
Py_XDECREF(%(out)s);
%(out)s = (PyArrayObject *)%(inp)s;
Py_INCREF(%(out)s);
""" % dict(out=outs[0], inp=inps[0])
def c_code_cache_version(self):
return (0,)
def test_cdata():
i = TensorType('float32', (False,))()
c = ProdOp()(i)
i2 = GetOp()(c)
# This should be a passthrough function for vectors
f = theano.function([i], i2)
v = numpy.random.randn(9).astype('float32')
v2 = f(v)
assert (v2 == v).all()
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
from theano.compat import PY3
from theano.gof import utils from theano.gof import utils
from theano.gof.utils import MethodNotDefined, object2 from theano.gof.utils import MethodNotDefined, object2
from theano.gof import graph from theano.gof import graph
...@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject): ...@@ -158,7 +160,7 @@ class CLinkerType(CLinkerObject):
c_extract_code=self.c_extract(name, sub, check_input)) c_extract_code=self.c_extract(name, sub, check_input))
def c_cleanup(self, name, sub): def c_cleanup(self, name, sub):
"""Optional: Return c code to clean up after `c_extract`. """Return c code to clean up after `c_extract`.
This returns C code that should deallocate whatever `c_extract` This returns C code that should deallocate whatever `c_extract`
allocated or decrease the reference counts. Do not decrease allocated or decrease the reference counts. Do not decrease
...@@ -250,7 +252,7 @@ class PureType(object): ...@@ -250,7 +252,7 @@ class PureType(object):
# If filter_inplace is defined, it will be called instead of # If filter_inplace is defined, it will be called instead of
# filter() This is to allow reusing the old allocated memory. As # filter() This is to allow reusing the old allocated memory. As
# of this writing this is used only when we transfer new data to a # of this writing this is used only when we transfer new data to a
# shared variable on the gpu. # shared variable on the gpu.
#def filter_inplace(value, storage, strict=False, allow_downcast=None) #def filter_inplace(value, storage, strict=False, allow_downcast=None)
...@@ -470,3 +472,97 @@ class Generic(SingletonType): ...@@ -470,3 +472,97 @@ class Generic(SingletonType):
return self.__class__.__name__ return self.__class__.__name__
generic = Generic() generic = Generic()
class CDataType(Type):
"""
Represents opaque C data to be passed around. The intent is to
ease passing arbitrary data between ops C code.
"""
def __init__(self, ctype, freefunc=None):
"""
Build a type made to represent a C pointer in theano.
:param ctype: The type of the pointer (complete with the `*`)
:param freefunc: a function to call to free the pointer. This
function must have a `void` return and take a
single pointer argument.
"""
assert isinstance(ctype, basestring)
self.ctype = ctype
if freefunc is not None:
assert isinstance(freefunc, basestring)
self.freefunc = freefunc
def __eq__(self, other):
return (type(self) == type(other) and
self.ctype == other.ctype,
self.freefunc == other.freefunc)
def __hash__(self):
return hash((type(self), self.ctype, self.freefunc))
def filter(self, data, strict=False, allow_downcast=None):
if data is not None:
raise TypeError("only None is valid")
def is_valid_value(self, a):
return a is None
def c_declare(self, name, sub, check_input=True):
return """
%(ctype)s %(name)s;
""" % dict(ctype=self.ctype, name=name)
def c_init(self, name, sub):
return "%(name)s = NULL;" % dict(name=name)
def c_extract(self, name, sub, check_input=True):
if PY3:
s = """
%(name)s = (%(ctype)s)PyCapsule_GetPointer(py_%(name)s, NULL);
if (%(name)s == NULL) %(fail)s
"""
else:
s = """
%(name)s = (%(ctype)s)PyCObject_AsVoidPtr(py_%(name)s);
"""
return s % dict(name=name, ctype=self.ctype, fail=sub['fail'])
def c_sync(self, name, sub):
freefunc = self.freefunc
if freefunc is None:
freefunc = "NULL"
s = """
Py_XDECREF(py_%(name)s);
if (%(name)s == NULL) {
py_%(name)s = Py_None;
Py_INCREF(py_%(name)s);
} else """
if PY3:
s += """{
py_%(name)s = PyCapsule_New((void *)%(name)s, NULL,
(void (*)(void *))%(freefunc)s);
}"""
else:
s += """{
py_%(name)s = PyCObject_FromVoidPtr((void *)%(name)s,
(void (*)(void *))%(freefunc)s);
}"""
if self.freefunc is not None:
s += """
if (py_%(name)s == NULL) { %(freefunc)s(%(name)s); }
"""
return s % dict(name=name, freefunc=freefunc)
def c_cleanup(self, name, sub):
# No need to do anything here since the CObject/Capsule will
# free the data for us when released.
return ""
def c_code_cache_version(self):
return (1,)
def __str__(self):
return "%s{%s}" % (self.__class__.__name__, self.ctype)
...@@ -2,32 +2,53 @@ import copy ...@@ -2,32 +2,53 @@ import copy
import os import os
import theano import theano
from theano import Apply from theano import Apply, tensor
from theano import tensor from theano.gof.type import CDataType
from theano.compat.six import StringIO from theano.compat.six import StringIO
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.sandbox.cuda.blas import GpuConv from theano.sandbox.cuda.blas import GpuConv
from theano.compat import PY3
class GpuDnnConv(GpuOp): from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
__props__ = ('border_mode',)
def __init__(self, border_mode): class DnnBase(GpuOp):
self.border_mode = border_mode """
Creates a handle for cudnn and pulls in the cudnn libraries and headers.
"""
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def make_node(self, img, kern): def c_header_dirs(self):
if img.type.ndim != 4: return [os.path.dirname(__file__)]
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = (img.type.broadcastable[0], def c_libraries(self):
kern.type.broadcastable[0], return ['cudnn']
False, False)
def c_support_code(self):
return """
cudnnHandle_t _handle = NULL;
"""
def c_init_code(self):
if PY3:
error_out = "NULL"
else:
error_out = ""
return ["""{
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err));
return %s;
}
}""" % (error_out,)]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
class GpuDnnConvDesc(GpuOp):
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h'] return ['cudnn.h', 'cudnn_helper.h']
...@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp): ...@@ -38,28 +59,121 @@ class GpuDnnConv(GpuOp):
def c_libraries(self): def c_libraries(self):
return ['cudnn'] return ['cudnn']
def c_compiler(self):
return NVCC_compiler
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
assert border_mode in ('valid', 'full')
self.border_mode = border_mode
assert len(subsample) == 2
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 and img_shape.type.dtype != numpy.int64:
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 and kern_shape.type.dtype != numpy.int64:
raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape],
[CDataType("cudnnConvolutionDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
img_shape, kern_shape = inputs
desc, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
return """
{
cudnnStatus_t err;
int pad_h%(name)s;
int pad_w%(name)s;
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 1) {
pad_h%(name)s = 0;
pad_w%(name)s = 0;
} else if (%(bmode)d == 0) {
pad_h%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad_w%(name)s = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
err = cudnnSetConvolutionDescriptorEx(
%(desc)s,
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 1),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(img_shape)s, 3),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 0),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2),
*(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3),
pad_h%(name)s,
pad_w%(name)s,
%(subsx)d, %(subsy)d, 1, 1,
%(conv_flag)s
);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsx=self.subsample[0], subsy=self.subsample[1])
def c_code_cache_version(self):
return (1,)
class GpuDnnConvBase(DnnBase):
__props__ = ()
def make_node(self, img, kern, desc):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = (img.type.broadcastable[0],
kern.type.broadcastable[0],
False, False)
return Apply(self, [img, kern, desc],
[CudaNdarrayType(broadcastable)()])
def c_support_code_struct(self, node, struct_id): def c_support_code_struct(self, node, struct_id):
return """ return """
cudnnHandle_t handle%(id)d;
cudnnTensor4dDescriptor_t input%(id)d; cudnnTensor4dDescriptor_t input%(id)d;
cudnnTensor4dDescriptor_t output%(id)d; cudnnTensor4dDescriptor_t output%(id)d;
cudnnFilterDescriptor_t kerns%(id)d; cudnnFilterDescriptor_t kerns%(id)d;
cudnnConvolutionDescriptor_t op%(id)d;
""" % dict(id=struct_id) """ % dict(id=struct_id)
def c_init_code_struct(self, node, struct_id, sub): def c_init_code_struct(self, node, struct_id, sub):
return """ return """
handle%(id)d = NULL; cudnnStatus_t err%(id)d;
input%(id)d = NULL; input%(id)d = NULL;
output%(id)d = NULL; output%(id)d = NULL;
kerns%(id)d = NULL; kerns%(id)d = NULL;
op%(id)d = NULL;
cudnnStatus_t err%(id)d;
if ((err%(id)d = cudnnCreate(&handle%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cudnn handle: %%s",
cudnnGetErrorString(err%(id)d));
%(fail)s
}
if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) { if ((err%(id)d = cudnnCreateTensor4dDescriptor(&input%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
"(inp): %%s", cudnnGetErrorString(err%(id)d)); "(inp): %%s", cudnnGetErrorString(err%(id)d));
...@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC ...@@ -75,11 +189,6 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC
cudnnGetErrorString(err%(id)d)); cudnnGetErrorString(err%(id)d));
%(fail)s %(fail)s
} }
if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err%(id)d));
%(fail)s
}
""" % dict(id=struct_id, fail=sub['fail']) """ % dict(id=struct_id, fail=sub['fail'])
def c_cleanup_code_struct(self, node, struct_id): def c_cleanup_code_struct(self, node, struct_id):
...@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU ...@@ -87,123 +196,109 @@ if ((err%(id)d = cudnnCreateConvolutionDescriptor(&op%(id)d)) != CUDNN_STATUS_SU
cudnnDestroyTensor4dDescriptor(input%(id)d); cudnnDestroyTensor4dDescriptor(input%(id)d);
cudnnDestroyTensor4dDescriptor(output%(id)d); cudnnDestroyTensor4dDescriptor(output%(id)d);
cudnnDestroyFilterDescriptor(kerns%(id)d); cudnnDestroyFilterDescriptor(kerns%(id)d);
cudnnDestroyConvolutionDescriptor(op%(id)d);
cudnnDestroy(handle%(id)d);
""" % dict(id=struct_id) """ % dict(id=struct_id)
def c_code(self, node, name, inputs, outputs, sub): def c_set_tensor4d(self, var, desc, err, fail):
img, kern = inputs
out, = outputs
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
return """ return """
cudnnStatus_t err%(name)s; %(err)s = cudnnSetTensor4dDescriptorEx(
int pad_w%(name)s; %(desc)s, CUDNN_DATA_FLOAT,
int pad_h%(name)s; CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
if (!CudaNdarray_is_c_contiguous(%(img)s)) { CudaNdarray_HOST_DIMS(%(var)s)[2],
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported."); CudaNdarray_HOST_DIMS(%(var)s)[3],
%(fail)s CudaNdarray_HOST_STRIDES(%(var)s)[0]?CudaNdarray_HOST_STRIDES(%(var)s)[0]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3]*CudaNdarray_HOST_DIMS(%(var)s)[1],
} CudaNdarray_HOST_STRIDES(%(var)s)[1]?CudaNdarray_HOST_STRIDES(%(var)s)[1]:CudaNdarray_HOST_DIMS(%(var)s)[2]*CudaNdarray_HOST_DIMS(%(var)s)[3],
CudaNdarray_HOST_STRIDES(%(var)s)[2]?CudaNdarray_HOST_STRIDES(%(var)s)[2]:CudaNdarray_HOST_DIMS(%(var)s)[3],
if (!CudaNdarray_is_c_contiguous(%(kerns)s)) { CudaNdarray_HOST_STRIDES(%(var)s)[3]?CudaNdarray_HOST_STRIDES(%(var)s)[3]:1
PyErr_SetString(PyExc_ValueError, "Only contiguous filters are supported.");
%(fail)s
}
err%(name)s = cudnnSetTensor4dDescriptorEx(
input%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(img)s)[0],
CudaNdarray_HOST_DIMS(%(img)s)[1],
CudaNdarray_HOST_DIMS(%(img)s)[2],
CudaNdarray_HOST_DIMS(%(img)s)[3],
CudaNdarray_HOST_STRIDES(%(img)s)[0],
CudaNdarray_HOST_STRIDES(%(img)s)[1],
CudaNdarray_HOST_STRIDES(%(img)s)[2],
CudaNdarray_HOST_STRIDES(%(img)s)[3]
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not set tensor4d descriptor: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(%(err)s));
%(fail)s %(fail)s
} }
err%(name)s = cudnnSetFilterDescriptor( """ % dict(var=var, err=err, desc=desc, fail=fail)
kerns%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(kerns)s)[0], def c_set_filter(self, var, desc, err, fail):
CudaNdarray_HOST_DIMS(%(kerns)s)[1], return """
CudaNdarray_HOST_DIMS(%(kerns)s)[2], %(err)s = cudnnSetFilterDescriptor(
CudaNdarray_HOST_DIMS(%(kerns)s)[3] %(desc)s, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s)[0],
CudaNdarray_HOST_DIMS(%(var)s)[1],
CudaNdarray_HOST_DIMS(%(var)s)[2],
CudaNdarray_HOST_DIMS(%(var)s)[3]
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not set filter descriptor: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(%(err)s));
%(fail)s %(fail)s
} }
if (%(bmode)d == 1) { """ % dict(var=var, desc=desc, err=err, fail=fail)
pad_h%(name)s = 0;
pad_w%(name)s = 0; def c_code(self, node, name, inputs, outputs, sub):
} else if (%(bmode)d == 0) { desc = inputs[2]
pad_h%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[2] - 1; out, = outputs
pad_w%(name)s = CudaNdarray_HOST_DIMS(%(kerns)s)[3] - 1;
} else { checks = []
PyErr_SetString(PyExc_ValueError, "bad border mode"); for v in inputs[:2]:
%(fail)s checks.append("""
} if (!CudaNdarray_is_c_contiguous(%s)) {
err%(name)s = cudnnSetConvolutionDescriptor( PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
op%(id)d, input%(id)d, kerns%(id)d, %s
pad_h%(name)s,
pad_w%(name)s,
1, 1, 1, 1,
CUDNN_CONVOLUTION
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
} }
""" % (v, sub['fail']))
sets = []
for p, v, d in zip(inputs[:2], self.conv_inputs, self.conv_types[:2]):
sets.append(getattr(self, 'c_set_'+d)(p, v + str(sub['struct_id']),
'err' + name, sub['fail']))
set_out = getattr(self, 'c_set_' + self.conv_types[2])(
out, self.conv_output + str(sub['struct_id']), 'err' + name,
sub['fail'])
return """
cudnnStatus_t err%(name)s;
%(checks)s
%(sets)s
{ {
int out_dims[4]; int out_dims[4];
err%(name)s = cudnnGetOutputTensor4dDim( err%(name)s = cudnnGetOutputTensor4dDim(
op%(id)d, CUDNN_CONVOLUTION_FWD, %(desc)s, %(path)s,
&out_dims[0], &out_dims[1], &out_dims[0], &out_dims[1],
&out_dims[2], &out_dims[3] &out_dims[2], &out_dims[3]
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s", PyErr_Format(PyExc_RuntimeError, "could not get output sizes: %%s",
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) { // workaround for cudnn R1 bug
%(fail)s if (%(path)s == CUDNN_CONVOLUTION_WEIGHT_GRAD &&
(out_dims[0] != CudaNdarray_HOST_DIMS(%(input2)s)[1] ||
out_dims[1] != CudaNdarray_HOST_DIMS(%(input1)s)[1])) {
out_dims[0] = CudaNdarray_HOST_DIMS(%(input2)s)[1];
out_dims[1] = CudaNdarray_HOST_DIMS(%(input1)s)[1];
// This is a horrible hack that is unfortulately necessary
int *dd = (int *)%(desc)s;
out_dims[2] = dd[5];
out_dims[3] = dd[6];
}
if (CudaNdarray_prep_output(&%(out)s, 4, out_dims) != 0) {
%(fail)s
}
} }
}
err%(name)s = cudnnSetTensor4dDescriptorEx( %(set_out)s
output%(id)d, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(out)s)[0], err%(name)s = %(method)s(
CudaNdarray_HOST_DIMS(%(out)s)[1], _handle,
CudaNdarray_HOST_DIMS(%(out)s)[2], %(input1_desc)s, CudaNdarray_DEV_DATA(%(input1)s),
CudaNdarray_HOST_DIMS(%(out)s)[3], %(input2_desc)s, CudaNdarray_DEV_DATA(%(input2)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], %(desc)s,
CudaNdarray_HOST_STRIDES(%(out)s)[1], %(output_desc)s, CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[2],
CudaNdarray_HOST_STRIDES(%(out)s)[3]
);
if (err%(name)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set out descriptor: %%s",
cudnnGetErrorString(err%(name)s));
%(fail)s
}
err%(name)s = cudnnConvolutionForward(
handle%(id)d,
input%(id)d, CudaNdarray_DEV_DATA(%(img)s),
kerns%(id)d, CudaNdarray_DEV_DATA(%(kerns)s),
op%(id)d,
output%(id)d, CudaNdarray_DEV_DATA(%(out)s),
CUDNN_RESULT_NO_ACCUMULATE CUDNN_RESULT_NO_ACCUMULATE
); );
if (err%(name)s != CUDNN_STATUS_SUCCESS) { if (err%(name)s != CUDNN_STATUS_SUCCESS) {
...@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) { ...@@ -211,25 +306,77 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
cudnnGetErrorString(err%(name)s)); cudnnGetErrorString(err%(name)s));
%(fail)s %(fail)s
} }
""" % dict(img=img, kerns=kern, out=out, bmode=bmode, """ % dict(out=out, desc=desc, fail=sub['fail'], id=sub['struct_id'],
fail=sub['fail'], id=sub['struct_id'], name=name) name=name, checks='\n'.join(checks), sets='\n'.join(sets),
set_out=set_out, input1=inputs[0], input2=inputs[1],
input1_desc=self.conv_inputs[0]+str(sub['struct_id']),
input2_desc=self.conv_inputs[1]+str(sub['struct_id']),
output_desc=self.conv_output+str(sub['struct_id']),
method=self.conv_op, path=self.path_flag)
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (7,)
class GpuDnnConv(GpuDnnConvBase):
conv_inputs = 'input', 'kerns'
conv_output = 'output'
conv_types = 'tensor4d', 'filter', 'tensor4d'
conv_op = 'cudnnConvolutionForward'
path_flag = 'CUDNN_CONVOLUTION_FWD'
def grad(self, inp, grads):
img, kerns, desc = inp
top, = grads
top = gpu_contiguous(top)
d_img = GpuDnnConvGradI()(kerns, top, desc)
d_kerns = GpuDnnConvGradW()(img, top, desc)
return d_img, d_kerns, theano.gradient.DisconnectedType()()
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [0]]
class GpuDnnConvGradW(GpuDnnConvBase):
conv_inputs = 'input', 'output',
conv_output = 'kerns'
conv_types = 'tensor4d', 'tensor4d', 'filter'
path_flag = 'CUDNN_CONVOLUTION_WEIGHT_GRAD'
conv_op = 'cudnnConvolutionBackwardFilter'
class GpuDnnConvGradI(GpuDnnConvBase):
conv_inputs = 'kerns', 'output',
conv_output = 'input'
conv_types = 'filter', 'tensor4d', 'tensor4d'
path_flag = 'CUDNN_CONVOLUTION_DATA_GRAD'
conv_op = 'cudnnConvolutionBackwardData'
from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous, from theano.sandbox.cuda.opt import (local_optimizer, gpu_contiguous,
gpu_optimizer) gpu_optimizer)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
return GpuDnnConv()(img, kerns, desc)
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_dnn(node): def local_conv_dnn(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.subsample != (1, 1) or if node.op.border_mode not in ['full', 'valid']:
node.op.border_mode not in ['full', 'valid']):
return return
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
return [GpuDnnConv(border_mode)(gpu_contiguous(img), subsample = node.op.subsample
gpu_contiguous(kern))] return [dnn_conv(gpu_contiguous(img), gpu_contiguous(kern),
border_mode=border_mode, subsample=subsample)]
gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn') gpu_optimizer.register("conv_cudnn", local_conv_dnn, 'cudnn')
...@@ -4,7 +4,7 @@ Tests for GPU convolution ...@@ -4,7 +4,7 @@ Tests for GPU convolution
import sys import sys
import time import time
import unittest import unittest
import traceback
import numpy import numpy
...@@ -19,14 +19,14 @@ except ImportError: ...@@ -19,14 +19,14 @@ except ImportError:
import theano import theano
from theano import tensor from theano import tensor
from theano.gof.python25 import any from theano.gof.python25 import any
from theano.tests.unittest_tools import seed_rng from theano.tests.unittest_tools import seed_rng, assert_allclose
# Skip test if cuda is not available. # Skip test if cuda is not available.
from theano.sandbox import cuda from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv from theano.sandbox.cuda.dnn import GpuDnnConv, GpuDnnConvBase, dnn_conv
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
...@@ -48,7 +48,7 @@ if device_id is None: ...@@ -48,7 +48,7 @@ if device_id is None:
enable_cuda=False, enable_cuda=False,
test_driver=True) test_driver=True)
device_id = theano.sandbox.cuda.use.device_number device_id = theano.sandbox.cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id) device_prop = cuda_ndarray.device_properties(device_id)
...@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -173,40 +173,30 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
t2 = None t2 = None
rval = True
try: t0 = time.time()
t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample)
cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time()
t1 = time.time() i = cuda_tensor4()
i = cuda_tensor4() k = cuda_tensor4()
k = cuda_tensor4() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample,
subsample=subsample, version=version,
version=version, verbose=verbose,
verbose=verbose, kshp=compile_kshp)(i, k)
kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode)
f = theano.function([i, k], op, mode=theano_mode) if cls is not None:
if cls is not None: assert any([isinstance(node.op, cls)
assert any([isinstance(node.op, cls) for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
for node in f.maker.fgraph.toposort()]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort()) gpuval = f(img, kern)
gpuval = f(img, kern) t2 = time.time()
t2 = time.time() for i in range(nb_iter):
for i in range(nb_iter): gpuval2 = f(img, kern)
gpuval2 = f(img, kern) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
assert numpy.allclose(numpy.asarray(gpuval), gpuval = numpy.asarray(gpuval)
numpy.asarray(gpuval2)) assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape)
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
gpuval = numpy.asarray(gpuval) assert numpy.all(numpy.isfinite(gpuval)), gpuval
if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch",
print >> sys.stdout, gpuval.shape, cpuval.shape
rval = False
if rval:
rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
assert numpy.all(numpy.isfinite(gpuval)), gpuval
except NotImplementedError, e:
print >> sys.stdout, '_params_allgood Failed allclose', e
rval = False
if (t2 is not None): if (t2 is not None):
if mode == 'valid': if mode == 'valid':
...@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -221,37 +211,6 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape), print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp, print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)) cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
if not rval:
print >> sys.stdout, ('test_' + mode + ' id=' + str(id) +
' FAILED for ishape, kshape, mode, subsample,' +
' img_stride, kern_stride, version', ishape,
kshape, mode, subsample, img_stride, kern_stride,
version)
diff = cpuval - gpuval
diffabs = numpy.absolute(diff)
pr_diff = diffabs / numpy.absolute(cpuval)
nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print "max absolute diff:", (diffabs.max(), "avg abs diff:",
numpy.average(diffabs))
print "median abs diff:", (numpy.median(diffabs), "nb close:",
nb_close, "/", diff.size)
print "max relatif diff:", (pr_diff.max(), "avg rel diff:",
numpy.average(pr_diff))
if not rval and print_ != False:
if npy_img.shape[0] > 5:
print "img", npy_img[0]
print "kern", npy_kern[0]
print "gpu", gpuval[0][0]
print "cpu", cpuval[0][0]
print "diff", diff[0][0]
else:
print "img", npy_img
print "kern", npy_kern
print "gpu", gpuval
print "cpu", cpuval
print "diff", diff
return rval
def exec_conv(version, shapes, verbose, random, mode, def exec_conv(version, shapes, verbose, random, mode,
...@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -259,46 +218,13 @@ def exec_conv(version, shapes, verbose, random, mode,
theano_mode=theano_mode, cls=None): theano_mode=theano_mode, cls=None):
if verbose > 0: if verbose > 0:
_params_allgood_header() _params_allgood_header()
nb_failed = 0
nb_tests = 0
failed_version = set()
failed_id = []
for ver in version: for ver in version:
for id, (ishape, kshape, subshape, for id, (ishape, kshape, subshape,
istride, kstride) in enumerate(shapes): istride, kstride) in enumerate(shapes):
ret = False yield (_params_allgood, ishape, kshape, mode, subshape,
try: istride, kstride, ver, verbose, random, print_, id,
ret = _params_allgood(ishape, rtol, 1e-8, 0, ones, None, theano_mode, cls)
kshape,
mode,
subsample=subshape,
img_stride=istride,
kern_stride=kstride,
version=ver,
verbose=verbose,
random=random,
id=id,
print_=print_,
rtol=rtol,
ones=ones,
theano_mode=theano_mode,
cls=cls)
except Exception, e:
print ver, id, (ishape, kshape, subshape, istride, kstride)
print "Exception", type(e), e
pass
if not ret:
failed_version.add(ver)
failed_id.append(id)
nb_failed += 1
nb_tests += 1
if nb_failed > 0:
print "nb_failed", nb_failed, "on", nb_tests,
print "failed_version", failed_version, "failed_id", failed_id
assert nb_failed == 0, nb_failed
else:
print 'Executed', nb_tests, 'different shapes'
def get_basic_shapes(): def get_basic_shapes():
...@@ -453,8 +379,9 @@ def test_valid_0_2(): ...@@ -453,8 +379,9 @@ def test_valid_0_2():
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_1_3_11_12(): def test_valid_1_3_11_12():
...@@ -483,8 +410,9 @@ def test_valid_1_3_11_12(): ...@@ -483,8 +410,9 @@ def test_valid_1_3_11_12():
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_4(): def test_valid_4():
...@@ -515,8 +443,9 @@ def test_valid_4(): ...@@ -515,8 +443,9 @@ def test_valid_4():
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_5(): def test_valid_5():
...@@ -532,7 +461,6 @@ def test_valid_5(): ...@@ -532,7 +461,6 @@ def test_valid_5():
random = False random = False
shapes2 = [] shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
...@@ -545,10 +473,10 @@ def test_valid_5(): ...@@ -545,10 +473,10 @@ def test_valid_5():
if subshape == (1, 1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_7_8_13(): def test_valid_7_8_13():
...@@ -567,7 +495,6 @@ def test_valid_7_8_13(): ...@@ -567,7 +495,6 @@ def test_valid_7_8_13():
random = False random = False
shapes2 = [] shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
...@@ -580,10 +507,10 @@ def test_valid_7_8_13(): ...@@ -580,10 +507,10 @@ def test_valid_7_8_13():
if subshape == (1, 1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def test_valid_9_10(): def test_valid_9_10():
...@@ -599,7 +526,6 @@ def test_valid_9_10(): ...@@ -599,7 +526,6 @@ def test_valid_9_10():
random = False random = False
shapes2 = [] shapes2 = []
# print len(shapes)
for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) - oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
numpy.asarray(kshape[2:]) + numpy.asarray(kshape[2:]) +
...@@ -611,19 +537,16 @@ def test_valid_9_10(): ...@@ -611,19 +537,16 @@ def test_valid_9_10():
if subshape == (1, 1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
# print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', for t in exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5) print_=print_, ones=ones, rtol=1.1e-5):
yield t
def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]): def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
seed_rng() seed_rng()
shapes = get_valid_shapes() shapes = get_valid_shapes()
#shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version.
verbose = 0 verbose = 0
random = True random = True
...@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]): ...@@ -634,26 +557,30 @@ def _test_valid(cls, mode=None, extra_shapes=[], version=[-1]):
shapes += extra_shapes shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'valid', return exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5, print_=print_, ones=ones, rtol=1.1e-5,
theano_mode=mode, cls=cls) theano_mode=mode, cls=cls)
def test_valid(): def test_valid():
_test_valid(None, version=[-2, -1, 6]) for t in _test_valid(None, version=[-2, -1, 6]):
yield t
def test_gemm_valid(): def test_gemm_valid():
extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2)) extra_shapes = get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2)) extra_shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
_test_valid(cuda.blas.BaseGpuCorrMM, for t in _test_valid(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm"), mode=theano_mode.including("conv_gemm"),
extra_shapes=extra_shapes) extra_shapes=extra_shapes):
yield t
def test_dnn_valid(): def test_dnn_valid():
_test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")) for t in _test_valid(GpuDnnConv, mode=theano_mode.including("cudnn")):
yield t
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
...@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): ...@@ -710,30 +637,32 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
, ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash , ((1,1,44800,1), (6,1,1,1), (1, 1), (1, 1), (1, 1))#This caused crash
] ]
# shapes=shapes[:277]
verbose = 0 verbose = 0
random = True random = True
shapes += extra_shapes shapes += extra_shapes
exec_conv(version, shapes, verbose, random, 'full', return exec_conv(version, shapes, verbose, random, 'full',
theano_mode=mode, cls=cls) theano_mode=mode, cls=cls)
def test_full(): def test_full():
_test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5]) for t in _test_full(None, version=[-2, -1, 0, 1, 2, 3, 4, 5]):
yield t
def test_gemm_full(): def test_gemm_full():
_test_full(cuda.blas.BaseGpuCorrMM, for t in _test_full(cuda.blas.BaseGpuCorrMM,
mode=theano_mode.including("conv_gemm")) mode=theano_mode.including("conv_gemm")):
yield t
def test_dnn_full(): def test_dnn_full():
_test_full(GpuDnnConv, mode=theano_mode.including("cudnn")) for t in _test_full(GpuDnnConv, mode=theano_mode.including("cudnn")):
yield t
def test_subsample(conv_gemm=False): def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
seed_rng() seed_rng()
shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)), shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)), ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
...@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False): ...@@ -748,8 +677,6 @@ def test_subsample(conv_gemm=False):
# We put only the version that implement the subsample to make the # We put only the version that implement the subsample to make the
# test faster. # test faster.
version_valid = [-2, -1, 1, 3, 11, 12]
version_full = [-2, -1]
verbose = 0 verbose = 0
random = True random = True
print_ = False print_ = False
...@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False): ...@@ -757,26 +684,32 @@ def test_subsample(conv_gemm=False):
if ones: if ones:
random = False random = False
if conv_gemm: for t in exec_conv(version_valid, shapes, verbose, random, 'valid',
# Test the GpuCorrMM version print_=print_, ones=ones,
mode = theano_mode.including("conv_gemm") theano_mode=mode, cls=cls):
cls = cuda.blas.BaseGpuCorrMM yield t
# dummy version; not used by GpuCorrMM so one version is enough for t in exec_conv(version_full, shapes, verbose, random, 'full',
version_valid = version_full = [-1] print_=print_, ones=ones,
else: theano_mode=mode, cls=cls):
mode = theano_mode yield t
cls = None
exec_conv(version_valid, shapes, verbose, random, 'valid',
print_=print_, ones=ones, def test_subsample():
theano_mode=mode, cls=cls) for t in _test_subsample(None, theano_mode,
exec_conv(version_full, shapes, verbose, random, 'full', version_valid=[-2, -1, 1, 3, 11, 12],
print_=print_, ones=ones, version_full=[-2, -1]):
theano_mode=mode, cls=cls) yield t
def test_gemm_subsample(): def test_gemm_subsample():
test_subsample(conv_gemm=True) for t in _test_subsample(cuda.blas.BaseGpuCorrMM,
theano_mode.including("conv_gemm")):
yield t
def test_dnn_subsample():
for t in _test_subsample(GpuDnnConv, theano_mode.including('cudnn')):
yield t
class TestConv2DGPU(unittest.TestCase): class TestConv2DGPU(unittest.TestCase):
...@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -850,58 +783,127 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode = theano_mode_orig theano_mode = theano_mode_orig
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
direction):
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2)
subsample = (subsx, subsy)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
if direction == 'fprop':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
elif direction == 'bprop img':
cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
elif direction == 'bprop kern':
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(
npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
1, 0, 2, 3)
assert_allclose(cpuval, gpuval, rtol=1e-4)
def test_gemm_directly(): def test_gemm_directly():
for direction in ['fprop', 'bprop img', 'bprop kern']: for bs in range(1, 5):
print 'Testing direction: ' + direction for ch in range(1,4):
for bs in range(1, 5): for nf in range(1,4):
for ch in range(1,4): for rImg1 in range(5, 9):
for nf in range(1,4): for rImg2 in range(5, 9):
for rImg1 in range(5, 9): for rFlt1 in range(2, 4):
for rImg2 in range(5, 9): for rFlt2 in range(2, 4):
for rFlt1 in range(2, 4): for direction in ['bprop img', 'bprop kern']:
for rFlt2 in range(2, 4): yield (gemm_directly, bs, ch, nf, rImg1,
for subsx in range(1, 3) if direction == 'fprop' else [1]: rImg2, rFlt1, rFlt2, 1, 1,
for subsy in range(1, 3) if direction == 'fprop' else [1]: direction)
ishape = (bs, ch, rImg1, rImg2)
kshape = (nf, ch, rFlt1, rFlt2) for subsx in range(1, 3):
subsample = (subsx, subsy) for subsy in range(1, 3):
yield (gemm_directly, bs, ch, nf,
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') rImg1, rImg2, rFlt1, rFlt2,
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') subsx, subsy, 'fprop')
i = cuda_tensor4()
k = cuda_tensor4() def gemm_op(mode, subsample):
pad = 'full' if mode == 'full' else (0, 0)
if direction == 'fprop': return theano.sandbox.cuda.blas.GpuCorrMM('valid', subsample, pad)
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
subsample=subsample)(i, k) def dnn_op(mode, subsample):
f = theano.function([i, k], op, mode=theano_mode) def f(img, kern):
gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) return dnn_conv(img, kern, border_mode=mode, conv_mode='cross',
elif direction == 'bprop img': subsample=subsample)
cpuval = py_conv(npy_img, npy_kern, 'full', subsample) return f
op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(border_mode='valid',
subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode) def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) ishape = (bs, ch, rImg1, rImg2)
elif direction == 'bprop kern': kshape = (nf, ch, rFlt1, rFlt2)
cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(border_mode='valid', npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
subsample=subsample)(i, k) npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f(npy_img.transpose(1, 0, 2, 3), i = cuda_tensor4()
npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(1, 0, 2, 3) k = cuda_tensor4()
if not numpy.allclose(cpuval, gpuval, rtol=1e-4): # TODO: also test custom pad values
print "Test failed for" corr_op = op(mode, subsample)(i, k)
print "direction: ", direction # try to compile reference implementation without shape,
print "ishape: ", ishape # so we don't have to compile hundreds of versions
print "kshape: ", kshape conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
print "subsample: ", subsample border_mode=mode, subsample=subsample)
assert False try:
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
def test_gemm_grads(): except Exception:
# compile with shape information only when needed
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
except Exception:
# skip if the reference implementation can't do it
pass
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
assert_allclose(a, b, rtol=1e-4)
def test_conv_grads():
for mode in 'valid', 'full': for mode in 'valid', 'full':
for bs in [1, 5]: for bs in [1, 5]:
for ch in [4]: for ch in [4]:
...@@ -910,68 +912,11 @@ def test_gemm_grads(): ...@@ -910,68 +912,11 @@ def test_gemm_grads():
for rImg2 in [2, 8]: for rImg2 in [2, 8]:
for rFlt1 in [1, 2]: for rFlt1 in [1, 2]:
for rFlt2 in [1, 2]: for rFlt2 in [1, 2]:
for subsx in [1, 2]: for subsample in (1, 1), (1, 2), (2, 2):
for subsy in [1, 2] if subsx == 1 else [2]: for op in [gemm_op, dnn_op]:
ishape = (bs, ch, rImg1, rImg2) yield (conv_grad, mode, bs, ch, nf,
kshape = (nf, ch, rFlt1, rFlt2) rImg1, rImg2, rFlt1, rFlt2,
subsample = (subsx, subsy) subsample, op)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
i = cuda_tensor4()
k = cuda_tensor4()
pad = 'full' if mode == 'full' else (0, 0)
# TODO: also test custom pad values
corr_op = theano.sandbox.cuda.blas.GpuCorrMM(
'valid', subsample, pad)(i, k)
# try to compile reference implementation without shape,
# so we don't have to compile hundreds of versions
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
border_mode=mode, subsample=subsample)
try:
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
except Exception:
# compile with shape information only when needed
conv_op = tensor.nnet.conv2d(i, k[:,:,::-1,::-1],
ishape, kshape, mode, subsample)
conv_op_di = theano.grad(conv_op.sum(), i)
conv_op_dk = theano.grad(conv_op.sum(), k)
corr_op_di = theano.grad(corr_op.sum(), i)
corr_op_dk = theano.grad(corr_op.sum(), k)
outputs = [corr_op, conv_op,
corr_op_di, conv_op_di,
corr_op_dk, conv_op_dk]
try:
conv_op_dik = theano.grad(conv_op_di.sum(), k)
conv_op_dki = theano.grad(conv_op_dk.sum(), i)
except Exception:
# skip if the reference implementation can't do it
print ".",
else:
corr_op_dik = theano.grad(corr_op_di.sum(), k)
corr_op_dki = theano.grad(corr_op_dk.sum(), i)
outputs.extend([corr_op_dik, conv_op_dik,
corr_op_dki, conv_op_dki])
print ":",
f = theano.function([i, k], outputs, mode=theano_mode)
allvals = f(npy_img, npy_kern)
for a, b, p in zip(allvals[::2], allvals[1::2],
('top', 'dtop/dbottom', 'dtop/dweight',
'dtop/dbottom/dweight', 'dtop/dweight/dbottom')):
if (a.shape != b.shape) or not numpy.allclose(a, b, rtol=1e-4):
print "Test failed for", p
print "mode: ", mode
print "ishape: ", ishape
print "kshape: ", kshape
print "subsample: ", subsample
assert False
sys.stdout.flush()
def benchmark(): def benchmark():
...@@ -1017,15 +962,15 @@ def benchmark(): ...@@ -1017,15 +962,15 @@ def benchmark():
,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full ,((2, 30,116,116), (20, 30, 9,9), (1, 1), (1, 1), (1, 1))#full conv_reference_full
] ]
# shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:]
version = [-1] version = [-1]
verbose = 1 verbose = 1
random = True random = True
exec_conv(version, shapes_valid, verbose, random, 'valid', for t in exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3) print_=None, rtol=1e-3):
exec_conv(version, shapes_full, verbose, random, 'full') t[0](*t[1:])
for t in exec_conv(version, shapes_full, verbose, random, 'full'):
t[0](*t[1:])
def test_stack_rows_segfault_070312(): def test_stack_rows_segfault_070312():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论