提交 92c62153 authored 作者: abergeron's avatar abergeron

Merge pull request #2551 from kelvinxu/gpu_continuous_check

Check negative strides in gpu_contiguous
...@@ -3407,6 +3407,76 @@ class GpuAlloc(GpuOp): ...@@ -3407,6 +3407,76 @@ class GpuAlloc(GpuOp):
gpu_alloc = GpuAlloc() gpu_alloc = GpuAlloc()
class CopyOnNegativeStrides(GpuOp):
"""
Checks if the input has contains negative strides. If it
does, returns a c contiguous copy.
"""
view_map = {0: [0]}
check_input = False
__props__ = ()
def grad(self, inputs, dout):
x, = inputs
dout, = dout
dout = as_cuda_ndarray_variable(dout)
return [dout]
def make_node(self, input):
input = as_cuda_ndarray_variable(input)
return Apply(self, [input], [input.type()])
def perform(self, node, inp, out):
i = inp[0]
if any(s < 0 for s in i.strides):
i = i.copy()
out[0][0] = i
def c_code(self, node, name, inp, out, sub):
input, = inp
z, = out
fail = sub['fail']
str = """
{
bool strides_all_positive = true;
for (int i = 0; i < CudaNdarray_NDIM(%(input)s); i++){
if (CudaNdarray_HOST_STRIDES(%(input)s)[i] < 0){
strides_all_positive = false;
break;
}
}
if (strides_all_positive){
Py_XDECREF(%(z)s);
%(z)s = %(input)s;
Py_INCREF(%(z)s);
} else if ((NULL == %(z)s)""" % locals()
for i in xrange(node.inputs[0].type.ndim):
str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
str += """
|| !CudaNdarray_is_c_contiguous(%(z)s))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
if (!%(z)s)
{
%(fail)s;
}
}else if(CudaNdarray_CopyFromCudaNdarray(%(z)s,%(input)s)){
%(fail)s;
}
}
""" % locals()
return str
def c_code_cache_version(self):
return (0,)
cp_on_negative_strides = CopyOnNegativeStrides()
class GpuContiguous(GpuOp): class GpuContiguous(GpuOp):
""" """
Always return a c contiguous output. Copy the input only if it is Always return a c contiguous output. Copy the input only if it is
......
...@@ -14,7 +14,8 @@ from theano.tensor.basic import ShapeError ...@@ -14,7 +14,8 @@ from theano.tensor.basic import ShapeError
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous, HostFromGpu) gpu_contiguous, HostFromGpu,
cp_on_negative_strides)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax from theano.sandbox.cuda.nnet import GpuSoftmax
...@@ -394,7 +395,7 @@ class GpuDnnConv(DnnBase, COp): ...@@ -394,7 +395,7 @@ class GpuDnnConv(DnnBase, COp):
img, kerns, desc = inp img, kerns, desc = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = cp_on_negative_strides(top)
d_img = GpuDnnConvGradI()(kerns, top, desc, d_img = GpuDnnConvGradI()(kerns, top, desc,
img.shape[2], img.shape[3]) img.shape[2], img.shape[3])
...@@ -520,7 +521,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -520,7 +521,7 @@ class GpuDnnConvGradI(DnnBase, COp):
kerns, top, desc, h, w = inp kerns, top, desc, h, w = inp
img, = grads img, = grads
img = gpu_contiguous(img) img = cp_on_negative_strides(img)
d_kerns = GpuDnnConvGradW()(img, top, desc, d_kerns = GpuDnnConvGradW()(img, top, desc,
kerns.shape[2], kerns.shape[3]) kerns.shape[2], kerns.shape[3])
...@@ -630,7 +631,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -630,7 +631,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3) return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
img = gpu_contiguous(img) # cp_on_negative_strides will return a gpu_contiguous copy
# if the img contains negative strides
img = cp_on_negative_strides(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape) conv_mode=conv_mode)(img.shape, kerns.shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论