提交 904a523d authored 作者: Kelvin Xu's avatar Kelvin Xu 提交者: Kelvin Xu

added copy on negative strides

上级 013bc89d
......@@ -3407,6 +3407,76 @@ class GpuAlloc(GpuOp):
gpu_alloc = GpuAlloc()
class CopyOnNegativeStrides(GpuOp):
"""
Checks if the input has contains negative strides. If it
does, returns a c contiguous copy.
"""
view_map = {0: [0]}
check_input = False
__props__ = ()
def grad(self, inputs, dout):
x, = inputs
dout, = dout
dout = as_cuda_ndarray_variable(dout)
return [dout]
def make_node(self, input):
input = as_cuda_ndarray_variable(input)
return Apply(self, [input], [input.type()])
def perform(self, node, inp, out):
i = inp[0]
if any(s < 0 for s in i.strides):
i = i.copy()
out[0][0] = i
def c_code(self, node, name, inp, out, sub):
input, = inp
z, = out
fail = sub['fail']
str = """
{
bool strides_all_positive = true;
for (int i = 0; i < CudaNdarray_NDIM(%(input)s); i++){
if (CudaNdarray_HOST_STRIDES(%(input)s)[i] < 0){
strides_all_positive = false;
break;
}
}
if (strides_all_positive){
Py_XDECREF(%(z)s);
%(z)s = %(input)s;
Py_INCREF(%(z)s);
} else if ((NULL == %(z)s)""" % locals()
for i in xrange(node.inputs[0].type.ndim):
str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
str += """
|| !CudaNdarray_is_c_contiguous(%(z)s))
{
Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
if (!%(z)s)
{
%(fail)s;
}
}else if(CudaNdarray_CopyFromCudaNdarray(%(z)s,%(input)s)){
%(fail)s;
}
}
""" % locals()
return str
def c_code_cache_version(self):
return (0,)
cp_on_negative_strides = CopyOnNegativeStrides()
class GpuContiguous(GpuOp):
"""
Always return a c contiguous output. Copy the input only if it is
......
......@@ -14,7 +14,8 @@ from theano.tensor.basic import ShapeError
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous, HostFromGpu)
gpu_contiguous, HostFromGpu,
cp_on_negative_strides)
from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import GpuSoftmax
......@@ -630,7 +631,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3)
# Standard case: We use GpuDnnConv with suitable padding.
img = gpu_contiguous(img)
# cp_on_negative_strides will return a gpu_contiguous copy
# if the img contains negative strides
img = cp_on_negative_strides(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论