提交 c4e6bded authored 作者: nouiz's avatar nouiz

Merge pull request #1187 from lamblin/fix_check_prealloc_out

Make check_preallocated_output work on fct outputs
......@@ -1609,7 +1609,11 @@ class _Linker(gof.link.LocalLinker):
active_order = self.schedule(fgraph) # an ordering of just the active nodes
active_order_set = set(active_order)
no_recycling = self.no_recycling
# Disable no_recycling, in order to be able to use
# check_preallocated_output even on the output of the function.
# no_recycling in individual thunks does not really matter, since
# the function's outputs will always be freshly allocated.
no_recycling = []
input_storage, output_storage, storage_map = link.map_storage(
fgraph, order, input_storage_, output_storage_)
......@@ -1704,11 +1708,14 @@ class _Linker(gof.link.LocalLinker):
_logger.warn("We won't check the perform function of node '%s' but we will check its make_thunk function" % node)
thunks_py[-1] = thunk
if no_recycling is True:
no_recycling = storage_map.values()
no_recycling = utils.difference(no_recycling, input_storage)
# Use self.no_recycling (that was passed in accept()) to always
# use new memory storage when it is needed, in particular for the
# function's outputs. no_recycling_map will be used in f() below.
if self.no_recycling is True:
no_recycling_map = storage_map.values()
no_recycling_map = utils.difference(no_recycling_map, input_storage)
else:
no_recycling = [storage_map[r] for r in no_recycling
no_recycling_map = [storage_map[r] for r in self.no_recycling
if r not in fgraph.inputs]
# Precompute some things for storage pre-allocation
......@@ -1729,7 +1736,7 @@ class _Linker(gof.link.LocalLinker):
_logger.debug("starting a DebugMode call")
_logger.debug("self.maker.mode.check_preallocated_output: %s",
self.maker.mode.check_preallocated_output)
for x in no_recycling:
for x in no_recycling_map:
x[0] = None
# nest all this in try-finally to put storage *back* into
......
......@@ -709,7 +709,7 @@ class Test_preallocated_output(unittest.TestCase):
a = theano.tensor.fmatrix('a')
b = theano.tensor.fmatrix('b')
z = BrokenCImplementationAdd()(a, b)
# Needed so that z is not the output of the graph
# In this test, we do not want z to be an output of the graph.
out = theano.tensor.dot(z, numpy.eye(7))
a_val = self.rng.randn(7, 7).astype('float32')
......@@ -730,7 +730,39 @@ class Test_preallocated_output(unittest.TestCase):
check_preallocated_output=['f_contiguous'])
f = theano.function([a, b], out, mode=mode)
if theano.config.cxx:
self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
else:
# The python code of this op is good.
f(a_val, b_val)
def test_f_contiguous_out(self):
# Same test as test_f_contiguous, but check that it works
# even if z _is_ the output of the graph
a = theano.tensor.fmatrix('a')
b = theano.tensor.fmatrix('b')
out = BrokenCImplementationAdd()(a, b)
a_val = self.rng.randn(7, 7).astype('float32')
b_val = self.rng.randn(7, 7).astype('float32')
# Should work
mode = debugmode.DebugMode(
check_preallocated_output=['c_contiguous'])
f = theano.function([a, b], out, mode=mode)
out_val = f(a_val, b_val)
#print 'out_val =', out_val
#print out_val.strides
# Should raise an Exception, since the output buffer is
# used incorrectly.
mode = debugmode.DebugMode(
check_preallocated_output=['f_contiguous'])
f = theano.function([a, b], out, mode=mode)
if theano.config.cxx:
self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
else:
......
......@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp):
%(fail)s;
}
}
if (%(memset_0)s)
if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
{
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4))
......@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs]
def c_code_cache_version(self):
return (5,)
return (7,)
def do_constant_folding(self, node):
for client in node.outputs[0].clients:
......
......@@ -748,7 +748,7 @@ class GpuDownsampleFactorMax(GpuOp):
#def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return (5)
return (6)
def c_code(self, node, nodename, inp, out, sub):
x, = inp
......@@ -849,6 +849,9 @@ class GpuDownsampleFactorMax(GpuOp):
float *z, int zS0, int zS1, int zS2, int zS3)
{
float cur_max, cur_x;
// Cast threadIdx.x into a signed int, to avoid problems with
// indexing with negative offsets.
int tx = threadIdx.x;
for(int block_x_idx = blockIdx.x;
block_x_idx < D0 * D1;
block_x_idx += gridDim.x){
......@@ -865,7 +868,7 @@ class GpuDownsampleFactorMax(GpuOp):
{
__syncthreads();
// load the current row of the image into shared memory
for (int j = threadIdx.x; j < xD3; j += blockDim.x)
for (int j = tx; j < xD3; j += blockDim.x)
{
xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3];
}
......@@ -873,7 +876,7 @@ class GpuDownsampleFactorMax(GpuOp):
// initialize our max if this is the
// first row we're loading
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
cur_max = (r2 == 0) ? xbuf[tx*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements
// in the current row
......@@ -882,7 +885,7 @@ class GpuDownsampleFactorMax(GpuOp):
{
for (int k = 0; k < pf3; ++k)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_x = xbuf[tx*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
......@@ -890,17 +893,16 @@ class GpuDownsampleFactorMax(GpuOp):
{
for (int k = 0; k < pf3; ++k)
{
if (threadIdx.x*pf3 + k < xD3)
if (tx*pf3 + k < xD3)
{
cur_x = xbuf[threadIdx.x*pf3+k];
cur_x = xbuf[tx*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max;
}
}
}
}
//store the result to global memory
z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
z[i0*zS0 + i1*zS1 + i2*zS2 + tx*zS3] = cur_max;
}
}
""" % locals()
......@@ -931,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
return Apply(self, [x, z, gz], [x.type()])
def c_code_cache_version(self):
return (6,)
return (7,)
def c_code(self, node, nodename, inp, out, sub):
x, z, gz = inp
......@@ -999,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
CudaNdarray_HOST_STRIDES(%(gz)s)[1],
CudaNdarray_HOST_STRIDES(%(gz)s)[2],
CudaNdarray_HOST_STRIDES(%(gz)s)[3],
CudaNdarray_DEV_DATA(%(gx)s));
CudaNdarray_DEV_DATA(%(gx)s),
CudaNdarray_HOST_STRIDES(%(gx)s)[0],
CudaNdarray_HOST_STRIDES(%(gx)s)[1],
CudaNdarray_HOST_STRIDES(%(gx)s)[2],
CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
......@@ -1037,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
const float * x, int xS0, int xS1, int xS2, int xS3,
const float * z, int zS0, int zS1, int zS2, int zS3,
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx)
float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
{
// D0: number of image rows
// D1: number of image cols
......@@ -1048,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// various .S. variables are strides
float cur_max, cur_x, my_z, my_gz;
// Cast threadIdx.x into a signed int, to avoid problems with
// indexing with negative offsets.
int tx = threadIdx.x;
for(int i0 = blockIdx.x;
i0 < D0;
i0 += gridDim.x){
......@@ -1056,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
// (as needed to cover all x rows)
int i2 = blockIdx.y;
int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1
int x_col = tx; // col wrt x, ranges from 0 to xD3 - 1
int z_col = x_col/ds1; // z_col corresponding to this x_col
......@@ -1073,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
if(blockDim.x != xD3)
{
x_col = threadIdx.x + col_iter * blockDim.x;
x_col = tx + col_iter * blockDim.x;
z_col = x_col/ds1;
}
......@@ -1108,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// gx[image_row][image_col][x_row][x_col]
// = (my_z == x[image_row][image_col][
// x_row][x_col]) ? my_gz : 0.0f;
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
x_row*xD3 + x_col]
gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
x_col*xS3]) ? my_gz : 0.0f;
}
//gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
// x_row*xD3 + x_col] = -999;
}
}
......
......@@ -3241,14 +3241,19 @@ static __global__ void k_copy_4d(const int N1,
// These must be made int instead of unsigned int due to a bug in nvcc
int bx = blockIdx.x;
int by = blockIdx.y;
// N1 and N2 are kept in case a future implementation needs to
// loop on the first two dimensions if there are not enough blocks
for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
for (int i = bx; i < N1; i += gridDim.x)
{
for (int i = threadIdx.x; i < N3; i += (int) blockDim.x)
for (int j = by; j < N2; j += gridDim.y)
{
y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] =
x[bx * sx1 + by * sx2 + i * sx3 + j * sx4];
for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
{
for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
{
y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
}
}
}
}
}
......@@ -3380,8 +3385,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
// The blocks implement the looping over the first two axes so
// this needs to be (N1, N2)
dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0],
(unsigned int) CudaNdarray_HOST_DIMS(self)[1]);
dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
NUM_VECTOR_OP_BLOCKS),
std::min(CudaNdarray_HOST_DIMS(self)[1],
NUM_VECTOR_OP_BLOCKS));
// For the threads, just make as many as possible
dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
......
......@@ -445,14 +445,14 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)s;
}
} else {
if(!CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) {
if(CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) {
PyErr_SetString(PyExc_ValueError,
"DeepCopyOp: the copy failed into already allocated space!");
%(fail)s;
}
}
""",
version=2)
version=3)
# THIS WORKS But CudaNdarray instances don't compare equal to one
......
......@@ -128,7 +128,7 @@ class MultinomialFromUniform(Op):
if unis.shape[0] != pvals.shape[0]:
raise ValueError("unis.shape[0] != pvals.shape[0]",
unis.shape[0], pvals.shape[0])
if not z[0] or z[0].shape != pvals.shape:
if z[0] is None or z[0].shape != pvals.shape:
z[0] = numpy.zeros(pvals.shape, dtype=node.outputs[0].dtype)
nb_multi = pvals.shape[0]
......
......@@ -965,7 +965,7 @@ class ConvOp(OpenMPOp):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>']
def c_code_cache_version(self):
return (9, self.openmp)
return (10, self.openmp)
def c_support_code(self):
return """
......@@ -1343,14 +1343,24 @@ if (typenum != typenum_f) {
%(fail)s;
}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if (!img2d)
{
PyErr_SetString(PyExc_AssertionError, "!img2d");
%(fail)s;
}
if (!filtersflipped)
{
PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
%(fail)s;
}
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
|| (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||!PyArray_ISCONTIGUOUS(%(z)s)
)
{
{Py_XDECREF(%(z)s);}
......@@ -1370,19 +1380,11 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
//assertions
if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] *
PyArray_DIMS(%(z)s)[2] *
PyArray_DIMS(%(z)s)[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] *
PyArray_DIMS(%(z)s)[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s))
if (!PyArray_ISCONTIGUOUS(%(z)s))
{
PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
%(fail)s;
}
for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
......@@ -1862,14 +1864,24 @@ typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s;
if (!filtersflipped) %(fail)s;
if (!img2d)
{
PyErr_SetString(PyExc_AssertionError, "!img2d");
%(fail)s;
}
if (!filtersflipped)
{
PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
%(fail)s;
}
if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4
||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
|| (PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||!PyArray_ISCONTIGUOUS(%(z)s)
)
{
{Py_XDECREF(%(z)s);}
......@@ -1889,10 +1901,11 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s;
//assertions
if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] *PyArray_DIMS(%(z)s)[2] *PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] * PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s;
if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s)) %(fail)s;
if (!PyArray_ISCONTIGUOUS(%(z)s))
{
PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
%(fail)s;
}
for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论