提交 3dc94e90 authored 作者: abalkin's avatar abalkin

Merge branch 'master' into take-op-c-code-clean

...@@ -87,6 +87,7 @@ def main(): ...@@ -87,6 +87,7 @@ def main():
if time_prof_args or batch_args: if time_prof_args or batch_args:
from theano.tests import run_tests_in_batch from theano.tests import run_tests_in_batch
return run_tests_in_batch.main( return run_tests_in_batch.main(
theano_nose=os.path.realpath(__file__),
batch_size=batch_size, batch_size=batch_size,
time_profile=bool(time_prof_args), time_profile=bool(time_prof_args),
display_batch_output=display_batch_output) display_batch_output=display_batch_output)
......
...@@ -1609,7 +1609,11 @@ class _Linker(gof.link.LocalLinker): ...@@ -1609,7 +1609,11 @@ class _Linker(gof.link.LocalLinker):
active_order = self.schedule(fgraph) # an ordering of just the active nodes active_order = self.schedule(fgraph) # an ordering of just the active nodes
active_order_set = set(active_order) active_order_set = set(active_order)
no_recycling = self.no_recycling # Disable no_recycling, in order to be able to use
# check_preallocated_output even on the output of the function.
# no_recycling in individual thunks does not really matter, since
# the function's outputs will always be freshly allocated.
no_recycling = []
input_storage, output_storage, storage_map = link.map_storage( input_storage, output_storage, storage_map = link.map_storage(
fgraph, order, input_storage_, output_storage_) fgraph, order, input_storage_, output_storage_)
...@@ -1704,11 +1708,14 @@ class _Linker(gof.link.LocalLinker): ...@@ -1704,11 +1708,14 @@ class _Linker(gof.link.LocalLinker):
_logger.warn("We won't check the perform function of node '%s' but we will check its make_thunk function" % node) _logger.warn("We won't check the perform function of node '%s' but we will check its make_thunk function" % node)
thunks_py[-1] = thunk thunks_py[-1] = thunk
if no_recycling is True: # Use self.no_recycling (that was passed in accept()) to always
no_recycling = storage_map.values() # use new memory storage when it is needed, in particular for the
no_recycling = utils.difference(no_recycling, input_storage) # function's outputs. no_recycling_map will be used in f() below.
if self.no_recycling is True:
no_recycling_map = storage_map.values()
no_recycling_map = utils.difference(no_recycling_map, input_storage)
else: else:
no_recycling = [storage_map[r] for r in no_recycling no_recycling_map = [storage_map[r] for r in self.no_recycling
if r not in fgraph.inputs] if r not in fgraph.inputs]
# Precompute some things for storage pre-allocation # Precompute some things for storage pre-allocation
...@@ -1729,7 +1736,7 @@ class _Linker(gof.link.LocalLinker): ...@@ -1729,7 +1736,7 @@ class _Linker(gof.link.LocalLinker):
_logger.debug("starting a DebugMode call") _logger.debug("starting a DebugMode call")
_logger.debug("self.maker.mode.check_preallocated_output: %s", _logger.debug("self.maker.mode.check_preallocated_output: %s",
self.maker.mode.check_preallocated_output) self.maker.mode.check_preallocated_output)
for x in no_recycling: for x in no_recycling_map:
x[0] = None x[0] = None
# nest all this in try-finally to put storage *back* into # nest all this in try-finally to put storage *back* into
......
...@@ -709,7 +709,7 @@ class Test_preallocated_output(unittest.TestCase): ...@@ -709,7 +709,7 @@ class Test_preallocated_output(unittest.TestCase):
a = theano.tensor.fmatrix('a') a = theano.tensor.fmatrix('a')
b = theano.tensor.fmatrix('b') b = theano.tensor.fmatrix('b')
z = BrokenCImplementationAdd()(a, b) z = BrokenCImplementationAdd()(a, b)
# Needed so that z is not the output of the graph # In this test, we do not want z to be an output of the graph.
out = theano.tensor.dot(z, numpy.eye(7)) out = theano.tensor.dot(z, numpy.eye(7))
a_val = self.rng.randn(7, 7).astype('float32') a_val = self.rng.randn(7, 7).astype('float32')
...@@ -730,7 +730,39 @@ class Test_preallocated_output(unittest.TestCase): ...@@ -730,7 +730,39 @@ class Test_preallocated_output(unittest.TestCase):
check_preallocated_output=['f_contiguous']) check_preallocated_output=['f_contiguous'])
f = theano.function([a, b], out, mode=mode) f = theano.function([a, b], out, mode=mode)
if theano.config.cxx:
self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
else:
# The python code of this op is good.
f(a_val, b_val)
def test_f_contiguous_out(self):
# Same test as test_f_contiguous, but check that it works
# even if z _is_ the output of the graph
a = theano.tensor.fmatrix('a')
b = theano.tensor.fmatrix('b')
out = BrokenCImplementationAdd()(a, b)
a_val = self.rng.randn(7, 7).astype('float32')
b_val = self.rng.randn(7, 7).astype('float32')
# Should work
mode = debugmode.DebugMode(
check_preallocated_output=['c_contiguous'])
f = theano.function([a, b], out, mode=mode)
out_val = f(a_val, b_val)
#print 'out_val =', out_val
#print out_val.strides
# Should raise an Exception, since the output buffer is
# used incorrectly.
mode = debugmode.DebugMode(
check_preallocated_output=['f_contiguous'])
f = theano.function([a, b], out, mode=mode)
if theano.config.cxx: if theano.config.cxx:
self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val) self.assertRaises(debugmode.BadThunkOutput, f, a_val, b_val)
else: else:
......
...@@ -298,7 +298,7 @@ AddConfigVar('warn.ignore_bug_before', ...@@ -298,7 +298,7 @@ AddConfigVar('warn.ignore_bug_before',
"bugs found after that version. " "bugs found after that version. "
"Warning for specific bugs can be configured with specific " "Warning for specific bugs can be configured with specific "
"[warn] flags."), "[warn] flags."),
EnumStr('None', 'all', '0.3', '0.4', '0.4.1', '0.5', '0.6', EnumStr('0.5', 'None', 'all', '0.3', '0.4', '0.4.1', '0.6',
allow_override=False), allow_override=False),
in_c_key=False) in_c_key=False)
......
...@@ -891,10 +891,11 @@ class ModuleCache(object): ...@@ -891,10 +891,11 @@ class ModuleCache(object):
hash_key = hash(key) hash_key = hash(key)
key_data = None key_data = None
# We have never seen this key before. # We have never seen this key before.
# Acquire lock before creating things in the compile cache,
# to avoid that other processes remove the compile dir while it # We acquire the lock later only if we where able to
# is still empty. # generate c code Otherwise, we would take the lock for op
compilelock.get_lock() # that have only a perform().
lock_taken = False
# This try/finally block ensures that the lock is released once we # This try/finally block ensures that the lock is released once we
# are done writing in the cache file or after raising an exception. # are done writing in the cache file or after raising an exception.
try: try:
...@@ -918,6 +919,13 @@ class ModuleCache(object): ...@@ -918,6 +919,13 @@ class ModuleCache(object):
# The first compilation step is to yield the source code. # The first compilation step is to yield the source code.
src_code = compile_steps.next() src_code = compile_steps.next()
module_hash = get_module_hash(src_code, key) module_hash = get_module_hash(src_code, key)
# The op have c_code, so take the lock.
compilelock.get_lock()
lock_taken = True
assert os.path.exists(location), (
"The directory just created shouldn't be deleted!")
if module_hash in self.module_hash_to_key_data: if module_hash in self.module_hash_to_key_data:
_logger.debug("Duplicated module! Will re-use the " _logger.debug("Duplicated module! Will re-use the "
"previous one") "previous one")
...@@ -1039,7 +1047,7 @@ class ModuleCache(object): ...@@ -1039,7 +1047,7 @@ class ModuleCache(object):
finally: finally:
# Release lock if needed. # Release lock if needed.
if not keep_lock: if not keep_lock and lock_taken:
compilelock.release_lock() compilelock.release_lock()
# Update map from key to module name for all keys associated to # Update map from key to module name for all keys associated to
......
...@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp): ...@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp):
%(fail)s; %(fail)s;
} }
} }
if (%(memset_0)s) if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
{ {
if (cudaSuccess != cudaMemset(%(out)s->devdata, 0, if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
CudaNdarray_SIZE(%(out)s) * 4)) CudaNdarray_SIZE(%(out)s) * 4))
...@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp): ...@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp):
return [None for i in inputs] return [None for i in inputs]
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (7,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
for client in node.outputs[0].clients: for client in node.outputs[0].clients:
...@@ -2803,6 +2803,13 @@ class GpuContiguous(GpuOp): ...@@ -2803,6 +2803,13 @@ class GpuContiguous(GpuOp):
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def grad(self, inputs, dout):
x, = inputs
dout, = dout
return [dout]
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
...@@ -2824,7 +2831,8 @@ class GpuContiguous(GpuOp): ...@@ -2824,7 +2831,8 @@ class GpuContiguous(GpuOp):
} else if ((NULL == %(z)s)""" % locals() } else if ((NULL == %(z)s)""" % locals()
for i in xrange(len(node.inputs[0].type.broadcastable)): for i in xrange(len(node.inputs[0].type.broadcastable)):
str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals() str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
str += """) str += """
|| !CudaNdarray_is_c_contiguous(%(z)s))
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s); %(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
...@@ -2840,7 +2848,7 @@ class GpuContiguous(GpuOp): ...@@ -2840,7 +2848,7 @@ class GpuContiguous(GpuOp):
return str return str
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
gpu_contiguous = GpuContiguous() gpu_contiguous = GpuContiguous()
......
...@@ -748,7 +748,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -748,7 +748,7 @@ class GpuDownsampleFactorMax(GpuOp):
#def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return (5) return (6)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
...@@ -849,6 +849,9 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -849,6 +849,9 @@ class GpuDownsampleFactorMax(GpuOp):
float *z, int zS0, int zS1, int zS2, int zS3) float *z, int zS0, int zS1, int zS2, int zS3)
{ {
float cur_max, cur_x; float cur_max, cur_x;
// Cast threadIdx.x into a signed int, to avoid problems with
// indexing with negative offsets.
int tx = threadIdx.x;
for(int block_x_idx = blockIdx.x; for(int block_x_idx = blockIdx.x;
block_x_idx < D0 * D1; block_x_idx < D0 * D1;
block_x_idx += gridDim.x){ block_x_idx += gridDim.x){
...@@ -865,7 +868,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -865,7 +868,7 @@ class GpuDownsampleFactorMax(GpuOp):
{ {
__syncthreads(); __syncthreads();
// load the current row of the image into shared memory // load the current row of the image into shared memory
for (int j = threadIdx.x; j < xD3; j += blockDim.x) for (int j = tx; j < xD3; j += blockDim.x)
{ {
xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3]; xbuf[j] = x[i0*xS0 + i1*xS1 + (i2*pf2+r2)*xS2 + j*xS3];
} }
...@@ -873,7 +876,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -873,7 +876,7 @@ class GpuDownsampleFactorMax(GpuOp):
// initialize our max if this is the // initialize our max if this is the
// first row we're loading // first row we're loading
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max; cur_max = (r2 == 0) ? xbuf[tx*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements // do a mini-reduction over the pf3 relevant elements
// in the current row // in the current row
...@@ -882,7 +885,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -882,7 +885,7 @@ class GpuDownsampleFactorMax(GpuOp):
{ {
for (int k = 0; k < pf3; ++k) for (int k = 0; k < pf3; ++k)
{ {
cur_x = xbuf[threadIdx.x*pf3+k]; cur_x = xbuf[tx*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max; cur_max = (cur_x > cur_max) ? cur_x : cur_max;
} }
} }
...@@ -890,17 +893,16 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -890,17 +893,16 @@ class GpuDownsampleFactorMax(GpuOp):
{ {
for (int k = 0; k < pf3; ++k) for (int k = 0; k < pf3; ++k)
{ {
if (threadIdx.x*pf3 + k < xD3) if (tx*pf3 + k < xD3)
{ {
cur_x = xbuf[threadIdx.x*pf3+k]; cur_x = xbuf[tx*pf3+k];
cur_max = (cur_x > cur_max) ? cur_x : cur_max; cur_max = (cur_x > cur_max) ? cur_x : cur_max;
} }
} }
} }
} }
//store the result to global memory z[i0*zS0 + i1*zS1 + i2*zS2 + tx*zS3] = cur_max;
z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
} }
} }
""" % locals() """ % locals()
...@@ -931,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -931,7 +933,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, z, gz = inp x, z, gz = inp
...@@ -999,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -999,7 +1001,11 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
CudaNdarray_HOST_STRIDES(%(gz)s)[1], CudaNdarray_HOST_STRIDES(%(gz)s)[1],
CudaNdarray_HOST_STRIDES(%(gz)s)[2], CudaNdarray_HOST_STRIDES(%(gz)s)[2],
CudaNdarray_HOST_STRIDES(%(gz)s)[3], CudaNdarray_HOST_STRIDES(%(gz)s)[3],
CudaNdarray_DEV_DATA(%(gx)s)); CudaNdarray_DEV_DATA(%(gx)s),
CudaNdarray_HOST_STRIDES(%(gx)s)[0],
CudaNdarray_HOST_STRIDES(%(gx)s)[1],
CudaNdarray_HOST_STRIDES(%(gx)s)[2],
CudaNdarray_HOST_STRIDES(%(gx)s)[3]);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
...@@ -1037,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1037,7 +1043,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
const float * x, int xS0, int xS1, int xS2, int xS3, const float * x, int xS0, int xS1, int xS2, int xS3,
const float * z, int zS0, int zS1, int zS2, int zS3, const float * z, int zS0, int zS1, int zS2, int zS3,
const float * gz, int gzS0, int gzS1, int gzS2, int gzS3, const float * gz, int gzS0, int gzS1, int gzS2, int gzS3,
float *gx) float *gx, int gxS0, int gxS1, int gxS2, int gxS3)
{ {
// D0: number of image rows // D0: number of image rows
// D1: number of image cols // D1: number of image cols
...@@ -1048,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1048,6 +1054,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// various .S. variables are strides // various .S. variables are strides
float cur_max, cur_x, my_z, my_gz; float cur_max, cur_x, my_z, my_gz;
// Cast threadIdx.x into a signed int, to avoid problems with
// indexing with negative offsets.
int tx = threadIdx.x;
for(int i0 = blockIdx.x; for(int i0 = blockIdx.x;
i0 < D0; i0 < D0;
i0 += gridDim.x){ i0 += gridDim.x){
...@@ -1056,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1056,7 +1066,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2 // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
// (as needed to cover all x rows) // (as needed to cover all x rows)
int i2 = blockIdx.y; int i2 = blockIdx.y;
int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1 int x_col = tx; // col wrt x, ranges from 0 to xD3 - 1
int z_col = x_col/ds1; // z_col corresponding to this x_col int z_col = x_col/ds1; // z_col corresponding to this x_col
...@@ -1073,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1073,7 +1083,7 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
if(blockDim.x != xD3) if(blockDim.x != xD3)
{ {
x_col = threadIdx.x + col_iter * blockDim.x; x_col = tx + col_iter * blockDim.x;
z_col = x_col/ds1; z_col = x_col/ds1;
} }
...@@ -1108,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1108,13 +1118,10 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
// gx[image_row][image_col][x_row][x_col] // gx[image_row][image_col][x_row][x_col]
// = (my_z == x[image_row][image_col][ // = (my_z == x[image_row][image_col][
// x_row][x_col]) ? my_gz : 0.0f; // x_row][x_col]) ? my_gz : 0.0f;
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + gx[i0*gxS0 + i1*gxS1 + x_row*gxS2 + x_col*gxS3]
x_row*xD3 + x_col]
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
x_col*xS3]) ? my_gz : 0.0f; x_col*xS3]) ? my_gz : 0.0f;
} }
//gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
// x_row*xD3 + x_col] = -999;
} }
} }
......
...@@ -3241,14 +3241,19 @@ static __global__ void k_copy_4d(const int N1, ...@@ -3241,14 +3241,19 @@ static __global__ void k_copy_4d(const int N1,
// These must be made int instead of unsigned int due to a bug in nvcc // These must be made int instead of unsigned int due to a bug in nvcc
int bx = blockIdx.x; int bx = blockIdx.x;
int by = blockIdx.y; int by = blockIdx.y;
// N1 and N2 are kept in case a future implementation needs to
// loop on the first two dimensions if there are not enough blocks for (int i = bx; i < N1; i += gridDim.x)
for (int j = threadIdx.y; j < (int) N4; j += (int) blockDim.y)
{ {
for (int i = threadIdx.x; i < N3; i += (int) blockDim.x) for (int j = by; j < N2; j += gridDim.y)
{ {
y[bx * sy1 + by * sy2 + i * sy3 + j * sy4] = for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
x[bx * sx1 + by * sx2 + i * sx3 + j * sx4]; {
for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
{
y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
}
}
} }
} }
} }
...@@ -3380,8 +3385,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, ...@@ -3380,8 +3385,10 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
// The blocks implement the looping over the first two axes so // The blocks implement the looping over the first two axes so
// this needs to be (N1, N2) // this needs to be (N1, N2)
dim3 n_blocks( (unsigned int) CudaNdarray_HOST_DIMS(self)[0], dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
(unsigned int) CudaNdarray_HOST_DIMS(self)[1]); NUM_VECTOR_OP_BLOCKS),
std::min(CudaNdarray_HOST_DIMS(self)[1],
NUM_VECTOR_OP_BLOCKS));
// For the threads, just make as many as possible // For the threads, just make as many as possible
dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2], dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
(unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK), (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
#else #else
#define DllExport __declspec( dllimport ) #define DllExport __declspec( dllimport )
#endif #endif
#else #define ALWAYS_INLINE
#else //else _WIN32
#define DllExport #define DllExport
#define ALWAYS_INLINE __attribute__((always_inline))
#endif #endif
typedef float real; typedef float real;
...@@ -134,7 +136,7 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self); ...@@ -134,7 +136,7 @@ CudaNdarray_HOST_STRIDES(const CudaNdarray * self);
DllExport const int * DllExport const int *
CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self); CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self);
DllExport inline void __attribute__((always_inline)) DllExport inline void ALWAYS_INLINE
cnda_mark_dev_structure_dirty(CudaNdarray * self) cnda_mark_dev_structure_dirty(CudaNdarray * self)
{ {
self->dev_structure_fresh = 0; self->dev_structure_fresh = 0;
...@@ -155,7 +157,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2); ...@@ -155,7 +157,7 @@ CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2);
* *
* Does not sync structure to device. * Does not sync structure to device.
*/ */
DllExport inline void __attribute__((always_inline)) DllExport inline void ALWAYS_INLINE
CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
{ {
if ((idx >= self->nd) || (idx < 0) || (d < 0)) if ((idx >= self->nd) || (idx < 0) || (d < 0))
...@@ -173,7 +175,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d) ...@@ -173,7 +175,7 @@ CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
} }
DllExport inline void __attribute__((always_inline)) DllExport inline void ALWAYS_INLINE
CudaNdarray_set_stride(CudaNdarray * self, int idx, int s) CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
{ {
if ((idx >= self->nd) || (idx < 0)) if ((idx >= self->nd) || (idx < 0))
...@@ -232,7 +234,7 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd); ...@@ -232,7 +234,7 @@ DllExport PyObject * CudaNdarray_new_nd(const int nd);
* Note: This does not allocate storage for data, or free * Note: This does not allocate storage for data, or free
* pre-existing storage. * pre-existing storage.
*/ */
DllExport inline int __attribute__((always_inline)) DllExport inline int ALWAYS_INLINE
CudaNdarray_set_nd(CudaNdarray * self, const int nd) CudaNdarray_set_nd(CudaNdarray * self, const int nd)
{ {
if (nd != self->nd) if (nd != self->nd)
...@@ -434,7 +436,7 @@ CudaNdarray_ZEROS(int n, int * dims); ...@@ -434,7 +436,7 @@ CudaNdarray_ZEROS(int n, int * dims);
/** /**
* True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1] * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
*/ */
DllExport inline bool __attribute__((always_inline)) DllExport inline bool ALWAYS_INLINE
CudaNdarray_is_c_contiguous(const CudaNdarray * self) CudaNdarray_is_c_contiguous(const CudaNdarray * self)
{ {
bool c_contiguous = true; bool c_contiguous = true;
......
...@@ -445,14 +445,14 @@ theano.compile.register_deep_copy_op_c_code( ...@@ -445,14 +445,14 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)s; %(fail)s;
} }
} else { } else {
if(!CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) { if(CudaNdarray_CopyFromCudaNdarray(%(oname)s, %(iname)s)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"DeepCopyOp: the copy failed into already allocated space!"); "DeepCopyOp: the copy failed into already allocated space!");
%(fail)s; %(fail)s;
} }
} }
""", """,
version=2) version=3)
# THIS WORKS But CudaNdarray instances don't compare equal to one # THIS WORKS But CudaNdarray instances don't compare equal to one
......
...@@ -128,7 +128,7 @@ class MultinomialFromUniform(Op): ...@@ -128,7 +128,7 @@ class MultinomialFromUniform(Op):
if unis.shape[0] != pvals.shape[0]: if unis.shape[0] != pvals.shape[0]:
raise ValueError("unis.shape[0] != pvals.shape[0]", raise ValueError("unis.shape[0] != pvals.shape[0]",
unis.shape[0], pvals.shape[0]) unis.shape[0], pvals.shape[0])
if not z[0] or z[0].shape != pvals.shape: if z[0] is None or z[0].shape != pvals.shape:
z[0] = numpy.zeros(pvals.shape, dtype=node.outputs[0].dtype) z[0] = numpy.zeros(pvals.shape, dtype=node.outputs[0].dtype)
nb_multi = pvals.shape[0] nb_multi = pvals.shape[0]
......
...@@ -696,7 +696,7 @@ def test_random_state_transfer(): ...@@ -696,7 +696,7 @@ def test_random_state_transfer():
""" """
Test that random state can be transferred from one theano graph to another. Test that random state can be transferred from one theano graph to another.
""" """
class Graph(): class Graph:
def __init__(self, seed=123): def __init__(self, seed=123):
self.rng = MRG_RandomStreams(seed) self.rng = MRG_RandomStreams(seed)
self.y = self.rng.uniform(size=(1,)) self.y = self.rng.uniform(size=(1,))
......
...@@ -3331,8 +3331,8 @@ class T_Scan(unittest.TestCase): ...@@ -3331,8 +3331,8 @@ class T_Scan(unittest.TestCase):
outputs_info=[tensor.zeros_like(A)]) outputs_info=[tensor.zeros_like(A)])
f = theano.function([A,B], S.owner.inputs[0][-1]) f = theano.function([A,B], S.owner.inputs[0][-1])
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
vA = rng.uniform(size=(5,5)) vA = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
vB = rng.uniform(size=(5,5)) vB = rng.uniform(size=(5, 5)).astype(theano.config.floatX)
assert numpy.allclose(f(vA, vB), numpy.dot(vA.T, vB)) assert numpy.allclose(f(vA, vB), numpy.dot(vA.T, vB))
......
...@@ -1615,7 +1615,7 @@ def local_gemm_to_ger(node): ...@@ -1615,7 +1615,7 @@ def local_gemm_to_ger(node):
yv = y.dimshuffle(1) yv = y.dimshuffle(1)
try: try:
bval = T.get_scalar_constant_value(b) bval = T.get_scalar_constant_value(b)
except TypeError: except T.NotScalarConstantError:
# b isn't a constant, GEMM is doing useful pre-scaling # b isn't a constant, GEMM is doing useful pre-scaling
return return
......
...@@ -965,7 +965,7 @@ class ConvOp(OpenMPOp): ...@@ -965,7 +965,7 @@ class ConvOp(OpenMPOp):
return ['<numpy/noprefix.h>', '<iostream>', '<sstream>'] return ['<numpy/noprefix.h>', '<iostream>', '<sstream>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (9, self.openmp) return (10, self.openmp)
def c_support_code(self): def c_support_code(self):
return """ return """
...@@ -1343,14 +1343,24 @@ if (typenum != typenum_f) { ...@@ -1343,14 +1343,24 @@ if (typenum != typenum_f) {
%(fail)s; %(fail)s;
} }
if (!img2d) %(fail)s; if (!img2d)
if (!filtersflipped) %(fail)s; {
PyErr_SetString(PyExc_AssertionError, "!img2d");
%(fail)s;
}
if (!filtersflipped)
{
PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
%(fail)s;
}
if ((!%(z)s) if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4 || *PyArray_DIMS(%(z)s)!=4
||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s) ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s) ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
||(PyArray_DIMS(%(z)s)[2] != dim_zz[0]) ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
|| (PyArray_DIMS(%(z)s)[3] != dim_zz[1]) ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||!PyArray_ISCONTIGUOUS(%(z)s)
) )
{ {
{Py_XDECREF(%(z)s);} {Py_XDECREF(%(z)s);}
...@@ -1370,19 +1380,11 @@ Os[0]=%(self_outshp0)s; ...@@ -1370,19 +1380,11 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s; Os[1]=%(self_outshp1)s;
//assertions //assertions
if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] * if (!PyArray_ISCONTIGUOUS(%(z)s))
PyArray_DIMS(%(z)s)[2] * {
PyArray_DIMS(%(z)s)[3] * PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] *
PyArray_DIMS(%(z)s)[3] *
(npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s))
%(fail)s;
if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s))
%(fail)s; %(fail)s;
}
for(int b=0;b< %(self_bsize)s;b++){ for(int b=0;b< %(self_bsize)s;b++){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern++){
...@@ -1862,14 +1864,24 @@ typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0); ...@@ -1862,14 +1864,24 @@ typenum_f = PyArray_ObjectType((PyObject*)%(filtersflipped)s, 0);
if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;} if (typenum < 0) {PyErr_SetString(PyExc_ValueError, "Invalid type"); %(fail)s;}
if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;} if (typenum != typenum_f) {PyErr_SetString(PyExc_ValueError, "Input types must match"); %(fail)s;}
if (!img2d) %(fail)s; if (!img2d)
if (!filtersflipped) %(fail)s; {
PyErr_SetString(PyExc_AssertionError, "!img2d");
%(fail)s;
}
if (!filtersflipped)
{
PyErr_SetString(PyExc_AssertionError, "!filtersflipped");
%(fail)s;
}
if ((!%(z)s) if ((!%(z)s)
|| *PyArray_DIMS(%(z)s)!=4 || *PyArray_DIMS(%(z)s)!=4
||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s) ||(PyArray_DIMS(%(z)s)[0] != %(self_bsize)s)
||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s) ||(PyArray_DIMS(%(z)s)[1] != %(self_nkern)s)
||(PyArray_DIMS(%(z)s)[2] != dim_zz[0]) ||(PyArray_DIMS(%(z)s)[2] != dim_zz[0])
|| (PyArray_DIMS(%(z)s)[3] != dim_zz[1]) ||(PyArray_DIMS(%(z)s)[3] != dim_zz[1])
||!PyArray_ISCONTIGUOUS(%(z)s)
) )
{ {
{Py_XDECREF(%(z)s);} {Py_XDECREF(%(z)s);}
...@@ -1889,10 +1901,11 @@ Os[0]=%(self_outshp0)s; ...@@ -1889,10 +1901,11 @@ Os[0]=%(self_outshp0)s;
Os[1]=%(self_outshp1)s; Os[1]=%(self_outshp1)s;
//assertions //assertions
if (PyArray_STRIDES(%(z)s)[0] != PyArray_DIMS(%(z)s)[1] *PyArray_DIMS(%(z)s)[2] *PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; if (!PyArray_ISCONTIGUOUS(%(z)s))
if (PyArray_STRIDES(%(z)s)[1] != PyArray_DIMS(%(z)s)[2] * PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; {
if (PyArray_STRIDES(%(z)s)[2] != PyArray_DIMS(%(z)s)[3] * (npy_intp)sizeof(%(type)s)) %(fail)s; PyErr_SetString(PyExc_AssertionError, "Output (%(z)s) not contiguous");
if (PyArray_STRIDES(%(z)s)[3] != (npy_intp)sizeof(%(type)s)) %(fail)s; %(fail)s;
}
for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){ for(int b=0;b< %(self_bsize)s ;b+=%(unroll_bsize)s){
for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){ for(int n_kern=0;n_kern<%(self_nkern)s;n_kern+=%(unroll_ksize)s){
......
...@@ -1185,6 +1185,9 @@ def local_subtensor_make_vector(node): ...@@ -1185,6 +1185,9 @@ def local_subtensor_make_vector(node):
# if it is a constant we can do something with it # if it is a constant we can do something with it
try: try:
v = get_scalar_constant_value(idx) v = get_scalar_constant_value(idx)
if isinstance(v, numpy.integer):
# Python 2.4 wants to index only with Python integers
v = int(v)
return [x.owner.inputs[v]] return [x.owner.inputs[v]]
except NotScalarConstantError: except NotScalarConstantError:
pass pass
......
...@@ -417,8 +417,8 @@ def makeTester(name, op, expected, checks=None, good=None, bad_build=None, ...@@ -417,8 +417,8 @@ def makeTester(name, op, expected, checks=None, good=None, bad_build=None,
def rand(*shape): def rand(*shape):
r = numpy.asarray(numpy.random.rand(*shape), dtype=config.floatX) r = numpy.random.rand(*shape) * 2 - 1
return r * 2 - 1 return numpy.asarray(r, dtype=config.floatX)
def rand_nonzero(shape, eps=3e-4): def rand_nonzero(shape, eps=3e-4):
...@@ -4270,8 +4270,9 @@ class t_dot(unittest.TestCase): ...@@ -4270,8 +4270,9 @@ class t_dot(unittest.TestCase):
return type(x), x.dtype, x.shape return type(x), x.dtype, x.shape
nz = numpy.dot(x, y) nz = numpy.dot(x, y)
tz = eval_outputs([dot(as_tensor_variable(x), as_tensor_variable(y))]) tz = eval_outputs([dot(as_tensor_variable(x), as_tensor_variable(y))])
self.assertTrue(tz.dtype == nz.dtype) self.assertTrue(tz.dtype == nz.dtype,
self.assertTrue(tz.shape == nz.shape) (tz.dtype, tz.dtype.num, nz.dtype, nz.dtype.num))
self.assertTrue(tz.shape == nz.shape, (tz.shape, nz.shape))
self.assertTrue(_approx_eq(nz, tz)) self.assertTrue(_approx_eq(nz, tz))
def test_Op_dims(self): def test_Op_dims(self):
...@@ -4300,19 +4301,19 @@ class t_dot(unittest.TestCase): ...@@ -4300,19 +4301,19 @@ class t_dot(unittest.TestCase):
self.assertRaises(TypeError, _dot, d3, d3) self.assertRaises(TypeError, _dot, d3, d3)
def test_dot_0d_0d(self): def test_dot_0d_0d(self):
self.cmp_dot(1.1, 2.2) self.cmp_dot(rand(), rand())
def test_dot_0d_1d(self): def test_dot_0d_1d(self):
self.cmp_dot(1.1, rand(5)) self.cmp_dot(rand(), rand(5))
def test_dot_0d_2d(self): def test_dot_0d_2d(self):
self.cmp_dot(3.0, rand(6,7)) self.cmp_dot(rand(), rand(6,7))
def test_dot_0d_3d(self): def test_dot_0d_3d(self):
self.cmp_dot(3.0, rand(8,6,7)) self.cmp_dot(rand(), rand(8,6,7))
def test_dot_1d_0d(self): def test_dot_1d_0d(self):
self.cmp_dot(rand(5), 1.1 ) self.cmp_dot(rand(5), rand())
def test_dot_1d_1d(self): def test_dot_1d_1d(self):
self.cmp_dot(rand(5), rand(5)) self.cmp_dot(rand(5), rand(5))
...@@ -4344,7 +4345,7 @@ class t_dot(unittest.TestCase): ...@@ -4344,7 +4345,7 @@ class t_dot(unittest.TestCase):
self.cmp_dot(rand(6), rand(8,6,7)) self.cmp_dot(rand(6), rand(8,6,7))
def test_dot_2d_0d(self): def test_dot_2d_0d(self):
self.cmp_dot(rand(5,6), 1.0) self.cmp_dot(rand(5,6), rand())
def test_dot_2d_1d(self): def test_dot_2d_1d(self):
self.cmp_dot(rand(5, 6), rand(6)) self.cmp_dot(rand(5, 6), rand(6))
...@@ -4380,7 +4381,7 @@ class t_dot(unittest.TestCase): ...@@ -4380,7 +4381,7 @@ class t_dot(unittest.TestCase):
self.cmp_dot(rand(5,6), rand(8,6,7)) self.cmp_dot(rand(5,6), rand(8,6,7))
def test_dot_3d_0d(self): def test_dot_3d_0d(self):
self.cmp_dot(rand(4,5,6), 1.0) self.cmp_dot(rand(4,5,6), rand())
def test_dot_3d_1d(self): def test_dot_3d_1d(self):
self.cmp_dot(rand(4,5,6), rand(6)) self.cmp_dot(rand(4,5,6), rand(6))
......
...@@ -5,6 +5,7 @@ import sys ...@@ -5,6 +5,7 @@ import sys
import theano.tensor as T import theano.tensor as T
from theano import tensor from theano import tensor
from theano.gof.python25 import product as itertools_product from theano.gof.python25 import product as itertools_product
from theano.gof.python25 import any
from theano.printing import pp from theano.printing import pp
import numpy import numpy
...@@ -857,7 +858,6 @@ def test_dot22(): ...@@ -857,7 +858,6 @@ def test_dot22():
assert _dot22 in [x.op for x in topo], (dtype1, dtype2) assert _dot22 in [x.op for x in topo], (dtype1, dtype2)
else: else:
check = [isinstance(x.op, T.Dot) for x in topo] check = [isinstance(x.op, T.Dot) for x in topo]
from theano.gof.python25 import any
assert any(check), (dtype1, dtype2) assert any(check), (dtype1, dtype2)
rng = numpy.random.RandomState(unittest_tools.fetch_seed()) rng = numpy.random.RandomState(unittest_tools.fetch_seed())
...@@ -1603,6 +1603,13 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin): ...@@ -1603,6 +1603,13 @@ class TestGer(TestCase, unittest_tools.TestOptimizationMixin):
self.A, self.a, self.x.dimshuffle(0, 'x'), self.A, self.a, self.x.dimshuffle(0, 'x'),
self.y.dimshuffle('x', 0), self.b(1.5)).owner) self.y.dimshuffle('x', 0), self.b(1.5)).owner)
def test_b_nonconst_does_not_triggers_ger(self):
""" test local_gemm_to_ger opt"""
assert not T.blas.local_gemm_to_ger.transform(
gemm_no_inplace(
self.A, self.a, self.x.dimshuffle(0, 'x'),
self.y.dimshuffle('x', 0), self.a).owner)
def test_outer(self): def test_outer(self):
f = self.function([self.x, self.y], T.outer(self.x, self.y)) f = self.function([self.x, self.y], T.outer(self.x, self.y))
self.assertFunctionContains(f, self.ger_destructive) self.assertFunctionContains(f, self.ger_destructive)
......
...@@ -101,7 +101,7 @@ def main(stdout=None, stderr=None, argv=None, theano_nose=None, ...@@ -101,7 +101,7 @@ def main(stdout=None, stderr=None, argv=None, theano_nose=None,
theano_nose = path theano_nose = path
break break
if theano_nose is None: if theano_nose is None:
raise Exception("Not able to find theano_nose") raise Exception("Not able to find theano-nose")
if batch_size is None: if batch_size is None:
batch_size = 100 batch_size = 100
stdout_backup = sys.stdout stdout_backup = sys.stdout
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论