提交 e16f2ec7 authored 作者: James Bergstra's avatar James Bergstra

merge

......@@ -6,6 +6,9 @@
#include "cuda_ndarray.cuh"
//If true, when their is a gpu malloc or free error, we print the size of allocated memory on the device.
#define COMPUTE_GPU_MEM_USED false
/////////////////////////
// Alloc and Free
/////////////////////////
......@@ -18,17 +21,40 @@
*
*/
int _outstanding_mallocs[] = {0,0};
#if COMPUTE_GPU_MEM_USED
int _allocated_size = 0;
const int TABLE_SIZE = 10000;
struct table_struct{
void* ptr;
int size;
};
table_struct _alloc_size_table[TABLE_SIZE];
#endif
void * device_malloc(size_t size)
{
void * rval=NULL;
cudaError_t err = cudaMalloc(&rval, size);
if (cudaSuccess != err)
{
#if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error allocating %li bytes of device memory (%s). %d already allocated\n", (long)size, cudaGetErrorString(err),_allocated_size);
#else
fprintf(stderr, "Error allocating %li bytes of device memory (%s).\n", (long)size, cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError, "error allocating %li bytes of device memory (%s)", (long)size, cudaGetErrorString(err));
return NULL;
}
_outstanding_mallocs[0] += (rval != NULL);
#if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){
if(NULL==_alloc_size_table[i].ptr){
_alloc_size_table[i].ptr=rval;
_alloc_size_table[i].size=size;
break;
}
}
_allocated_size += size;
#endif
return rval;
}
int device_free(void *ptr)
......@@ -36,11 +62,28 @@ int device_free(void *ptr)
cudaError_t err = cudaFree(ptr);
if (cudaSuccess != err)
{
#if COMPUTE_GPU_MEM_USED
fprintf(stderr, "Error freeing device pointer %p (%s).%d byte already allocated\n", ptr, cudaGetErrorString(err), _allocated_size);
#else
fprintf(stderr, "Error freeing device pointer %p (%s).\n", ptr, cudaGetErrorString(err));
#endif
PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p (%s)", ptr, cudaGetErrorString(err));
return -1;
}
_outstanding_mallocs[0] -= (ptr != NULL);
#if COMPUTE_GPU_MEM_USED
int i=0;
for(;i<TABLE_SIZE;i++)
if(_alloc_size_table[i].ptr==ptr){
_allocated_size -= _alloc_size_table[i].size;
_alloc_size_table[i].ptr=0;
_alloc_size_table[i].size=0;
break;
}
if(i==TABLE_SIZE)
printf("Unallocated unknow size!\n");
#endif
return 0;
}
static PyObject *
......@@ -1868,7 +1911,12 @@ initcuda_ndarray(void)
Py_INCREF(&CudaNdarrayType);
PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
#if COMPUTE_GPU_MEM_USED
for(int i=0;i<TABLE_SIZE;i++){
_alloc_size_table[i].ptr=NULL;
_alloc_size_table[i].size=0;
}
#endif
// cublasInit();
//if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
//{
......
......@@ -278,12 +278,22 @@ def local_gpu_reshape(node):
if host_input.owner and isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op
x, shp = host_input.owner.inputs
return [GpuReshape(rshp.ndim)(gpu_from_host(x), shp)]
gpu_reshape = GpuReshape(rshp.ndim)(gpu_from_host(x), shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
#this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape
#Event if we did the same think, with the constant optimization that could happen.
gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable)
return [gpu_reshape]
if isinstance(node.op, tensor.Reshape):
x, shp = node.inputs
if x.owner and x.owner.op == host_from_gpu:
gpu_x, = x.owner.inputs
return [host_from_gpu(GpuReshape(node.op.ndim)(gpu_x, shp))]
gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
#this can happen as we always return False for all broadcast dim in GpuReshape but not for Reshape
#Event if we did the same think, with the constant optimization that could happen.
gpu_reshape = theano.tensor.patternbroadcast(gpu_reshape,node.outputs[0].broadcastable)
return [host_from_gpu(gpu_reshape)]
return False
@register_opt()
......
......@@ -3,11 +3,12 @@ from theano import Op, Apply
import theano.tensor as T
from theano.tensor.opt import register_specialize
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available
if cuda_available:
from theano.sandbox.cuda import CudaNdarrayType
from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
from theano.sandbox.cuda.opt import register_opt as register_gpu_opt
class Images2Neibs(Op):
def __eq__(self, other):
......@@ -17,7 +18,7 @@ class Images2Neibs(Op):
def make_node(self, ten4, neib_shape):
ten4 = T.as_tensor_variable(ten4)
neib_shape = T.as_tensor_variable(neib_shape)
return Apply(self, [ten4, neib_shape], [T.matrix()])
return Apply(self, [ten4, neib_shape], [ten4.type()])
def grad(self, (pvals, unis), (gz,)):
return [None, None]
......@@ -163,7 +164,7 @@ class GpuImages2Neibs(Images2Neibs):
# raise TypeError('unis must be cudandarray', neib_shape)
#print 'neib_shape type and dtype', type(neib_shape), neib_shape.dtype
return Apply(self, [ten4, neib_shape], [CudaNdarrayType(broadcastable=(False,)*2)()])
return Apply(self, [ten4, neib_shape], [ten4.type()])
def c_code_cache_version(self):
return ()
......@@ -360,6 +361,7 @@ gpu_images2neibs = GpuImages2Neibs()
def use_gpu_images2neibs(node):
if node.op == images2neibs:
return [host_from_gpu(gpu_images2neibs(*[gpu_from_host(node.inputs[0]),node.inputs[1]]))]
if theano.config.device.startswith('gpu'):
register_specialize(use_gpu_images2neibs)
if cuda_available:
register_gpu_opt()(use_gpu_images2neibs)
from numpy import *
import numpy
import theano
from theano import shared, function
import theano.tensor as T
from neighbours import images2neibs, neibs2images
from neighbours import images2neibs, neibs2images, GpuImages2Neibs
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda
mode = theano.config.mode
if mode=="FAST_COMPILE":
mode='FAST_RUN'
if theano.config.mode=='FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def neibs_test():
def test_neibs():
shape = (100,40,18,18)
images = shared(arange(prod(shape), dtype='float32').reshape(shape))
images = shared(numpy.arange(numpy.prod(shape)).reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
f = function([], images2neibs(images, neib_shape), mode=mode)
f = function([], images2neibs(images, neib_shape), mode=mode_without_gpu)
#print images.value
neibs = f()
#print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode)
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_without_gpu)
#print g()
assert allclose(images.value,g())
assert numpy.allclose(images.value,g())
def test_neibs_gpu():
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
shape = (100,40,18,18)
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32'))
from theano.sandbox.cuda.basic_ops import gpu_from_host
f = function([], images2neibs(images,neib_shape),
mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
#print images.value
res1=[[[[ 0., 1., 4., 5.],
[ 2., 3., 6., 7.],
[ 8., 9., 12., 13.],
[ 10., 11., 14., 15.],
[ 16., 17., 20., 21.],
[ 18., 19., 22., 23.],
[ 24., 25., 28., 29.],
[ 26., 27., 30., 31.],
[ 32., 33., 36., 37.],
[ 34., 35., 38., 39.],
[ 40., 41., 44., 45.],
[ 42., 43., 46., 47.],
[ 48., 49., 52., 53.],
[ 50., 51., 54., 55.],
[ 56., 57., 60., 61.],
[ 58., 59., 62., 63.]]]]
neibs = numpy.asarray(f())
numpy.allclose(neibs,res1)
#print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
#print numpy.asarray(g())
assert numpy.allclose(images.value,g())
neibs_test()
if __name__ == '__main__':
test_neibs_gpu()
test_neibs()
......@@ -953,7 +953,11 @@ class Mod(BinaryScalarOp):
def impl(self, x, y):
return x % y
def c_code_cache_version(self):
return (4,)
return (5,)
def c_support_code(self):
#We use a macro as python use % as a special string caractere.
return "#define THEANO_MACRO_MOD(x,y) (x % y)"
def c_code(self, node, name, (x, y), (z, ), sub):
"""
......@@ -962,10 +966,10 @@ class Mod(BinaryScalarOp):
#raise NotImplementedError("Unlike Python, C's modulo returns negative modulo on negative dividend (to implement)")
t = node.inputs[0].type.upcast(*[ i.type for i in node.inputs[1:]])
if t in int_types or t in ['uint8','int8','uint16','int16','uint32','int32','uint64','int64']:
x_mod_y = "(%(x)s %% %(y)s)"%locals()
x_mod_ymm = "(-%(x)s %% -%(y)s)"%locals()
x_mod_ypm = "(%(x)s %% -%(y)s)"%locals()
x_mod_ymp = "(-%(x)s %% %(y)s)"%locals()
x_mod_y = "THEANO_MACRO_MOD(%(x)s, %(y)s)"%locals()
x_mod_ymm = "THEANO_MACRO_MOD(-%(x)s, -%(y)s)"%locals()
x_mod_ypm = "THEANO_MACRO_MOD(%(x)s, -%(y)s)"%locals()
x_mod_ymp = "THEANO_MACRO_MOD(-%(x)s, %(y)s)"%locals()
elif t in float_types or t in ['float32','float64']:
x_mod_y = "fmod(%(x)s,%(y)s)"%locals()
x_mod_ymm = "fmod(-%(x)s,-%(y)s)"%locals()
......@@ -1706,8 +1710,6 @@ class Composite(ScalarOp):
[subd[output] for output in node.outputs],
dict(fail = "%(fail)s",
id = "%%(id)s_%i" % j))
if any([isinstance(x.op,Mod) for x in env.toposort()]):
s = s.replace('% ','%% ')
_c_code += s
_c_code += "\n"
_c_code += "}\n"
......@@ -1773,6 +1775,15 @@ class Composite(ScalarOp):
def c_code_cache_version(self):
return (1,)+tuple([x.op.c_code_cache_version() for x in self.env.toposort()])
def c_support_code(self):
str = ""
for node in self.env.toposort():
try:
str += node.op.c_support_code()+"\n"
except gof.utils.MethodNotDefined:
pass
return str
def __eq__(self, other):
if self is other: return True
if not isinstance(other, self.__class__): return False
......
......@@ -68,10 +68,6 @@ def hash_listsDictsTuples(x):
return hash_value
## TODO
###################################
## Implement specific function calls : map, reduce, generate
def map(fn, sequences, non_sequences = [],
truncate_gradient = -1, go_backwards = False,
mode = None, name = None):
......@@ -83,15 +79,15 @@ def map(fn, sequences, non_sequences = [],
:param sequences: list of arrays over which map should
iterate (see scan for more info)
:param non_sequences: list of other arguments of `fn` over which
:param non_sequences: list of other arguments of `fn` over which
map shouldn't iterate (see scan for more info)
:param truncate_gradient: see scan for more info
:param go_backwards: if map should also inverse the order in the arrays
see scan for more info
:param go_backwards: set to true if you want map to start at the end of the
provided arrays in ``sequences`` going towards 0 (back in time)
:param mode: see scan
:param mode: see scan
:param name: see scan
"""
......@@ -108,15 +104,17 @@ def reduce(fn, sequences, outputs_info, non_sequences = [], go_backwards = False
sequences ( see scan `fn` for more info)
:param outputs_info: information about outputs (mainly the initial state
of each )
of each, but other options are available ), see scan for more
info
:param sequences: list of arrays over which reduce should
iterate (see scan for more info)
:param non_sequences: list of other arguments of `fn` over which
reduce shouldn't iterate (see scan for more info)
:param go_backwards: if reduce should also inverse the order in the arrays
see scan for more info
:param go_backwards: set to true if you want map to start at the end of the
provided arrays in ``sequences`` going towards 0 (back in time)
:param mode: see scan
:param name: see scan
......@@ -241,8 +239,8 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
If you are using shared variables over which you do not want to iterate,
you do not need to provide them as arguments to ``fn``, though you can if you
wish so. The function should return the outputs after each step plus the updates for
any of the shared variables. You can either return only outputs or only
wish so. The function should return the outputs after each step plus the updates
for any of the shared variables. You can either return only outputs or only
updates. If you have both outputs and updates the function should return
them as a tuple : (outputs, updates) or (updates, outputs).
......@@ -281,9 +279,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
list of ints (only negative .. since you can not use future values of outputs),
with the same meaning as for ``sequences`` (see above).
* ``inplace`` -- theano variable pointing to one of the input sequences; this
flag tells scan that the output should be computed in the memory spaced occupied
flag tells scan that the output should be computed in the memory space occupied
by that input sequence. Note that scan will only do this if allowed by the
rest of your computational graph and if you are not using past taps of the input.
rest of your computational graph and if you are not using past taps of the
input.
* ``return_steps`` how many steps to return from your output. If not given, or
0 scan will return all steps, otherwise it will return the last ``return_steps``.
Note that if you set this to something else then 0, scan will try to be smart
......@@ -298,8 +297,8 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
notation, when ``t = 0``, we would need values for ``y[-1]``, ``y[-2]``
and ``y[-4]``. These values are provided by the initial state of ``y``,
which should have same number of dimension as ``y``, where the first
dimension should be large enough to cover all past values, which in this
case is 4. If ``init_y`` is the variable containing the initial state
dimension should be large enough to cover all the required past values, which in
this case is 4. If ``init_y`` is the variable containing the initial state
of ``y``, then ``init_y[0]`` corresponds to ``y[-4]``, ``init_y[1]``
corresponds to ``y[-3]``, ``init_y[2]`` corresponds to ``y[-2]``,
``init_y[3]`` corresponds to ``y[-1]``. The default behaviour of scan is
......@@ -313,13 +312,13 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
of -1
* if you wrap an output in a dictionary but you do not provide any initial state,
it assumes that you are not using any form of taps
* if you provide a ``None`` scan assumes that you will not use any taps for this
output (this would be the case for map )
* if you provide a ``None`` instead of a variable or a dictionary scan assumes
that you will not use any taps for this output (this would be the case for map)
If you did not provide any information for your outputs, scan will assume by default
that you are not using any taps for any of the outputs. If you provide information for
just a subset of outputs, scan will not know to which outputs these information
corresponds and will raise an error.
If you did not provide any information for your outputs, scan will assume by
default that you are not using any taps for any of the outputs. If you provide
information for just a subset of outputs, scan will not know to which outputs
these correspond and will raise an error.
:param non_sequences:
Parameters over which scan should not iterate. These parameters are
......@@ -332,18 +331,20 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
the input sequences. If the value is 0, the outputs will have 0 rows. If the
value is negative, scan will run backwards (or if the flag go_backwards is
already set to true it will run forward in time). If n_steps is not provided,
or evaluetes to None, inf or nan, scan will figure out the maximal amount of steps it can
take and do that.
or evaluetes to None, inf or nan, scan will figure out the maximal amount of
steps it can run given the input sequences and do that.
:param truncate_gradient:
Number of steps to use in truncated BPTT. If you compute gradients
through a scan op, they are computed using backpropagation through time.
By providing a different value then -1, you choose to use truncated BPTT
instead of classical BPTT, where you only do ``truncate_gradient``
number of steps. (NOT YET IMPLEMENTED)
number of steps.
:param go_backwards:
Flag indicating if you should go backwards through the sequences
Flag indicating if you should go backwards through the sequences ( if you
think as the sequences being indexed by time, this would mean go backwards
in time)
:param name:
The name of the theano function compiled by the Scan op. It will show in the
......
......@@ -2676,6 +2676,21 @@ def test_mod():
):
assert fn(a,b) == a%b, (a,)
def test_mod_compile():
"""
This test generate an Elemwise of Composite as:
Elemwise{Composite{Composite{Composite{Composite{mod,EQ},Switch},mul},add}}
The c_code generated is not compiling as of 30 June 2010. I fix the compilation in the same commit.
"""
x = tensor.vector()
y = tensor.vector()
shape = x.shape
out = tensor.switch(tensor.eq(3%x.shape[0],0),y,y[:-1])
f = theano.function([x,y],out)
if __name__ == '__main__':
if 1:
unittest.main()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论