提交 0f89d7fa authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged (resolved conflict in theano/tensor/opt.py, keeping repo version)

...@@ -42,6 +42,16 @@ default values. ...@@ -42,6 +42,16 @@ default values.
have a default value of ``False``. The third argument must be called have a default value of ``False``. The third argument must be called
``allow_downcast`` and must have a default value of ``None``. ``allow_downcast`` and must have a default value of ``None``.
.. method:: filter_inplace(value, storage, strict=False, allow_downcast=None)
If filter_inplace is defined, it will be called instead of
filter() This is to allow reusing the old allocated memory. As
of this writing this is used only when we transfer new data to a
shared variable on the gpu.
``storage`` will be the old value. i.e. The old numpy array,
CudaNdarray, ...
.. method:: is_valid_value(value) .. method:: is_valid_value(value)
Returns True iff the value is compatible with the Type. If Returns True iff the value is compatible with the Type. If
......
...@@ -520,8 +520,14 @@ class Function(object): ...@@ -520,8 +520,14 @@ class Function(object):
allow_downcast=s.allow_downcast) allow_downcast=s.allow_downcast)
except Exception, e: except Exception, e:
e.args = tuple(list(e.args)+["Bad input argument at index %d" % i]) function_name="theano function"
if self.name:
function_name += 'with name "'+self.name+'" '
#end if
e.args = tuple(list(e.args)+["Bad input argument to "+function_name+" at index %d" % i])
raise raise
#end except
#end if
s.provided += 1 s.provided += 1
i+=1 i+=1
......
...@@ -1100,17 +1100,15 @@ def _execute(cthunk, init_tasks, tasks, error_storage): ...@@ -1100,17 +1100,15 @@ def _execute(cthunk, init_tasks, tasks, error_storage):
trace = () trace = ()
try: try:
exc_type, _exc_value, exc_trace = error_storage exc_type, _exc_value, exc_trace = error_storage
if hasattr(task, "outputs"):
exc_value = exc_type(_exc_value, task, task.outputs)
else:
exc_value = exc_type(_exc_value, task) exc_value = exc_type(_exc_value, task)
exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared exc_value.__thunk_trace__ = trace # this can be used to retrieve the location the Op was declared
except: except:
print >> sys.stderr, 'ERROR retrieving error_storage', error_storage print >> sys.stderr, 'ERROR retrieving error_storage', error_storage
raise raise
#TODO-- someone who understands how these exceptions work, please put this info into the exception message itself
# (exc_value.message seems to be ignored)
print "while computing "+str(task.outputs)
raise exc_type, exc_value, exc_trace raise exc_type, exc_value, exc_trace
execute.cthunk = cthunk execute.cthunk = cthunk
return execute return execute
......
...@@ -98,6 +98,7 @@ class Env(utils.object2): ...@@ -98,6 +98,7 @@ class Env(utils.object2):
for f in features: for f in features:
self.extend(f) self.extend(f)
self.extend(toolbox.ReplaceValidate())
for input in self.inputs: for input in self.inputs:
if input.owner is not None: if input.owner is not None:
......
...@@ -92,7 +92,9 @@ class FromFunctionOptimizer(Optimizer): ...@@ -92,7 +92,9 @@ class FromFunctionOptimizer(Optimizer):
def __init__(self, fn): def __init__(self, fn):
self.apply = fn self.apply = fn
def add_requirements(self, env): def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate()) # Added by default
#env.extend(toolbox.ReplaceValidate())
pass
def print_summary(self, stream=sys.stdout, level=0): def print_summary(self, stream=sys.stdout, level=0):
print >> stream, "%s%s id=%i" %(' '*level, print >> stream, "%s%s id=%i" %(' '*level,
...@@ -252,7 +254,9 @@ class MergeOptimizer(Optimizer): ...@@ -252,7 +254,9 @@ class MergeOptimizer(Optimizer):
self.skip_const_merge = skip_const_merge self.skip_const_merge = skip_const_merge
def add_requirements(self, env): def add_requirements(self, env):
env.extend(toolbox.ReplaceValidate()) # Added by default
#env.extend(toolbox.ReplaceValidate())
pass
def apply_constant_merge(self, env): def apply_constant_merge(self, env):
seen_constants = set() seen_constants = set()
...@@ -421,7 +425,9 @@ class LocalOptimizer(object): ...@@ -421,7 +425,9 @@ class LocalOptimizer(object):
def add_requirements(self, env): def add_requirements(self, env):
"""If this local optimization wants to add some requirements to the env, """If this local optimization wants to add some requirements to the env,
This is the place to do it.""" This is the place to do it."""
env.extend(toolbox.ReplaceValidate()) # Added by default
#env.extend(toolbox.ReplaceValidate())
pass
def print_summary(self, stream=sys.stdout, level=0): def print_summary(self, stream=sys.stdout, level=0):
print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self)) print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self))
...@@ -908,7 +914,8 @@ class NavigatorOptimizer(Optimizer): ...@@ -908,7 +914,8 @@ class NavigatorOptimizer(Optimizer):
def add_requirements(self, env): def add_requirements(self, env):
super(NavigatorOptimizer, self).add_requirements(env) super(NavigatorOptimizer, self).add_requirements(env)
env.extend(toolbox.ReplaceValidate()) # Added by default
#env.extend(toolbox.ReplaceValidate())
if self.local_opt: if self.local_opt:
self.local_opt.add_requirements(env) self.local_opt.add_requirements(env)
...@@ -989,7 +996,7 @@ class OpKeyOptimizer(NavigatorOptimizer): ...@@ -989,7 +996,7 @@ class OpKeyOptimizer(NavigatorOptimizer):
""" """
Requires the following features: Requires the following features:
- NodeFinder - NodeFinder
- ReplaceValidate - ReplaceValidate(Added by default)
""" """
super(OpKeyOptimizer, self).add_requirements(env) super(OpKeyOptimizer, self).add_requirements(env)
env.extend(toolbox.NodeFinder()) env.extend(toolbox.NodeFinder())
......
...@@ -224,6 +224,13 @@ class PureType(object): ...@@ -224,6 +224,13 @@ class PureType(object):
""" """
raise MethodNotDefined("filter", type(self), self.__class__.__name__) raise MethodNotDefined("filter", type(self), self.__class__.__name__)
# If filter_inplace is defined, it will be called instead of
# filter() This is to allow reusing the old allocated memory. As
# of this writing this is used only when we transfer new data to a
# shared variable on the gpu.
#def filter_inplace(value, storage, strict=False, allow_downcast=None)
def is_valid_value(self, a): def is_valid_value(self, a):
"""Required: Return True for any python object `a` that would be a legal value for a Variable of this Type""" """Required: Return True for any python object `a` that would be a legal value for a Variable of this Type"""
try: try:
......
...@@ -363,7 +363,7 @@ class GpuConv(Op): ...@@ -363,7 +363,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,13) # raise this whenever modifying any of the support_code_files return (0,14) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files # REMEMBER TO RAISE c_code_cache_version when changing any of these files
......
...@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out, ...@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out,
* *
* nkern: the number of kernel, used to compute the output image to store the result * nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load. * nstack: the size of the stack, used to compute the image to load.
* dx: patch stride rows(1 for normal convolution)
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result * template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
...@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out, ...@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out,
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more then 1 output pixel, but use more registers. * template split: if true, each thread generate more then 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time. * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out,
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
...@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out,
//TODO: inverse the out_row and stack loop to don't load the date as frequently! //TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere? //TODO: do this happen elsewhere?
for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){ for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
...@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out,
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
......
...@@ -80,7 +80,7 @@ def test_gemm(): ...@@ -80,7 +80,7 @@ def test_gemm():
c = tensor.fmatrix('c') c = tensor.fmatrix('c')
f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu) f = pfunc([b,c], [], updates=[(a, tensor.dot(a,b) + tensor.exp(c))], mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.env.toposort()]) assert any([node.op == tcn.blas.gpu_gemm_no_inplace for node in f.maker.env.toposort()])
a0 = a.get_value() * 1.0 a0 = a.get_value() * 1.0
print a0 print a0
......
...@@ -282,8 +282,7 @@ def get_valid_shapes(): ...@@ -282,8 +282,7 @@ def get_valid_shapes():
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample #test subsample done in a separate fct
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [ shapes += [
#other test #other test
...@@ -502,8 +501,7 @@ def test_full(): ...@@ -502,8 +501,7 @@ def test_full():
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample #test subsample done in a separate fct
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [ shapes += [
#other test #other test
...@@ -553,21 +551,31 @@ def test_full(): ...@@ -553,21 +551,31 @@ def test_full():
def test_subsample(): def test_subsample():
# implement when # implement when
shapes = [ shapes = [
((1, 1, 1, 1), (1, 1, 1, 1), (1,1)) ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
, ((1, 1, 1, 1), (1, 1, 1, 1), (2,2)) , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3)) , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
] ]
all_good = True shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
_params_allgood_header() #We put only the version that implement the subsample to make the test faster.
for ishape, kshape, ds in shapes: version_valid = [-2,-1,1,3,11,12]
if not _params_allgood(ishape, kshape, 'full', subsample=ds): version_full = [-2,-1]
all_good = False verbose = 0
if not _params_allgood(ishape, kshape, 'valid', subsample=ds): random = True
all_good = False print_ = False
assert all_good ones = False
if ones:
random = False
#test
random = False
exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
## See #616 ## See #616
#def test_logical_shapes(): #def test_logical_shapes():
......
...@@ -104,11 +104,11 @@ def scan( fn ...@@ -104,11 +104,11 @@ def scan( fn
.. code-block:: python .. code-block:: python
scan(fn, sequences = [ dict( Sequence1, taps = [-3,2,-1]) scan(fn, sequences = [ dict(input= Sequence1, taps = [-3,2,-1])
, Sequence2 , Sequence2
, dict( Sequence3, taps = 3) ] , dict(input = Sequence3, taps = 3) ]
, outputs_info = [ dict( Output1, taps = [-3,-5]) , outputs_info = [ dict(initial = Output1, taps = [-3,-5])
, dict( Output2, taps = None) , dict(initial = Output2, taps = None)
, Output3 ] , Output3 ]
, non_sequences = [ Argument1, Argument 2]) , non_sequences = [ Argument1, Argument 2])
...@@ -371,7 +371,7 @@ def scan( fn ...@@ -371,7 +371,7 @@ def scan( fn
# ^ explicitly provided a None for taps # ^ explicitly provided a None for taps
warning (' Output %s ( index %d) has a initial state ' warning (' Output %s ( index %d) has a initial state '
' but taps is explicitly set to None ' % ( ' but taps is explicitly set to None ' % (
outs_info[i]['initial'].name getattr(outs_info[i]['initial'],'name','None')
, i) ) , i) )
outs_info[i]['taps'] = [-1] outs_info[i]['taps'] = [-1]
else: else:
...@@ -416,12 +416,10 @@ def scan( fn ...@@ -416,12 +416,10 @@ def scan( fn
nw_slice = seq['input'][0].type() nw_slice = seq['input'][0].type()
actual_slice = seq['input'][k-mintap] actual_slice = seq['input'][k-mintap]
if not hasattr(seq['input'],'name'):
raise TypeError('Expected object with a "name" field, got '+str(seq)+"['input'] = "+str(seq['input']))
# Add names to slices for debugging and pretty printing .. # Add names to slices for debugging and pretty printing ..
# that is if the input already has a name # that is if the input already has a name
if seq['input'].name: if getattr(seq['input'],'name', None) is not None:
if k > 0: if k > 0:
nw_name = seq['input'].name + '[t+%d]'%k nw_name = seq['input'].name + '[t+%d]'%k
elif k == 0: elif k == 0:
...@@ -481,7 +479,7 @@ def scan( fn ...@@ -481,7 +479,7 @@ def scan( fn
# Add names -- it helps a lot when debugging # Add names -- it helps a lot when debugging
for (nw_seq, seq) in zip(scan_seqs, seqs): for (nw_seq, seq) in zip(scan_seqs, seqs):
if seq['input'].name: if getattr(seq['input'],'name', None) is not None:
nw_seq.name = seq['input'].name + '[%d:]'%k nw_seq.name = seq['input'].name + '[%d:]'%k
# Conventions : # Conventions :
...@@ -534,7 +532,7 @@ def scan( fn ...@@ -534,7 +532,7 @@ def scan( fn
actual_arg = init_out['initial'] actual_arg = init_out['initial']
arg = safe_new(init_out['initial']) arg = safe_new(init_out['initial'])
if init_out['initial'].name: if getattr(init_out['initial'],'name', None) is not None:
arg.name = init_out['initial'].name+'[t-1]' arg.name = init_out['initial'].name+'[t-1]'
# We need now to allocate space for storing the output and copy # We need now to allocate space for storing the output and copy
# the initial state over. We do this using the expand function # the initial state over. We do this using the expand function
...@@ -579,7 +577,7 @@ def scan( fn ...@@ -579,7 +577,7 @@ def scan( fn
nw_slice = init_out['initial'][0].type() nw_slice = init_out['initial'][0].type()
# give it a name or debugging and pretty printing # give it a name or debugging and pretty printing
if init_out['initial'].name: if getattr(init_out['initial'],'name', None) is not None:
if k > 0: if k > 0:
nw_slice.name = ( init_out['initial'].name + nw_slice.name = ( init_out['initial'].name +
'[t+%d]'%k ) '[t+%d]'%k )
...@@ -746,7 +744,7 @@ def scan( fn ...@@ -746,7 +744,7 @@ def scan( fn
for input in dummy_f.maker.expanded_inputs: for input in dummy_f.maker.expanded_inputs:
if isinstance(input.variable, SharedVariable) and input.update: if isinstance(input.variable, SharedVariable) and input.update:
new_var = safe_new(input.variable) new_var = safe_new(input.variable)
if input.variable.name: if getattr(input.variable,'name', None) is not None:
new_var.name = input.variable.name + '_copy' new_var.name = input.variable.name + '_copy'
shared_inner_inputs.append( new_var ) shared_inner_inputs.append( new_var )
shared_scan_inputs.append( input.variable ) shared_scan_inputs.append( input.variable )
...@@ -777,7 +775,7 @@ def scan( fn ...@@ -777,7 +775,7 @@ def scan( fn
## Step 5.6 all shared variables with no update rules ## Step 5.6 all shared variables with no update rules
def new_variable( v ): def new_variable( v ):
new_v = safe_new(v) new_v = safe_new(v)
if v.name: if getattr(v,'name', None) is not None:
new_v.name = v.name + '_copy' new_v.name = v.name + '_copy'
return new_v return new_v
other_inner_args += [ new_variable(arg) for arg in non_seqs other_inner_args += [ new_variable(arg) for arg in non_seqs
......
...@@ -226,10 +226,10 @@ class Scan(Op): ...@@ -226,10 +226,10 @@ class Scan(Op):
for idx in xrange(self.n_seqs): for idx in xrange(self.n_seqs):
if inputs[1+idx].dtype != self.inputs[idx].dtype: if inputs[1+idx].dtype != self.inputs[idx].dtype:
raise ValueError(err_msg1%( 'Sequence' raise ValueError(err_msg1%( 'Sequence'
, inputs[1+idx].name , str(inputs[1+idx])
, idx , idx
, inputs[1+idx].dtype , inputs[1+idx].dtype
, self.inputs[idx].name , str(self.inputs[idx])
, self.inputs[idx].dtype) ) , self.inputs[idx].dtype) )
# Check that this 3 things have the same dtype for mit_mot: # Check that this 3 things have the same dtype for mit_mot:
...@@ -246,10 +246,10 @@ class Scan(Op): ...@@ -246,10 +246,10 @@ class Scan(Op):
for k in self.tap_array[index-start]: for k in self.tap_array[index-start]:
if inputs[index].dtype != self.inputs[index_i].dtype: if inputs[index].dtype != self.inputs[index_i].dtype:
raise ValueError(err_msg1%( 'Initial state' raise ValueError(err_msg1%( 'Initial state'
, inputs[index].name , str(inputs[index])
, index , index
, inputs[index].dtype , inputs[index].dtype
, self.inputs[index_i].name , str(self.inputs[index_i])
, self.inputs[index_i].dtype) ) , self.inputs[index_i].dtype) )
index_i += 1 index_i += 1
for k in self.mit_mot_out_slices[index-start]: for k in self.mit_mot_out_slices[index-start]:
...@@ -266,14 +266,14 @@ class Scan(Op): ...@@ -266,14 +266,14 @@ class Scan(Op):
for k in self.tap_array[index-start]: for k in self.tap_array[index-start]:
if inputs[index].dtype != self.inputs[index_i].dtype: if inputs[index].dtype != self.inputs[index_i].dtype:
raise ValueError(err_msg1%( 'Initial state' raise ValueError(err_msg1%( 'Initial state'
, inputs[index].name , str(inputs[index])
, index , index
, inputs[index].dtype , inputs[index].dtype
, self.inputs[index_i].name , str(self.inputs[index_i])
, self.inputs[index_i].dtype) ) , self.inputs[index_i].dtype) )
index_i += 1 index_i += 1
if inputs[index].dtype != self.outputs[index_o].dtype: if inputs[index].dtype != self.outputs[index_o].dtype:
raise ValueError(err_msg2%( inputs[index].name raise ValueError(err_msg2%( str(inputs[index])
, index , index
, inputs[index].dtype , inputs[index].dtype
, self.outputs[index_o].dtype) ) , self.outputs[index_o].dtype) )
...@@ -287,7 +287,7 @@ class Scan(Op): ...@@ -287,7 +287,7 @@ class Scan(Op):
while index < end: while index < end:
if (hasattr(inputs[index],'dtype') and if (hasattr(inputs[index],'dtype') and
inputs[index].dtype != self.outputs[index_o].dtype): inputs[index].dtype != self.outputs[index_o].dtype):
raise ValueError(err_msg2%( inputs[index].name raise ValueError(err_msg2%( str(inputs[index])
, index , index
, inputs[index].dtype , inputs[index].dtype
, self.outputs[index_o].dtype) ) , self.outputs[index_o].dtype) )
...@@ -610,11 +610,13 @@ class Scan(Op): ...@@ -610,11 +610,13 @@ class Scan(Op):
t_call = time.time() - t0_call t_call = time.time() - t0_call
if hasattr(self.fn.maker.mode,'fct_call_time'): if hasattr(self.fn.maker.mode,'fct_call_time'):
self.fn.maker.mode.fct_call_time[self.fn] += t_call self.fn.maker.mode.fct_call_time[self.fn] += t_fn
self.fn.maker.mode.fct_call[self.fn] += n_steps self.fn.maker.mode.fct_call[self.fn] += n_steps
self.fn.maker.mode.call_time += t_call self.fn.maker.mode.call_time += t_fn
self.fn.maker.mode.fn_time += t_fn self.fn.maker.mode.fn_time += t_fn
self.t_call = t_call
self.t_fn = t_fn
### Infer Shape ### Infer Shape
...@@ -792,7 +794,7 @@ class Scan(Op): ...@@ -792,7 +794,7 @@ class Scan(Op):
g_out_slices.append(g_outs_no_shared[dx][0]) g_out_slices.append(g_outs_no_shared[dx][0])
else: else:
g_out_slices.append(None) g_out_slices.append(None)
if out.name: if getattr(out,'name',None) is not None:
inner_g_out.name = 'g_'+out.name inner_g_out.name = 'g_'+out.name
else: else:
inner_g_out.name = 'g_'+str(dx) inner_g_out.name = 'g_'+str(dx)
...@@ -870,7 +872,7 @@ class Scan(Op): ...@@ -870,7 +872,7 @@ class Scan(Op):
nw_seq = seq[dim_offset +k -mintap: -(maxtap -k)] nw_seq = seq[dim_offset +k -mintap: -(maxtap -k)]
else: else:
nw_seq = seq[dim_offset +k -mintap: ] nw_seq = seq[dim_offset +k -mintap: ]
if seq.name: if getattr(seq,'name', None) is not None:
nw_seq.name = seq.name + '[%d:]'%k nw_seq.name = seq.name + '[%d:]'%k
scan_seqs.append(nw_seq) scan_seqs.append(nw_seq)
......
...@@ -118,10 +118,9 @@ def reduce( fn ...@@ -118,10 +118,9 @@ def reduce( fn
if not isinstance(out_info, dict): if not isinstance(out_info, dict):
# Specifies that it should return only the last step. # Specifies that it should return only the last step.
outs_info[i] = dict( outs_info[i] = dict(
initial = out_info, return_steps = 1, store_steps = 1) initial = out_info, return_steps = 1)
else: else:
# Specifies that it should return only the last step. # Specifies that it should return only the last step.
outs_info[i]['store_steps'] = 1
outs_info[i]['return_steps'] = 1 outs_info[i]['return_steps'] = 1
# NOTE : If the user asks for more then the last step, # NOTE : If the user asks for more then the last step,
# it means he does not understand ``reduce``. We could # it means he does not understand ``reduce``. We could
...@@ -131,7 +130,7 @@ def reduce( fn ...@@ -131,7 +130,7 @@ def reduce( fn
, outputs_info = outs_info , outputs_info = outs_info
, non_sequences = non_sequences , non_sequences = non_sequences
, go_backwards = go_backwards , go_backwards = go_backwards
, truncate_gradient = 1 , truncate_gradient = -1
, mode = mode , mode = mode
, name = name ) , name = name )
......
...@@ -2036,6 +2036,71 @@ class T_Scan(unittest.TestCase): ...@@ -2036,6 +2036,71 @@ class T_Scan(unittest.TestCase):
f2_vals = f2(x_val) f2_vals = f2(x_val)
assert numpy.allclose(f_vals, f2_vals) assert numpy.allclose(f_vals, f2_vals)
def test_reduce_memory_consumption(self):
x = theano.shared( numpy.asarray(
numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
o,_ = theano.reduce(lambda v,acc : acc+v, x,
theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
)
#f1 = theano.function([],o)
# Get the scan node
#scan_node = [n for n in f1.maker.env.toposort()
# if n.op.__class__.__name__=='Scan'][0]
# Check how much memory it uses
# Can actually do that since things are hidden by the infershape
# mechanism
#assert scan_node.inputs[2].value.shape == ()
gx = theano.tensor.grad(o, x)
f2 = theano.function([],gx)
assert numpy.allclose( f2(), numpy.ones((10,)))
def test_foldl_memory_consumption(self):
x = theano.shared( numpy.asarray(
numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
o,_ = theano.foldl(lambda v,acc : acc+v, x,
theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
)
#f1 = theano.function([],o)
# Get the scan node
#scan_node = [n for n in f1.maker.env.toposort()
# if n.op.__class__.__name__=='Scan'][0]
# Check how much memory it uses
# Can actually do that since things are hidden by the infershape
# mechanism
#assert scan_node.inputs[2].value.shape == ()
gx = theano.tensor.grad(o, x)
f2 = theano.function([],gx)
assert numpy.allclose( f2(), numpy.ones((10,)))
def test_foldr_memory_consumption(self):
x = theano.shared( numpy.asarray(
numpy.random.uniform(size=(10,)),dtype = theano.config.floatX))
o,_ = theano.foldr(lambda v,acc : acc+v, x,
theano.tensor.constant(numpy.asarray(0.,dtype=theano.config.floatX))
)
#f1 = theano.function([],o)
# Get the scan node
#scan_node = [n for n in f1.maker.env.toposort()
# if n.op.__class__.__name__=='Scan'][0]
# Check how much memory it uses
# Can actually do that since things are hidden by the infershape
# mechanism
#assert scan_node.inputs[2].value.shape == ()
gx = theano.tensor.grad(o, x)
f2 = theano.function([],gx)
assert numpy.allclose( f2(), numpy.ones((10,)))
if __name__ == '__main__': if __name__ == '__main__':
#''' #'''
print ' Use nosetests to run these tests ' print ' Use nosetests to run these tests '
......
...@@ -1056,6 +1056,9 @@ class CAReduce(Op): ...@@ -1056,6 +1056,9 @@ class CAReduce(Op):
scal_name = 'maximum' scal_name = 'maximum'
if input.type.dtype in ["float32","float64"]: if input.type.dtype in ["float32","float64"]:
identity = "-__builtin_inf()" identity = "-__builtin_inf()"
elif input.type.dtype.startswith("uint"):
# numpy1.5.1 don't define NPY_MIN_UINT*
identity = "0"
else: else:
identity = "NPY_MIN_"+str(input.type.dtype).upper() identity = "NPY_MIN_"+str(input.type.dtype).upper()
if self.scalar_op == scalar.minimum: if self.scalar_op == scalar.minimum:
......
...@@ -9,7 +9,7 @@ _logger = logging.getLogger('theano.tensor.opt') ...@@ -9,7 +9,7 @@ _logger = logging.getLogger('theano.tensor.opt')
import operator import operator
import itertools import itertools
import sys import sys
import traceback
import numpy import numpy
import numpy as N #guys... please don't do this in the library :( import numpy as N #guys... please don't do this in the library :(
...@@ -567,16 +567,18 @@ class ShapeFeature(object): ...@@ -567,16 +567,18 @@ class ShapeFeature(object):
sometimes Theano constants?? That would be confusing. sometimes Theano constants?? That would be confusing.
""" """
def shape_i(self, i):
def op_deco(r): def shape_ir(self, i, r):
#TODO: Write a doc string for this method
if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]: if hasattr(r.type,"broadcastable") and r.type.broadcastable[i]:
return self.lscalar_one return self.lscalar_one
else: else:
return Shape_i(i)(r) return Shape_i(i).make_node(r).outputs[0]
return op_deco
def shape_tuple(self, r): def shape_tuple(self, r):
return tuple([self.shape_i(i)(r) for i in xrange(r.ndim)]) #TODO: Write a doc string for this method
return tuple([self.shape_ir(i,r) for i in xrange(r.ndim)])
def default_infer_shape(self, node, i_shapes): def default_infer_shape(self, node, i_shapes):
rval = [] rval = []
...@@ -690,10 +692,13 @@ class ShapeFeature(object): ...@@ -690,10 +692,13 @@ class ShapeFeature(object):
def on_attach(self, env): def on_attach(self, env):
assert not hasattr(env, 'shape_feature') assert not hasattr(env, 'shape_feature')
env.shape_feature = self env.shape_feature = self
self.shape_of = {} # Variable -> tuple(scalars) or None (All tensor vars map to tuple) # Must be local to the object as otherwise we reuse the same
self.scheduled = {} # Variable -> # variable for multiple env!
self.lscalar_one = T.constant(1, dtype='int64') self.lscalar_one = T.constant(1, dtype='int64')
assert self.lscalar_one.type == T.lscalar assert self.lscalar_one.type == T.lscalar
self.shape_of = {} # Variable -> tuple(scalars) or None (All tensor vars map to tuple)
self.scheduled = {} # Variable ->
for node in env.toposort(): for node in env.toposort():
self.on_import(env, node) self.on_import(env, node)
...@@ -725,12 +730,10 @@ class ShapeFeature(object): ...@@ -725,12 +730,10 @@ class ShapeFeature(object):
'supported, and one should now use tensor.ShapeError ' 'supported, and one should now use tensor.ShapeError '
'instead. The original exception message is: %s' % e) 'instead. The original exception message is: %s' % e)
except Exception, e: except Exception, e:
_logger.error('Failed to infer_shape from Op %s (i_shapes=%s): %s %s'% (node.op, _logger.error('Failed to infer_shape from Op %s.\nInput shapes:%s\nException encountered during infer_shape: %s\nException message: %s\nTraceback: %s'% (node.op,
[self.shape_of[r] for r in node.inputs], [self.shape_of[r] for r in node.inputs],
type(e), str(e))) type(e), str(e), traceback.format_exc()))
# We raise the exception to make sure the user knows something bad o_shapes = self.default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
# is going on.
raise
# this is packed information # this is packed information
# an element of o_shapes is either None or a tuple # an element of o_shapes is either None or a tuple
......
...@@ -195,7 +195,8 @@ class test_CAReduce(unittest.TestCase): ...@@ -195,7 +195,8 @@ class test_CAReduce(unittest.TestCase):
if tosum is None: tosum = range(len(xsh)) if tosum is None: tosum = range(len(xsh))
f = copy(linker).accept(Env([x], [e])).make_function() f = copy(linker).accept(Env([x], [e])).make_function()
xv = numpy.asarray(numpy.random.rand(*xsh)) xv = numpy.asarray(numpy.random.rand(*xsh))
if dtype.startswith('float'):
if not "int" in dtype:
xv = numpy.asarray(xv,dtype=dtype) xv = numpy.asarray(xv,dtype=dtype)
else: else:
xv = numpy.asarray(xv<0.5,dtype=dtype) xv = numpy.asarray(xv<0.5,dtype=dtype)
...@@ -245,7 +246,8 @@ class test_CAReduce(unittest.TestCase): ...@@ -245,7 +246,8 @@ class test_CAReduce(unittest.TestCase):
raise Exception("Test for CAReduce with scalar_op %s not implemented"%str(scalar_op)) raise Exception("Test for CAReduce with scalar_op %s not implemented"%str(scalar_op))
if scalar_op in [maximum,minimum] and numpy_raised: if scalar_op in [maximum,minimum] and numpy_raised:
try: try:
f(xv) out = f(xv)
assert out.dtype == dtype
except ValueError: except ValueError:
pass pass
else: else:
...@@ -254,7 +256,7 @@ class test_CAReduce(unittest.TestCase): ...@@ -254,7 +256,7 @@ class test_CAReduce(unittest.TestCase):
#numpy.{all,any} return bool type. #numpy.{all,any} return bool type.
if scalar_op in [and_, or_]: if scalar_op in [and_, or_]:
zv = numpy.asarray(zv, dtype=dtype) zv = numpy.asarray(zv, dtype=dtype)
self.assertTrue((numpy.abs(f(xv) - zv) < 1e-10).all()) self.assertTrue(numpy.allclose(f(xv), zv))
#test CAReduce.infer_shape #test CAReduce.infer_shape
...@@ -268,22 +270,27 @@ class test_CAReduce(unittest.TestCase): ...@@ -268,22 +270,27 @@ class test_CAReduce(unittest.TestCase):
assert all(f(xv)== zv.shape) assert all(f(xv)== zv.shape)
def test_perform(self): def test_perform(self):
self.with_linker(gof.PerformLinker(), add) for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
self.with_linker(gof.PerformLinker(), mul) self.with_linker(gof.PerformLinker(), add, dtype=dtype)
self.with_linker(gof.PerformLinker(), maximum) self.with_linker(gof.PerformLinker(), mul, dtype=dtype)
self.with_linker(gof.PerformLinker(), minimum) self.with_linker(gof.PerformLinker(), maximum, dtype=dtype)
self.with_linker(gof.PerformLinker(), or_, dtype='int8') self.with_linker(gof.PerformLinker(), minimum, dtype=dtype)
self.with_linker(gof.PerformLinker(), and_, dtype='int8') for dtype in ["int8", "uint8"]:
self.with_linker(gof.PerformLinker(), xor, dtype='int8') self.with_linker(gof.PerformLinker(), or_, dtype=dtype)
self.with_linker(gof.PerformLinker(), and_, dtype=dtype)
self.with_linker(gof.PerformLinker(), xor, dtype=dtype)
def test_c(self): def test_c(self):
self.with_linker(gof.CLinker(), add) for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
self.with_linker(gof.CLinker(), mul) self.with_linker(gof.CLinker(), add, dtype=dtype)
self.with_linker(gof.CLinker(), maximum) self.with_linker(gof.CLinker(), mul, dtype=dtype)
self.with_linker(gof.CLinker(), minimum) for dtype in ["floatX", "int8", "uint8"]:
self.with_linker(gof.CLinker(), or_, dtype='int8') self.with_linker(gof.CLinker(), minimum, dtype=dtype)
self.with_linker(gof.CLinker(), and_, dtype='int8') self.with_linker(gof.CLinker(), maximum, dtype=dtype)
self.with_linker(gof.CLinker(), xor, dtype='int8') for dtype in ["int8", "uint8"]:
self.with_linker(gof.CLinker(), or_, dtype=dtype)
self.with_linker(gof.CLinker(), and_, dtype=dtype)
self.with_linker(gof.CLinker(), xor, dtype=dtype)
class test_Prod(unittest.TestCase): class test_Prod(unittest.TestCase):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论