提交 925a4eb6 authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merged

...@@ -37,8 +37,9 @@ class ProfileMode(Mode): ...@@ -37,8 +37,9 @@ class ProfileMode(Mode):
fct_call_time = {}#time passed inside theano fct call including op time. fct_call_time = {}#time passed inside theano fct call including op time.
fct_call = {} fct_call = {}
message="" message=""
outputs_size={}
self.__setstate__((linker, optimizer, apply_time, op_cimpl, self.__setstate__((linker, optimizer, apply_time, op_cimpl,
compile_time, fct_call_time, fct_call, message)) compile_time, fct_call_time, fct_call, message, outputs_size))
def function_maker(self, i,o,m, *args, **kwargs): def function_maker(self, i,o,m, *args, **kwargs):
"""Return an instance of `Profiler_Maker` which init the count""" """Return an instance of `Profiler_Maker` which init the count"""
...@@ -51,10 +52,11 @@ class ProfileMode(Mode): ...@@ -51,10 +52,11 @@ class ProfileMode(Mode):
def __getstate__(self): def __getstate__(self):
#print "__getstate__",self.provided_linker,self.provided_optimizer #print "__getstate__",self.provided_linker,self.provided_optimizer
return (self.provided_linker, self.provided_optimizer, self.apply_time, return (self.provided_linker, self.provided_optimizer, self.apply_time,
self.op_cimpl, self.compile_time, self.fct_call_time, self.fct_call, self.message) self.op_cimpl, self.compile_time, self.fct_call_time,
self.fct_call, self.message, self.outputs_size)
def __setstate__(self, (linker, optimizer, apply_time, op_cimpl, def __setstate__(self, (linker, optimizer, apply_time, op_cimpl,
compile_time, fct_call_time, fct_call, message)): compile_time, fct_call_time, fct_call, message, outputs_size)):
self.apply_time = apply_time self.apply_time = apply_time
self.op_cimpl = op_cimpl self.op_cimpl = op_cimpl
...@@ -64,8 +66,11 @@ class ProfileMode(Mode): ...@@ -64,8 +66,11 @@ class ProfileMode(Mode):
self.call_time = 0 self.call_time = 0
self.fn_time = 0 self.fn_time = 0
self.message = "" self.message = ""
self.outputs_size = outputs_size
def profile_thunk(i, node, th): def profile_thunk(i, node, th):
""" Profile only the execution time
"""
if hasattr(th, 'cthunk'): if hasattr(th, 'cthunk'):
t0 = time.time() t0 = time.time()
failure = run_cthunk(th.cthunk) failure = run_cthunk(th.cthunk)
...@@ -82,12 +87,50 @@ class ProfileMode(Mode): ...@@ -82,12 +87,50 @@ class ProfileMode(Mode):
apply_time[(i,node)] += dt apply_time[(i,node)] += dt
def profile_thunk2(i, node, th):
""" Profile the execution time and the memory size.
"""
if hasattr(th, 'cthunk'):
t0 = time.time()
failure = run_cthunk(th.cthunk)
dt = time.time() - t0
if failure:
raise RuntimeError(('A C Op raised an exception. PROFILE_MODE cannot'
' tell you what it was though. Use a standard mode such as'
' FAST_RUN_NOGC to correct the problem.'))
else:
t0 = time.time()
th()
dt = time.time() - t0
size=[]
for o in th.outputs:
s=o[0].size
#can't use o[0].dtype.itemsize as dtype is a str for CudaNdarray
dtype = str(o[0].dtype)
dtype2=dtype[-2:]
if dtype2 == '32':
s *= 4
elif dtype2 == '64':
s *= 8
elif dtype2 == '16':
s *= 2
elif dtype[-1] == '8':
s *= 1
elif dtype[-3:] == '128':
s *= 16
else:
raise Exception("Can't determine the memory size of dtype",o[0].dtype)
size.append(s)
outputs_size[node]=size
apply_time[(i,node)] += dt
self.provided_linker = linker self.provided_linker = linker
self.provided_optimizer = optimizer self.provided_optimizer = optimizer
if isinstance(linker, str) or linker is None: if isinstance(linker, str) or linker is None:
linker = predefined_linkers[linker] linker = predefined_linkers[linker]
linker = WrapLinker([linker], profile_thunk) linker = WrapLinker([linker], profile_thunk2)
self.linker = linker self.linker = linker
if isinstance(optimizer, str) or optimizer is None: if isinstance(optimizer, str) or optimizer is None:
...@@ -116,9 +159,10 @@ class ProfileMode(Mode): ...@@ -116,9 +159,10 @@ class ProfileMode(Mode):
apply_time = self.apply_time apply_time = self.apply_time
op_cimpl = self.op_cimpl op_cimpl = self.op_cimpl
message = self.message message = self.message
outputs_size = self.outputs_size
self.print_summary_("print_summary", compile_time, fct_call_time, fct_call, self.print_summary_("print_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print, n_ops_to_print) n_apply_to_print, n_ops_to_print)
...@@ -154,15 +198,16 @@ class ProfileMode(Mode): ...@@ -154,15 +198,16 @@ class ProfileMode(Mode):
apply_time = diff_dict(self.apply_time, other.apply_time) apply_time = diff_dict(self.apply_time, other.apply_time)
op_cimpl = self.op_cimpl and other.op_cimpl op_cimpl = self.op_cimpl and other.op_cimpl
message = self.message message = self.message
outputs_size = diff_dict(self.outputs_size,other.outputs_size)
self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call, self.print_summary_("print_diff_summary", compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=n_apply_to_print, n_apply_to_print=n_apply_to_print,
n_ops_to_print=n_ops_to_print, print_apply=False) n_ops_to_print=n_ops_to_print, print_apply=False)
@staticmethod @staticmethod
def print_summary_(fct_name, compile_time, fct_call_time, fct_call, def print_summary_(fct_name, compile_time, fct_call_time, fct_call,
apply_time, op_cimpl, message, apply_time, op_cimpl, message, outputs_size,
n_apply_to_print=15, n_ops_to_print=20, print_apply=True): n_apply_to_print=15, n_ops_to_print=20, print_apply=True):
""" """
do the actual printing of print_summary and print_diff_summary. do the actual printing of print_summary and print_diff_summary.
...@@ -240,7 +285,7 @@ class ProfileMode(Mode): ...@@ -240,7 +285,7 @@ class ProfileMode(Mode):
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, op_flops.get(a,-1), nb_call, nb_apply, a) print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %7.1f %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, op_flops.get(a,-1), nb_call, nb_apply, a)
else: else:
print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_apply, a) print ' %4.1f%% %5.1f%% %5.3fs %5.3fs %.2es %s %5d %2d %s' % (f, ftot, t, tot, t/nb_call, msg, nb_call, nb_apply, a)
print ' ... (remaining %i Ops account for %6.2f%%(%.2fs) of the runtime)'\ print ' ... (remaining %i Apply account for %6.2f%%(%.2fs) of the runtime)'\
%(max(0, len(otimes)-n_ops_to_print), %(max(0, len(otimes)-n_ops_to_print),
sum(f for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:]), sum(f for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:]),
sum(t for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:])) sum(t for f, t, a, ci, nb_call, nb_op in otimes[n_ops_to_print:]))
...@@ -312,8 +357,6 @@ class ProfileMode(Mode): ...@@ -312,8 +357,6 @@ class ProfileMode(Mode):
print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100) print 'Other time since import %.3fs %.1f%%'%(other_time,other_time/total_time*100)
print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call) print '%i Theano fct call, %.3fs per call'%(total_fct_call, time_per_call)
#imported here to break circular dependency...
from theano.tensor.basic import as_tensor_variable
print print
print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code." print "List of apply that don't have float64 as input but have float64 in outputs. Usefull to know if we forgot some cast when using floatX=float32 or gpu code."
print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>' print '<Apply> <Apply position> <fct name> <inputs type> <outputs type>'
...@@ -348,6 +391,80 @@ class ProfileMode(Mode): ...@@ -348,6 +391,80 @@ class ProfileMode(Mode):
if hasattr(i.type, 'dtype') and i.type.dtype=='float64': if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
print fct.name, i.name, i.type, i print fct.name, i.name, i.type, i
if outputs_size:
fct_memory={}#env->dict(node->(outputs size))
var_mem = {}
for node,val in outputs_size.items():
fct_memory.setdefault(node.env,{})
fct_memory[node.env][node]=val
for out,v in zip(node.outputs,val):
var_mem[out]=v
print
print "Profile of Theano functions memory:"
for env,nodes_mem in fct_memory.iteritems():
print "Theano fct:", [fct for fct in fct_call.keys() if fct.maker.env is env][0].name
size_sum=sum([sum(val) for key,val in nodes_mem.iteritems()])
print " Max without gc, inplace and view (KB)",size_sum/1024
node_memory_size = 0
node_memory_saved_by_view = 0
node_memory_saved_by_inplace = 0
running_memory_size = 0
running_max_memory_size = 0
post_thunk_old_storage = []
items = nodes_mem.items()
items.sort(key=lambda a: a[1])
items.reverse()
order = env.toposort()
computed, last_user = gof.link.gc_helper(order)
for node in order:
post_thunk_old_storage.append([ input_idx
for input_idx,input in enumerate(node.inputs)
if (input in computed) and (input not in env.outputs) and node == last_user[input]])
for node,val in items[:n_apply_to_print]:
dmap = getattr(node.op,'destroy_map',None)
vmap = getattr(node.op,'view_map',None)
for idx,v in enumerate(val):
if dmap and idx in dmap:#TODO check the op returned a view
node_memory_saved_by_inplace += v
elif vmap and idx in vmap:#TODO check the op returned a view
node_memory_saved_by_view += v
else:
node_memory_size += v
running_memory_size += v
if running_memory_size > running_max_memory_size:
running_max_memory_size = running_memory_size
old_storage = post_thunk_old_storage[order.index(node)]
for old_s in old_storage:
running_memory_size -= var_mem[node.inputs[old_s]]
pass
pass
print " Max FAST_RUN_NO_GC (KB)", node_memory_size/1024
print " Max FAST_RUN (KB)", running_max_memory_size/1024
print " Memory saved by view (KB)", node_memory_saved_by_view/1024
print " Memory saved by inplace (KB)", node_memory_saved_by_inplace/1024
print " Memory saved by GC (KB)", (node_memory_size-running_max_memory_size)/1024
n_apply_to_print+=10#TODO remove this line
print " <Sum apply outputs (bytes)> <Apply outputs memory size(bytes)> <created/inplace/view> <Apply node>"
print " <created/inplace/view> is taked from the op declaration, not the op exeuction. Use DebugMode to have warning about inplace/view declaration being respected."
for key,val in items[:n_apply_to_print]:
code = ['c']*len(node.outputs)
for out,inp in getattr(key.op,'destroy_map',{}).iteritems():
code[out] = "i"
for out,inp in getattr(key.op,'view_map',{}).iteritems():
code[out] = "v"
print ' %9dB %s %s %s' % (sum(val), str(val), ' '.join(code), key)
print ' ... (remaining %i Apply account for %.2f%%(%.2fs) of the runtime)'\
%(max(0, len(nodes_mem)-n_ops_to_print),
sum(sum(val) for key, val in items[n_ops_to_print:]),
sum(sum(val) for key, val in items[n_ops_to_print:])/size_sum)
print print
print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up." print "We guess some tips to make your code faster. If you think of new one, suggest them on the mailing list. Test them before use as they are not guaranted to always give a speed up."
from theano import tensor as T from theano import tensor as T
......
...@@ -421,7 +421,7 @@ class GpuDownsampleFactorMax(Op): ...@@ -421,7 +421,7 @@ class GpuDownsampleFactorMax(Op):
#def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return () return (1)
def c_code(self, node, nodename, (x,), (z,), sub): def c_code(self, node, nodename, (x,), (z,), sub):
fail = sub['fail'] fail = sub['fail']
ds0, ds1 = self.ds ds0, ds1 = self.ds
......
...@@ -521,6 +521,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -521,6 +521,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
if (1 && (version==6||version==-1) && if (1 && (version==6||version==-1) &&
kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
......
import sys, time import sys, time
import numpy import numpy
from nose.plugins.skip import SkipTest
imported_scipy_convolve2d = False
try:
from scipy.signal import convolve2d
imported_scipy_convolve2d = True
except ImportError:
pass
import theano import theano
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -38,9 +46,23 @@ def py_conv_full_numpy(img, kern): ...@@ -38,9 +46,23 @@ def py_conv_full_numpy(img, kern):
pad_cols = 2*(kern.shape[3]-1) + img.shape[3] pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype) padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
return py_conv_valid(padded_img, kern) return py_conv_valid_numpy(padded_img, kern)
def py_conv(img, kern, mode, subsample):
"""
use a scipy or numpy implementation depending is scipy is available.
The scipy version is faster.
"""
if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample)
elif mode=='valid':
return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
elif mode=='full':
return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]]
else:
raise Exception("Can't execute this kernel.")
def py_conv_scipy(img, kern, mode, subsample): def py_conv_scipy(img, kern, mode, subsample):
from scipy.signal import convolve2d
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
if mode == 'valid': if mode == 'valid':
outshp = (img.shape[0], kern.shape[0], outshp = (img.shape[0], kern.shape[0],
...@@ -89,7 +111,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker ...@@ -89,7 +111,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
rval = True rval = True
try: try:
t0 = time.time() t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample) cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time() t1 = time.time()
i = cuda_tensor4() i = cuda_tensor4()
k = cuda_tensor4() k = cuda_tensor4()
...@@ -550,7 +572,7 @@ def _test_dummy(): ...@@ -550,7 +572,7 @@ def _test_dummy():
rval = True rval = True
t0 = time.time() t0 = time.time()
cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample) cpuval = py_conv(npy_img, npy_kern, mode, subsample)
t1 = time.time() t1 = time.time()
gpuval = cuda_ndarray.conv(img, kern, mode, subsample) gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
t2 = time.time() t2 = time.time()
......
...@@ -252,13 +252,13 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -252,13 +252,13 @@ class GpuImages2Neibs(Images2Neibs):
dtype=ten4.type.dtype)()]) dtype=ten4.type.dtype)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return () return (6,)
return (2,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
if self.mode=="valid": mode = self.mode
return """ return """
static __global__ void k_multi_warp_%(nodename)s( //a version that use less register but don't work in all case.
static __global__ void k_multi_warp_less_%(nodename)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
const int height, const int height,
...@@ -274,8 +274,10 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -274,8 +274,10 @@ class GpuImages2Neibs(Images2Neibs):
float * global_out float * global_out
) )
{ {
const int wrap_centered_idx_shift_x = c/2;
for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){ const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d; const int b = tblock%%grid_d;
int left = tblock/grid_d; int left = tblock/grid_d;
const int a = left%%grid_c; const int a = left%%grid_c;
...@@ -289,12 +291,23 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -289,12 +291,23 @@ class GpuImages2Neibs(Images2Neibs):
if(a>grid_c)continue; if(a>grid_c)continue;
if(b>grid_d)continue; if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n)); int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
for (int i = 0; i < c; i++) // loop over c int i = threadIdx.y; // loop over c
{ {
int ten4_2 = i + a * step_x; int ten4_2 = i + a * step_x;
for (int j = threadIdx.x; j < d; j+=blockDim.x) // loop over d if("%(mode)s"=="wrap_centered"){
ten4_2 -= wrap_centered_idx_shift_x;
if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
}
int j = threadIdx.x; // loop over d
{ {
int ten4_3 = j + b * step_y; int ten4_3 = j + b * step_y;
if("%(mode)s"=="wrap_centered"){
ten4_3 -= wrap_centered_idx_shift_y;
if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n)); //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
...@@ -307,9 +320,6 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -307,9 +320,6 @@ class GpuImages2Neibs(Images2Neibs):
} }
} }
""" % locals()
if self.mode=="wrap_centered":
return """
static __global__ void k_multi_warp_%(nodename)s( static __global__ void k_multi_warp_%(nodename)s(
const int nb_batch, const int nb_batch,
const int nb_stack, const int nb_stack,
...@@ -329,7 +339,7 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -329,7 +339,7 @@ class GpuImages2Neibs(Images2Neibs):
const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_x = c/2;
const int wrap_centered_idx_shift_y = d/2; const int wrap_centered_idx_shift_y = d/2;
for(int tblock = blockIdx.x;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x){ for(int tblock = blockIdx.x*blockDim.z+threadIdx.z;tblock<nb_batch*nb_stack*grid_c*grid_d;tblock+=gridDim.x*blockDim.z){
const int b = tblock%%grid_d; const int b = tblock%%grid_d;
int left = tblock/grid_d; int left = tblock/grid_d;
const int a = left%%grid_c; const int a = left%%grid_c;
...@@ -343,19 +353,23 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -343,19 +353,23 @@ class GpuImages2Neibs(Images2Neibs):
if(a>grid_c)continue; if(a>grid_c)continue;
if(b>grid_d)continue; if(b>grid_d)continue;
int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n)); int z_row = b + grid_d*(a + grid_c*(s + nb_stack*n));
for (int i = 0; i < c; i++) // loop over c for (int i = threadIdx.y; i < c; i+=blockDim.y) // loop over c
{ {
int ten4_2 = i + a * step_x; int ten4_2 = i + a * step_x;
ten4_2 -= wrap_centered_idx_shift_x; if("%(mode)s"=="wrap_centered"){
if ( ten4_2 < 0 ) ten4_2 += height; ten4_2 -= wrap_centered_idx_shift_x;
else if (ten4_2 >= height) ten4_2 -= height; if ( ten4_2 < 0 ) ten4_2 += height;
else if (ten4_2 >= height) ten4_2 -= height;
}
for (int j = threadIdx.x; j < d; j+=blockDim.x) // loop over d for (int j = threadIdx.x; j < d; j+=blockDim.x) // loop over d
{ {
int ten4_3 = j + b * step_y; int ten4_3 = j + b * step_y;
ten4_3 -= wrap_centered_idx_shift_y; if("%(mode)s"=="wrap_centered"){
if ( ten4_3 < 0 ) ten4_3 += width; ten4_3 -= wrap_centered_idx_shift_y;
else if (ten4_3 >= width) ten4_3 -= width; if ( ten4_3 < 0 ) ten4_3 += width;
else if (ten4_3 >= width) ten4_3 -= width;
}
//int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n)); //int ten4_idx = ten4_3 + width*(ten4_2 + height*(s +nb_stack*n));
//int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n)); //int ten4_idx = stride3*ten4_3 + stride2*(ten4_2 + stride1*(s + stride0*n));
int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;
...@@ -370,7 +384,6 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -370,7 +384,6 @@ class GpuImages2Neibs(Images2Neibs):
""" % locals() """ % locals()
def c_code(self, node, name, (ten4, neib_shape, neib_step), (z,), sub): def c_code(self, node, name, (ten4, neib_shape, neib_step), (z,), sub):
fail = sub['fail'] fail = sub['fail']
mode = self.mode mode = self.mode
...@@ -473,17 +486,36 @@ class GpuImages2Neibs(Images2Neibs): ...@@ -473,17 +486,36 @@ class GpuImages2Neibs(Images2Neibs):
const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0); const npy_intp step_x = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 0);
const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1); const npy_intp step_y = (npy_intp) *(dtype_%(neib_step)s*) PyArray_GETPTR1(%(neib_step)s, 1);
dim3 n_threads(d,c,1);
//Their is a max of 512 threads per blocks
while(n_threads.x*n_threads.y>512 && n_threads.y>1)n_threads.y--;
while(n_threads.x*n_threads.y>512 && n_threads.x>1)n_threads.x--;
//Make bigger block to have better memory access pattern and a higher core utilisation.
//for smaller patch size
while(c*d*(n_threads.z+1) < 128 && n_threads.z<64 && n_threads.z<CudaNdarray_HOST_DIMS(%(z)s)[0]){
n_threads.z++;
}
int nb_block; int nb_block;
if (nb_batch %% 32 == 0) if (CudaNdarray_HOST_DIMS(%(z)s)[0] %% n_threads.z == 0)
nb_block = nb_batch/32; nb_block = CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z;
else else
nb_block = (int)((float)nb_batch/32. + 1.); nb_block = (CudaNdarray_HOST_DIMS(%(z)s)[0] / n_threads.z) + 1;
dim3 n_blocks(std::min(32*1024,nb_block));
dim3 n_blocks(std::min(32*1024,CudaNdarray_HOST_DIMS(%(z)s)[0]),1,1);
dim3 n_threads(32,1,1);
int n_shared = 0; int n_shared = 0;
k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>( void (*f)(int, int, int ,int,
int, int, int ,int,
int, int,
int, int, int, int,
float*, float*);
if(n_threads.x==d && n_threads.y==c){
f = k_multi_warp_less_%(name)s;
}else{
f = k_multi_warp_%(name)s;
}
f<<<n_blocks, n_threads, n_shared>>>(
nb_batch, nb_batch,
nb_stack, nb_stack,
height, width, height, width,
......
...@@ -278,26 +278,30 @@ def test_neibs_wrap_centered_step_manual(): ...@@ -278,26 +278,30 @@ def test_neibs_wrap_centered_step_manual():
def test_neibs_gpu(): def test_neibs_gpu():
if cuda.cuda_available == False: if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
for shape, pshape in [((100,40,18,18),(2,2)),
shape = (100,40,18,18) ((100,40,6,18),(3,2)),
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape)) ((10,40,66,66),(33,33)),
neib_shape = T.as_tensor_variable((2,2))#(array((2,2), dtype='float32')) ((10,40,68,66),(34,33))
]:
from theano.sandbox.cuda.basic_ops import gpu_from_host
images = shared(numpy.arange(numpy.prod(shape), dtype='float32').reshape(shape))
f = function([], images2neibs(images,neib_shape), neib_shape = T.as_tensor_variable(pshape)
mode=mode_with_gpu)
f_gpu = function([], images2neibs(images,neib_shape), from theano.sandbox.cuda.basic_ops import gpu_from_host
mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()]) f = function([], images2neibs(images,neib_shape),
#print images.value mode=mode_with_gpu)
neibs = numpy.asarray(f_gpu()) f_gpu = function([], images2neibs(images,neib_shape),
assert numpy.allclose(neibs,f()) mode=mode_with_gpu)
#print neibs assert any([isinstance(node.op,GpuImages2Neibs) for node in f_gpu.maker.env.toposort()])
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu) #print images.value
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()]) neibs = numpy.asarray(f_gpu())
#print numpy.asarray(g()) assert numpy.allclose(neibs,f())
assert numpy.allclose(images.value,g()) #print neibs
g = function([], neibs2images(neibs, neib_shape, images.shape), mode=mode_with_gpu)
assert any([isinstance(node.op,GpuImages2Neibs) for node in f.maker.env.toposort()])
#print numpy.asarray(g())
assert numpy.allclose(images.value,g())
def speed_neibs(): def speed_neibs():
......
...@@ -12,7 +12,7 @@ import numpy, theano ...@@ -12,7 +12,7 @@ import numpy, theano
#from copy import copy as python_copy #from copy import copy as python_copy
from theano import gof, shared from theano import gof, shared
from theano.gof import Variable, Op, utils, Type, Constant, Value from theano.gof import Variable, Op, Type, Constant, Value
from theano.tensor.tsor_apply import Apply from theano.tensor.tsor_apply import Apply
from theano import gradient from theano import gradient
...@@ -21,7 +21,7 @@ import elemwise ...@@ -21,7 +21,7 @@ import elemwise
from theano import scalar as scal from theano import scalar as scal
from theano.gof.python25 import partial, any, all from theano.gof.python25 import partial, any, all
from theano import compile, printing from theano import compile, printing
from theano.printing import pprint, Print from theano.printing import pprint
### set up the external interface ### set up the external interface
from elemwise import Elemwise, DimShuffle, CAReduce, Sum from elemwise import Elemwise, DimShuffle, CAReduce, Sum
......
...@@ -18,6 +18,16 @@ from theano import gof, Op, tensor, config ...@@ -18,6 +18,16 @@ from theano import gof, Op, tensor, config
from theano.tensor.tsor_apply import Apply from theano.tensor.tsor_apply import Apply
from theano.gof.python25 import any from theano.gof.python25 import any
imported_scipy_signal = False
try:
# TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imported_scipy_signal = True
except ImportError:
pass
_logger=logging.getLogger("theano.signal.conv") _logger=logging.getLogger("theano.signal.conv")
def _debug(*msg): def _debug(*msg):
_logger.debug(' '.join([ str(x) for x in msg])) _logger.debug(' '.join([ str(x) for x in msg]))
...@@ -547,9 +557,12 @@ class ConvOp(Op): ...@@ -547,9 +557,12 @@ class ConvOp(Op):
""" """
By default if len(img2d.shape)==3, we By default if len(img2d.shape)==3, we
""" """
if not imported_scipy_signal:
raise theano.gof.utils.MethodNotDefined(
"c_headers", type(self), self.__class__.__name__,
"Need the python package for scipy.signal to be installed for the python implementation. You can use the C implementation instead.")
# TODO: move these back out to global scope when they no longer cause an atexit error # TODO: move these back out to global scope when they no longer cause an atexit error
from scipy.signal.signaltools import _valfrommode, _bvalfromboundary
from scipy.signal.sigtools import _convolve2d
imshp = self.imshp imshp = self.imshp
if imshp is None or any([x is None for x in imshp]): if imshp is None or any([x is None for x in imshp]):
imshp = tuple(img2d.shape[1:]) imshp = tuple(img2d.shape[1:])
...@@ -584,8 +597,6 @@ class ConvOp(Op): ...@@ -584,8 +597,6 @@ class ConvOp(Op):
z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp, z[0] = numpy.zeros((bsize,)+(nkern,)+fulloutshp,
dtype=img2d.dtype) dtype=img2d.dtype)
zz=z[0] zz=z[0]
val = _valfrommode(self.out_mode)
bval = _bvalfromboundary('fill')
stacklen = imshp[0] stacklen = imshp[0]
...@@ -616,6 +627,9 @@ class ConvOp(Op): ...@@ -616,6 +627,9 @@ class ConvOp(Op):
filtersflipped = buf filtersflipped = buf
del buf, rstride, cstride del buf, rstride, cstride
val = _valfrommode(self.out_mode)
bval = _bvalfromboundary('fill')
for b in range(bsize): for b in range(bsize):
for n in range(nkern): for n in range(nkern):
zz[b,n,...].fill(0) zz[b,n,...].fill(0)
...@@ -623,6 +637,25 @@ class ConvOp(Op): ...@@ -623,6 +637,25 @@ class ConvOp(Op):
zz[b,n,...] += _convolve2d(\ zz[b,n,...] += _convolve2d(\
img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0) img2d[b,im0,...], filtersflipped[n,im0,...],1,val, bval, 0)
if False:
if False and self.out_mode=="full":
img2d2 = numpy.zeros((bsize,stacklen,
imshp[1]+2*kshp[0]-2,
imshp[2]+2*kshp[1]-2))
img2d2[:,:,kshp[0]-1:kshp[0]-1+imshp[1],
kshp[1]-1:kshp[1]-1+imshp[2]] = img2d
img2d = img2d2
#N_image_shape = image_data.shape
for b in range(bsize):
for n in range(nkern):
zz[b,n,...].fill(0)
for im0 in range(stacklen):
for row in range(0,zz.shape[2],self.dx):
for col in range(0,zz.shape[3],self.dy):
zz[b,n,row,col] += (img2d[b,im0,row:row+kshp[0],col:col+kshp[1]]*\
filtersflipped[n,im0,::-1,::-1]).sum()
#We copy it to remove the Stride mismatch warning from DEBUG_MODE. #We copy it to remove the Stride mismatch warning from DEBUG_MODE.
#The copy make that we return an object with the same stride as the c version. #The copy make that we return an object with the same stride as the c version.
#The copy don't affect the performence during our experience as in that case we #The copy don't affect the performence during our experience as in that case we
......
import sys, time, unittest import sys, time, unittest
import numpy import numpy
from scipy import signal
import theano import theano
import theano.tensor as T import theano.tensor as T
...@@ -60,6 +59,7 @@ class TestConv2D(unittest.TestCase): ...@@ -60,6 +59,7 @@ class TestConv2D(unittest.TestCase):
############# REFERENCE IMPLEMENTATION ############ ############# REFERENCE IMPLEMENTATION ############
s = 1. s = 1.
orig_image_data = image_data
if border_mode is not 'full': s = -1. if border_mode is not 'full': s = -1.
out_shape2d = numpy.array(N_image_shape[-2:]) +\ out_shape2d = numpy.array(N_image_shape[-2:]) +\
s*numpy.array(N_filter_shape[-2:]) - s s*numpy.array(N_filter_shape[-2:]) - s
...@@ -68,26 +68,41 @@ class TestConv2D(unittest.TestCase): ...@@ -68,26 +68,41 @@ class TestConv2D(unittest.TestCase):
ref_output = numpy.zeros(out_shape) ref_output = numpy.zeros(out_shape)
# loop over output feature maps # loop over output feature maps
for k in range(N_filter_shape[0]): ref_output.fill(0)
# loop over input feature maps if border_mode=='full':
for l in range(N_filter_shape[1]): image_data2 = numpy.zeros((N_image_shape[0],N_image_shape[1],
N_image_shape[2]+2*N_filter_shape[2]-2,
filter2d = filter_data[k,l,:,:] N_image_shape[3]+2*N_filter_shape[3]-2))
image_data2[:,:,N_filter_shape[2]-1:N_filter_shape[2]-1+N_image_shape[2],
# loop over mini-batches N_filter_shape[3]-1:N_filter_shape[3]-1+N_image_shape[3]] = image_data
for b in range(N_image_shape[0]): image_data = image_data2
image2d = image_data[b,l,:,:] N_image_shape = image_data.shape
output2d = signal.convolve2d(image2d, filter2d, border_mode) for bb in range(N_image_shape[0]):
for nn in range(N_filter_shape[0]):
ref_output[b,k,:,:] +=\ for im0 in range(N_image_shape[1]):
output2d[::subsample[0],::subsample[1]] filter2d = filter_data[nn,im0,:,:]
image2d = image_data[bb,im0,:,:]
for row in range(ref_output.shape[2]):
irow = row * subsample[0]#image row
for col in range(ref_output.shape[3]):
icol = col * subsample[1]#image col
ref_output[bb,nn,row,col] += (image2d[irow:irow+N_filter_shape[2],
icol:icol+N_filter_shape[3]]*filter2d[::-1,::-1]
).sum()
self.failUnless(_allclose(theano_output, ref_output)) self.failUnless(_allclose(theano_output, ref_output))
############# TEST GRADIENT ############ ############# TEST GRADIENT ############
if verify_grad: if verify_grad:
utt.verify_grad(sym_conv2d, [image_data, filter_data]) utt.verify_grad(sym_conv2d, [orig_image_data, filter_data])
def test_basic1(self):
"""
Tests that basic convolutions work for odd and even dimensions of image and filter
shapes, as well as rectangular images and filters.
"""
self.validate((2,2,3,3), (2,2,2,2), 'valid', verify_grad=False)
def test_basic(self): def test_basic(self):
""" """
......
import sys, time, unittest import sys, time, unittest
import numpy import numpy
from scipy import signal
import theano import theano
import theano.tensor as T import theano.tensor as T
...@@ -59,7 +58,13 @@ class TestSignalConv2D(unittest.TestCase): ...@@ -59,7 +58,13 @@ class TestSignalConv2D(unittest.TestCase):
image2d = image_data3d[b,:,:] image2d = image_data3d[b,:,:]
filter2d = filter_data3d[k,:,:] filter2d = filter_data3d[k,:,:]
output2d = signal.convolve2d(image2d, filter2d, 'valid') output2d = numpy.zeros(ref_output.shape)
for row in range(ref_output.shape[0]):
for col in range(ref_output.shape[1]):
output2d[row,col] += (image2d[row:row+filter2d.shape[0],
col:col+filter2d.shape[1]]*filter2d[::-1,::-1]
).sum()
self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d)) self.failUnless(_allclose(theano_output4d[b,k,:,:], output2d))
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论