提交 a2fd617c authored 作者: James Bergstra's avatar James Bergstra

merging new GEMM optimization code

......@@ -8,9 +8,11 @@ AddConfigVar('floatX',
EnumStr('float64', 'float32'),
)
#gpu mean let the driver select the gpu. Needed in case of gpu in exclusive mode.
#gpuX mean use the gpu number X.
AddConfigVar('device',
"Default device for computations",
EnumStr('cpu', *['gpu%i'%i for i in range(4)])
EnumStr('cpu', 'gpu',*['gpu%i'%i for i in range(4)])
)
# keep the default mode.optimizer==config.optimizer and mode.linker==config.linker!
......
......@@ -629,7 +629,6 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
python_inc = distutils.sysconfig.get_python_inc()
libname = os.path.basename(python_inc)
#DSE Patch 1 for supporting OSX frameworks; add -framework Python
if sys.platform=='darwin' :
preargs.extend(['-undefined','dynamic_lookup'])
......@@ -639,8 +638,16 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
if python_inc.count('Python.framework')>0 and config.cmodule.mac_framework_link:
preargs.extend(['-framework','Python'])
workdir = location
# sometimes, the linker cannot find -lpython so we need to tell it
# explicitly where it is located
# this returns somepath/lib/python2.x
python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
standard_lib=1)
python_lib = os.path.dirname(python_lib)
if python_lib not in lib_dirs:
lib_dirs.append(python_lib)
workdir = location
cppfilename = os.path.join(location, 'mod.cpp')
cppfile = file(cppfilename, 'w')
......
......@@ -88,7 +88,7 @@ class Print(Op):
if callable(temp):
pmsg = temp()
else:
psmg = temp
pmsg = temp
print self.message, attr,'=', pmsg
#backport
#print self.message, attr,'=', temp() if callable(temp) else temp
......@@ -441,12 +441,8 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
g.add_node(pd.Node(varstr,color='grey'))
elif var.name or not compact:
g.add_edge(pd.Edge(astr,varstr))
else:
#no name, so we don't make a var ellipse
for client in var.clients:
edge = pd.Edge(astr,apply_name(client[0]))
g.add_edge(edge)
g.set_simplify(True)
# else:
#don't add egde here as it is already added from the inputs.
g.write_png(outfile, prog='dot')
print 'The output file is available at',outfile
......
......@@ -112,7 +112,9 @@ if cuda_available:
def use(device):
global cuda_enabled, enabled_cuda
if device.startswith('gpu'):
if device == 'gpu':
pass
elif device.startswith('gpu'):
device = int(device[3:])
elif device == 'cpu':
device = -1
......@@ -120,13 +122,17 @@ def use(device):
raise ValueError("Invalid device identifier", device)
if use.device_number is None:
# No successful call to use() has been made yet
if device<0:
if device != 'gpu' and device<0:
return
if device in [None,""]:
device=0
device=int(device)
try:
if device !='gpu':
gpu_init(device)
else:
#warning To let people see that the gpu will be used.
_logger.warn("We let the driver select the gpu device to use")
handle_shared_float32(True)
use.device_number = device
cuda_enabled = True
......
......@@ -162,16 +162,19 @@ class GpuConv(Op):
and self.logical_img_hw == other.logical_img_hw \
and self.logical_kern_hw == other.logical_kern_hw \
and self.logical_kern_align_top == other.logical_kern_align_top \
and self.version == other.version
and self.version == other.version \
and self.verbose == other.verbose
def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and hash(-2)==-2 in python!
return hash(type(self)) \
^ hash(self.border_mode) \
^ hash(self.subsample) \
^ hash(self.logical_img_hw) \
^ hash(self.logical_kern_hw) \
^ hash(self.logical_kern_align_top) \
^ self.version# don't use hash as hash(-1)==-2 and hash(-2)==-2 in python!
^ self.version \
^ self.verbose
def __str__(self):
return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
......@@ -200,7 +203,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self):
return (0,4)
return (0,5)
def c_support_code_apply(self, node, nodename):
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
......
......@@ -307,7 +307,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
else f = conv_rows_stack<kern_wid, true>;\
else f = conv_rows_stack<kern_wid, true>;
CONV_ROWS_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
......@@ -379,7 +379,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
......
......@@ -2,6 +2,7 @@ import sys, os, subprocess, logging
from theano.gof.cmodule import (std_libs, std_lib_dirs, std_include_dirs, dlimport,
get_lib_extension)
from theano import config
import distutils
_logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
_logger.setLevel(logging.WARN)
......@@ -68,6 +69,15 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
if cuda_root:
lib_dirs.append(os.path.join(cuda_root, 'lib'))
# sometimes, the linker cannot find -lpython so we need to tell it
# explicitly where it is located
# this returns somepath/lib/python2.x
python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
standard_lib=1)
python_lib = os.path.dirname(python_lib)
if python_lib not in lib_dirs:
lib_dirs.append(python_lib)
cppfilename = os.path.join(location, 'mod.cu')
cppfile = file(cppfilename, 'w')
......
......@@ -14,7 +14,7 @@ import numpy
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_enabled == False:
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn
......@@ -23,6 +23,13 @@ import logging
logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
def my_rand(*shape):
return theano._asarray(numpy.random.rand(*shape),dtype='float32')
def my_randn(*shape):
return theano._asarray(numpy.random.randn(*shape),dtype='float32')
def my_zeros(*shape):
return theano._asarray(numpy.zeros(*shape),dtype='float32')
def get_mode(use_gpu):
ret = theano.compile.get_default_mode()
if isinstance(ret, theano.compile.ProfileMode):
......@@ -44,15 +51,15 @@ def print_diff_mode(a,b):
def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_iter=100):
if use_gpu:
w = tcn.shared_constructor(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(numpy.zeros(n_hid), 'b')
v = tcn.shared_constructor(numpy.zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(numpy.zeros(n_out), 'c')
w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = tcn.shared_constructor(my_zeros(n_hid), 'b')
v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
c = tcn.shared_constructor(my_zeros(n_out), 'c')
else:
w = shared(theano._asarray(0.01*(numpy.random.rand(n_in,n_hid)-0.5), dtype='float32'), 'w')
b = shared(theano._asarray(numpy.zeros(n_hid), dtype='float32'), 'b')
v = shared(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
b = shared(my_zeros(n_hid), 'b')
v = shared(my_zeros((n_hid, n_out)), 'c')
c = shared(my_zeros(n_out), 'c')
x = tensor.fmatrix('x')
y = tensor.fmatrix('y')
......@@ -75,8 +82,8 @@ def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_iter=100):
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = theano._asarray(numpy.random.rand(n_batch, n_in), dtype='float32')
yval = theano._asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
xval = my_rand(n_batch, n_in)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
t0 = time.time()
......@@ -123,10 +130,10 @@ def run_conv_nnet1(use_gpu):
n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
n_out = 10
w = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w')
b = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b')
v = shared_fn(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
b = shared_fn(my_zeros((n_kern,)), 'b')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
......@@ -152,8 +159,8 @@ def run_conv_nnet1(use_gpu):
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = theano._asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
xval = my_rand(*shape_img)
yval = my_rand(n_batch, n_out)
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(10):
......@@ -204,12 +211,12 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
n_out = 10
w0 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
b0 = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b0')
w1 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
b1 = shared_fn(theano._asarray(numpy.zeros((n_kern1,)), dtype='float32'), 'b1')
v = shared_fn(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(my_zeros((n_hid, n_out)), 'c')
c = shared_fn(my_zeros(n_out), 'c')
x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
y = tensor.fmatrix('y')
......@@ -238,8 +245,8 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
# for i, n in enumerate(train.maker.env.toposort()):
# print i, n
xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = theano._asarray(numpy.random.rand(n_batch,n_out), dtype='float32')#int32 make all 0...
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)#int32 make all 0...
lr = theano._asarray(0.01, dtype='float32')
for i in xrange(n_train):
rval = train(xval, yval, lr)
......@@ -284,12 +291,12 @@ def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_iter,
n_out = 10
w0 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
b0 = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b0')
w1 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
b1 = shared_fn(theano._asarray(numpy.zeros((n_kern1,)), dtype='float32'), 'b1')
v = shared_fn(theano._asarray(0.01*numpy.random.randn(n_hid, n_out), dtype='float32'), 'v')
c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
b0 = shared_fn(my_zeros((n_kern,)), 'b0')
w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
c = shared_fn(my_zeros(n_out), 'c')
print 'ALLOCATING ARCH: w0 shape', w0.value.shape
print 'ALLOCATING ARCH: w1 shape', w1.value.shape
......@@ -330,11 +337,11 @@ def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_iter,
for i, n in enumerate(train.maker.env.toposort()):
print i, n
xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
yval = theano._asarray(numpy.random.rand(n_batch,n_out), dtype='float32')
xval = my_rand(*shape_img)
yval = my_rand(n_batch,n_out)
lr = theano._asarray(0.01, dtype='float32')
rvals=numpy.zeros(n_iter)
rvals=my_zeros(n_iter)
t0 = time.time()
for i in xrange(n_iter):
rvals[i] = train(xval, yval, lr)[0]
......
......@@ -945,8 +945,6 @@ def local_dot22_to_dot22scalar(node):
#we take the first _dot22 found. TODO check others!
dot22_idx = i_dot22.index(True)
d = node.inputs[dot22_idx]
i_scalar = [_as_scalar(x) for x in node.inputs]
if not any(i_scalar) and not any([x.owner and x.owner.op ==T.mul for x in node.inputs]):
#no scalar in input and no multiplication
......@@ -983,15 +981,11 @@ def local_dot22_to_dot22scalar(node):
if scalar_idx<0:
info('Not optimizing dot22 with inputs', node.inputs, [x.type for x in node.inputs], 'as the type of the scalar can\'t be upcasted to the matrix type')
return False
assert scalar_idx<len(node.inputs)
s = node.inputs[scalar_idx]
o = copy.copy(node.inputs)
o.remove(d)
o.remove(s)
if len(o)==0:
return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], s)]
else:
......
......@@ -24,6 +24,8 @@ from theano import compile #to register the optimizer built by this file
from theano.gof.python25 import any, all
from theano.gof.opt import Optimizer
from theano.gof import toolbox, DestroyHandler
# Utilities
def out2in(*local_opts):
......@@ -395,6 +397,13 @@ class ShapeFeature(object):
else:
self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])
def init_r(self,r):
if r not in self.shape_of:
try:
self.set_shape(r, self.shape_tuple(r))
except AttributeError:
self.set_shape(r,None)
def make_vector_shape(self, r):
return make_vector(*self.shape_of[r])
#
......@@ -421,11 +430,7 @@ class ShapeFeature(object):
for i, r in enumerate(node.inputs):
# make sure we have shapes for the inputs
if r not in self.shape_of:
try:
self.set_shape(r, self.shape_tuple(r))
except AttributeError:
self.set_shape(r, None ) # not a TensorType variable
self.init_r(r)
try:
shape_infer = node.op.infer_shape
......@@ -453,7 +458,7 @@ class ShapeFeature(object):
# TODO:
# This tells us that r and new_r must have the same shape
# if we didn't know that the shapes are related, now we do.
self.init_r(new_r)
# change_input happens in two cases:
# 1) we are trying to get rid of r, or
# 2) we are putting things back after a failed transaction.
......@@ -1160,7 +1165,8 @@ register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')
@gof.local_optimizer([T.neg])
def local_neg_to_mul(node):
if node.op == T.neg:
return [T.mul(-1, node.inputs[0])]
return [T.mul(numpy.array(-1, dtype = node.inputs[0].dtype),
node.inputs[0])]
register_canonicalize(local_neg_to_mul)
@register_specialize
......
......@@ -212,7 +212,7 @@ class DownsampleFactorMax(Op):
""" % locals()
def c_code_cache_version(self):
return ()
return (0,1)
class DownsampleFactorMaxGrad(Op):
......@@ -349,4 +349,4 @@ class DownsampleFactorMaxGrad(Op):
""" %locals()
def c_code_cache_version(self):
return ()
return (0,1)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论