merging new GEMM optimization code

a2fd617c · James Bergstra · abd9bef4 · 5d229740 · a2fd617c · a2fd617c
--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -8,9 +8,11 @@ AddConfigVar('floatX',
        EnumStr('float64', 'float32'), 
        )

+#gpu mean let the driver select the gpu. Needed in case of gpu in exclusive mode.
+#gpuX mean use the gpu number X.
 AddConfigVar('device',
        "Default device for computations",
-        EnumStr('cpu', *['gpu%i'%i for i in range(4)])
+        EnumStr('cpu', 'gpu',*['gpu%i'%i for i in range(4)])
        )

 # keep the default mode.optimizer==config.optimizer and mode.linker==config.linker!

--- a/theano/gof/cmodule.py
+++ b/theano/gof/cmodule.py
@@ -629,7 +629,6 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
        python_inc = distutils.sysconfig.get_python_inc()
        libname = os.path.basename(python_inc)

-
    #DSE Patch 1 for supporting OSX frameworks; add -framework Python 
    if sys.platform=='darwin' :
        preargs.extend(['-undefined','dynamic_lookup'])
@@ -639,8 +638,16 @@ def gcc_module_compile_str(module_name, src_code, location=None, include_dirs=[]
        if python_inc.count('Python.framework')>0 and config.cmodule.mac_framework_link:
            preargs.extend(['-framework','Python'])

-    workdir = location
+    # sometimes, the linker cannot find -lpython so we need to tell it 
+    # explicitly where it is located
+    # this returns somepath/lib/python2.x
+    python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
+                    standard_lib=1) 
+    python_lib = os.path.dirname(python_lib)
+    if python_lib not in lib_dirs:
+	lib_dirs.append(python_lib)

+    workdir = location

    cppfilename = os.path.join(location, 'mod.cpp')
    cppfile = file(cppfilename, 'w')

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -88,7 +88,7 @@ class Print(Op):
            if callable(temp):
              pmsg = temp()
            else:
-              psmg = temp
+              pmsg = temp
            print self.message, attr,'=', pmsg
            #backport
            #print self.message, attr,'=', temp() if callable(temp) else temp
@@ -441,12 +441,8 @@ def pydotprint(fct, outfile=os.path.join(config.compiledir,'theano.pydotprint.pn
                g.add_node(pd.Node(varstr,color='grey'))
            elif var.name or not compact:
                g.add_edge(pd.Edge(astr,varstr))
-            else:
-                #no name, so we don't make a var ellipse
-                for client in var.clients:
-                    edge = pd.Edge(astr,apply_name(client[0]))
-                    g.add_edge(edge)
-    g.set_simplify(True)
+#            else:
+            #don't add egde here as it is already added from the inputs.
    g.write_png(outfile, prog='dot')

    print 'The output file is available at',outfile

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -112,7 +112,9 @@ if cuda_available:

 def use(device):
    global cuda_enabled, enabled_cuda
-    if device.startswith('gpu'):
+    if device == 'gpu':
+        pass
+    elif device.startswith('gpu'):
        device = int(device[3:])
    elif device == 'cpu':
        device = -1
@@ -120,13 +122,17 @@ def use(device):
        raise ValueError("Invalid device identifier", device)
    if use.device_number is None:
        # No successful call to use() has been made yet
-        if device<0:
+        if device != 'gpu' and device<0:
            return
        if device in [None,""]:
            device=0
-        device=int(device)
        try:
-            gpu_init(device)
+            if device !='gpu':
+                gpu_init(device)
+            else:
+                #warning To let people see that the gpu will be used.
+                _logger.warn("We let the driver select the gpu device to use")
+                
            handle_shared_float32(True)
            use.device_number = device
            cuda_enabled = True

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -162,16 +162,19 @@ class GpuConv(Op):
            and self.logical_img_hw == other.logical_img_hw \
            and self.logical_kern_hw == other.logical_kern_hw \
            and self.logical_kern_align_top == other.logical_kern_align_top \
-            and self.version == other.version
+            and self.version == other.version \
+            and self.verbose == other.verbose

    def __hash__(self):
+        # don't use hash(self.version) as hash(-1)==-2 and hash(-2)==-2 in python! 
        return hash(type(self)) \
            ^ hash(self.border_mode) \
            ^ hash(self.subsample) \
            ^ hash(self.logical_img_hw) \
            ^ hash(self.logical_kern_hw) \
            ^ hash(self.logical_kern_align_top) \
-            ^ self.version# don't use hash as hash(-1)==-2 and hash(-2)==-2 in python!
+            ^ self.version \
+            ^ self.verbose
    
    def __str__(self):
        return '%s{%s, %s, %s, %s, %s}' %(self.__class__.__name__,
@@ -200,7 +203,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']

    def c_code_cache_version(self):
-        return (0,4)
+        return (0,5)

    def c_support_code_apply(self, node, nodename):
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -307,7 +307,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

 #define CONV_ROWS_STACK_SPECIAL(kern_wid) \
 	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
-	else f = conv_rows_stack<kern_wid, true>;\
+	else f = conv_rows_stack<kern_wid, true>;
 	CONV_ROWS_STACK_SPECIAL(THEANO_KERN_WID);

 	f<<< grid, threads, shared_size >>>
@@ -379,7 +379,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
 	else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
 	else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-	else f = conv_rows_stack2<kern_wid, true, false>;\
+	else f = conv_rows_stack2<kern_wid, true, false>;
+
 	CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);

 	f<<< grid, threads, shared_size >>>

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -2,6 +2,7 @@ import sys, os, subprocess, logging
 from theano.gof.cmodule import (std_libs, std_lib_dirs, std_include_dirs, dlimport,
    get_lib_extension)
 from theano import config
+import distutils

 _logger=logging.getLogger("theano.sandbox.cuda.nvcc_compiler")
 _logger.setLevel(logging.WARN)
@@ -68,6 +69,15 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
    if cuda_root:
        lib_dirs.append(os.path.join(cuda_root, 'lib'))

+    # sometimes, the linker cannot find -lpython so we need to tell it 
+    # explicitly where it is located
+    # this returns somepath/lib/python2.x
+    python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, \
+                    standard_lib=1)
+    python_lib = os.path.dirname(python_lib)
+    if python_lib not in lib_dirs:
+        lib_dirs.append(python_lib)
+
    cppfilename = os.path.join(location, 'mod.cu')
    cppfile = file(cppfilename, 'w')


--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -14,7 +14,7 @@ import numpy
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_enabled == False:
+if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

 import theano.sandbox.cuda as tcn
@@ -23,6 +23,13 @@ import logging
 logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)


+def my_rand(*shape):
+    return theano._asarray(numpy.random.rand(*shape),dtype='float32')
+def my_randn(*shape):
+    return theano._asarray(numpy.random.randn(*shape),dtype='float32')
+def my_zeros(*shape):
+    return theano._asarray(numpy.zeros(*shape),dtype='float32')
+
 def get_mode(use_gpu):
    ret = theano.compile.get_default_mode()
    if isinstance(ret, theano.compile.ProfileMode):
@@ -44,15 +51,15 @@ def print_diff_mode(a,b):
 def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_iter=100):

    if use_gpu:
-        w = tcn.shared_constructor(0.01*(numpy.random.rand(n_in,n_hid)-0.5), 'w')
-        b = tcn.shared_constructor(numpy.zeros(n_hid), 'b')
-        v = tcn.shared_constructor(numpy.zeros((n_hid, n_out)), 'c')
-        c = tcn.shared_constructor(numpy.zeros(n_out), 'c')
+        w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
+        b = tcn.shared_constructor(my_zeros(n_hid), 'b')
+        v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
+        c = tcn.shared_constructor(my_zeros(n_out), 'c')
    else:
-        w = shared(theano._asarray(0.01*(numpy.random.rand(n_in,n_hid)-0.5), dtype='float32'), 'w')
-        b = shared(theano._asarray(numpy.zeros(n_hid), dtype='float32'), 'b')
-        v = shared(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
-        c = shared(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+        w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
+        b = shared(my_zeros(n_hid), 'b')
+        v = shared(my_zeros((n_hid, n_out)), 'c')
+        c = shared(my_zeros(n_out), 'c')

    x = tensor.fmatrix('x')
    y = tensor.fmatrix('y')
@@ -75,8 +82,8 @@ def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_iter=100):
        for i, n in enumerate(train.maker.env.toposort()):
            print i, n

-    xval = theano._asarray(numpy.random.rand(n_batch, n_in), dtype='float32')
-    yval = theano._asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
+    xval = my_rand(n_batch, n_in)
+    yval = my_rand(n_batch, n_out)
    lr = theano._asarray(0.01, dtype='float32')

    t0 = time.time()
@@ -123,10 +130,10 @@ def run_conv_nnet1(use_gpu):
    n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
    n_out = 10

-    w = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w')
-    b = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b')
-    v = shared_fn(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
-    c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+    w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
+    b = shared_fn(my_zeros((n_kern,)), 'b')
+    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
+    c = shared_fn(my_zeros(n_out), 'c')

    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
    y = tensor.fmatrix('y')
@@ -152,8 +159,8 @@ def run_conv_nnet1(use_gpu):
 #    for i, n in enumerate(train.maker.env.toposort()):
 #        print i, n

-    xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
-    yval = theano._asarray(numpy.random.rand(n_batch, n_out), dtype='float32')
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch, n_out)
    lr = theano._asarray(0.01, dtype='float32')

    for i in xrange(10):
@@ -204,12 +211,12 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
    n_out = 10

-    w0 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
-    b0 = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b0')
-    w1 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
-    b1 = shared_fn(theano._asarray(numpy.zeros((n_kern1,)), dtype='float32'), 'b1')
-    v = shared_fn(theano._asarray(numpy.zeros((n_hid, n_out)), dtype='float32'), 'c')
-    c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
+    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
+    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
+    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
+    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
+    c = shared_fn(my_zeros(n_out), 'c')

    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
    y = tensor.fmatrix('y')
@@ -238,8 +245,8 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
 #    for i, n in enumerate(train.maker.env.toposort()):
 #        print i, n

-    xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
-    yval = theano._asarray(numpy.random.rand(n_batch,n_out), dtype='float32')#int32 make all 0...
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch,n_out)#int32 make all 0...
    lr = theano._asarray(0.01, dtype='float32')
    for i in xrange(n_train):
        rval = train(xval, yval, lr)
@@ -284,12 +291,12 @@ def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_iter,
    n_out = 10


-    w0 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern)-0.5), dtype='float32'), 'w0')
-    b0 = shared_fn(theano._asarray(numpy.zeros((n_kern,)), dtype='float32'), 'b0')
-    w1 = shared_fn(theano._asarray(0.01*(numpy.random.rand(*shape_kern1)-0.5), dtype='float32'), 'w1')
-    b1 = shared_fn(theano._asarray(numpy.zeros((n_kern1,)), dtype='float32'), 'b1')
-    v = shared_fn(theano._asarray(0.01*numpy.random.randn(n_hid, n_out), dtype='float32'), 'v')
-    c = shared_fn(theano._asarray(numpy.zeros(n_out), dtype='float32'), 'c')
+    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
+    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
+    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
+    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
+    v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
+    c = shared_fn(my_zeros(n_out), 'c')

    print 'ALLOCATING ARCH: w0 shape', w0.value.shape
    print 'ALLOCATING ARCH: w1 shape', w1.value.shape
@@ -330,11 +337,11 @@ def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_iter,
        for i, n in enumerate(train.maker.env.toposort()):
            print i, n

-    xval = theano._asarray(numpy.random.rand(*shape_img), dtype='float32')
-    yval = theano._asarray(numpy.random.rand(n_batch,n_out), dtype='float32')
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch,n_out)
    lr = theano._asarray(0.01, dtype='float32')

-    rvals=numpy.zeros(n_iter)
+    rvals=my_zeros(n_iter)
    t0 = time.time()
    for i in xrange(n_iter):
        rvals[i] = train(xval, yval, lr)[0]

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1064,7 +1064,7 @@ class TensorValue(Value, _tensor_py_operators):
 Tensor = TensorType

 #QUESTION: why are we doing this!?
-elemwise.as_tensor_variable = as_tensor_variable    
+elemwise.as_tensor_variable = as_tensor_variable
 elemwise.TensorType = TensorType
 elemwise.TensorVariable = TensorVariable
 elemwise.TensorConstant = TensorConstant

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -945,8 +945,6 @@ def local_dot22_to_dot22scalar(node):
    #we take the first _dot22 found. TODO check others!
    dot22_idx = i_dot22.index(True)
    d = node.inputs[dot22_idx]
-
-
    i_scalar = [_as_scalar(x) for x in node.inputs]
    if not any(i_scalar) and not any([x.owner and x.owner.op ==T.mul for x in node.inputs]):
        #no scalar in input and no multiplication
@@ -983,15 +981,11 @@ def local_dot22_to_dot22scalar(node):
    if scalar_idx<0:
        info('Not optimizing dot22 with inputs', node.inputs, [x.type for x in node.inputs], 'as the type of the scalar can\'t be upcasted to the matrix type')
        return False
-        
    assert scalar_idx<len(node.inputs)
-        
    s = node.inputs[scalar_idx]
    o = copy.copy(node.inputs)
    o.remove(d)
    o.remove(s)
-
-    
    if len(o)==0:
        return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], s)]
    else:

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -24,6 +24,8 @@ from theano import compile  #to register the optimizer built by this file
 from theano.gof.python25 import any, all
 from theano.gof.opt import Optimizer
 from theano.gof import toolbox, DestroyHandler
+
+
 # Utilities

 def out2in(*local_opts):
@@ -395,6 +397,13 @@ class ShapeFeature(object):
        else:
            self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])

+    def init_r(self,r):
+        if r not in self.shape_of:
+            try:
+                self.set_shape(r, self.shape_tuple(r))
+            except AttributeError:
+                self.set_shape(r,None)
+
    def make_vector_shape(self, r):
        return make_vector(*self.shape_of[r])
    #
@@ -421,11 +430,7 @@ class ShapeFeature(object):

        for i, r in enumerate(node.inputs):
            # make sure we have shapes for the inputs
-            if r not in self.shape_of:
-                try:
-                    self.set_shape(r, self.shape_tuple(r))
-                except AttributeError:
-                    self.set_shape(r, None ) # not a TensorType variable
+            self.init_r(r)

        try:
            shape_infer = node.op.infer_shape
@@ -453,7 +458,7 @@ class ShapeFeature(object):
        # TODO:
        # This tells us that r and new_r must have the same shape
        # if we didn't know that the shapes are related, now we do.
-
+        self.init_r(new_r)
        # change_input happens in two cases:
        # 1) we are trying to get rid of r, or
        # 2) we are putting things back after a failed transaction.
@@ -1160,7 +1165,8 @@ register_canonicalize(local_mul_canonizer, name = 'local_mul_canonizer')
 @gof.local_optimizer([T.neg])
 def local_neg_to_mul(node):
    if node.op == T.neg:
-        return [T.mul(-1, node.inputs[0])]
+        return [T.mul(numpy.array(-1, dtype = node.inputs[0].dtype), 
+            node.inputs[0])]
 register_canonicalize(local_neg_to_mul)

 @register_specialize

--- a/theano/tensor/signal/downsample.py
+++ b/theano/tensor/signal/downsample.py
@@ -212,7 +212,7 @@ class DownsampleFactorMax(Op):
        """ % locals()

    def c_code_cache_version(self):
-        return ()
+        return (0,1)


 class DownsampleFactorMaxGrad(Op):
@@ -349,4 +349,4 @@ class DownsampleFactorMaxGrad(Op):
        """ %locals()

    def c_code_cache_version(self):
-        return ()
+        return (0,1)