merge

be0902fb · Razvan Pascanu · 0fe8b8f8 · 223cdae9 · be0902fb · be0902fb
--- a/README.txt
+++ b/README.txt
-To install the package, use:
+To install the package, see this page:

-    python setup.py build
-    python setup.py test
-    python setup.py install
+   http://deeplearning.net/software/theano/install.html#install


 For the documentation, see the project website:

-    http://pylearn.org/theano/
+   http://deeplearning.net/software/theano/

 We recommend you look at the documentation on the website, since it
 will be more current than the documentation included with the package.

--- a/doc/install.txt
+++ b/doc/install.txt
@@ -333,6 +333,13 @@ but this has not been tested yet.
        cp libblas.dll /mingw/lib
        mv libblas.dll /mingw/bin

+- Edit (or create) your ``$HOME/.theanorc`` and add the following section:
+
+    .. code-block:: bash
+
+        [blas]
+        ldflags = -lblas
+
 - Install `Mercurial <http://mercurial.selenic.com/downloads/>`__
  (you can use the regular Windows release, you do not need TortoiseHg).


--- a/theano/compile/debugmode.py
+++ b/theano/compile/debugmode.py
@@ -936,7 +936,7 @@ class _Linker(gof.link.LocalLinker):
            except (NotImplementedError, utils.MethodNotDefined):
                thunks_c.append(None)

-            if self.maker.mode.check_py_code:
+            if self.maker.mode.check_py_code or thunks_c[-1] is None:
                p = node.op.perform
                thunk = (lambda p = p, i = node_input_storage, o = node_output_storage, n =
                        node: p(n, [x[0] for x in i], o))
@@ -1455,7 +1455,7 @@ class DebugMode(Mode):

    check_py_code = config.DebugMode.check_py
    """
-    Should we evaluate (and check) the `perform` implementations?
+    Should we evaluate (and check) the `perform` implementations? Always checked if no `c_code`.
    """

    check_isfinite = config.DebugMode.check_finite

--- a/theano/gof/cutils.py
+++ b/theano/gof/cutils.py
@@ -33,7 +33,7 @@ run_cthunk(PyObject *self, PyObject *args)
    return NULL;
  }
  void * ptr_addr = PyCObject_AsVoidPtr(py_cthunk);
-  int (*fn)(void*) = reinterpret_cast<int (*)(void*)>(ptr_addr);
+  int (*fn)(void*) = (int (*)(void*))(ptr_addr);
  void* it = PyCObject_GetDesc(py_cthunk);
  int failure = fn(it);


--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -12,10 +12,10 @@ import theano.tensor.signal.downsample as downsample

 import numpy

-raise SkipTest('SKIP TO MAKE THE BUILDBOT DON\'T CRASH. THEIR IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')

 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
+raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -10,6 +10,7 @@ import numpy

 from theano import Op, Apply, shared, config
 from theano.tensor import raw_random, TensorType, as_tensor_variable, get_vector_length, cast, opt
+from theano.tensor import zeros_like, sqrt, log, sin, cos, join
 from theano.compile import optdb
 from theano.gof import local_optimizer

@@ -650,6 +651,49 @@ class MRG_RandomStreams(object):
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")

+    def normal(self, size=None, avg=0.0, std=1.0, ndim=None, dtype=config.floatX):
+        # We need an even number of ]0,1[ samples. Then we split them
+        # in two halves. First half becomes our U1's for Box-Muller,
+        # second half our U2's. See Wikipedia page:
+        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+
+        n_samples = self.n_streams(size)
+        evened = False
+           
+        if n_samples % 2 == 1:
+            n_samples += 1
+            evened = True
+
+        flattened = self.uniform(size=(n_samples,), dtype=dtype)
+
+        U1 = flattened[:n_samples/2]
+        U2 = flattened[n_samples/2:]
+
+        #normal_samples = zeros_like(flattened)
+        sqrt_ln_U1 = sqrt(-2.0*log(U1))
+        # TypeError: 'TensorVariable' object does not support item assignment
+        # so this doesn't work...
+        #normal_samples[:n_samples/2] = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
+        #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)
+
+        # so trying this instead
+        first_half = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
+        second_half = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)
+        normal_samples = join(0, first_half, second_half)
+
+        final_samples = None
+        if evened:
+            final_samples = normal_samples[:-1]
+        else:
+            final_samples = normal_samples
+
+        final_samples = avg + std * final_samples
+
+        if size:
+            final_samples = final_samples.reshape(size)
+
+        return final_samples
+
 @local_optimizer([None])
 def mrg_random_make_inplace(node):
    op = node.op
@@ -734,3 +778,78 @@ def test_rng0():

    basictest(ff, 1000, prefix='numpy')

+
+
+
+def test_normal0():
+
+    def basictest(f, steps, target_avg, target_std, prefix=""):
+        dt = 0.0
+        avg_std = 0.0
+        for i in xrange(steps):
+            t0 = time.time()
+            ival = f()
+            dt += time.time() - t0
+            ival = numpy.asarray(ival)
+            if i == 0:
+                mean = numpy.array(ival, copy=True)
+                avg_std = numpy.std(ival)
+            else:
+                alpha = 1.0 / (1+i)
+                mean = alpha * ival + (1-alpha)*mean
+                avg_std = alpha * numpy.std(ival) + (1-alpha)*avg_std
+
+        print prefix, 'mean', numpy.mean(mean)
+        assert abs(numpy.mean(mean) - target_avg) < .01, 'bad mean?'
+        print prefix, 'std', avg_std
+        assert abs(avg_std - target_std) < .01, 'bad std?'
+        print prefix, 'time', dt
+        print prefix, 'elements', steps*sample_size[0]*sample_size[1]
+        print prefix, 'samples/sec', steps*sample_size[0]*sample_size[1] / dt
+
+    sample_size = (999,100)
+
+    print ''
+    print 'ON CPU:'
+
+    R = MRG_RandomStreams(234, use_cuda=False)
+    n = R.normal(size=sample_size, avg=-5.0, std=2.0)
+    f = theano.function([], n)
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', f()[0,0:10]
+    basictest(f, 50, -5.0, 2.0, prefix='mrg ')
+
+    sys.stdout.flush()
+
+    # now with odd number of samples
+    sample_size = (999,99)
+
+    print ''
+    print 'ON GPU:'
+    R = MRG_RandomStreams(234, use_cuda=True)
+    n = R.normal(size=sample_size, avg=-5.0, std=2.0, dtype='float32')
+    assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
+    f = theano.function([], theano.Out(
+        theano.sandbox.cuda.basic_ops.gpu_from_host(n),
+        borrow=True))
+    theano.printing.debugprint(f)
+    print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
+    basictest(f, 50, -5.0, 2.0, prefix='gpu mrg ')
+
+    sys.stdout.flush()
+
+    print ''
+    print 'ON CPU w NUMPY:'
+    RR = theano.tensor.shared_randomstreams.RandomStreams(234)
+
+    nn = RR.normal(size=sample_size, avg=-5.0, std=2.0)
+    ff = theano.function([], nn)
+
+    basictest(ff, 50, -5.0, 2.0, prefix='numpy ')
+
+
+#if __name__ == '__main__':
+#    # with: export THEANO_FLAGS=device=gpu0,floatX=float32
+#    test_normal0()
+
+
--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -1414,12 +1414,16 @@ class Composite(ScalarOp):
                    name = "V%%(id)s_tmp%i" % i
                    subd[output] = name
                    _c_code += "%s %s;\n" % (output.type.dtype_specs()[1], name)
-            _c_code += node.op.c_code(node,
+
+            s =     node.op.c_code(node,
                                      "%(name)s",
                                      [subd[input] for input in node.inputs],
                                      [subd[output] for output in node.outputs],
                                      dict(fail = "%(fail)s",
                                           id = "%%(id)s_%i" % j))
+            if any([isinstance(x.op,Mod) for x in env.toposort()]):
+                s = s.replace('% ','%% ')
+            _c_code += s
            _c_code += "\n"
        _c_code += "}\n"

@@ -1481,6 +1485,9 @@ class Composite(ScalarOp):
            
        return self._c_code % d

+    def c_code_cache_version(self):
+        return (1,)+tuple([x.op.c_code_cache_version() for x in self.env.toposort()])
+    
    def __eq__(self, other):
        if self is other: return True
        if not isinstance(other, self.__class__): return False

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -100,18 +100,24 @@ class GemmRelated(Op):
        #ifndef MOD
        #define MOD %
        #endif
+        static double time_time() // a time function like time.time()
+        {
+            struct timeval tv;
+            gettimeofday(&tv, 0);
+            return (double) tv.tv_sec + (double) tv.tv_usec / 1000000.0;
+        }
        """
        return blas_header_text() + mod_str
    def c_headers(self):
        # std.cout doesn't require the '%' symbol to print stuff... 
        # so it works much better with python's string-substitution stuff.
-        return ['<iostream>'] 
+        return ['<iostream>', '<time.h>', '<sys/time.h>'] 
    
    def c_libraries(self):
        return ldflags()

-    def c_code_cache_version(self):
-        return (0,0,1)
+    # code_cache_version is built by subclasses from 
+    #  build_gemm_version

    def c_compile_args(self):
        return ldflags(libs=False, flags=True)
@@ -247,6 +253,7 @@ class GemmRelated(Op):
                char T = 'T';
                int Nz0 = Nz[0], Nz1 = Nz[1], Nx1 = Nx[1];
                //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
+                //double t0 = time_time();
                switch(unit)
                {
                    case 0x000: sgemm_(&N, &N, &Nz1, &Nz0, &Nx1, &a, y, &sy_0, x, &sx_0, &b, z, &sz_0); break;
@@ -259,6 +266,7 @@ class GemmRelated(Op):
                    case 0x111: sgemm_(&N, &N, &Nz0, &Nz1, &Nx1, &a, x, &sx_1, y, &sy_1, &b, z, &sz_1); break;
                    default: PyErr_SetString(PyExc_ValueError, "some matrix has no unit stride"); %(fail)s;
                };
+                //fprintf(stderr, "Calling sgemm %%i %%i %%i %%i took %%f\\n", unit, Nz1, Nz0, Nx1, time_time() - t0);
        """

    case_double = """
@@ -278,6 +286,7 @@ class GemmRelated(Op):
                char T = 'T';
                int Nz0 = Nz[0], Nz1 = Nz[1], Nx1 = Nx[1];
                //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
+                //double t0 = time_time();
                switch(unit)
                {
                    case 0x000: dgemm_(&N, &N, &Nz1, &Nz0, &Nx1, &a, y, &sy_0, x, &sx_0, &b, z, &sz_0); break;
@@ -290,6 +299,7 @@ class GemmRelated(Op):
                    case 0x111: dgemm_(&N, &N, &Nz0, &Nz1, &Nx1, &a, x, &sx_1, y, &sy_1, &b, z, &sz_1); break;
                    default: PyErr_SetString(PyExc_ValueError, "some matrix has no unit stride"); %(fail)s;
                };
+                //fprintf(stderr, "Calling dgemm %%i %%i %%i %%i took %%f\\n", unit, Nz1, Nz0, Nx1, time_time()- t0);
        """

    end_switch_typenum = """
@@ -319,7 +329,7 @@ class GemmRelated(Op):
            self.end_switch_typenum), '')

    def build_gemm_version(self):
-        return (2,)
+        return (4,)

 class Gemm(GemmRelated):
    """In-place version of matrix-matrix multiplication (with accumulation):
@@ -442,6 +452,7 @@ class Gemm(GemmRelated):
            dims[0] = %(_z)s->dimensions[0];
            dims[1] = %(_z)s->dimensions[1];
            %(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, type_num_%(_z)s);
+            //fprintf(stderr, "Gemm Allocating %%i %%i\\n", dims[0], dims[1]);
            if(!%(_zout)s) {
                PyErr_SetString(PyExc_MemoryError, "failed to alloc gemm_no_inplace output");
                %(fail)s
@@ -515,7 +526,11 @@ class Gemm(GemmRelated):
        return full_code

    def c_code_cache_version(self):
-        return (3,) + self.build_gemm_version()
+        gv = self.build_gemm_version()
+        if gv:
+            return (3,) + gv
+        else:
+            return gv

 gemm_inplace = Gemm(inplace=True)
 gemm_no_inplace = Gemm(inplace=False)
@@ -817,6 +832,7 @@ class Dot22(GemmRelated):
            dims[0] = %(_x)s->dimensions[0];
            dims[1] = %(_y)s->dimensions[1];
            %(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, type_num_%(_x)s);
+            //fprintf(stderr, "Dot Allocating %%i %%i\\n", dims[0], dims[1]);
            if(!%(_zout)s) {
                PyErr_SetString(PyExc_MemoryError, "failed to alloc dot22 output");
                %(fail)s
@@ -841,7 +857,11 @@ class Dot22(GemmRelated):
        full_code = self.build_gemm_call() % dict(locals(), **sub)
        return full_code
    def c_code_cache_version(self):
-        return (1,) + self.build_gemm_version()
+        gv = self.build_gemm_version()
+        if gv:
+            return (1,) + gv
+        else:
+            return gv

 _dot22 = Dot22()

@@ -947,7 +967,11 @@ class Dot22Scalar(GemmRelated):
        full_code = self.build_gemm_call() % dict(locals(), **sub)
        return full_code
    def c_code_cache_version(self):
-        return (2,) + self.build_gemm_version()
+        gv = self.build_gemm_version()
+        if gv:
+            return (2,) + gv
+        else:
+            return gv

 _dot22scalar = Dot22Scalar()


--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
 """
-Contains an op for convolving input images with a set of filters. This was
+Contains an Op for convolving input images with a set of filters. This was
 developed especially for Convolutional Neural Networks.
+
+For related ops, including downsampling and subsampling, see
+tensor.signal and tensor.signal.downsample.
+
+See especially conv2d().
 """

 __docformat__ = "restructuredtext en"
@@ -764,7 +769,6 @@ using namespace std;
        d["self_dx"]=self.dx
        d["self_dy"]=self.dy
        d["mode"]=self.out_mode.upper()
-        d["mode"]=self.out_mode.upper()
        d["affectation"]="="
        if all_shape:
            d["self_bsize"]=self.bsize
@@ -910,7 +914,7 @@ if(%(filtersflipped)s->nd==3){
  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
 }else{
-    std:stringstream temp;
+    std::stringstream temp;
    temp << "nddim="<<%(filtersflipped)s->nd;
    std::string param = temp.str();
    PyErr_SetString(PyExc_ValueError,
@@ -1145,7 +1149,7 @@ if(%(filtersflipped)s->nd==3){
  kerns_dim[1]=%(filtersflipped)s->dimensions[1];
  kerns_dim[0]=%(filtersflipped)s->dimensions[0];
 }else{
-    std:stringstream temp;
+    std::stringstream temp;
    temp << "nddim="<<%(filtersflipped)s->nd;
    std::string param = temp.str();
    PyErr_SetString(PyExc_ValueError,
@@ -1377,7 +1381,7 @@ if(%(img2d)s->nd==2){
  img2d_dim[1]=%(img2d)s->dimensions[1];
  img2d_dim[0]=%(img2d)s->dimensions[0];
 }else {
-    std:stringstream temp;
+    std::stringstream temp;
    temp << "nddim="<<%(img2d)s->nd;
    std::string param = temp.str();
    PyErr_SetString(PyExc_ValueError,

--- a/theano/tensor/nnet/tests/speed_test_conv.py
+++ b/theano/tensor/nnet/tests/speed_test_conv.py
@@ -143,16 +143,18 @@ def speed_multilayer_conv():
        
        validate=False# we don't validate the result to have it much faster!
        verbose=1
-        unroll_batch = [1,2,4,5,10,20]
-        unroll_kern = [1,2,4,5,10,20]
-        unroll_batch = [1,4,5]
-        unroll_kern = [1,4,5]
+        unroll_batch = [1,2,3,4,5,10]#15, 30, 60 always much slower
+        unroll_kern = [1,2,3,4,5,10]#15, 30, 60 always much slower
+        #unroll_batch = [1,4,5]
+        #unroll_kern = [1,4,5]
+        #unroll_batch = [1,4]
+        #unroll_kern = [1,4]
        unroll_patch = [True, False]
        
-        bsize = 20 # batch size
+        bsize = 60 # batch size
        imshp_start = (1,48,48)#un square shape to test more corner case.
        kshps = ([11,12],[12,11])#un square shape to test more corner case.
-        nkerns = [20,20] # per output pixel
+        nkerns = [60,60] # per output pixel
        ssizes = [(1,1),]#(1,1)]#(2,2) bugged
        convmodes = ['valid','full']
        do_convolve2=False
@@ -212,8 +214,10 @@ def speed_multilayer_conv():
        best=N.asarray(best)
        worst=N.asarray(worst)
        print "timing for unrolled version"
-        print t_b_k
-        print t
+        print "unroll_batch/unroll_kern valid_mode full_mode"
+        for n_b in range(len(unroll_batch)):
+            for n_k in range(len(unroll_kern)):
+                print unroll_batch[n_b],"/",unroll_kern[n_k], " ",t[n_b,n_k]
        t_detail=t
        t = t.sum(axis=2)
        print "max %.3fs"%t.max(), "max param(batch unloop size/kernel unloop size)", t_b_k[t.argmax()]

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -88,10 +88,10 @@ class TestConv2D(unittest.TestCase):
        Tests that basic convolutions work for odd and even dimensions of image and filter
        shapes, as well as rectangular images and filters.
        """
-        self.validate((3,2,8,8), (4,2,5,5), 'valid')
+        self.validate((3,2,8,8), (4,2,5,5), 'valid', verify_grad=False)
        self.validate((3,2,7,5), (5,2,2,3), 'valid')
-        self.validate((3,2,7,5), (5,2,3,2), 'valid')
-        self.validate((3,2,8,8), (4,2,5,5), 'full')
+        self.validate((3,2,7,5), (5,2,3,2), 'valid', verify_grad=False)
+        self.validate((3,2,8,8), (4,2,5,5), 'full', verify_grad=False)
        self.validate((3,2,7,5), (5,2,2,3), 'full')
        # test filter same size as input

@@ -105,7 +105,7 @@ class TestConv2D(unittest.TestCase):
        """
        self.validate((3,2,7,5), (5,2,2,3), 'valid', unroll_patch=False)
        self.validate((3,2,7,5), (5,2,2,3), 'full', unroll_patch=False)
-        self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False)
+        self.validate((3,2,3,3), (4,2,3,3), 'valid', unroll_patch=False, verify_grad=False)

    def test_unroll_special(self):
        """
@@ -175,7 +175,17 @@ class TestConv2D(unittest.TestCase):
        """
        try:
            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dmatrix())
+            # should never reach here
+            self.fail()
+        except: 
+            pass
+        try:
            self.validate((3,2,8,8), (4,2,5,5), 'valid', filters = T.dvector())
+            # should never reach here
+            self.fail()
+        except: 
+            pass
+        try:
            self.validate((3,2,8,8), (4,2,5,5), 'valid', input = T.dtensor3())
            # should never reach here
            self.fail()

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -224,7 +224,12 @@ class MakeVector(T.Op):
    def __str__(self):
        return self.__class__.__name__
    def perform(self, node, inputs, (out,)):
-        out[0] = theano._asarray(inputs, dtype=node.outputs[0].dtype)
+        # not calling theano._asarray as optimization
+        if out[0] is None:
+            out[0] = theano._asarray(inputs, dtype=node.outputs[0].dtype)
+        else:
+            # assume that out has correct dtype.  there is no cheap way to check
+            out[0][...] = inputs

 make_vector = MakeVector()

@@ -262,7 +267,10 @@ class Shape_i(T.Op):
            raise TypeError('x has too few dimensions for Shape_i', (x, self.i))
        return T.Apply(self, [x], [T.lscalar()])
    def perform(self, node, (x, ), (out, )):
-        out[0] = theano._asarray(x.shape[self.i], dtype = 'int64')
+        if out[0] is None:
+            out[0] = theano._asarray(x.shape[self.i], dtype='int64')
+        else:
+            out[0][...] = x.shape[self.i]
    def grad(self, (x,), (gz,)):
        return [None]


--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -603,8 +603,12 @@ def test_dot22scalar():
    #currently the canonizer don't always merge all Mul together...
    #that force the optimizer to make a recursive search witch it don't do now.
    #but it do it for 1 level of recursion.
-#    assert _dot22scalar in [x.op for x in topo]
-#    assert len(topo)==2
+    #    assert _dot22scalar in [x.op for x in topo]
+    #    assert len(topo)==2
+    ### Fred, 
+    ### What are you talking about?
+    ### -James (March 28 2010)
+    
    f(av,bv,cv)
    f = theano.function([a,b,c],c * a*0.2*T.dot(a,b),mode=m2)
    topo = f.maker.env.toposort()