merge

03375db6 · Pascal Lamblin · 2acc5734 · 40feefca · 03375db6 · 03375db6
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -9,7 +9,7 @@ arrays efficiently. Theano features:
 * **tight integration with numpy** -- Use `numpy.ndarray` in Theano-compiled functions.
 * **near-transparent use of a GPU** -- Accelerate data-intensive calculations [JAN 2010].
 * **symbolic differentiation** -- Let Theano do your derivatives.
-* **speed and stability optimizations** -- Write ``log(1+exp(x))`` and get the right answer.
+* **speed and stability optimizations** -- Get the right answer for ``log(1+x)`` even when ``x`` is really tiny.
 * **dynamic C code generation** -- Evaluate expressions faster.

 Theano has been powering large-scale computationally intensive scientific investigations

--- a/doc/optimizations.txt
+++ b/doc/optimizations.txt
@@ -5,8 +5,7 @@ Optimizations
 ==============

 Theano applies many kinds of graph optimizations, with different objectives:
-* simplifying and standardizing the form of the expression graph 
-  (e.g.  :term:`merge`, :term:`add canonicalization<add canonicalization>`), 
+* simplifying and standardizing the form of the expression graph (e.g.  :term:`merge`, :term:`add canonicalization` ), 
 * reducing the maximum memory footprint (e.g. :term:`inplace_elemwise`),
 * increasing execution speed (e.g. :term:`constant folding`).

@@ -34,7 +33,6 @@ Optimization                                              FAST_RUN  FAST_COMPILE
 :term:`merge`                                             x         x
 :term:`constant folding<constant folding>`                x
 :term:`shape promotion<shape promotion>`                  x
-:term:`fill promotion <fill promotion>`                   x
 :term:`fill cut<fill cut>`                                x
 :term:`inc_subtensor srlz.<inc_subtensor serialization>`  x
 :term:`reshape_chain`                                     x
@@ -75,33 +73,65 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        When all the inputs to an expression are constant, then the expression
        can be pre-computed at compile-time.

-        See ***TODO***
+        See :func:`opt.constant_folding`

    shape promotion
-        See ***TODO***
+        Theano often knows how to infer the shape of an output from the shape
+        of its inputs.  Without this optimization, it would otherwise have to
+        compute things (e.g. ``log(x)``) just to find out the shape of it!

-    fill promotion 
-        See ***TODO***
+        See :func:`opt.local_shape_lift_*`

    fill cut             
-        See ***TODO***
+        `Fill(a,b)` means to make a tensor of the shape of `a` full of the value `b`.
+        Often when fills are used with elementwise operations (e.g. f) they are
+        un-necessary:
+        * ``f(fill(a,b), c) -> f(b, c)``
+        * ``f(fill(a, b), fill(c, d), e) -> fill(a, fill(c, f(b, d, e)))``
+
+        See :func:`opt.local_fill_cut`, :func:`opt.local_fill_sink`

    inc_subtensor serialization  
-        ***TODO***
+        Incrementing a small subregion of a large tensor can be done quickly
+        using an inplace operation, but if two increments are being done on
+        the same large tensor, then only one of them can be done inplace.
+        This optimization reorders such graphs so that all increments can be
+        done inplace.  
+        
+        ``inc_subensor(a,b,idx) + inc_subtensor(a,c,idx) -> inc_subtensor(inc_subtensor(a,b,idx),c,idx)``
+
+        See :func:`local_IncSubtensor_serialize`

    reshape_chain        
        This optimizes graphs like ``reshape(reshape(x, shape1), shape2)`` -> ``reshape(x, shape2)``

-        See also ***TODO***
+        See :func:`local_reshape_chain`

-    constant elimination   
-        ***TODO***
+    constant elimination
+        Many constants indicate special cases, such as ``pow(x,1) -> x``.
+        Theano recognizes many of these special cases.
+
+        See :func:`local_mul_specialize`, :func:`local_mul_specialize`,:func:`local_mul_specialize`

    add canonicalization
-        ***TODO***
+        Rearrange expressions of additions and subtractions to a canonical
+        form:
+
+        .. math::
+            
+            (a+b+c+...) - (z + x + y + ....)
+
+        See :class:`Canonizer`, :attr:`local_add_canonizer`

    mul canonicalization       
-        ***TODO***
+        Rearrange expressions of multiplication and division to a canonical
+        form:
+
+        .. math::
+            
+            \frac{a * b * c * ...}{z * x * y * ....}
+
+        See :class:`Canonizer`, :attr:`local_mul_canonizer`

    dot22                
        This simple optimization replaces dot(matrix, matrix) with a special
@@ -109,31 +139,35 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        implemented with a call to GEMM, and sometimes replaced entirely by
        the :term:`gemm` optimization.

-        See also, ***TODO***.
+        See :func:`local_dot_to_dot22`

    sparse_dot           
-        ***TODO***
+        Theano has a sparse matrix multiplication algorithm that is faster in
+        many cases than scipy's (for dense matrix output).  This optimization
+        swaps scipy's algorithm for ours.
+
+        See :func:`local_structured_dot`

    sum_scalar_mul       
        This optimizes graphs like ``sum(scalar * tensor)`` -> ``scalar * sum(tensor)``

-        See ***TODO***
+        See :func:`local_sum_mul_by_scalar`

    neg_neg              
        Composition of two negatives can be cancelled out.

-        See ***TODO***
+        See :func:`local_neg_neg`

    neg_div_neg          
        Matching negatives in both the numerator and denominator can both be removed.

-        See ***TODO***
+        See :func:`local_neg_div_neg`

    add specialization       
        This optimization simplifies expressions involving the addition of
        zero.
        
-        See ***TODO***
+        See :func:`local_add_specialize`

    mul specialization       
        Several special cases of mul() exist, and this optimization tries to
@@ -142,7 +176,7 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        * ``mul(x,0)`` -> ``zeros_like(x)``
        * ``mul(x, -1)`` -> ``neg(x)``
        
-        See ***TODO***
+        See :func:`local_mul_specialize`

    pow specialization       
        Several special cases of pow() exist, and this optimization tries to
@@ -151,14 +185,15 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        * ``pow(x,0)`` -> ``ones_like(x)``
        * ``pow(x, -0.5)`` -> ``inv(sqrt(x))``
        
-        See also ***TODO***
+        See :func:`local_pow_specialize`
+        

    inplace_setsubtensor 
        In order to be a pure Op, setsubtensor must copy its entire input, and
        modify just the subtensor in question (possibly a single element).  It
        is much more efficient to modify that element inplace.

-        See ***TODO***
+        See :func:`local_inplace_setsubtensor`

    gemm                 
        Numerical libraries such as MKL and ATLAS implement the BLAS-level-3
@@ -170,7 +205,7 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        expressions into one or more instances of this motif, and replace them
        each with a single `Gemm` Op.

-        See ***TODO***
+        See :class:`GemmOptimizer`

    inplace_elemwise
        When one of the inputs to an elementwise expression has the same type
@@ -178,17 +213,23 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        the elemwise expression is evaluated, then we can reuse the storage of
        the input to store the output.

-        See ***TODO***
+        See :func:`insert_inplace_optimizer`

    inplace_random       
        Typically when a graph uses random numbers, the RandomState is stored
        in a shared variable, used once per call and, updated after each function
        call.  In this common case, it makes sense to update the random number generator in-place.

-        See ***TODO***
+        See :func:`random_make_inplace`
+
+    elemwise fusion 
+        This optimization compresses subgraphs of computationally cheap
+        elementwise operations into a single Op that does the whole job in a
+        single pass over the inputs (like loop fusion).  This is a win when
+        transfer from main memory to the CPU (or from graphics memory to the
+        GPU) is a bottleneck.

-    elemwise fusion
-        See ***TODO***
+        See :class:`FusionOptimizer`

    GPU transfer
        The current strategy for choosing which expressions to evaluate on the
@@ -200,15 +241,16 @@ Optimization                                              FAST_RUN  FAST_COMPILE
        copying the output of a Op with a GPU implementation to the GPU, 
        then we substitute the GPU version for the CPU version.  In this way, if all goes well,
        this procedure will result in a graph with the following form:
-            1. copy non-shared inputs to GPU
-            2. carry out most/all computations on the GPU
-            3. copy output back to CPU
+
+        1. copy non-shared inputs to GPU
+        2. carry out most/all computations on the GPU
+        3. copy output back to CPU

        When using a GPU, :func:`shared()` will default to GPU storage for
        'float32' ndarray arguments, and these shared variables act as seeds
        for the greedy algorithm.

-        See ***TODO***
+        See :func:`theano.sandbox.cuda.opt.*`.



--- a/theano/__init__.py
+++ b/theano/__init__.py
@@ -63,6 +63,16 @@ import gof
 import floatX
 floatX.set_floatX()

+import config
+
+#if THEANO_GPU not defined: don't automaticcaly importe cuda
+#if THEANO_GPU defined to something else then "": automatically import cuda
+#   he will init cuda automatically if THEANO_GPU is not -1 or GPU
+#if cuda.use() and THEANO_GPU not defined or defined to "": init to device 0.
+#if THEANO_GPU defined to "-1" or "CPU", automatically import cuda, but don't init it.
+if config.THEANO_GPU not in [None,""]:
+    import theano.sandbox.cuda
+
 ## import scalar_opt

 import subprocess as _subprocess

--- a/theano/config.py
+++ b/theano/config.py
@@ -30,7 +30,7 @@ THEANO_BLAS_LDFLAGS = os.getenv('THEANO_BLAS_LDFLAGS','-lblas')

 #for gpu
 CUDA_ROOT = os.getenv('CUDA_ROOT')
-THEANO_GPU = os.getenv("THEANO_GPU",0)
+THEANO_GPU = os.getenv("THEANO_GPU")

 THEANO_DEFAULT_MODE = os.getenv('THEANO_DEFAULT_MODE','FAST_RUN')


--- a/theano/sandbox/conv.py
+++ b/theano/sandbox/conv.py
@@ -14,33 +14,49 @@ def getFilterOutShp(inshp, kshp, (dx,dy)=(1,1), mode='valid'):
            N.array([dx,dy], dtype='float')))


-def conv(border_mode, subsample=(1,1), imshp=None, kshp=None, **kargs):
+def conv2d(input, filters, border_mode='valid', subsample=(1,1), 
+           image_shape=None, filter_shape=None, **kargs):
    """
    This fct return an instanciated ConvOp but give better name for some param.
    We do this instead of changing the ConvOp interface to don't change all code
    used up to now.

+    :type input: symbolic 4D tensor
+    :param input: tensor containing mini-batch of input feature maps
+    :type filters: symbolic 4D tensor
+    :param filters: tensor containing filters for convolutional neural net
    :type border_mode: string
    :param border_mode:'valid'(only apply kernel over complete patch of the image)
                       or 'full'(padd the image with 0 and apply the kernel over all full patch and partial patch of the image
    :type subsample: tuple of len 2
    :param subsample: how many pixel we move in the (row,col) direction of the image when we change of patch
-    :type imshp: tuple of len 4
-    :param imshp: (batch size, stack size, nb row, nb col)
-    :type kshp: tuple of len 4
-    :param kshp: (nb kernel, stack size, nb row, nb col)
+    :type image_shape: tuple of len 4
+    :param image_shape: (batch size, stack size, nb row, nb col)
+    :type filter_shape: tuple of len 4
+    :param filter_shape: (nb kernel, stack size, nb row, nb col)
    """
-    if imshp is not None and kshp is not None:
-        assert imshp[1]==kshp[1]
-        nkern = kshp[0]
-        bsize = imshp[0]
-        kshp = kshp[:2]
+
+    if image_shape and filter_shape:
+        assert image_shape[1]==filter_shape[1]
+
+    if filter_shape is not None:
+        nkern = filter_shape[0]
+        kshp = filter_shape[2:]
+    else:
+        nkern, kshp = None, None
+
+    if image_shape is not None:
+        bsize = image_shape[0]
        imshp = imshp[1:]
    else:
-        nkern, bsize = None, None
-        
-    return ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
-                  imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
+        bsize, imshp = None, None
+
+     
+    op = ConvOp(output_mode=border_mode, dx=subsample[0], dy=subsample[1],
+                imshp=imshp, kshp=kshp, nkern=nkern, bsize=bsize,**kargs)
+
+    return op(input, filters)
+

 class ConvOp(Op):
    """
@@ -551,7 +567,7 @@ using namespace std;
        if self.kshp_logical_top_aligned:
            d["self_kshp_logical_offset_r"] = 0
            d["self_kshp_logical_offset_c"] = 0
-        else:
+        elif self.imshp != self.imshp_logical or self.kshp != self.kshp_logical:
            rstride = d["self_kshp_logical_stride_r"]
            cstride = d["self_kshp_logical_stride_c"]
            d["self_kshp_logical_offset_r"] = (self.kshp_logical[0] - (self.kshp[0]*rstride) - 1+rstride) % rstride

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -61,7 +61,6 @@ except ImportError:
        set_cuda_disabled()

    if enable_cuda:
-        print __file__

        cuda_path=os.path.split(old_file)[0]
        code = open(os.path.join(cuda_path, "type_support.cu")).read()
@@ -115,6 +114,8 @@ def use(device=config.THEANO_GPU):
        # No successful call to use() has been made yet
        if device=="-1" or device=="CPU":
            return
+        if device in [None,""]:
+            device=0
        device=int(device)
        try:
            cuda_ndarray.gpu_init(device)
@@ -142,3 +143,6 @@ def handle_shared_float32(tf):
    else:
        raise NotImplementedError('removing our handler')

+
+if enable_cuda and config.THEANO_GPU not in [None, ""]:
+    use()
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -140,13 +140,15 @@ class GpuConv(Op):
            #TODO: reconsider this... since shapes are not given in constructor,
            # maybe a multiplier + offset is a more appropriate way of passing this logical
            # grid
-        self.logical_img_hw = tuple(logical_img_hw)
+            logical_img_hw = tuple(logical_img_hw)
+        self.logical_img_hw = logical_img_hw
        if logical_kern_hw is not None:
            h,w = logical_kern_hw
            #TODO: reconsider this... since shapes are not given in constructor,
            # maybe a multiplier + offset is a more appropriate way of passing this logical
            # grid
-        self.logical_kern_hw = tuple(logical_kern_hw)
+            logical_kern_hw = tuple(logical_kern_hw)
+        self.logical_kern_hw = logical_kern_hw
        self.logical_kern_align_top = logical_kern_align_top
        self.version=version
        self.verbose=verbose
@@ -195,6 +197,8 @@ class GpuConv(Op):
                version=self.version,
                verbose=self.verbose)
    def c_support_code_apply(self, node, nodename):
+        if self.logical_img_hw is None or self.logical_kern_hw is None:
+            return super(GpuConv,self).c_support_code_apply(node, nodename)
        img_wid = self.logical_img_hw[1]
        img_len = self.logical_img_hw[0]

@@ -588,6 +592,8 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
        subsample_cols=self.subsample[1]
        version=self.version
        verbose=self.verbose
+        if self.logical_img_hw is None or self.logical_kern_hw is None:
+            return super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)        
        #todo assert out is ccontiguous
        img_wid = self.logical_img_hw[1]
        img_len = self.logical_img_hw[0]

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -339,9 +339,12 @@ def local_gpu_conv(node):
    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
    """
    def GpuConvOp_from_ConvOp(op):
+        logical_img_hw=None
+        if op.imshp_logical is not None:
+            logical_img_hw=op.imshp_logical[1:3]
        ret = GpuConv(border_mode=op.out_mode,
                    subsample=(op.dx, op.dy),
-                    logical_img_hw=op.imshp_logical[1:3],
+                    logical_img_hw=logical_img_hw,
                    logical_kern_hw=op.kshp_logical,
                    logical_kern_align_top=op.kshp_logical_top_aligned,
                    version=op.version,

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -446,13 +446,13 @@ def test_lenet_64(): # ???
                               float_atol=5e-4, check_isfinite=True, version=version)

 def test_lenet_108(): # NORB
-    cmp_run_conv_nnet2_classif(23485, 108, 7, 10, n_iter=5,
+    cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_iter=4,
                               ignore_error=ignore_error, gpu_only=gpu_only,
                               cpu_only=cpu_only, verbose=verbose,
-                               check_isfinite=True, version=version)
+                               check_isfinite=True, version=version, float_atol=7e-2)

 def test_lenet_256(): # ImageNet
-    cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_iter=3,
+    cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_iter=5,
                               ignore_error=ignore_error, gpu_only=gpu_only,
                               cpu_only=cpu_only, verbose=verbose,
                               check_isfinite=True, version=version)

--- a/theano/tensor/__init__.py
+++ b/theano/tensor/__init__.py
@@ -24,6 +24,8 @@ from elemwise import \

 import sharedvar # adds shared-variable constructors

+import nnet # used for softmax, sigmoid, etc.
+




--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py