merge

b58fbb85 · Frederic Bastien · b8e1f760 · 5f2822a8 · b58fbb85 · b58fbb85
--- a/doc/library/scan.txt
+++ b/doc/library/scan.txt
@@ -32,7 +32,7 @@ The equivalent Theano code would be

  # Symbolic description of the result
  result,updates = theano.scan(fn = lambda x_tm1,A: x_tm1*A,\
-                       info_outputs = T.ones_like(A),\
+                       outputs_info = T.ones_like(A),\
                       non_sequences  = A, \
                       n_steps        = k)

@@ -112,7 +112,7 @@ the Theano variables needed we construct our RNN as follows :

   ([x_vals, y_vals],updates) = theano.scan(fn = oneStep, \
                                sequences    = dict(input = u, taps= [-4,-0]), \
-                                info_outputs = [dict(initial = x0, taps = [-3,-1]),y0], \
+                                outputs_info = [dict(initial = x0, taps = [-3,-1]),y0], \
                                non_sequences  = [W,W_in_1,W_in_2,W_feedback, W_out])
        # for second input y, scan adds -1 in output_taps by default

@@ -155,7 +155,7 @@ the following:

 sample = theano.tensor.vector()

- values, updates = theano.scan( OneStep, info_outputs = sample, n_steps = 10 )
+ values, updates = theano.scan( OneStep, outputs_info = sample, n_steps = 10 )

 gibbs10 = theano.function([sample], values[-1], updates = updates)


--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -340,7 +340,7 @@ class ProfileMode(Mode):
            print "<fct name> <input name> <input type> <str input>"
            for fct in fct_call.keys():
                for i in fct.input_storage:
-                    if i.type.dtype=='float64':
+                    if hasattr(i.type, 'dtype') and i.type.dtype=='float64':
                        print fct.name, i.name, i.type, i

 register_mode('PROFILE_MODE',ProfileMode())

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -344,6 +344,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	!work_complete) //conv_rows_stack2

    {
+      // version 9:we preload the full kernel
+      // version 10: load only a few row at a time.
 	int nb_row=1;
 	int max_threads=512;
 	int version_back = version;

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -29,11 +29,17 @@ def debug(*args):
    _logger.debug("DEBUG: "+' '.join(str(a) for a in args))

 nvcc_path = 'nvcc'
+nvcc_version = None
 def is_nvcc_available():
    """Return True iff the nvcc compiler is found."""
    try:
-        subprocess.call(['nvcc', '--version'], stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
+        p = subprocess.Popen(['nvcc', '--version'], stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+        p.wait()
+        s = p.stdout.readlines()[-1].split(',')[1].strip().split()
+        assert s[0]=='release'
+        global nvcc_version
+        nvcc_version = s[1]
        return True
    except:
        #try to find nvcc into cuda.root
@@ -43,7 +49,7 @@ def is_nvcc_available():
            nvcc_path = p
            return True
        else: return False
-is_nvcc_available()#to set nvcc_path correctly.
+is_nvcc_available()#to set nvcc_path correctly and get the version

 def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[], lib_dirs=[], libs=[],
        preargs=[]):

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -78,10 +78,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ker
    kern = cuda_ndarray.CudaNdarray(npy_kern)

    #we take the stride after the transfert as we make c_contiguous data on the GPU.
-    img=img[:,:,::img_stride[0],::img_stride[1]]
-    kern=kern[:,:,::kern_stride[0],::kern_stride[1]]
-    npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]]
-    npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]]
+    if img_stride!=(1,1):
+        img=img[:,:,::img_stride[0],::img_stride[1]]
+        npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]]
+    if kern_stride!=(1,1):
+        kern=kern[:,:,::kern_stride[0],::kern_stride[1]]
+        npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]]

    t2 = None
    rval = True
@@ -265,12 +267,12 @@ def get_valid_shapes():
            , ((60,20,12,12),(30,20,5,5), (1,1), (1,1), (1,1))#test_lenet_28 2 layers
            , ((60,30,8,8),(20,30,5,5), (1,1), (1,1), (1,1))#test_lenet_28 bprop 1 full
            , ((20,60,12,12),(30,60,8,8), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
-            , ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
+#            , ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
            , ((10,1,64,64),(20,1,7,7), (1,1), (1,1), (1,1))#test_lenet_64 1 layers
            , ((10,20,29,29),(30,20,7,7), (1,1), (1,1), (1,1))#test_lenet_64 2 layers
            , ((10,30,23,23),(20,30,7,7), (1,1), (1,1), (1,1))#test_lenet_64 full
-            , ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
-            , ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
+#            , ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
+#            , ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
            ]
    return shapes


--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -49,7 +49,7 @@ MASK12 = numpy.int32(511)       #2^9 - 1
 MASK13 = numpy.int32(16777215)  #2^24 - 1
 MASK2 = numpy.int32(65535)      #2^16 - 1
 MULT2 = numpy.int32(21069)
-NORM = 4.656612873077392578125e-10;
+NORM = 4.656612873077392578125e-10; #1./2^31

 A1p0 = numpy.asarray([[0, 4194304, 129], [1, 0, 0], [0, 1, 0]])
 A2p0 = numpy.asarray([[32768, 0, 32769], [1, 0, 0], [0, 1, 0]])
@@ -593,6 +593,7 @@ class MRG_RandomStreams(object):
        return rval

    def n_streams(self, size):
+        # TODO: a smart way of choosing the number of streams
        if isinstance(size, (tuple, list)):
            r = 1
            for s in size:
@@ -601,12 +602,7 @@ class MRG_RandomStreams(object):
                return r/6 # chosen as fastest for rbm_benchmark
            else:
                return r
-        try:
-            rval =  int(size)
-            assert rval > 0
-            return rval
-        except:
-            pass
+
        print >> sys.stderr, "MRG_RandomStreams Can't determine #streams from size (%s), guessing 30*256"%str(size)
        return 30*256

@@ -616,7 +612,7 @@ class MRG_RandomStreams(object):
        node_rstate.default_update = new_rstate
        return sample

-    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX):
+    def uniform(self, size=None, low=0.0, high=1.0, ndim=None, dtype=config.floatX, nstreams=None):
        """
        Sample a tensor of given size whose element from a uniform
        distribution between low and high.
@@ -625,8 +621,10 @@ class MRG_RandomStreams(object):
        ndim may be a plain integer to supplement the missing
        information.
        """
+        if nstreams is None:
+            nstreams = self.n_streams(size)
        if self.use_cuda and dtype=='float32':
-            rstates = self.get_substream_rstates(self.n_streams(size))
+            rstates = self.get_substream_rstates(nstreams)
            rstates = rstates.flatten()
            # HACK - we use fact that int32 and float32 have same size to 
            # sneak ints into the CudaNdarray type.
@@ -643,11 +641,11 @@ class MRG_RandomStreams(object):
            u = self.pretty_return(node_rstate, 
                    *GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
        else:
-            node_rstate = shared(self.get_substream_rstates(self.n_streams(size)))
-            u = self.pretty_return(node_rstate, 
+            node_rstate = shared(self.get_substream_rstates(nstreams))
+            u = self.pretty_return(node_rstate,
                    *mrg_uniform.new(node_rstate, ndim, dtype, size))
        r = u * (high-low) + low
-        
+
        if u.type.broadcastable != r.type.broadcastable:
            raise NotImplementedError( 'Increase the size to match the broadcasting pattern of `low` and `high` arguments')
        return  r
@@ -664,7 +662,7 @@ class MRG_RandomStreams(object):
        # second half our U2's. See Wikipedia page:
        # http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform

-        n_samples = self.n_streams(size)
+        n_samples = numpy.prod(size)
        evened = False
           
        if n_samples % 2 == 1:
@@ -684,8 +682,8 @@ class MRG_RandomStreams(object):
        #normal_samples[n_samples/2:] = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)

        # so trying this instead
-        first_half = sqrt_ln_U1 * cos(2.0*numpy.pi*U2)
-        second_half = sqrt_ln_U1 * sin(2.0*numpy.pi*U2)
+        first_half = sqrt_ln_U1 * cos(2.0*cast(numpy.pi,dtype)*U2)
+        second_half = sqrt_ln_U1 * sin(2.0*cast(numpy.pi,dtype)*U2)
        normal_samples = join(0, first_half, second_half)

        final_samples = None

--- a/theano/sandbox/samples_MRG31k3p_12_7_5.txt
+++ b/theano/sandbox/samples_MRG31k3p_12_7_5.txt
+0.7353244530968368
+0.6142074400559068
+0.11007806099951267
+0.6487741703167558
+0.36619443260133266
+0.2585685825906694
+0.9489980279468
+0.4309556516818702
+0.12257590936496854
+0.9760319022461772
+0.6940806899219751
+0.18046841165050864
+0.003993193618953228
+0.5351603352464736
+0.02472442388534546
+0.7705746139399707
+0.8138928869739175
+0.9650539481081069
+0.24507411010563374
+0.35767574002966285
+0.4939101580530405
+0.9027785388752818
+0.27498403564095497
+0.03848231676965952
+0.3081609820947051
+0.9062023567967117
+0.009030417073518038
+0.7953705741092563
+0.5061718439683318
+0.5975547162815928
+0.5435514179989696
+0.330895590595901
+0.49919482320547104
+0.9409166998229921
+0.8276205519214272
+0.5180770065635443
+0.2319392478093505
+0.36197659047320485
+0.11120751267299056
+0.5018561617471278
+0.47852187464013696
+0.7188052111305296
+0.3030327311716974
+0.6756376498378813
+0.03624899685382843
+0.34987151669338346
+0.031225718092173338
+0.06772322440519929
+0.06820952938869596
+0.9987128847278655
+0.08330700965598226
+0.9731874465942383
+0.6345655219629407
+0.7169904578477144
+0.5793502484448254
+0.7396790678612888
+0.9926023166626692
+0.7522463691420853
+0.6768838302232325
+0.3253784184344113
+0.05375300580635667
+0.4912636987864971
+0.6485021142289042
+0.3043024237267673
+0.24868384934961796
+0.8166692252270877
+0.5274319797754288
+0.31434731651097536
+0.9961257497780025
+0.3549888739362359
+0.8423425843939185
+0.21591948671266437
+0.8698299624957144
+0.17033040337264538
+0.22816143138334155
+0.11795765580609441
+0.7024209997616708
+0.15607220400124788
+0.5493582566268742
+0.5827712984755635
+0.8592293248511851
+0.785309090744704
+0.6115233600139618
+0.019046304281800985
+0.2573754615150392
+0.03130705002695322
+0.6572857238352299
+0.2033171127550304
+0.5058645992539823
+0.15793190989643335
+0.6273676953278482
+0.7285307059064507
+0.265245848800987
+0.6073522809892893
+0.3896624594926834
+0.27189663611352444
+0.705508322454989
+0.12823439668864012
+0.39648046158254147
+0.6584051586687565
+0.07818163838237524
+0.33628708589822054
+0.20613654889166355
+0.4277639244683087
+0.5401185592636466
+0.07513022050261497
+0.4920963351614773
+0.18214095244184136
+0.3235122123733163
+0.29958881670609117
+0.7304665613919497
+0.05146520072594285
+0.2471711952239275
+0.8797005712985992
+0.5029069227166474
+0.526974250562489
+0.15968210343271494
+0.4696163134649396
+0.17607332626357675
+0.362843859475106
+0.7626461815088987
+0.960180682130158
+0.2536660563200712
+0.710880630183965
+0.28728525526821613
+0.78940424695611
+0.5242114691063762
+0.8314367309212685
+0.5898511232808232
+0.015212591737508774
+0.4944482510909438
+0.06396882887929678
+0.519745257217437
+0.3558214954100549
+0.04566589882597327
+0.8368005948141217
+0.979805170558393
+0.7622401369735599
+0.2578657674603164
+0.5378834479488432
+0.9926298237405717
+0.4013678622432053
+0.510077933780849
+0.018817965406924486
+0.21481098141521215
+0.5357040031813085
+0.8512061606161296
+0.009026535786688328
+0.27302876580506563
+0.21162108704447746
+0.5273029855452478
+0.1086404686793685
+0.14079083362594247
+0.14331109775230289
+0.8190496540628374
+0.3947252375073731
+0.28109811525791883
+0.4066850380040705
+0.9154577874578536
+0.8929708409123123
+0.13500721845775843
+0.6328344400972128
+0.5668322211131454
+0.5448646773584187
+0.5418433886952698
+0.1141617177054286
+0.15885689994320273
+0.3867143443785608
+0.5574855520389974
+0.9173167692497373
+0.22908265376463532
+0.2047420055605471
+0.05979115655645728
+0.44121386017650366
+0.9507057839073241
+0.15352962678298354
+0.23290937673300505
+0.46427791472524405
+8.519855327904224E-4
+0.7947354763746262
+0.6385304923169315
+0.8696001935750246
+0.6022149357013404
+0.02299323584884405
+0.5036068987101316
+0.7541037476621568
+0.9995524706318974
+0.5888469088822603
+0.3318097642622888
+0.32492663664743304
+0.6643895329907537
+0.3656829949468374
+0.4912424306385219
+0.1900841724127531
+0.5945985522121191
+0.5709856003522873
+0.35780346347019076
+0.388774358201772
+0.9446004652418196
+0.14594348100945354
+0.6250799335539341
+0.5504232128150761
+0.16380576323717833
+0.7428167965263128
+0.5522975320927799
+0.655389194842428
+0.47579632699489594
+0.29743909696117043
+0.6319712968543172
+0.8178138644434512
+0.2785301594994962
+0.46813122322782874
+0.2898342702537775
+0.3287009159103036
+0.12909299414604902
+0.5859099281951785
+0.1891166502609849
+0.14497734932228923
+0.5543341124430299
+0.11846801871433854
+0.8499364419840276
+0.6603211951442063
+0.35630465345457196
+0.9680569358170033
+0.6639338186942041
+0.24408268369734287
+0.030771974939852953
+0.17226932244375348
+0.7909302446059883
+0.4327161009423435
+0.6732332338578999
+0.0849734228104353
+0.7278832173906267
+0.5536605608649552
+0.7091806619428098
+0.01754110073670745
+0.8406045655719936
+0.4815619965083897
+0.0535086034797132
+0.9874794147908688
+0.07097038673236966
+0.023544831201434135
+0.42413365049287677
+0.2970325672067702
+0.48028060607612133
+0.1990663455799222
+0.6099434774369001
+0.5050413520075381
+0.7814605687744915
+0.2650358658283949
+0.5148864723742008
+0.7807142282836139
+0.0976667134091258
+0.1516015767119825
+0.6566055505536497
+0.3946392172947526
+0.8052488421089947
+0.2964451564475894
+0.07394864456728101
+0.6961450576782227
+0.01576960226520896
+0.3434433783404529
+0.08799878368154168
+0.785557022318244
+0.7494717631489038
+0.45548726338893175
+0.7672475459985435
+0.5134695749729872
+0.7000438082031906
+0.49818582693114877
+0.4293400440365076
+0.9961911663413048
+0.016769078094512224
+0.013044610153883696
+0.8661804771982133
+0.7819683295674622
+0.33438047766685486
+0.966121535282582
+0.7259743176400661
+0.9887824659235775
+0.9494950002990663
+0.037431647535413504
+0.8268285538069904
+0.7355263698846102
+0.3120658891275525
+0.3588241692632437
+0.471130283549428
+0.7047113911248744
+0.980073744431138
+0.6762627908028662
+0.869295812677592
+0.9070576094090939
+0.7852784115821123
+0.16342713963240385
+0.06330870278179646
+0.6165989111177623
+0.342802997212857
+0.8414176292717457
+0.6921333004720509
+0.2594374935142696
+0.4386491202749312
+0.555369642097503
+0.3660965468734503
+0.6484139142557979
+0.9005299550481141
+0.25335891311988235
+0.23852926725521684
+0.9044205779209733
+0.8694673446007073
+0.46783560374751687
+0.34727911837399006
+0.19556640228256583
+0.8798208390362561
+0.3131108647212386
+0.6312824171036482
+0.5722001581452787
+0.9441223978064954
+0.7707183314487338
+0.17464511329308152
+0.08897313429042697
+0.5044040409848094
+0.5735817537643015
+0.4467783076688647
+0.19051036844030023
+0.4578995378687978
+0.6395204453729093
+0.460110604763031
+0.576092894654721
+0.7038368303328753
+0.5555814192630351
+0.4171535111963749
+0.8905360852368176
+0.12811446748673916
+0.6814800254069269
+0.8502416326664388
+0.12028768053278327
+0.16715052351355553
+0.3563938206061721
+0.049810963682830334
+0.27328392397612333
+0.2407418810762465
+0.6631906591355801
+0.674483266659081
+0.10489491606131196
+0.04698043642565608
+0.0812066881917417
+0.312124056275934
+0.6798701109364629
+0.7286937129683793
+0.9784366562962532
+0.5650205011479557
+0.833059043623507
+0.8976074242964387
+0.9441233519464731
+0.6146679543890059
+0.9019614770077169
+0.5529476394876838
+0.7665416682139039
+0.39598167687654495
+0.26307358546182513
+0.14862705068662763
+0.9521124185994267
+0.17644333699718118
+0.7684473628178239
+0.4274347145110369
+0.6102834036573768
+0.9328651092946529
+0.058630190789699554
+0.04729347629472613
+0.9597438890486956
+0.6761234584264457
+0.21832499839365482
+0.20707347383722663
+0.7274158899672329
+0.9477886455133557
+0.7821800266392529
+0.07305240212008357
+0.40399201214313507
+0.22684293938800693
+0.053185423370450735
+0.330069282092154
+0.6862794999033213
+0.7821815954521298
+0.22617859859019518
+0.8118352359160781
+0.015444065444171429
+0.6732339109294116
+0.9980663135647774
+0.8833195753395557
+0.21191661106422544
+0.32638366147875786
+0.5747208022512496
+0.07515769777819514
+0.02952938713133335
+0.4980746121145785
+0.8762881984002888
+0.17386484891176224
+0.10696181375533342
+0.5474299816414714
+0.016154434997588396
+0.6960771018639207
+0.47133891424164176
+0.9015861176885664
+0.782880718819797
+0.6602211343124509
+0.6578835439868271
+0.6049443730153143
+0.17169494135305285
+0.9915955001488328
+0.10519243823364377
+0.37815978936851025
+0.20879409136250615
+0.45666090911254287
+0.6456936108879745
+0.684759714640677
+0.8762755445204675
+0.8020628895610571
+0.1663151141256094
+0.31246642768383026
+0.18852565623819828
--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
--- a/theano/scan.py
+++ b/theano/scan.py
@@ -366,7 +366,6 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
             updates rules for all shared variables used in the scan
             operation; this dictionary should be pass to ``theano.function``
    """
-
    # General observation : this code is executed only once, at creation 
    # of the computational graph, so we don't yet need to be smart about 
    # anything ( to speed things up)
@@ -404,7 +403,6 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
    # compute number of sequences and number of outputs
    n_seqs = len(seqs)
    n_outs = len(outs_info)
-
    # initialize the inplace map, sequences map and 
    # outputs map
    ''' Details:
@@ -629,10 +627,27 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
    # remove shared variables from the non sequences list
    # such that we can compile the function ( the user has the option to add them when writing
    # scan, because in some situations this might make the code more readable)
+    # Also duplicate the list of non sequences arguments to contain copies of the non-shared 
+    # inputs ( this fixes the case when one of this inputs has a default update attached to it 
+    # that belongs to some shared random stream ). 
+    #
+    # Note : In that case, scan assumes that you do not want to draw new numbers at every call ( you
+    #        would have made the internal function do that explicitly if you wanted to) but rather to 
+    #        use that initial draw as a matrix of values
+    new_non_seqs = []
    notshared_other_args = []
+    notshared_other_args_copies = []
    for non_seq in non_seqs:
        if not isinstance(non_seq, SharedVariable):
+            if n_fixed_steps not in [-1,1]:
+                non_seq_copy = non_seq.type()
+            else:
+                non_seq_copy = non_seq
            notshared_other_args += [non_seq]
+            notshared_other_args_copies += [non_seq_copy]
+            new_non_seqs += [non_seq_copy]
+        else:
+            new_non_seqs += [non_seq]

    # add only the not shared variables to the arguments of the dummy
    # function [ a function should not get shared variables as input ]
@@ -640,10 +655,10 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
    for arg in args:
        if not isinstance(arg, SharedVariable):
            dummy_args += [arg]
-    dummy_args += notshared_other_args
+    dummy_args += notshared_other_args_copies
    # arguments for the lambda expression that gives us the output
    # of the inner function
-    args += non_seqs
+    args += new_non_seqs

    # when we apply the lambda expression we get a mixture of update rules
    # and outputs that needs to be separated
@@ -704,14 +719,6 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
    # make the compilation as fast as possible by not applying any optimization
    # or conversion to C [ note this region is not important for performance
    # so we can do stuff as unoptimal as we wish ]
-    '''
-    Why did I use gof.graph.inputs to pick the inputs here ??
-    
-    dummy_f = function(filter(lambda x: isinstance(x,gof.Variable) and \
-            not isinstance(x,SharedVariable) and not isinstance(x,gof.Constant), \
-            reversed(gof.graph.inputs(dummy_args))), outputs, updates = updates, mode = \
-                 compile.mode.Mode(linker = 'py', optimizer = None) )
-    '''
    if n_fixed_steps in [-1,1]:
        ''' We do have a special case here, namely is so might happen that
        whatever we have in dummy_args is not sufficient to compile the 
@@ -726,6 +733,7 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
            not isinstance(x,SharedVariable) and not isinstance(x,gof.Constant), \
            gof.graph.inputs(dummy_args)), outputs, updates = updates, mode = compile.mode.Mode(linker='py',optimizer=None))
    else:
+
        dummy_f = function(filter(lambda x: isinstance(x, gof.Variable) and \
            not isinstance(x,SharedVariable) and not isinstance(x,gof.Constant), \
            dummy_args), outputs, updates = updates, mode = compile.mode.Mode(linker='py',optimizer=None))
@@ -859,22 +867,8 @@ def scan(fn, sequences=[], outputs_info=[], non_sequences=[],
    else:
        # If we do not actually need scan
        for pos, inner_out in enumerate(inner_fn_outs):
-            # check if we are suppose to return just the last step
-            # we treat this case differently because the tensor we return
-            # in this case is different (it has one dimension less)
-            if return_steps.has_key(pos):
-                if return_steps[pos] != 1:
-                    # if we return more then one step, we need to add 
-                    # one more dimension to our output and make it 
-                    # unbroadcastable
-                    inner_fn_outs[pos] = tensor.unbroadcast(
-                            tensor.shape_padleft(inner_out),0)
-            else:
-                # same if we do not have any information about how many
-                # steps we should return (to read return everything in this
-                # case
-                inner_fn_outs[pos] = tensor.unbroadcast( 
-                        tensor.shape_padleft(inner_out),0)
+            if isinstance(inner_out.type, tensor.TensorType):
+                inner_fn_outs[pos] = tensor.unbroadcast( tensor.shape_padleft(inner_out),0)
        values = inner_fn_outs



--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -83,93 +83,6 @@ def conv2d(input, filters, image_shape=None, filter_shape=None,

    return op(input, filters)

-def conv2d_offset(input, filters, image_shape=None, filter_shape=None,
-                border_mode='valid', subsample=(1,1), offsets = [], **kargs):
-    """
-    Build a graph for nnet convolutions with subsampling and offsetting.
-
-    Refer to conv2d for a general explanation of the context.
-
-    *Subsampling* means, we only compute convolutions at a fraction of the 
-    sites. *Offsetting* convolutions in the context of subsampling means not
-    computing all of them at the same sites (starting in upper left corner).
-    This function allows offsets to be used.
-
-    Most parameters are shared with conv2d, except:
-    :param offsets: list of 2-tuples that specify sites wrt the subsampling
-    scheme. The filters are split evenly accross the different sites. For 
-    example with subsample (2,2) we can use the sites [(0,0), (0,1), (1,0),
-    (1,1)]. An empty list is interpreted as all offsets.
-    """
-    # There should be subsampling for offsets to make any sense.
-    if not (subsample[0] > 1 or subsample[1] > 1):
-        raise ValueError('conv2d_offset requires subsampling.')
-
-    # Haven't thought about this case.
-    if numpy.any(numpy.array(subsample) > filter_shape[2:]):
-        raise ValueError('conv2d_offset subsample greater than filter shape. Not supported?')
-
-    # No offsets specified is interpreted as all offsets.
-    if len(offsets) == 0:
-        offsets = []
-        for i in range(subsample[0]):
-            for j in range(subsample[1]):
-              offsets.append((i,j))
-
-    # Find the largest offsets in both image dimensions. Used to determine the
-    # size of the image used.
-    max_offset = list(offsets[0])
-
-    for offset in offsets:
-        if offset[0] > max_offset[0]:
-            max_offset[0] = offset[0]
-        if offset[1] > max_offset[1]:
-            max_offset[1] = offset[1]
-
-    if not (max_offset[0] < subsample[0] and max_offset[1] < subsample[1]):
-        raise ValueError('conv2d_offset: invalid offset sites.')
-
-    # Determine the reduced size of input so all feature maps get an input of 
-    # the same size.
-    sub_image_shape = list(image_shape)
-    sub_image_shape[2] -= max_offset[0]
-    sub_image_shape[3] -= max_offset[1]
-
-    # Determine number of filters per offset position.
-    if (filter_shape[0] % len(offsets)) != 0:
-        print 'nfilts ', filter_shape[0], ' noffsets ', len(offsets)
-        raise ValueError('conv2d_offset: invalid number of filters wrt offsets.')
-    n_filters = filter_shape[0] / len(offsets) 
-    sub_filter_shape = list(filter_shape)
-    sub_filter_shape[0] = n_filters
-
-    # Call conv2d at each offset using same fraction of kernels.
-    outputs = []
-    for i, offset in enumerate(offsets):
-
-        # Crop the input so all offsets get an input of the same size.
-        sub_input = input[:, :, offset[0]:sub_image_shape[2] + offset[0], 
-                      offset[1]:sub_image_shape[3] + offset[1]]
-
-        # Grab part of the filters.
-        sub_filters = filters[i*n_filters:(i+1)*n_filters]
-
-        out = conv2d(sub_input, sub_filters, sub_image_shape, sub_filter_shape,
-                    border_mode=border_mode, subsample=subsample, **kargs)
-
-        outputs.append(out)
-
-    # Join the outputs on the leading axis.
-    if len(outputs) > 1:
-        output = tensor.join(1, *outputs)
-    else:
-        output = outputs[0] 
-
-    outshp = ConvOp.getOutputShape(sub_image_shape[2:], filter_shape[2:], subsample, border_mode)
-
-    return [output, outshp]
-
-

 class ConvOp(Op):
    """

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1888,6 +1888,27 @@ def local_log1p(node):
                else:
                    return _fill_chain(T.log1p(T.add(*nonconsts)), scalar_inputs)

+#TODO: in canonicalize, change log10 and log2 -> log
+@register_stabilize
+@gof.local_optimizer([T.log])
+def local_log_add(node):
+    # log(exp(x)+exp(y))
+    #
+    # Suppose x >= y
+    # log(exp(x) + exp(y))
+    # log(exp(x) * (1 + exp(y)/exp(x)))
+    # x + log(1 + exp(y)/exp(x))
+    # x + log1p(exp(y)/exp(x))
+    # x + log1p(exp(y-x))
+    if node.op == T.log:
+        z = node.inputs[0]
+        if z.owner and z.owner.op == T.add:
+            zi = z.owner.inputs
+            pre_exp = [ x.owner.inputs[0] for x in zi if x.owner and x.owner.op == T.exp]
+            if len(pre_exp) == len(zi):
+                # all arguments to add are exp(<something>)
+                max_pre = T.maximum(*pre_exp)
+                return [max_pre + T.log1p(T.exp(T.add(*[p - max_pre for p in pre_exp])))]

 def add_calculate(num, denum, aslist = False, out_type=None):
    #TODO: make sure that this function and mul_calculate are similar

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -930,6 +930,36 @@ def test_log1p():
        theano.printing.debugprint(f)
        assert [node.op for node in f.maker.env.toposort()] == [T.log1p]

+def test_log_add():
+    m = theano.config.mode
+    if m == 'FAST_COMPILE':
+        m = 'FAST_RUN'
+    m = compile.mode.get_mode(m)
+    m = m.excluding('fusion')
+    # check some basic cases
+    x = dvector()
+    y = dvector()
+    f = function([x,y], T.log(T.exp(x) + T.exp(y)), mode=m)
+
+    theano.printing.debugprint( f)
+    print f([10000], [10000])  # causes overflow if handled incorrectly
+    assert numpy.allclose(f([10000], [10000]), 10000+numpy.log1p(1))
+
+
+    # test that it also works with more than two args, (this currently fails)
+    x = dvector()
+    y = dvector()
+    f = function([x,y], T.log(T.exp(x) + T.exp(y) + T.exp(x-y) + T.exp(x+y)), mode=m)
+    theano.printing.debugprint( f)
+
+
+    print f([10000], [10000])  # causes overflow if handled incorrectly
+    assert numpy.allclose(f([10000], [10000]), 20000)
+
+    #TODO: test that the optimization works in the presence of broadcasting.
+
+    #TODO: (write and) test that the optimization works with Sum in addition to working with Add.
+
 class test_local_subtensor_unary(unittest.TestCase):

    def test0(self):

--- a/theano/tests/test_scan.py
+++ b/theano/tests/test_scan.py
@@ -922,7 +922,24 @@ class T_Scan(unittest.TestCase):
        assert len(analytic_grad[0]) == 3


+    def test_draw_as_input_to_scan(self):
+        trng = theano.tensor.shared_randomstreams.RandomStreams(123)

+        x = theano.tensor.matrix('x')
+        y = trng.binomial(size = x.shape, p = x)
+        z,updates = theano.scan(lambda a:a, non_sequences=y, n_steps=2)
+
+        f = theano.function([x],[y,z], updates = updates)
+
+        rng = numpy.random.RandomState(utt.fetch_seed())
+        nx = rng.uniform( size = (10,10) )
+        ny1,nz1 = f(nx)
+        ny2,nz2 = f(nx)
+
+
+        assert numpy.allclose([ny1,ny1], nz1)
+        assert numpy.allclose([ny2,ny2], nz2)
+        assert not numpy.allclose(ny1,ny2)

 if __name__ == '__main__':
    unittest.main()