merge

85447abe · James Bergstra · 13b8fb68 · 7ae6897c · 85447abe · 85447abe
--- a/doc/index.txt
+++ b/doc/index.txt
@@ -52,6 +52,10 @@ Community

 * Register and post to `theano-dev`_ if you want to talk to the developers.

+* Register and post to `theano-announce`_ if you want to be keep informed on important change on theano(low volume).
+
+* Register and post to `theano-buildbot`_ if you want to receive our daily buildbot email.
+
 * We try to stay organized with `Theano's Trac <http://trac-hg.assembla.com/theano/report/1>`__ 

 * Come visit us in Montreal!  Most of the developers are students in the LISA_ group at the `University of Montreal`_.
@@ -77,6 +81,8 @@ Community

 .. _theano-dev: http://groups.google.com/group/theano-dev
 .. _theano-users: http://groups.google.com/group/theano-users
+.. _theano-announce: http://groups.google.com/group/theano-announce
+.. _theano-buildbot: http://groups.google.com/group/theano-buildbot
 .. _tickets: http://pylearn.org/theano/trac/query?status=accepted&status=assigned&status=new&status=reopened&group=milestone&max=200&col=id&col=summary&col=status&col=owner&col=type&col=priority&col=component&col=time&report=9&order=priority

 .. _LISA: http://www.iro.umontreal.ca/~lisa

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -188,7 +188,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
    def make_node(self, dy, sm, y_idx):
        return Apply(self, [dy, sm, y_idx],[sm.type()])
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
        #return ()
    def c_code(self, node, nodename, (dnll, sm, y_idx), (dx,), sub):
        fail = sub['fail']
@@ -229,7 +229,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (Op):
            kCrossEntropySoftmax1HotWithBiasDx_%(nodename)s
                <<<
                    CudaNdarray_HOST_DIMS(%(dx)s)[0],
-                    CudaNdarray_HOST_DIMS(%(dx)s)[1]
+                    std::min(CudaNdarray_HOST_DIMS(%(dx)s)[1],256)
                >>>(
                        CudaNdarray_HOST_DIMS(%(dx)s)[0],
                        CudaNdarray_HOST_DIMS(%(dx)s)[1], 
@@ -303,7 +303,7 @@ class GpuSoftmax (Op):
        return shape
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version
    def c_code(self, node, nodename, (x,), (z,), sub):
        fail = sub['fail']
        return """
@@ -330,7 +330,7 @@ class GpuSoftmax (Op):
            kSoftmax_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -362,11 +362,14 @@ class GpuSoftmax (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
-                    "buf2[threadIdx.x] = buf[threadIdx.x]",
-                    "__syncthreads()",
-                    inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                      "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
+                      "buf2[threadIdx.x] = buf[threadIdx.x]",
+                      "__syncthreads()",
+                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                      "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                      "__syncthreads()",
+                    "}",
                    ])


@@ -386,7 +389,7 @@ class GpuSoftmaxWithBias (Op):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (1,) + inline_softmax.code_version
+        return (2,) + inline_softmax.code_version

    def c_code(self, node, nodename, (x,b), (z,), sub):
        fail = sub['fail']
@@ -425,7 +428,7 @@ class GpuSoftmaxWithBias (Op):
            kSoftmaxWithBias_%(nodename)s
                <<<
                // todo: cap these at the card limits, implement loops in kernel
-                    CudaNdarray_HOST_DIMS(%(x)s)[0],
+                    std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],32*1024),
                    CudaNdarray_HOST_DIMS(%(x)s)[1],
                    CudaNdarray_HOST_DIMS(%(x)s)[1] * 2 * sizeof(float)
                >>>(
@@ -461,10 +464,14 @@ class GpuSoftmaxWithBias (Op):
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
-                    "buf[threadIdx.x] = x[blockIdx.x * sx0 + threadIdx.x * sx1]",
-                    "buf[threadIdx.x] += b[threadIdx.x * sb0]",
-                    "buf2[threadIdx.x] = buf[threadIdx.x]",
-                    "__syncthreads()",
-                    inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
-                    "sm[blockIdx.x * N + threadIdx.x] = buf[threadIdx.x]"
+                    "for (int blockIDX = blockIdx.x; blockIDX < M; blockIDX += gridDim.x){",
+                       "buf[threadIdx.x] = x[blockIDX * sx0 + threadIdx.x * sx1]",
+                       "buf[threadIdx.x] += b[threadIdx.x * sb0]",
+                       "buf2[threadIdx.x] = buf[threadIdx.x]",
+                       "__syncthreads()",
+                       inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
+                       "sm[blockIDX * N + threadIdx.x] = buf[threadIdx.x]",
+                       "__syncthreads()",
+                    "}",
                    ])
+#for (int i = blockIdx.x; i < N; i += gridDim.x)
--- a/theano/sandbox/cuda/tests/test_mlp.py
+++ b/theano/sandbox/cuda/tests/test_mlp.py
+import sys, time
+import theano
+
+from theano.compile.sharedvalue import shared
+from theano.compile.pfunc import pfunc
+from theano import tensor
+import theano.tensor.nnet
+from theano import config
+
+import theano.tensor.nnet.conv as conv
+import theano.tensor.signal.downsample as downsample
+
+import numpy
+
+
+# Skip test if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
+import theano.sandbox.cuda as cuda_ndarray
+if cuda_ndarray.cuda_available == False:
+    raise SkipTest('Optional package cuda disabled')
+
+import theano.sandbox.cuda as tcn
+
+import logging
+logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
+
+
+def my_rand(*shape):
+    return theano._asarray(numpy.random.rand(*shape),dtype='float32')
+def my_randn(*shape):
+    return theano._asarray(numpy.random.randn(*shape),dtype='float32')
+def my_zeros(*shape):
+    return theano._asarray(numpy.zeros(*shape),dtype='float32')
+
+def get_mode(use_gpu):
+    ret = theano.compile.get_default_mode()
+    if isinstance(ret, theano.compile.ProfileMode):
+        ret = theano.compile.ProfileMode()
+    if use_gpu:
+        ret = ret.including('gpu')
+    else:
+        ret = ret.excluding('gpu')
+    return ret
+
+def print_mode(mode):
+    if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
+        mode.print_summary()
+
+def print_diff_mode(a,b):
+    if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
+        a.print_diff_summary(b)
+
+def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
+
+    if config.mode=='DEBUG_MODE': n_train=1
+    
+    if use_gpu:
+        w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
+        b = tcn.shared_constructor(my_zeros(n_hid), 'b')
+        v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
+        c = tcn.shared_constructor(my_zeros(n_out), 'c')
+    else:
+        w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
+        b = shared(my_zeros(n_hid), 'b')
+        v = shared(my_zeros((n_hid, n_out)), 'c')
+        c = shared(my_zeros(n_out), 'c')
+
+    x = tensor.fmatrix('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+
+    hid = tensor.tanh(tensor.dot(x, w)+b)
+    out = tensor.tanh(tensor.dot(hid, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    if 0: print 'loss type', loss.type
+
+    params = [w, b, v, c]
+    gparams = tensor.grad(loss, params)
+
+    mode = get_mode(use_gpu)
+
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+
+    if 0:
+        for i, n in enumerate(train.maker.env.toposort()):
+            print i, n
+
+    xval = my_rand(n_batch, n_in)
+    yval = my_rand(n_batch, n_out)
+    lr = theano._asarray(0.01, dtype='float32')
+
+    t0 = time.time()
+    rval = []
+    for i in xrange(n_train):
+        rval.append(train(xval, yval, lr))
+    dt = time.time() - t0
+        
+    print_mode(mode)
+    return numpy.asarray(rval), dt
+    
+def test_run_nnet():
+    for n_in in 1024, 2048, 4096:
+        for n_hid in 1024, 2048, 4096:
+            numpy.random.seed(23456)
+            rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
+            numpy.random.seed(23456)
+            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
+            #print "cpu:", rval_cpu
+            #print "gpu:", rval_gpu
+            print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
+            print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
+            assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
+
+def test_run_nnet_med():
+    numpy.random.seed(23456)
+    rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
+
+def test_run_nnet_small():
+    numpy.random.seed(23456)
+    rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
+
+def run_conv_nnet1(use_gpu):
+    if use_gpu:
+        shared_fn = tcn.shared_constructor
+    else:
+        shared_fn = shared
+    n_batch = 16
+    n_kern = 20
+    shape_img = (n_batch, 1, 32, 32)
+    shape_kern = (n_kern, 1, 5, 5)
+    n_train=10
+    if config.mode=='DEBUG_MODE': n_train=1
+
+    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
+    n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
+    n_out = 10
+
+    w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
+    b = shared_fn(my_zeros((n_kern,)), 'b')
+    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
+    c = shared_fn(my_zeros(n_out), 'c')
+
+    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+
+    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
+    conv_op.set_flops()
+
+    hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
+    hid_flat = hid.reshape((n_batch, n_hid))
+    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    print 'loss type', loss.type
+
+    params = [w, b, v, c]
+    gparams = tensor.grad(loss, params)
+
+    mode = get_mode(use_gpu)
+
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+
+#    for i, n in enumerate(train.maker.env.toposort()):
+#        print i, n
+
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch, n_out)
+    lr = theano._asarray(0.01, dtype='float32')
+
+    for i in xrange(n_train):
+        rval = train(xval, yval, lr)
+    print 'training done'
+    print_mode(mode)
+    return rval
+
+def test_conv_nnet1():
+    numpy.random.seed(23456)
+    rval_cpu = run_conv_nnet1(False)
+    numpy.random.seed(23456)
+    rval_gpu = run_conv_nnet1(True)
+    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
+
+def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
+    if use_gpu:
+        shared_fn = tcn.shared_constructor
+    else:
+        shared_fn = shared
+
+    #cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
+    #TODO: why the last two example see the error lower? We are converging?
+    #n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
+    #n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
+    #n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
+    #n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
+    #n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
+    #n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
+
+
+    #n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
+    #n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
+
+    n_batch = 60
+    shape_img = (n_batch, 1, 32, 32)
+
+    n_kern = 20
+    shape_kern = (n_kern, 1, 5, 5)
+
+    n_kern1 = 10
+    shape_kern1 = (n_kern1, n_kern, 5, 5)
+
+    n_train=30
+    if config.mode=='DEBUG_MODE': n_train=1
+
+    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
+    logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
+    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
+    n_out = 10
+
+    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
+    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
+    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
+    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
+    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
+    c = shared_fn(my_zeros(n_out), 'c')
+
+    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+
+    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
+    conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
+    conv_op.set_flops()
+    conv_op1.set_flops()
+
+    hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
+    hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
+    hid_flat = hid1.reshape((n_batch, n_hid))
+    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
+    loss = tensor.sum(0.5 * (out-y)**2 * lr)
+    print 'loss type', loss.type
+
+    params = [w0, b0, w1, b1, v, c]
+    gparams = tensor.grad(loss, params)
+
+    mode = get_mode(use_gpu)
+
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+
+#    for i, n in enumerate(train.maker.env.toposort()):
+#        print i, n
+
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch,n_out)#int32 make all 0...
+    lr = theano._asarray(0.01, dtype='float32')
+    for i in xrange(n_train):
+        rval = train(xval, yval, lr)
+
+    print_mode(mode)
+    return rval
+
+def test_conv_nnet2():
+    numpy.random.seed(23456)
+    rval_gpu = run_conv_nnet2(True)
+    if True:
+        numpy.random.seed(23456)
+        rval_cpu = run_conv_nnet2(False)
+        print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
+        assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
+
+def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
+                           downsample_ops=True, verbose=0, version=-1):
+    if use_gpu:
+        shared_fn = tcn.shared_constructor
+    else:
+        shared_fn = shared
+
+    isize1=isize
+    isize2=isize
+    if isinstance(isize,(tuple,)):
+        isize1=isize[0]
+        isize2=isize[1]
+
+    shape_img = (n_batch, 1, isize1, isize2)
+
+    n_kern = 20  # 6 were used in LeNet5
+    shape_kern = (n_kern, 1, ksize, ksize)
+
+    n_kern1 = 30 # 16 were used in LeNet5
+    shape_kern1 = (n_kern1, n_kern, ksize, ksize)
+
+    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid')
+    logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
+        logical_hid_shape[1]/2), (ksize, ksize), 'valid')
+    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
+    n_out = 10
+
+
+    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
+    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
+    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
+    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
+    v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
+    c = shared_fn(my_zeros(n_out), 'c')
+
+    print 'ALLOCATING ARCH: w0 shape', w0.value.shape
+    print 'ALLOCATING ARCH: w1 shape', w1.value.shape
+    print 'ALLOCATING ARCH: v shape', v.value.shape
+
+    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
+    y = tensor.fmatrix('y')
+    lr = tensor.fscalar('lr')
+
+    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern,
+                          n_batch, 1, 1, verbose=verbose, version=version)
+    conv_op1 = conv.ConvOp(
+        (n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
+        shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
+    conv_op.set_flops()
+    conv_op1.set_flops()
+
+    ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False)
+    if downsample_ops:
+        hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x'))))
+    else:
+        hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2])
+    hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x')))
+    hid_flat = hid1.reshape((n_batch, n_hid))
+    out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
+    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
+    print 'loss type', loss.type
+
+    params = [w0, b0, w1, b1, v, c]
+    gparams = tensor.grad(loss, params, warn_type=True)
+
+    mode = get_mode(use_gpu)
+
+    print 'building pfunc ...'
+    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+
+    if False:
+        for i, n in enumerate(train.maker.env.toposort()):
+            print i, n
+
+    xval = my_rand(*shape_img)
+    yval = my_rand(n_batch,n_out)
+    lr = theano._asarray(0.01, dtype='float32')
+
+    rvals=my_zeros(n_train)
+    t0 = time.time()
+    for i in xrange(n_train):
+        rvals[i] = train(xval, yval, lr)[0]
+    t1 = time.time()
+    print_mode(mode)
+    return rvals, t1-t0, mode
+
+def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, 
+                               ignore_error=False, 
+                               n_train=10,
+                               gpu_only=False,
+                               cpu_only=False,
+                               float_atol=1e-06,
+                               check_isfinite=True,
+                               pickle=False,
+                               verbose=0,
+                               version=-1):
+    """
+       float_atol: None mean use the default value.
+       check_isfinite: the debug mode option. We forward this value to debug mode.
+                       For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
+    """
+    if config.mode=='DEBUG_MODE': n_train=1
+
+    numpy.random.seed(seed)
+
+    import theano.tensor.basic
+    import theano.compile.debugmode
+    from theano.compile.mode import predefined_modes
+    orig_float32_atol = theano.tensor.basic.float32_atol
+    orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
+    
+    try:
+        predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
+        if gpu_only:
+            tcn.use()
+        if float_atol:
+            print "float_atol",float_atol
+            theano.tensor.basic.float32_atol=float_atol
+        if not cpu_only:
+            rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
+                                                            isize, ksize, bsize, n_train, verbose=verbose, version=version)
+    finally:
+        predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
+        theano.tensor.basic.float32_atol=orig_float32_atol
+
+    if gpu_only:
+        print "time gpu: %.3f"%(tg)
+        return
+    
+    try:
+        predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
+        numpy.random.seed(seed)
+        rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
+                                                        verbose=verbose, version=version)
+        if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
+            import pickle
+            print "BEGIN GPU profile mode dump"
+            #print pickle.dumps(gpu_mode)
+            print "END GPU profile mode dump"
+            print "BEGIN CPU profile mode dump"
+            print pickle.dumps(cpu_mode)
+            print "END CPU profile mode dump"
+
+    finally:
+        predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
+        theano.tensor.basic.float32_atol=orig_float32_atol
+
+    if not cpu_only:
+        if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol): 
+            print "cpu:", rval_cpu
+            print "gpu:", rval_gpu
+            print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
+        print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
+        print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
+        print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
+    else:
+        print "time cpu: %.3f"%(tc)
+        print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
+
+    if not ignore_error and not cpu_only and not gpu_only:
+        assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
+
+gpu_only=False
+cpu_only=False
+ignore_error=False
+verbose=0
+version=-1
+
+def test_lenet_28(): #MNIST
+    cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose, version=version)
+
+def test_lenet_32(): #CIFAR10 / Shapeset
+    cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               verbose=verbose, version=version)
+
+def test_lenet_32_long(): #CIFAR10 / Shapeset
+    # this tests the gradient of downsample on the GPU, 
+    # which does not recieve specific testing
+    cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose, version=version)
+
+def test_lenet_64(): # ???
+    #float_atol need to pass in debug mode
+    #needed as cpu use extended precision and gpu don't
+    cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose,
+                               float_atol=5e-4, check_isfinite=True, version=version)
+
+def test_lenet_108(): # NORB
+    cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose,
+                               check_isfinite=True, version=version, float_atol=7e-2)
+
+def test_lenet_256(): # ImageNet
+    cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose,
+                               check_isfinite=True, version=version)
+
+#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
+def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len)
+    cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose,
+                               check_isfinite=True, version=version)
+
+#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
+def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len)
+    cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3,
+                               ignore_error=ignore_error, gpu_only=gpu_only,
+                               cpu_only=cpu_only, verbose=verbose,
+                               check_isfinite=True, version=version)
--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
-import sys, time
-import theano
-
-from theano.compile.sharedvalue import shared
-from theano.compile.pfunc import pfunc
-from theano import tensor
-import theano.tensor.nnet
-from theano import config
-
-import theano.tensor.nnet.conv as conv
-import theano.tensor.signal.downsample as downsample
-
-import numpy
-
+import theano, numpy
+import theano.tensor as T

 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-raise SkipTest('SKIP TO PREVENT THE BUILDBOT FROM CRASHING. THERE IS A DIFFICULT BUG TO FIX WITH MEMORY LEAK AND/OR WHEN Cuda_Ndarray alloc fail!')
-import theano.sandbox.cuda as cuda_ndarray
-if cuda_ndarray.cuda_available == False:
+import theano.sandbox.cuda as cuda
+if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

-import theano.sandbox.cuda as tcn
-
-import logging
-logging.getLogger('theano.sandbox.cuda.tests.test_nnet').setLevel(logging.INFO)
-
-
-def my_rand(*shape):
-    return theano._asarray(numpy.random.rand(*shape),dtype='float32')
-def my_randn(*shape):
-    return theano._asarray(numpy.random.randn(*shape),dtype='float32')
-def my_zeros(*shape):
-    return theano._asarray(numpy.zeros(*shape),dtype='float32')
-
-def get_mode(use_gpu):
-    ret = theano.compile.get_default_mode()
-    if isinstance(ret, theano.compile.ProfileMode):
-        ret = theano.compile.ProfileMode()
-    if use_gpu:
-        ret = ret.including('gpu')
-    else:
-        ret = ret.excluding('gpu')
-    return ret
-
-def print_mode(mode):
-    if mode != None and isinstance(mode,(theano.compile.ProfileMode,)):
-        mode.print_summary()
-
-def print_diff_mode(a,b):
-    if a != None and isinstance(a,(theano.compile.ProfileMode,)) and isinstance(b,(theano.compile.ProfileMode,)):
-        a.print_diff_summary(b)
+if theano.config.mode=='FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')

-def run_nnet(use_gpu, n_batch=60, n_in=1024, n_hid=2048, n_out=10, n_train=100):
-
-    if config.mode=='DEBUG_MODE': n_train=1
-    
-    if use_gpu:
-        w = tcn.shared_constructor(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
-        b = tcn.shared_constructor(my_zeros(n_hid), 'b')
-        v = tcn.shared_constructor(my_zeros((n_hid, n_out)), 'c')
-        c = tcn.shared_constructor(my_zeros(n_out), 'c')
-    else:
-        w = shared(0.01*(my_rand(n_in,n_hid)-0.5), 'w')
-        b = shared(my_zeros(n_hid), 'b')
-        v = shared(my_zeros((n_hid, n_out)), 'c')
-        c = shared(my_zeros(n_out), 'c')
+def test_GpuCrossentropySoftmax1HotWithBiasDx():
+    """
+    This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias and GpuCrossentropySoftmax1HotWithBiasDx

-    x = tensor.fmatrix('x')
-    y = tensor.fmatrix('y')
-    lr = tensor.fscalar('lr')

-    hid = tensor.tanh(tensor.dot(x, w)+b)
-    out = tensor.tanh(tensor.dot(hid, v)+c)
-    loss = tensor.sum(0.5 * (out-y)**2 * lr)
-    if 0: print 'loss type', loss.type
+    We check that we loop when their is too much threads
+    TODO: check that we loop when their is too much block(>32*1024)
+    """

-    params = [w, b, v, c]
-    gparams = tensor.grad(loss, params)
+    n_in = 1000
+    batch_size = 4097
+    n_out = 1250

-    mode = get_mode(use_gpu)
+    if theano.config.mode!="DEBUG_MODE":
+        n_in = 4098
+        n_out = 4099

-    print 'building pfunc ...'
-    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+    x = T.fmatrix('x')
+    y = T.lvector('y')

-    if 0:
-        for i, n in enumerate(train.maker.env.toposort()):
-            print i, n

-    xval = my_rand(n_batch, n_in)
-    yval = my_rand(n_batch, n_out)
-    lr = theano._asarray(0.01, dtype='float32')
+    b = T.fvector()
+    W = T.fmatrix()

-    t0 = time.time()
-    rval = []
-    for i in xrange(n_train):
-        rval.append(train(xval, yval, lr))
-    dt = time.time() - t0
-        
-    print_mode(mode)
-    return numpy.asarray(rval), dt
+    p_y_given_x = T.nnet.softmax(T.dot(x,W)+b)
+    y_pred = T.argmax(p_y_given_x)
+    loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
+    dW = T.grad(loss,W)
+    classify = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
+                                mode = mode_without_gpu)
+    classify_gpu = theano.function( inputs = [x,y,b,W], outputs = [loss,y_pred,dW],
+                                    mode = mode_with_gpu)
+    
+    xx = numpy.asarray(numpy.random.rand(batch_size,n_in),dtype=numpy.float32)
+    yy = numpy.ones((batch_size,),dtype='float32')
+    b_values = numpy.zeros((n_out,),dtype='float32')
+    W_values = numpy.asarray(numpy.random.rand(n_in,n_out),dtype='float32')
    
-def test_run_nnet():
-    for n_in in 1024, 2048, 4096:
-        for n_hid in 1024, 2048, 4096:
-            numpy.random.seed(23456)
-            rval_cpu, tc = run_nnet(False, n_in=n_in, n_hid=n_hid)
-            numpy.random.seed(23456)
-            rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
-            #print "cpu:", rval_cpu
-            #print "gpu:", rval_gpu
-            print "max abs diff:", numpy.max(numpy.absolute(rval_gpu-rval_cpu))
-            print "time cpu: %f, time gpu: %f, speed up %f"%(tc, tg, tc/tg)
-            assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
-
-def test_run_nnet_med():
-    numpy.random.seed(23456)
-    rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
-
-def test_run_nnet_small():
-    numpy.random.seed(23456)
-    rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
-
-def run_conv_nnet1(use_gpu):
-    if use_gpu:
-        shared_fn = tcn.shared_constructor
-    else:
-        shared_fn = shared
-    n_batch = 16
-    n_kern = 20
-    shape_img = (n_batch, 1, 32, 32)
-    shape_kern = (n_kern, 1, 5, 5)
-    n_train=10
-    if config.mode=='DEBUG_MODE': n_train=1
-
-    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(shape_img[2:],shape_kern[2:], 'valid')
-    n_hid = n_kern * logical_hid_shape[0] * logical_hid_shape[1]
-    n_out = 10
-
-    w = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w')
-    b = shared_fn(my_zeros((n_kern,)), 'b')
-    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
-    c = shared_fn(my_zeros(n_out), 'c')
-
-    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
-    y = tensor.fmatrix('y')
-    lr = tensor.fscalar('lr')
-
-    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
-    conv_op.set_flops()
-
-    hid = tensor.tanh(conv_op(x, w)+b.dimshuffle((0,'x','x')))
-    hid_flat = hid.reshape((n_batch, n_hid))
-    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
-    loss = tensor.sum(0.5 * (out-y)**2 * lr)
-    print 'loss type', loss.type
-
-    params = [w, b, v, c]
-    gparams = tensor.grad(loss, params)
-
-    mode = get_mode(use_gpu)
-
-    print 'building pfunc ...'
-    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
-
-#    for i, n in enumerate(train.maker.env.toposort()):
-#        print i, n
-
-    xval = my_rand(*shape_img)
-    yval = my_rand(n_batch, n_out)
-    lr = theano._asarray(0.01, dtype='float32')
-
-    for i in xrange(n_train):
-        rval = train(xval, yval, lr)
-    print 'training done'
-    print_mode(mode)
-    return rval
-
-def test_conv_nnet1():
-    numpy.random.seed(23456)
-    rval_cpu = run_conv_nnet1(False)
-    numpy.random.seed(23456)
-    rval_gpu = run_conv_nnet1(True)
-    assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-6)
-
-def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
-    if use_gpu:
-        shared_fn = tcn.shared_constructor
-    else:
-        shared_fn = shared
-
-    #cumulativ rounding error affect this comparaison of result. So we lower the tolerance.
-    #TODO: why the last two example see the error lower? We are converging?
-    #n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9
-    #n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06
-    #n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05
-    #n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963
-    #n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05
-    #n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06
-
-
-    #n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05
-    #n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534
-
-    n_batch = 60
-    shape_img = (n_batch, 1, 32, 32)
-
-    n_kern = 20
-    shape_kern = (n_kern, 1, 5, 5)
-
-    n_kern1 = 10
-    shape_kern1 = (n_kern1, n_kern, 5, 5)
-
-    n_train=30
-    if config.mode=='DEBUG_MODE': n_train=1
-
-    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d(tuple(shape_img[2:]),tuple(shape_kern[2:]), 'valid')
-    logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2, logical_hid_shape[1]/2), tuple(shape_kern1[2:]), 'valid')
-    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
-    n_out = 10
-
-    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
-    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
-    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
-    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
-    v = shared_fn(my_zeros((n_hid, n_out)), 'c')
-    c = shared_fn(my_zeros(n_out), 'c')
-
-    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
-    y = tensor.fmatrix('y')
-    lr = tensor.fscalar('lr')
-
-    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
-    conv_op1 = conv.ConvOp((n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
-    conv_op.set_flops()
-    conv_op1.set_flops()
-
-    hid = tensor.tanh(conv_op(x, w0)+b0.dimshuffle((0,'x','x')))
-    hid1 = tensor.tanh(conv_op1(hid[:,:,::2,::2], w1) + b1.dimshuffle((0,'x','x')))
-    hid_flat = hid1.reshape((n_batch, n_hid))
-    out = tensor.tanh(tensor.dot(hid_flat, v)+c)
-    loss = tensor.sum(0.5 * (out-y)**2 * lr)
-    print 'loss type', loss.type
-
-    params = [w0, b0, w1, b1, v, c]
-    gparams = tensor.grad(loss, params)
-
-    mode = get_mode(use_gpu)
-
-    print 'building pfunc ...'
-    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
-
-#    for i, n in enumerate(train.maker.env.toposort()):
-#        print i, n
-
-    xval = my_rand(*shape_img)
-    yval = my_rand(n_batch,n_out)#int32 make all 0...
-    lr = theano._asarray(0.01, dtype='float32')
-    for i in xrange(n_train):
-        rval = train(xval, yval, lr)
-
-    print_mode(mode)
-    return rval
-
-def test_conv_nnet2():
-    numpy.random.seed(23456)
-    rval_gpu = run_conv_nnet2(True)
-    if True:
-        numpy.random.seed(23456)
-        rval_cpu = run_conv_nnet2(False)
-        print rval_cpu[0], rval_gpu[0],rval_cpu[0]-rval_gpu[0]
-        assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-4,atol=1e-4)
-
-def run_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, n_train,
-                           downsample_ops=True, verbose=0, version=-1):
-    if use_gpu:
-        shared_fn = tcn.shared_constructor
-    else:
-        shared_fn = shared
-
-    isize1=isize
-    isize2=isize
-    if isinstance(isize,(tuple,)):
-        isize1=isize[0]
-        isize2=isize[1]
-
-    shape_img = (n_batch, 1, isize1, isize2)
-
-    n_kern = 20  # 6 were used in LeNet5
-    shape_kern = (n_kern, 1, ksize, ksize)
-
-    n_kern1 = 30 # 16 were used in LeNet5
-    shape_kern1 = (n_kern1, n_kern, ksize, ksize)
-
-    logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d((isize1, isize2), (ksize, ksize), 'valid')
-    logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d((logical_hid_shape[0]/2,
-        logical_hid_shape[1]/2), (ksize, ksize), 'valid')
-    n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1]
-    n_out = 10
-
-
-    w0 = shared_fn(0.01*(my_rand(*shape_kern)-0.5), 'w0')
-    b0 = shared_fn(my_zeros((n_kern,)), 'b0')
-    w1 = shared_fn(0.01*(my_rand(*shape_kern1)-0.5), 'w1')
-    b1 = shared_fn(my_zeros((n_kern1,)), 'b1')
-    v = shared_fn(0.01*my_randn(n_hid, n_out), 'v')
-    c = shared_fn(my_zeros(n_out), 'c')
-
-    print 'ALLOCATING ARCH: w0 shape', w0.value.shape
-    print 'ALLOCATING ARCH: w1 shape', w1.value.shape
-    print 'ALLOCATING ARCH: v shape', v.value.shape
-
-    x = tensor.Tensor(dtype='float32', broadcastable=(0,1,0,0))('x')
-    y = tensor.fmatrix('y')
-    lr = tensor.fscalar('lr')
-
-    conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern,
-                          n_batch, 1, 1, verbose=verbose, version=version)
-    conv_op1 = conv.ConvOp(
-        (n_kern,logical_hid_shape[0]/2, logical_hid_shape[1]/2),
-        shape_kern1[2:], n_kern1, n_batch, 1, 1,verbose=verbose, version=version)
-    conv_op.set_flops()
-    conv_op1.set_flops()
-
-    ds_op = downsample.DownsampleFactorMax((2,2), ignore_border=False)
-    if downsample_ops:
-        hid = tensor.tanh(ds_op(conv_op(x, w0)+b0.dimshuffle((0,'x','x'))))
-    else:
-        hid = tensor.tanh((conv_op(x, w0)+b0.dimshuffle((0,'x','x')))[:,:,::2,::2])
-    hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0,'x','x')))
-    hid_flat = hid1.reshape((n_batch, n_hid))
-    out = tensor.nnet.softmax(tensor.dot(hid_flat, v)+c)
-    loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax(y, axis=1)) * lr)
-    print 'loss type', loss.type
-
-    params = [w0, b0, w1, b1, v, c]
-    gparams = tensor.grad(loss, params, warn_type=True)
-
-    mode = get_mode(use_gpu)

-    print 'building pfunc ...'
-    train = pfunc([x,y,lr], [loss], mode=mode, updates=[(p, p-g) for p,g in zip(params, gparams)])
+    assert any([isinstance(node.op,T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.env.toposort()])
+    assert any([isinstance(node.op,T.nnet.CrossentropySoftmax1HotWithBiasDx) for node in classify.maker.env.toposort()])
+    assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.env.toposort()])
+    assert any([isinstance(node.op,cuda.nnet.GpuCrossentropySoftmax1HotWithBiasDx) for node in classify_gpu.maker.env.toposort()])

-    if False:
-        for i, n in enumerate(train.maker.env.toposort()):
-            print i, n
+    out=classify(xx,yy,b_values,W_values)
+    gout=classify_gpu(xx,yy,b_values,W_values)

-    xval = my_rand(*shape_img)
-    yval = my_rand(n_batch,n_out)
-    lr = theano._asarray(0.01, dtype='float32')
+    assert numpy.allclose(out[0],gout[0])
+    assert numpy.allclose(out[1],gout[1])
+    assert numpy.allclose(out[2],gout[2],atol=2e-6)

-    rvals=my_zeros(n_train)
-    t0 = time.time()
-    for i in xrange(n_train):
-        rvals[i] = train(xval, yval, lr)[0]
-    t1 = time.time()
-    print_mode(mode)
-    return rvals, t1-t0, mode

-def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, 
-                               ignore_error=False, 
-                               n_train=10,
-                               gpu_only=False,
-                               cpu_only=False,
-                               float_atol=1e-06,
-                               check_isfinite=True,
-                               pickle=False,
-                               verbose=0,
-                               version=-1):
-    """
-       float_atol: None mean use the default value.
-       check_isfinite: the debug mode option. We forward this value to debug mode.
-                       For some parameter CrossentropyCategorical1Hot op generate inf when not optimized.
+def test_softmax_with_bias():
    """
-    if config.mode=='DEBUG_MODE': n_train=1
+    This is basic test for GpuSoftmaxWithBias

-    numpy.random.seed(seed)
-
-    import theano.tensor.basic
-    import theano.compile.debugmode
-    from theano.compile.mode import predefined_modes
-    orig_float32_atol = theano.tensor.basic.float32_atol
-    orig_check_isfinite = predefined_modes["DEBUG_MODE"].check_isfinite
-    
-    try:
-        predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
-        if gpu_only:
-            tcn.use()
-        if float_atol:
-            print "float_atol",float_atol
-            theano.tensor.basic.float32_atol=float_atol
-        if not cpu_only:
-            rval_gpu, tg, gpu_mode = run_conv_nnet2_classif(True,
-                                                            isize, ksize, bsize, n_train, verbose=verbose, version=version)
-    finally:
-        predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
-        theano.tensor.basic.float32_atol=orig_float32_atol
-
-    if gpu_only:
-        print "time gpu: %.3f"%(tg)
-        return
-    
-    try:
-        predefined_modes["DEBUG_MODE"].check_isfinite = check_isfinite
-        numpy.random.seed(seed)
-        rval_cpu, tc, cpu_mode = run_conv_nnet2_classif(False, isize, ksize, bsize, n_train,
-                                                        verbose=verbose, version=version)
-        if pickle and isinstance(cpu_mode,(theano.compile.ProfileMode,)):
-            import pickle
-            print "BEGIN GPU profile mode dump"
-            #print pickle.dumps(gpu_mode)
-            print "END GPU profile mode dump"
-            print "BEGIN CPU profile mode dump"
-            print pickle.dumps(cpu_mode)
-            print "END CPU profile mode dump"
-
-    finally:
-        predefined_modes["DEBUG_MODE"].check_isfinite = orig_check_isfinite
-        theano.tensor.basic.float32_atol=orig_float32_atol
-
-    if not cpu_only:
-        if verbose or not numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol): 
-            print "cpu:", rval_cpu
-            print "gpu:", rval_gpu
-            print "abs diff:", numpy.absolute(rval_gpu-rval_cpu)
-        print "time cpu: %.3f, time gpu: %.3f, speed up %f"%(tc, tg, tc/tg)
-        print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
-        print "estimated time for one pass through MNIST with gpu: %f" % (tg * (60000.0 / (n_train*bsize)))
-    else:
-        print "time cpu: %.3f"%(tc)
-        print "estimated time for one pass through MNIST with cpu: %f" % (tc * (60000.0 / (n_train*bsize)))
+    We check that we loop when their is too much block
+    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    """
+    x = T.fmatrix('x')

-    if not ignore_error and not cpu_only and not gpu_only:
-        assert numpy.allclose(rval_cpu, rval_gpu,rtol=1e-3,atol=float_atol)
+    #we need to test n>32*1024 to check that we make the block loop.
+    n,m=2<<15,5

-gpu_only=False
-cpu_only=False
-ignore_error=False
-verbose=0
-version=-1
+    data = numpy.arange(n*m, dtype='float32').reshape(n,m)

-def test_lenet_28(): #MNIST
-    cmp_run_conv_nnet2_classif(23485, 28, 5, 60, n_train=10,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose, version=version)
+    z = T.nnet.softmax_with_bias(x, T.zeros_like(x[0,:]))

-def test_lenet_32(): #CIFAR10 / Shapeset
-    cmp_run_conv_nnet2_classif(23485, 32, 5, 60, n_train=10,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               verbose=verbose, version=version)
+    f = theano.function([x],z, mode=mode_without_gpu)
+    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+    assert f.maker.env.toposort()[-1].op==T.nnet.softmax_with_bias
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmaxWithBias)
+    
+    out=f(data)
+    gout=f_gpu(data)
+    assert numpy.allclose(out,gout),numpy.absolute(out-gout)

-def test_lenet_32_long(): #CIFAR10 / Shapeset
-    # this tests the gradient of downsample on the GPU, 
-    # which does not recieve specific testing
-    cmp_run_conv_nnet2_classif(23485, 32, 5, 30, n_train=50,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose, version=version)
+def test_softmax():
+    """
+    This is basic test for GpuSoftmax

-def test_lenet_64(): # ???
-    #float_atol need to pass in debug mode
-    #needed as cpu use extended precision and gpu don't
-    cmp_run_conv_nnet2_classif(23485, 64, 7, 10, n_train=10,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose,
-                               float_atol=5e-4, check_isfinite=True, version=version)
+    We check that we loop when their is too much block
+    TODO: check that we loop when their is too much thread.(THIS IS NOT IMPLEMENTED)
+    """
+    x = T.fmatrix('x')

-def test_lenet_108(): # NORB
-    cmp_run_conv_nnet2_classif(23485, 108, 7, 5, n_train=4,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose,
-                               check_isfinite=True, version=version, float_atol=7e-2)
+    #we need to test n>32*1024 to check that we make the block loop.
+    n,m=2<<15,5

-def test_lenet_256(): # ImageNet
-    cmp_run_conv_nnet2_classif(23485, 256, 9, 2, n_train=5,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose,
-                               check_isfinite=True, version=version)
+    data = numpy.arange(n*m, dtype='float32').reshape(n,m)

-#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
-def tes_lenet_hd(): #HD 720p: 1280(wid)x720(len)
-    cmp_run_conv_nnet2_classif(23485, (720,1280), 9, 2, n_train=3,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose,
-                               check_isfinite=True, version=version)
+    z = T.nnet.softmax(x)

-#I did a wanted error in the name as we don't want it to execute automatically for now as it don't work
-def tes_lenet_full_hd(): #HD 1080p: 1920(wid)x1080(len)
-    cmp_run_conv_nnet2_classif(23485, (1080,1920), 9, 2, n_train=3,
-                               ignore_error=ignore_error, gpu_only=gpu_only,
-                               cpu_only=cpu_only, verbose=verbose,
-                               check_isfinite=True, version=version)
+    f = theano.function([x],z, mode=mode_without_gpu)
+    f_gpu = theano.function([x],z, mode=mode_with_gpu)
+    assert f.maker.env.toposort()[-1].op==T.nnet.softmax
+    assert isinstance(f_gpu.maker.env.toposort()[-2].op,cuda.nnet.GpuSoftmax)
+    
+    out=f(data)
+    gout=f_gpu(data)
+    assert numpy.allclose(out,gout),numpy.absolute(out-gout)
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -254,7 +254,9 @@ class CudaNdarrayType(Type):
        return ret

    def c_libraries(self):
-        return ['cudart']
+        # returning cublas because the cuda_ndarray.cuh header includes calls to SetVector and
+        # cublasGetError
+        return ['cudart', 'cublas']

    def c_support_code(cls):
        return ""

--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -4,7 +4,7 @@ import theano.tensor as T
 from theano.tensor.opt import register_specialize
 from theano.gof import local_optimizer

-from theano.sandbox.cuda import cuda_available
+from theano.sandbox.cuda import cuda_available, cuda_enabled
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
@@ -109,12 +109,11 @@ class GpuMultinomial(Multinomial):
            raise TypeError('pvals must be cudandarray', pvals)
        if not isinstance(unis.type, CudaNdarrayType):
            raise TypeError('unis must be cudandarray', unis)
-
        return Apply(self, [pvals, unis], [pvals.type()])

    def c_code_cache_version(self):
-        #return ()
-        return (super(GpuMultinomial,self).c_code_cache_version(),1)
+        return ()
+        #return (super(GpuMultinomial,self).c_code_cache_version(),1)

    def c_support_code_apply(self, node, nodename):
        return """
@@ -128,7 +127,7 @@ class GpuMultinomial(Multinomial):
            float * global_outs
        )
        {            
-            int n = 32*blockIdx.x + threadIdx.x;
+            int n = blockDim.x*blockIdx.x + threadIdx.x;
            if (n < nb_multi)
            {    
            
@@ -201,14 +200,31 @@ class GpuMultinomial(Multinomial):
            int nb_outcomes = CudaNdarray_HOST_DIMS(%(z)s)[0];
            int nb_multi = CudaNdarray_HOST_DIMS(%(z)s)[1];
            
-            int nb_block;
-            if (nb_multi %% 32 == 0)
-                nb_block = nb_multi/32;
-            else
-                nb_block = (int)((float)nb_multi/32. + 1.); 
+            //TODO : change this for a beautiful constant
+            int max_nb_blocks = 2<<15 - 1;
+            int nb_blocks = max_nb_blocks + 1;
+            int nb_threads=16; // so it really starts at 32, because of the *2
+            do
+            {
+                nb_threads*=2;
+                if (nb_multi %% nb_threads == 0)
+                    nb_blocks = nb_multi/nb_threads;
+                else
+                    nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.); 
+            } while (nb_blocks > max_nb_blocks);
+
+            //printf("\\nN=%%i b=%%i t=%%i t*b=%%i", nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+
+            // TODO : next line is a bit hardcoded...
+            if (nb_threads > 512)
+            {
+                PyErr_Format(PyExc_ValueError, "Mutinomial is not implemented for as many rows in the matrix (%%i)", nb_multi);
+                %(fail)s;
+            }
+
                
-            dim3 n_blocks(nb_block,1,1);
-            dim3 n_threads(32,1,1);
+            dim3 n_blocks(nb_blocks,1,1);
+            dim3 n_threads(nb_threads,1,1);
            int n_shared = 0;

            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
@@ -244,6 +260,6 @@ gpu_multinomial = GpuMultinomial()
 def use_gpu_multinomial(node):
    if node.op == multinomial:
        return [host_from_gpu(gpu_multinomial(*[gpu_from_host(i) for i in node.inputs]))]
-if theano.config.device.startswith('gpu'):
+if cuda_enabled:#theano.config.device.startswith('gpu'):
    register_specialize(use_gpu_multinomial)
    
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -685,7 +685,7 @@ class MRG_RandomStreams(object):
        else:
            raise NotImplementedError("MRG_RandomStreams.binomial with n > 1")
            
-    def multinomial(self, size=None, n=1, pvals=[[.5,.5]], ndim=None, dtype='int64'):
+    def multinomial(self, size=None, n=1, pvals=None, ndim=None, dtype='int64'):
        """
        Sample `n` (currently `n` needs to be 1) times from a multinomial distribution defined by
        probabilities pvals.
@@ -696,13 +696,12 @@ class MRG_RandomStreams(object):
            `size` and `ndim` are only there keep the same signature as other uniform, binomial, normal, etc.
            todo : adapt multinomial to take that into account
        """
+        if pvals is None:
+            raise TypeError("You have to specify pvals")
        pvals = as_tensor_variable(pvals)
        if n == 1 and pvals.ndim == 2:
-            pvals = as_tensor_variable(pvals)
            unis = self.uniform(size=pvals.shape[0:1], ndim=1)
-            
            return cast(multinomial(pvals.T, unis).T, dtype)
-
        else:
            raise NotImplementedError("MRG_RandomStreams.multinomial only implemented with n == 1 and pvals.ndim = 2")


--- a/theano/sandbox/test_rng_mrg.py
+++ b/theano/sandbox/test_rng_mrg.py
@@ -345,7 +345,7 @@ def test_uniform():
        #print 'random?[-1,-10:]\n', out[-1,-10:]
        basictest(f, steps, sample_size, prefix='mrg  cpu', inputs=input)

-        if mode!='FAST_COMPILE':
+        if mode!='FAST_COMPILE' and cuda_available:
            print ''
            print 'ON GPU with size=(%s):'%str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
@@ -403,7 +403,7 @@ def test_binomial():
            print 'random?[-1,-10:]\n', out[-1,-10:]
            basictest(f, steps, sample_size, prefix='mrg  cpu', inputs=input, allow_01=True, target_avg = mean)

-            if mode!='FAST_COMPILE':
+            if mode!='FAST_COMPILE' and cuda_available:
                print ''
                print 'ON GPU with size=(%s) and mean(%d):'%(str(size),mean)
                R = MRG_RandomStreams(234, use_cuda=True)
@@ -450,7 +450,7 @@ def test_normal0():
    # now with odd number of samples
    sample_size = (sample_size[0],sample_size[1]-1)

-    if mode!='FAST_COMPILE':
+    if mode!='FAST_COMPILE' and cuda_available:
        print ''
        print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
@@ -465,7 +465,7 @@ def test_normal0():
        print 'random?[:10]\n', numpy.asarray(f())[0,0:10]
        print '----'
        sys.stdout.flush()
-        basictest(f, steps, sample_size_odd, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
+        basictest(f, steps, sample_size, target_avg=-5.0, target_std=2.0, prefix='gpu mrg ', allow_01=True)
        

    print ''
@@ -528,6 +528,7 @@ def test_multinomial():
        print ''
        print 'ON GPU:'
        R = MRG_RandomStreams(234, use_cuda=True)
+        pvals = numpy.asarray(pvals, dtype='float32')
        n = R.multinomial(pvals=pvals, dtype='float32')
        assert n.dtype == 'float32' #well, it's really that this test w GPU doesn't make sense otw
        f = theano.function([], theano.Out(