Merge pull request #5474 from nouiz/abstract_bn

Follow up from abstract bn PR

Merge pull request #5474 from nouiz/abstract_bn
ea927aef · Frédéric Bastien · GitHub · 466cdaa8 · 475c1335 · ea927aef
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -28,7 +28,6 @@ from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              assert_conv_shape)
 from theano.tensor.signal.pool import (
    Pool, MaxPoolGrad, AveragePoolGrad)
-from theano.tensor.nnet import bn
 from . import pygpu
 from .type import (get_context, gpu_context_type, list_contexts,
                   GpuArraySharedVariable)
@@ -87,10 +86,6 @@ def _make_handle(ctx):
    return handle


-def raise_no_cudnn(msg="cuDNN is required for convolution and pooling"):
-    raise RuntimeError(msg)
-
-
 def _dnn_check_compile():
    preambule = """
 #include <stdio.h>
@@ -2733,7 +2728,7 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
    inp2 = inputs[1]

    if not dnn_available(inp1.type.context_name):
-        raise_no_cudnn()
+        return

    if op.filter_flip:
        conv_mode = 'conv'
@@ -2776,7 +2771,7 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
    inp2 = inputs[1]

    if not dnn_available(inp1.type.context_name):
-        raise_no_cudnn()
+        return

    if op.filter_flip:
        conv_mode = 'conv'
@@ -2902,7 +2897,7 @@ def local_dnn_convi_output_merge(node, *inputs):

 def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
-        raise_no_cudnn()
+        return
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
@@ -2931,7 +2926,7 @@ pool_db2.register("local_gpua_pool_dnn_alternative",

 def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
-        raise_no_cudnn()
+        return
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
@@ -2975,7 +2970,7 @@ pool_db2.register("local_gpua_pool_dnn_grad_stride",

 def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
-        raise_no_cudnn()
+        return
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
@@ -3018,7 +3013,7 @@ pool_db2.register("local_gpua_avg_pool_dnn_grad_stride",
 def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
        if not dnn_available(node.outputs[0].type.context_name):
-            raise_no_cudnn()
+            return
        ins = node.inputs[0].dimshuffle(0, 1, 'x', 'x')
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax('accurate', 'channel')(ins)
@@ -3035,9 +3030,6 @@ def local_log_softmax_dnn(node):
            node.inputs[0].owner and
            isinstance(node.inputs[0].owner.op, GpuDnnSoftmax) and
            len(node.inputs[0].clients) == 1):
-        if version(raises=False) < 3000:
-            # No log-softmax before cudnn v3
-            raise_no_cudnn("Need cuDNN v3 for LogSoftmax")
        softmax_node = node.inputs[0].owner
        new_softmax = GpuDnnSoftmax('log', softmax_node.op.mode)
        return [new_softmax(softmax_node.inputs[0])]
@@ -3051,9 +3043,8 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    inp = inputs[0]
    if inp.ndim != 2:
        return
-    if not dnn_available(ctx_name) or version(raises=False) < 3000:
-        # No log-softmax before cudnn v3
-        raise_no_cudnn("Need cuDNN v3 for LogSoftmax")
+    if not dnn_available(ctx_name):
+        return

    inp = inp.dimshuffle(0, 1, 'x', 'x')
    inp.tag.context_name = ctx_name
@@ -3087,7 +3078,7 @@ gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
 @register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
 def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
-        raise_no_cudnn("cuDNN needed for SoftmaxGrad")
+        return
    ins = []
    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
@@ -3100,9 +3091,6 @@ def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    return [out.dimshuffle(0, 2)]


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([bn.AbstractBatchNormTrain])
-@register_opt2([bn.AbstractBatchNormTrain], 'cudnn', 'fast_compile')
 def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
    x, scale, bias, epsilon, running_average_factor = inputs[:5]
    running_mean = inputs[5] if len(inputs) > 5 else None
@@ -3130,8 +3118,7 @@ def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):

    ctx = infer_context_name(*inputs)
    if not dnn_available(ctx):
-        # TODO should this raise_no_cudnn?
-        return None
+        return
    x = as_gpuarray_variable(x, context_name=ctx)
    scale = as_gpuarray_variable(scale, context_name=ctx)
    bias = as_gpuarray_variable(bias, context_name=ctx)
@@ -3186,9 +3173,6 @@ def local_batch_norm_inference_inplace(node):
        return [GpuDnnBatchNormInference(mode=node.op.mode, inplace=True)(*node.inputs)]


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([bn.AbstractBatchNormTrainGrad])
-@register_opt2([bn.AbstractBatchNormTrainGrad], 'cudnn', 'fast_compile')
 def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
    x, dy, scale, x_mean, x_invstd, epsilon = inputs

@@ -3234,8 +3218,7 @@ def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):

    ctx = infer_context_name(*inputs)
    if not dnn_available(ctx):
-        # TODO should this raise_no_cudnn?
-        return None
+        return
    x = as_gpuarray_variable(x, context_name=ctx)
    dy = as_gpuarray_variable(dy, context_name=ctx)
    scale = as_gpuarray_variable(scale, context_name=ctx)
@@ -3257,9 +3240,6 @@ def local_abstract_batch_norm_train_grad_cudnn(op, ctx_name, inputs, outputs):
    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]


-@register_opt('cudnn', 'fast_compile')
-@op_lifter([bn.AbstractBatchNormInference])
-@register_opt2([bn.AbstractBatchNormInference], 'cudnn', 'fast_compile')
 def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):
    x, scale, bias, estimated_mean, estimated_variance, epsilon = inputs

@@ -3280,8 +3260,7 @@ def local_abstract_batch_norm_inference_cudnn(op, ctx_name, inputs, outputs):

    ctx = infer_context_name(*inputs)
    if not dnn_available(ctx):
-        # TODO should this raise_no_cudnn?
-        return None
+        return
    x = as_gpuarray_variable(x, context_name=ctx)
    scale = as_gpuarray_variable(scale, context_name=ctx)
    bias = as_gpuarray_variable(bias, context_name=ctx)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -22,6 +22,7 @@ from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scalar.basic_scipy import Erfinv, Erfcinv
 from theano.scan_module import scan_utils, scan_op, scan_opt

+from theano.tensor.nnet import bn
 from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.blocksparse import SparseBlockGemv, SparseBlockOuter
 from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
@@ -1964,9 +1965,8 @@ abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
 abstractconv_groupopt.__name__ = "gpuarray_abstractconv_opts"
 register_opt('fast_compile')(abstractconv_groupopt)

-# cuDNN is first, but only registered if cuDNN is available.
-# (we import these opts here instead of at the top of this file
-# to avoid a circular dependency problem with dnn)
+# We import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn
 from .dnn import (local_abstractconv_cudnn, local_abstractconv_gw_cudnn,
                  local_abstractconv_gi_cudnn)     # noqa: 402
 abstractconv_groupopt.register('local_abstractconv_dnn',
@@ -2005,3 +2005,56 @@ abstractconv_groupopt.register('local_abstractconv3d_gradinputs',
                               local_abstractconv3d_gradinputs_gemm, 30,
                               'conv_gemm',
                               'gpuarray', 'fast_compile', 'fast_run')
+
+
+# Register cuDNN batch normalization implementation
+
+# We import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn
+from .dnn import (local_abstract_batch_norm_train_cudnn,
+                  local_abstract_batch_norm_train_grad_cudnn,
+                  local_abstract_batch_norm_inference_cudnn)     # noqa: 402
+
+abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
+abstract_batch_norm_groupopt.__name__ = "gpuarray_batchnorm_opts"
+register_opt('fast_compile')(abstract_batch_norm_groupopt)
+
+abstract_batch_norm_db = LocalGroupDB()
+abstract_batch_norm_db2 = LocalGroupDB(
+    local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
+abstract_batch_norm_db2.__name__ = "abstract_batch_norm_db2"
+register_opt('fast_compile', name='abstract_batch_norm_db')(
+    abstract_batch_norm_db)
+register_opt2([bn.AbstractBatchNormTrain,
+               bn.AbstractBatchNormTrainGrad,
+               bn.AbstractBatchNormInference],
+              'fast_compile', name='abstract_batch_norm_db2')(
+    abstract_batch_norm_db2)
+
+for op, fct, cpu in [(bn.AbstractBatchNormTrain,
+                      local_abstract_batch_norm_train_cudnn,
+                      bn.local_abstract_batch_norm_train),
+                     (bn.AbstractBatchNormTrainGrad,
+                      local_abstract_batch_norm_train_grad_cudnn,
+                      bn.local_abstract_batch_norm_train_grad),
+                     (bn.AbstractBatchNormInference,
+                      local_abstract_batch_norm_inference_cudnn,
+                      bn.local_abstract_batch_norm_inference)]:
+    lifter = op_lifter([op])(fct)
+    abstract_batch_norm_db.register(fct.__name__,
+                                    lifter,
+                                    'gpuarray', 'fast_compile', 'fast_run',
+                                    'cudnn', 'batchnorm_dnn',
+                                    position=1)
+    abstract_batch_norm_db2.register(fct.__name__,
+                                     local_optimizer([op])(fct),
+                                     'gpuarray', 'fast_compile', 'fast_run',
+                                     'cudnn', 'batchnorm_dnn',
+                                     position=1)
+    # cpu is a normal optimization. We can't register it in
+    # GraphToGPU.  So for now, only add it to the slower EQ phase.  If
+    # there is no cuDNN, we still want to move it to the GPU now with
+    # a Theano graph so to have this graph on the GPU.
+    abstract_batch_norm_db.register(cpu.__name__, cpu,
+                                    'gpuarray', 'fast_compile', 'fast_run',
+                                    position='last')
--- a/theano/gpuarray/tests/config.py
+++ b/theano/gpuarray/tests/config.py
@@ -22,6 +22,7 @@ if theano.config.mode == 'FAST_COMPILE':
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
+    mode_without_gpu.check_py_code = False


 # If using float16, cast reference input to float32

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -26,6 +26,10 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer

 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD

+mode_with_gpu = mode_with_gpu.including()
+# Globally disabled for mode_without_gpu
+mode_with_gpu.check_py_code = False
+

 # If using float16, set CUDNN precision to float32
 def set_precision(floatX):
@@ -1451,7 +1455,7 @@ def test_dnn_batchnorm_train():
                                              bn.AbstractBatchNormTrainGrad)) for n
                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
+            for data_shape in ((5, 10, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (2, 3, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
@@ -1505,7 +1509,7 @@ def test_dnn_batchnorm_train_without_running_averages():
        bn.batch_normalization_train(x, scale, bias, 'per-activation')
    # backward pass
    grads_gpu = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
-    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_gpu: dy})
+    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
    # compile
    f_gpu = theano.function([x, scale, bias, dy],
                            [out_gpu, x_mean_gpu, x_invstd_gpu] +
@@ -1533,6 +1537,44 @@ def test_dnn_batchnorm_train_without_running_averages():
    f_abstract(X, Scale, Bias, Dy)


+def test_without_dnn_batchnorm_train_without_running_averages():
+    # compile and run batch_normalization_train without running averages
+    # But disable cudnn and make sure it run on the GPU.
+    utt.seed_rng()
+
+    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    data_shape = (5, 10, 30, 25)
+    param_shape = (1, 10, 30, 25)
+
+    # forward pass
+    out_abstract, x_mean_abstract, x_invstd_abstract = \
+        bn.batch_normalization_train(x, scale, bias, 'per-activation')
+    # backward pass
+    grads_abstract = T.grad(None, wrt=[x, scale, bias], known_grads={out_abstract: dy})
+    # compile
+    f_abstract = theano.function([x, scale, bias, dy],
+                                 [out_abstract, x_mean_abstract, x_invstd_abstract] +
+                                 grads_abstract,
+                                 mode=mode_with_gpu.excluding('cudnn'))
+    # check if the abstract Ops have been replaced
+    assert not any([isinstance(n.op, dnn.GpuDnnBatchNorm)
+                    for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, dnn.GpuDnnBatchNormGrad)
+                    for n in f_abstract.maker.fgraph.toposort()])
+    assert not any([isinstance(n.op, (bn.AbstractBatchNormTrain,
+                                      bn.AbstractBatchNormInference,
+                                      bn.AbstractBatchNormTrainGrad))
+                    for n in f_abstract.maker.fgraph.toposort()])
+    assert any([isinstance(n.op, dnn.GpuElemwise)
+                for n in f_abstract.maker.fgraph.toposort()])
+    # run
+    X = 4 + 3 * np.random.randn(*data_shape).astype(theano.config.floatX)
+    Dy = -1 + 2 * np.random.randn(*data_shape).astype(theano.config.floatX)
+    Scale = np.random.randn(*param_shape).astype(theano.config.floatX)
+    Bias = np.random.randn(*param_shape).astype(theano.config.floatX)
+    f_abstract(X, Scale, Bias, Dy)
+
+
 def test_dnn_batchnorm_train_inplace():
    # test inplace_running_mean and inplace_running_var
    if not dnn.dnn_available(test_ctx_name):
@@ -1628,7 +1670,7 @@ def test_batchnorm_inference():
                                              bn.AbstractBatchNormTrainGrad)) for n
                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
+            for data_shape in ((10, 2, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -3705,6 +3705,19 @@ def local_gpu_batch_norm_inference_inplace(node):
                                         inplace=True)(*node.inputs)]


+def values_eq_approx_high_tol(a, b):
+    """
+    This fct is needed to don't have DebugMode raise useless
+    errors due to rounding error.
+
+    This happen as we reduce on the two last dimensions, so this
+    can raise the absolute error if the number of elements we
+    reduce on is significant.
+
+    """
+    return tensor.TensorType.values_eq_approx(a, b, atol=0.015)
+
+
 @local_optimizer([bn.AbstractBatchNormTrainGrad])
 def local_abstract_batch_norm_train_grad_cudnn(node):
    if not isinstance(node.op, bn.AbstractBatchNormTrainGrad):
@@ -3781,6 +3794,9 @@ def local_abstract_batch_norm_train_grad_cudnn(node):
    if isinstance(node.outputs[2].type, tensor.TensorType):
        g_wrt_bias = tensor.as_tensor_variable(g_wrt_bias)
    # TODO copy_stack_trace?
+
+    g_wrt_inputs.tag.values_eq_approx = values_eq_approx_high_tol
+    g_wrt_scale.tag.values_eq_approx = values_eq_approx_high_tol
    return [g_wrt_inputs, g_wrt_scale, g_wrt_bias]



--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
@@ -38,6 +38,8 @@ if theano.config.mode == 'FAST_COMPILE':
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+    mode_with_gpu.check_py_code = False
+    mode_without_gpu.check_py_code = False


 def test_dnn_conv_desc_merge():
@@ -732,7 +734,7 @@ def test_batchnorm_train():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

-    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+    tensor6 = T.TensorType('float32', (False,) * 6)

    for mode in ('per-activation', 'spatial'):
        for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
@@ -766,7 +768,7 @@ def test_batchnorm_train():
            x_invstd_ref = T.inv(T.sqrt(x_var_ref + eps))
            scale_ref = T.addbroadcast(scale, *axes)
            bias_ref = T.addbroadcast(bias, *axes)
-            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX)
+            m = T.cast(T.prod(x.shape) / T.prod(scale.shape), 'float32')
            out_ref = (x - x_mean_ref) * (scale_ref * x_invstd_ref) + bias_ref
            out_running_mean_ref = running_mean * (1 - running_average_factor) + \
                x_mean_ref * running_average_factor
@@ -801,16 +803,16 @@ def test_batchnorm_train():
                                              bn.AbstractBatchNormTrainGrad)) for n
                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
+            for data_shape in ((5, 2, 30, 4, 10, 5), (4, 3, 1, 1, 1, 1), (2, 3, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Running_mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Running_var = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
+                Scale = numpy.random.randn(*param_shape).astype('float32')
+                Bias = numpy.random.randn(*param_shape).astype('float32')
+                Running_mean = numpy.random.randn(*param_shape).astype('float32')
+                Running_var = numpy.random.randn(*param_shape).astype('float32')
                outputs_gpu = f_gpu(X, Scale, Bias, Running_mean, Running_var, Dy)
                outputs_abstract = f_abstract(X, Scale, Bias, Running_mean, Running_var, Dy)
                outputs_ref = f_ref(X, Scale, Bias, Running_mean, Running_var, Dy)
@@ -844,7 +846,7 @@ def test_dnn_batchnorm_train_without_running_averages():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

-    x, scale, bias, dy = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias'), T.tensor4('dy')
+    x, scale, bias, dy = T.ftensor4('x'), T.ftensor4('scale'), T.ftensor4('bias'), T.ftensor4('dy')
    data_shape = (5, 10, 30, 25)
    param_shape = (1, 10, 30, 25)

@@ -875,10 +877,10 @@ def test_dnn_batchnorm_train_without_running_averages():
                                      bn.AbstractBatchNormTrainGrad))
                    for n in f_abstract.maker.fgraph.toposort()])
    # run
-    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+    Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
+    Scale = numpy.random.randn(*param_shape).astype('float32')
+    Bias = numpy.random.randn(*param_shape).astype('float32')
    f_gpu(X, Scale, Bias, Dy)
    f_abstract(X, Scale, Bias, Dy)

@@ -891,14 +893,14 @@ def test_dnn_batchnorm_train_inplace():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

-    x, scale, bias = T.tensor4('x'), T.tensor4('scale'), T.tensor4('bias')
+    x, scale, bias = T.ftensor4('x'), T.ftensor4('scale'), T.ftensor4('bias')
    data_shape = (5, 10, 30, 25)
    param_shape = (1, 10, 30, 25)
    running_mean = shared(
-        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        numpy.random.randn(*param_shape).astype('float32'),
        broadcastable=(True, False, False, False))
    running_var = shared(
-        numpy.random.randn(*param_shape).astype(theano.config.floatX),
+        numpy.random.randn(*param_shape).astype('float32'),
        broadcastable=(True, False, False, False))

    # forward pass
@@ -923,9 +925,9 @@ def test_dnn_batchnorm_train_inplace():
    assert nodes[0].op.inplace_running_var
    assert nodes[0].op.inplace_output
    # run
-    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+    Scale = numpy.random.randn(*param_shape).astype('float32')
+    Bias = numpy.random.randn(*param_shape).astype('float32')
    f(X, Scale, Bias)


@@ -936,10 +938,10 @@ def test_batchnorm_inference():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

-    tensor6 = T.TensorType(theano.config.floatX, (False,) * 6)
+    tensor6 = T.TensorType('float32', (False,) * 6)

    for mode in ('per-activation', 'spatial'):
-        for vartype in (tensor6, T.tensor5, T.tensor4, T.tensor3, T.matrix, T.vector):
+        for vartype in (tensor6, T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix, T.fvector):
            x, scale, bias, mean, var = (vartype(n)
                                         for n in ('x', 'scale', 'bias', 'mean', 'var'))
            ndim = x.ndim
@@ -980,16 +982,16 @@ def test_batchnorm_inference():
                                              bn.AbstractBatchNormTrainGrad)) for n
                            in f_abstract.maker.fgraph.toposort()])
            # run
-            for data_shape in ((10, 20, 30, 40, 10, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
+            for data_shape in ((10, 2, 15, 4, 6, 5), (4, 3, 1, 1, 1, 1), (1, 1, 5, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes else s
                                    for d, s in enumerate(data_shape))
-                X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-                Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-                Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+                X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+                Dy = -1 + 2 * numpy.random.randn(*data_shape).astype('float32')
+                Scale = numpy.random.randn(*param_shape).astype('float32')
+                Bias = numpy.random.randn(*param_shape).astype('float32')
+                Mean = numpy.random.randn(*param_shape).astype('float32')
+                Var = numpy.random.rand(*param_shape).astype('float32')
                outputs_gpu = f_gpu(X, Scale, Bias, Mean, Var, Dy)
                outputs_abstract = f_abstract(X, Scale, Bias, Mean, Var, Dy)
                outputs_ref = f_ref(X, Scale, Bias, Mean, Var, Dy)
@@ -1017,7 +1019,7 @@ def test_batchnorm_inference_inplace():
        raise SkipTest("batch normalization requires cudnn v5+")
    utt.seed_rng()

-    x, scale, bias, mean, var = (T.tensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
+    x, scale, bias, mean, var = (T.ftensor4(n) for n in ('x', 'scale', 'bias', 'mean', 'var'))
    data_shape = (5, 10, 30, 25)
    param_shape = (1, 10, 30, 25)

@@ -1031,11 +1033,11 @@ def test_batchnorm_inference_inplace():
    assert nodes[0].op.inplace

    # run
-    X = 4 + 3 * numpy.random.randn(*data_shape).astype(theano.config.floatX)
-    Scale = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-    Bias = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-    Mean = numpy.random.randn(*param_shape).astype(theano.config.floatX)
-    Var = numpy.random.rand(*param_shape).astype(theano.config.floatX)
+    X = 4 + 3 * numpy.random.randn(*data_shape).astype('float32')
+    Scale = numpy.random.randn(*param_shape).astype('float32')
+    Bias = numpy.random.randn(*param_shape).astype('float32')
+    Mean = numpy.random.randn(*param_shape).astype('float32')
+    Var = numpy.random.rand(*param_shape).astype('float32')
    f(X, Scale, Bias, Mean, Var)


@@ -1045,7 +1047,7 @@ def test_dnn_batchnorm_valid_and_invalid_axes():
    if cuda.dnn.version() < (5000, 5000):
        raise SkipTest("batch normalization requires cudnn v5+")

-    for vartype in (T.tensor5, T.tensor4, T.tensor3, T.matrix):
+    for vartype in (T.ftensor5, T.ftensor4, T.ftensor3, T.fmatrix):
        x, scale, bias, mean, var, dy = (vartype(n)
                                         for n in ('x', 'scale', 'bias', 'mean', 'var', 'dy'))
        ndim = x.ndim

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -88,6 +88,14 @@ def upcast(dtype, *dtypes):
    return rval


+def as_common_dtype(*vars):
+    """
+    For for theano.scalar.Scalar and TensorVariable.
+    """
+    dtype = upcast(*[v.dtype for v in vars])
+    return (v.astype(dtype) for v in vars)
+
+
 def get_scalar_type(dtype):
    """
    Return a Scalar(dtype) object.

--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
@@ -7,7 +7,7 @@ from theano.gof.opt import copy_stack_trace
 from theano.tensor import as_tensor_variable, TensorType
 from theano.tensor import basic as T
 from theano.tensor.opt import register_specialize_device
-from theano.scalar import Composite
+from theano.scalar import Composite, as_common_dtype
 from theano.scalar import add, sub, true_div, mul


@@ -413,15 +413,27 @@ class AbstractBatchNormTrain(Op):
    def make_node(self, x, scale, bias, epsilon=1e-4,
                  running_average_factor=0.1,
                  running_mean=None, running_var=None):
+        x = as_tensor_variable(x)
+        scale = as_tensor_variable(scale)
+        bias = as_tensor_variable(bias)
+        epsilon = as_tensor_variable(epsilon)
+        running_average_factor = as_tensor_variable(running_average_factor)
+        if running_mean is not None:
+            running_mean = as_tensor_variable(running_mean)
+        if running_var is not None:
+            running_var = as_tensor_variable(running_var)
        assert x.ndim == scale.ndim == bias.ndim
        assert ((running_mean is None and running_var is None) or
                (running_mean is not None and running_var is not None))
        assert (running_mean is None or running_mean.ndim == x.ndim)
        assert (running_var is None or running_var.ndim == x.ndim)
-        if not isinstance(epsilon, theano.Variable):
-            epsilon = as_tensor_variable(epsilon)
-        if not isinstance(running_average_factor, theano.Variable):
-            running_average_factor = as_tensor_variable(running_average_factor)
+        # Upcast to common dtype on the non-scalar
+        # Keep as is dtype of scalar (epsilon and running_average_factor)
+        if running_mean:
+            x, scale, bias, running_mean, running_var = as_common_dtype(
+                x, scale, bias, running_mean, running_var)
+        else:
+            x, scale, bias = as_common_dtype(x, scale, bias)
        inputs = [x, scale, bias, epsilon, running_average_factor]
        output_types = [x.type(), scale.type(), scale.type()]
        if running_mean is not None and running_var is not None:
@@ -513,9 +525,18 @@ class AbstractBatchNormInference(Op):
        return [shape[0]]

    def make_node(self, x, scale, bias, estimated_mean, estimated_variance, epsilon=1e-4):
+        x = as_tensor_variable(x)
+        scale = as_tensor_variable(scale)
+        bias = as_tensor_variable(bias)
+        estimated_mean = as_tensor_variable(estimated_mean)
+        estimated_variance = as_tensor_variable(estimated_variance)
+        epsilon = as_tensor_variable(epsilon)
+        # Upcast to common dtype on the non-scalar
+        # Keep as is dtype of scalar (epsilon)
+        x, scale, bias, estimated_mean, estimated_variance = as_common_dtype(
+            x, scale, bias, estimated_mean, estimated_variance)
        assert x.ndim == scale.ndim == bias.ndim == estimated_mean.ndim == estimated_variance.ndim
-        if not isinstance(epsilon, theano.Variable):
-            epsilon = as_tensor_variable(epsilon)
+
        return Apply(self, [x, scale, bias, estimated_mean, estimated_variance, epsilon], [x.type()])

    def grad(self, inputs, grads):
@@ -561,9 +582,18 @@ class AbstractBatchNormTrainGrad(Op):
        self.axes = axes

    def make_node(self, x, dy, scale, x_mean, x_invstd, epsilon=1e-4):
+        x = as_tensor_variable(x)
+        dy = as_tensor_variable(dy)
+        scale = as_tensor_variable(scale)
+        x_mean = as_tensor_variable(x_mean)
+        x_invstd = as_tensor_variable(x_invstd)
+        epsilon = as_tensor_variable(epsilon)
+
+        # Upcast to common dtype on the non-scalar
+        # Keep as is dtype of scalar (epsilon)
+        x, dy, scale, x_mean, x_invstd = as_common_dtype(
+            x, dy, scale, x_mean, x_invstd)
        assert x.ndim == dy.ndim == scale.ndim == x_mean.ndim == x_invstd.ndim
-        if not isinstance(epsilon, theano.Variable):
-            epsilon = as_tensor_variable(epsilon)
        return Apply(self, [x, dy, scale, x_mean, x_invstd, epsilon],
                     [x.type(), scale.type(), scale.type()])

@@ -612,6 +642,9 @@ def local_abstract_batch_norm_train(node):

    mean = x.mean(axes, keepdims=True)
    var = x.var(axes, keepdims=True)
+    # The epsilon should not upcast the dtype.
+    if var.dtype == 'float32' and epsilon.dtype == 'float64':
+        epsilon = epsilon.astype('float32')
    invstd = T.inv(T.sqrt(var + epsilon))
    out = (x - mean) * (scale * invstd) + bias
    results = [out, mean, invstd]
@@ -687,6 +720,10 @@ def local_abstract_batch_norm_inference(node):
       not isinstance(epsilon.type, TensorType):
        return None

+    # The epsilon should not upcast the dtype.
+    if estimated_variance.dtype == 'float32' and epsilon.dtype == 'float64':
+        epsilon = epsilon.astype('float32')
+
    result = (x - estimated_mean) * (scale / T.sqrt(estimated_variance + epsilon)) + bias
    result = T.patternbroadcast(result, node.outputs[0].broadcastable)


--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
@@ -201,7 +201,7 @@ def test_batch_normalization_train():
                                              bn.AbstractBatchNormTrainGrad))
                            for n in f.maker.fgraph.toposort()])
            # run
-            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)):
+            for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)):
                data_shape = data_shape[:ndim]
                param_shape = tuple(1 if d in axes2 else s
                                    for d, s in enumerate(data_shape))

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -203,9 +203,10 @@ class TensorType(Type):
        """
        Convert a symbolic Variable into a TensorType, if compatible.

-        For the moment, only a TensorType or CudaNdarrayType will be
-        converted, provided they have the same number of dimensions,
-        broadcastable pattern, and dtype.
+        For the moment, only a TensorType, GpuArrayType and
+        CudaNdarrayType will be
+        converted, provided they have the same number of dimensions and
+        dtype and have "compatible" broadcastable pattern.

        """
        if hasattr(other, '_as_TensorVariable'):