Merge pull request #5190 from gvtulder/f-batchnorm-abstract

Abstract Ops for batch normalization

Merge pull request #5190 from gvtulder/f-batchnorm-abstract
8b9f7336 · Frédéric Bastien · GitHub · 18f27c44 · 60238616 · 8b9f7336
--- a/doc/library/tensor/nnet/bn.txt
+++ b/doc/library/tensor/nnet/bn.txt
@@ -10,6 +10,9 @@
 .. moduleauthor:: LISA
-.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`. They must be added manually as they do not have the same user interface.
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_train
+.. autofunction:: theano.tensor.nnet.bn.batch_normalization_test
+.. seealso:: cuDNN batch normalization: :class:`theano.gpuarray.dnn.dnn_batch_normalization_train`, :class:`theano.gpuarray.dnn.dnn_batch_normalization_test>`.
 .. autofunction:: theano.tensor.nnet.bn.batch_normalization
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
--- a/theano/gpuarray/dnn_batchnorm.c
+++ b/theano/gpuarray/dnn_batchnorm.c
@@ -2,8 +2,19 @@
 int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                     PyGpuArrayObject *bias, npy_float64 epsilon,
-                     PyGpuArrayObject **outp, PyGpuArrayObject **x_mean,
+                     npy_float64 running_average_factor,
-                     PyGpuArrayObject **x_invstd, cudnnHandle_t _handle) {
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject *in_running_mean,
+                     PyGpuArrayObject *in_running_var,
+#endif
+                     PyGpuArrayObject **outp,
+                     PyGpuArrayObject **x_mean,
+                     PyGpuArrayObject **x_invstd,
+#ifdef RUNNING_AVERAGES
+                     PyGpuArrayObject **out_running_mean,
+                     PyGpuArrayObject **out_running_var,
+#endif
+                     cudnnHandle_t _handle) {
  PyGpuContextObject *c = inp->context;
  if (c_set_tensorNd(inp, bn_input) != 0)
@@ -16,8 +27,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
    return 1;
  }
+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif
  if (theano_prep_output(x_mean, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
  if (theano_prep_output(x_invstd, scale->ga.nd, scale->ga.dimensions, scale->ga.typecode, GA_C_ORDER, c) != 0)
@@ -26,6 +43,31 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;
+#ifdef RUNNING_AVERAGES
+#ifdef INPLACE_RUNNING_MEAN
+  Py_XDECREF(out_running_mean);
+  PyGpuArrayObject *running_mean = in_running_mean;
+  Py_INCREF(running_mean);
+#else
+  PyGpuArrayObject *running_mean = *out_running_mean;
+  running_mean = theano_try_copy(running_mean, in_running_mean);
+  if (running_mean == NULL) {
+    return 1;
+  }
+#endif
+#ifdef INPLACE_RUNNING_VAR
+  Py_XDECREF(out_running_var);
+  PyGpuArrayObject *running_var = in_running_var;
+  Py_INCREF(running_var);
+#else
+  PyGpuArrayObject *running_var = *out_running_var;
+  running_var = theano_try_copy(running_var, in_running_var);
+  if (running_var == NULL) {
+    return 1;
+  }
+#endif
+#endif
  {
    const float falpha = 1.;
    const float fbeta = 0.;
@@ -52,9 +94,15 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
      bn_params,
      PyGpuArray_DEV_DATA(scale),
      PyGpuArray_DEV_DATA(bias),
+#ifdef RUNNING_AVERAGES
+      running_average_factor,
+      PyGpuArray_DEV_DATA(running_mean),
+      PyGpuArray_DEV_DATA(running_var),
+#else
      0,
      NULL,  // running mean, deliberately unused
      NULL,  // running var, deliberately unused
+#endif
      epsilon,
      PyGpuArray_DEV_DATA(*x_mean),
      PyGpuArray_DEV_DATA(*x_invstd)
@@ -64,6 +112,10 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
                   cudnnGetErrorString(err));
      return 1;
    }
+#ifdef RUNNING_AVERAGES
+    *out_running_mean = running_mean;
+    *out_running_var = running_var;
+#endif
  }
  return 0;
 }
--- a/theano/gpuarray/dnn_batchnorm_inf.c
+++ b/theano/gpuarray/dnn_batchnorm_inf.c
@@ -16,8 +16,14 @@ int dnn_batchnorm_op(PyGpuArrayObject *inp, PyGpuArrayObject *scale,
    return 1;
  }
+#ifdef INPLACE_OUTPUT
+  Py_XDECREF(*outp);
+  *outp = inp;
+  Py_INCREF(*outp);
+#else
  if (theano_prep_output(outp, inp->ga.nd, inp->ga.dimensions, inp->ga.typecode, GA_C_ORDER, c) != 0)
    return 1;
+#endif
  if (c_set_tensorNd(*outp, bn_output) != 0)
    return 1;

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -12,7 +12,7 @@ import warnings
 import theano
 from theano.compat import get_unbound_function
 from theano.compile import optdb
-from theano.gof import EquilibriumDB, SequenceDB
+from theano.gof import EquilibriumDB, SequenceDB, TopoOptimizer
 from theano.gof.cmodule import get_lib_extension
 from theano.gof.compilelock import get_lock, release_lock
 from theano import config
@@ -40,6 +40,17 @@ def register_opt(*tags, **kwargs):
    return f
+def register_inplace(*tags, **kwargs):
+    def f(local_opt):
+        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
+        optdb.register(
+            name, TopoOptimizer(
+                local_opt, failure_callback=TopoOptimizer.warn_inplace),
+            60, 'fast_run', 'inplace', 'gpu', *tags)
+        return local_opt
+    return f
 _logger_name = 'theano.sandbox.cuda'
 _logger = logging.getLogger(_logger_name)

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -3050,3 +3050,28 @@ conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
                       local_abstractconv3d_gradinputs_gemm, 30,
                       'conv_gemm',
                       'gpu', 'fast_compile', 'fast_run')
+# Register cuDNN batch normalization implementation
+abstract_batch_norm_groupopt = theano.gof.optdb.LocalGroupDB()
+abstract_batch_norm_groupopt.__name__ = "gpu_batchnorm_opts"
+register_opt('fast_compile')(abstract_batch_norm_groupopt)
+# cuDNN optimizations are only registered if cuDNN is available.
+# (we import these opts here instead of at the top of this file
+# to avoid a circular dependency problem with dnn)
+from .dnn import (local_abstract_batch_norm_train_cudnn,
+                  local_abstract_batch_norm_train_grad_cudnn,
+                  local_abstract_batch_norm_inference_cudnn)     # noqa: 402
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_dnn',
+                                      local_abstract_batch_norm_train_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_train_grad_dnn',
+                                      local_abstract_batch_norm_train_grad_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
+abstract_batch_norm_groupopt.register('local_abstract_batch_norm_inference_dnn',
+                                      local_abstract_batch_norm_inference_cudnn, 20,
+                                      'batchnorm_dnn',
+                                      'gpu', 'fast_compile', 'fast_run', 'cudnn')
--- a/theano/sandbox/cuda/tests/test_dnn.py
+++ b/theano/sandbox/cuda/tests/test_dnn.py
--- a/theano/tensor/nnet/bn.py
+++ b/theano/tensor/nnet/bn.py
--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py