Merge pull request #4413 from nouiz/harmdevries89-multinomial_newbackend

multinomial newbackend

Merge pull request #4413 from nouiz/harmdevries89-multinomial_newbackend
5e5e5cc5 · Frédéric Bastien · 6fcd2cd0 · 843e461d · 5e5e5cc5 · 5e5e5cc5
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -293,6 +293,9 @@ class GpuKernelBase(object):
    # This is a shorthand for if your op only has a fixed version
    # You can reimplement it, but make sure to call kernel_version()
    def c_code_cache_version_apply(self, node):
+        v = self.c_code_cache_version()
+        if not v:
+            return ()
        return (self.c_code_cache_version(), self.kernel_version(node))
    def kernel_version(self, node):

--- a/theano/sandbox/gpuarray/extra_ops.py
+++ b/theano/sandbox/gpuarray/extra_ops.py
 from __future__ import absolute_import, print_function, division
 import os
-from theano import Apply
+from theano import Apply, Op
 from theano.tensor.extra_ops import CumsumOp
 try:
@@ -13,7 +13,7 @@ from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
 from .opt import register_opt as register_gpu_opt, op_lifter
-class GpuCumsum(GpuKernelBase):
+class GpuCumsum(GpuKernelBase, Op):
    """
    Parameters
    ----------

--- a/theano/sandbox/gpuarray/multinomial.py
+++ b/theano/sandbox/gpuarray/multinomial.py
+# TODO test dtype != float32
+from __future__ import absolute_import, print_function, division
+import os
+try:
+    import pygpu
+except ImportError:
+    pass
+import theano
+import theano.sandbox.multinomial
+from theano import Apply, config
+from theano.gof import Op
+from theano.tensor import NotScalarConstantError, get_scalar_constant_value
+from theano.sandbox import gpuarray
+from .basic_ops import as_gpuarray_variable, infer_context_name
+from .opt import register_opt, op_lifter
+from .type import GpuArrayType
+class GPUAMultinomialFromUniform(gpuarray.basic_ops.GpuKernelBase, Op):
+    __props__ = ("odtype",)
+    def __init__(self, odtype):
+        Op.__init__(self)
+        self.odtype = odtype
+    def get_params(self, node):
+        return node.outputs[0].type.context
+    def c_headers(self):
+        return ['<numpy_compat.h>', 'gpuarray_helper.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__)]
+    def make_node(self, pvals, unis):
+        assert pvals.dtype == 'float32'
+        assert unis.dtype == 'float32'
+        ctx_name = infer_context_name(pvals, unis)
+        pvals = as_gpuarray_variable(pvals, ctx_name)
+        unis = as_gpuarray_variable(unis, ctx_name)
+        if pvals.ndim != 2:
+            raise NotImplementedError('pvals ndim should be 2', pvals.ndim)
+        if unis.ndim != 1:
+            raise NotImplementedError('unis ndim should be 1', unis.ndim)
+        if self.odtype == 'auto':
+            odtype = pvals.dtype
+        else:
+            odtype = self.odtype
+        assert odtype == 'float32', odtype
+        if odtype != pvals.dtype:
+            raise NotImplementedError(
+                'GpuMultinomialFromUniform works only if '
+                'self.odtype == pvals.dtype', odtype, pvals.dtype)
+        br = (pvals.broadcastable[1], pvals.broadcastable[0])
+        out = GpuArrayType(broadcastable=br,
+                           dtype=odtype,
+                           context_name=ctx_name)()
+        return Apply(self, [pvals, unis], [out])
+    def gpu_kernels(self, node, name):
+        code = """
+KERNEL void k_multi_warp_multinomial(
+    const ga_size nb_multi,
+    const ga_size nb_outcomes,
+    GLOBAL_MEM float * global_pvals,
+    const ga_ssize pvals_row_stride,
+    const ga_ssize pvals_col_stride,
+    GLOBAL_MEM float * global_unis,
+    const ga_ssize unis_stride,
+    GLOBAL_MEM float * global_outs,
+    const ga_ssize outs_row_stride,
+    const ga_ssize outs_col_stride
+)
+{
+    // each thread takes care of one multinomial draw
+    int n = LDIM_0*GID_0 + LID_0;
+    if (n < nb_multi)
+    {
+        float cummul = 0.;
+        bool done = false;
+        const float unis_n = global_unis[n*unis_stride];
+        for (ga_size m = 0; m < nb_outcomes; ++m)
+        {
+            float current_out = 0.;
+            if (!done)
+            {
+                cummul += global_pvals[m * pvals_col_stride +
+                                       n * pvals_row_stride];
+                if (unis_n < cummul)
+                {
+                    current_out = 1.;
+                    done = true;
+                }
+            }
+            //write out transposed for speed.
+            global_outs[n * outs_col_stride +
+                        m * outs_row_stride] = current_out;
+        }
+    }
+}
+"""
+        return [gpuarray.basic_ops.Kernel(
+            code=code, name="k_multi_warp_multinomial",
+            params=[pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.SIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.GpuArray,
+                    pygpu.gpuarray.SSIZE,
+                    pygpu.gpuarray.SSIZE],
+            flags=gpuarray.basic_ops.Kernel.get_flags(node.outputs[0].dtype),
+            objvar='k_multi_warp_multinomial_' + name)]
+    def c_code(self, node, name, inp, outputs, sub):
+        pvals, unis = inp
+        out, = outputs
+        fail = sub['fail']
+        ctx = sub['params']
+        sync = bool(config.gpuarray.sync)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+        PyGpuArrayObject * pvals = %(pvals)s;
+        PyGpuArrayObject * unis = %(unis)s;
+        PyGpuArrayObject * out = %(out)s;
+    size_t dims[2];
+    if (PyGpuArray_NDIM(pvals) != 2)
+    {
+        PyErr_Format(PyExc_TypeError, "pvals wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_NDIM(unis) != 1)
+    {
+        PyErr_Format(PyExc_TypeError, "unis wrong rank");
+        %(fail)s
+    }
+    if (PyGpuArray_DIMS(unis)[0] != PyGpuArray_DIMS(pvals)[0])
+    {
+        PyErr_Format(PyExc_ValueError, "unis.shape[0] != pvals.shape[0]");
+        %(fail)s
+    }
+    dims[0] = PyGpuArray_DIMS(pvals)[1];
+    dims[1] = PyGpuArray_DIMS(pvals)[0];
+    if (theano_prep_output(&out, 2, dims, unis->ga.typecode,
+                           GA_C_ORDER, %(ctx)s) != 0){
+      %(fail)s
+    }
+    %(out)s = out;
+    GpuArray_memset(&(out->ga), 0);
+    { // NESTED SCOPE
+        int nb_multi = PyGpuArray_DIMS(pvals)[0];
+        int nb_outcomes = PyGpuArray_DIMS(pvals)[1];
+        //TODO : change this for a beautiful constant
+        int max_nb_blocks = 2<<15 - 1;
+        size_t nb_blocks = max_nb_blocks + 1;
+        size_t nb_threads=16; // so it really starts at 32, because of the *2
+        do
+        {
+            nb_threads*=2;
+            if (nb_multi % %nb_threads == 0)
+                nb_blocks = nb_multi/nb_threads;
+            else
+                nb_blocks = (int)((float)nb_multi/(float)nb_threads + 1.);
+        } while (nb_blocks > max_nb_blocks);
+        //printf("\\nN=%%i b=%%i t=%%i t*b=%%i",
+        //         nb_multi, nb_blocks, nb_threads, nb_blocks*nb_threads);
+        // TODO : next line is a bit hardcoded...
+        if (nb_threads > 512)
+        {
+            PyErr_Format(
+                PyExc_ValueError,
+                "Multinomial is not implemented for so many rows in the matrix (%%i)",
+                nb_multi);
+            %(fail)s
+        }
+        assert(nb_blocks*nb_threads >= nb_multi);
+        void *args[10];
+        ssize_t strides[5] = {
+            PyGpuArray_STRIDES(pvals)[0]/sizeof(float),
+            PyGpuArray_STRIDES(pvals)[1]/sizeof(float),
+            PyGpuArray_STRIDES(unis)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[0]/sizeof(float),
+            PyGpuArray_STRIDES(out)[1]/sizeof(float)
+        };
+        int err;
+        args[0] = (void*)&PyGpuArray_DIMS(out)[1];
+        args[1] = (void*)&PyGpuArray_DIMS(out)[0];
+        args[2] = pvals->ga.data; //PyGpuArray_DEV_DATA(pvals);
+        args[3] = (void*)&strides[0];
+        args[4] = (void*)&strides[1];
+        args[5] = unis->ga.data; //PyGpuArray_DEV_DATA(unis);
+        args[6] = (void*)&strides[2];
+        args[7] = out->ga.data; //PyGpuArray_DEV_DATA(out);
+        args[8] = (void*)&strides[3];
+        args[9] = (void*)&strides[4];
+        err = GpuKernel_call(&%(kname)s, 1, &nb_threads, &nb_blocks, 0, args);
+        if (err != GA_NO_ERROR) {
+           PyErr_Format(
+                PyExc_RuntimeError,
+                "gpuarray error: %%s: %%s.\\n",
+                "k_multi_warp_%(name)s",
+                GpuKernel_error(&%(kname)s, err));
+            %(fail)s;
+        }
+        if(%(sync)d)
+            GpuArray_sync(&(out->ga));
+    } // END NESTED SCOPE
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (1,)
+@register_opt()
+@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
+def local_gpua_multinomial(node, context_name):
+    # TODO : need description for function
+    if len(node.inputs) == 2:
+        p, u = node.inputs
+        n_samples = 1
+    else:
+        p, u, n_samples = node.inputs
+    try:
+        if get_scalar_constant_value(n_samples) != 1:
+            return None
+    except NotScalarConstantError:
+        return None
+    m, = node.outputs
+    if (p.dtype == u.dtype == m.dtype == 'float32'):
+        gpu_op = GPUAMultinomialFromUniform(node.op.odtype)
+        return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
+            gpu_op(p, u))
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -763,7 +763,7 @@ def local_gpua_gemm(node, context_name):
 @op_lifter([tensor.blas.BatchedDot])
 def local_gpua_gemmbatch(node, context_name):
    a, b = node.inputs
-    c = tensor.AllocEmpty((a.shape[0], a.shape[1], b.shape[2]))
+    c = tensor.AllocEmpty(a.dtype)(a.shape[0], a.shape[1], b.shape[2])
    return gpugemmbatch_no_inplace(c, 1.0, a, b, 0.0)

--- a/theano/sandbox/gpuarray/tests/test_multinomial.py
+++ b/theano/sandbox/gpuarray/tests/test_multinomial.py
+from __future__ import absolute_import, print_function, division
+import numpy
+import theano
+from theano import config, function, tensor
+from ..multinomial import GPUAMultinomialFromUniform
+import theano.tests.unittest_tools as utt
+from .config import mode_with_gpu, mode_without_gpu
+def get_mode(gpu):
+    mode = mode_without_gpu
+    if gpu:
+        mode = mode_with_gpu
+    return mode
+def run_with_c(f, gpu=False):
+    mode = get_mode(gpu)
+    f(mode, gpu)
+def test_multinomial_0():
+    # This tests the MultinomialFromUniform Op directly, not going through the
+    # multinomial() call in GPU random generation.
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
+    def body(mode, gpu):
+        # the m*2 allows the multinomial to reuse output
+        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+        if gpu:
+            assert any([type(node.op) is GPUAMultinomialFromUniform
+                        for node in f.maker.fgraph.toposort()])
+        # test that both first and second samples can be drawn
+        utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]),
+                            [[2, 0], [0, 2]])
+        # test that both second labels can be drawn
+        r = f([[.2, .8], [.3, .7]], [.31, .31])
+        utt.assert_allclose(r, [[0, 2], [0, 2]])
+        # test that both first labels can be drawn
+        r = f([[.2, .8], [.3, .7]], [.21, .21])
+        utt.assert_allclose(r, [[0, 2], [2, 0]])
+        # change the size to make sure output gets reallocated ok
+        # and also make sure that the GPU version doesn't screw up the
+        # transposed-ness
+        r = f([[.2, .8]], [.25])
+        utt.assert_allclose(r, [[0, 2]])
+    run_with_c(body)
+    run_with_c(body, True)
+# TODO: check a bigger example (make sure blocking on GPU is handled correctly)
+def test_multinomial_large():
+    # DEBUG_MODE will test this on GPU
+    def body(mode, gpu):
+        p = tensor.fmatrix()
+        u = tensor.fvector()
+        m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
+        f = function([p, u], m * 2, allow_input_downcast=True, mode=mode)
+        if gpu:
+            assert any([type(node.op) is GPUAMultinomialFromUniform
+                        for node in f.maker.fgraph.toposort()])
+        pval = numpy.arange(10000 * 4,
+                            dtype='float32').reshape((10000, 4)) + 0.1
+        pval = pval / pval.sum(axis=1)[:, None]
+        uval = numpy.ones_like(pval[:, 0]) * 0.5
+        mval = f(pval, uval)
+        assert mval.shape == pval.shape
+        if config.cast_policy == 'custom':
+            assert mval.dtype == pval.dtype
+        elif config.cast_policy == 'numpy+floatX':
+            assert mval.dtype == config.floatX
+        elif config.cast_policy == 'numpy':
+            assert mval.dtype == 'float64'
+        else:
+            raise NotImplementedError(config.cast_policy)
+        utt.assert_allclose(mval.sum(axis=1), 2)
+        asdf = numpy.asarray([0, 0, 2, 0]) + 0 * pval
+        utt.assert_allclose(mval, asdf)  # broadcast over all rows
+    run_with_c(body)
+    run_with_c(body, True)
+def test_gpu_opt():
+    # Does have some overlap with test_multinomial_0
+    # We test the case where we put the op on the gpu when the output
+    # is moved to the gpu.
+    p = tensor.fmatrix()
+    u = tensor.fvector()
+    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(p, u)
+    assert m.dtype == 'float32', m.dtype
+    f = function([p, u], m, allow_input_downcast=True, mode=get_mode(True))
+    assert any([type(node.op) is GPUAMultinomialFromUniform
+                for node in f.maker.fgraph.toposort()])
+    pval = numpy.arange(10000 * 4, dtype='float32').reshape((10000, 4)) + 0.1
+    pval = pval / pval.sum(axis=1)[:, None]
+    uval = numpy.ones_like(pval[:, 0]) * 0.5
+    f(pval, uval)
+    # Test with a row, it was failing in the past.
+    r = tensor.frow()
+    m = theano.sandbox.multinomial.MultinomialFromUniform('auto')(r, u)
+    assert m.dtype == 'float32', m.dtype
+    f = function([r, u], m, allow_input_downcast=True, mode=get_mode(True))
+    assert any([type(node.op) is GPUAMultinomialFromUniform
+                for node in f.maker.fgraph.toposort()])
+    pval = numpy.arange(1 * 4, dtype='float32').reshape((1, 4)) + 0.1
+    pval = pval / pval.sum(axis=1)[:, None]
+    uval = numpy.ones_like(pval[:, 0]) * 0.5
+    f(pval, uval)
--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -9,11 +9,10 @@ from theano.tensor import NotScalarConstantError, get_scalar_constant_value
 from theano.scalar import as_scalar
 import copy
-from theano.sandbox.cuda import cuda_available, GpuOp
+from theano.sandbox.cuda import cuda_available, GpuOp, register_opt
 if cuda_available:
    from theano.sandbox.cuda import CudaNdarrayType
    from theano.sandbox.cuda.basic_ops import host_from_gpu, gpu_from_host
-    from theano.sandbox.cuda.opt import register_opt
 class MultinomialFromUniform(Op):
@@ -565,6 +564,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
        """ % locals()
+@register_opt()
 @local_optimizer([MultinomialFromUniform])
 def local_gpu_multinomial(node):
    # TODO : need description for function
@@ -608,7 +608,3 @@ def local_gpu_multinomial(node):
            # The dimshuffle is on the cpu, but will be moved to the
            # gpu by an opt.
            return [gpu_from_host(ret)]
-if cuda_available:
-    register_opt()(local_gpu_multinomial)
-    pass
--- a/theano/sandbox/tests/test_multinomial.py
+++ b/theano/sandbox/tests/test_multinomial.py
 from __future__ import absolute_import, print_function, division
-import copy
 import os
 import sys
 from six import reraise
@@ -10,7 +9,7 @@ import numpy
 import theano
 from theano import config, function, tensor
 from theano.sandbox import multinomial
-from theano.compile.mode import get_default_mode, predefined_linkers
+from theano.compile.mode import get_default_mode
 import theano.sandbox.cuda as cuda
 import theano.tests.unittest_tools as utt
 from theano.compat import PY3
@@ -19,15 +18,12 @@ from theano.misc.pkl_utils import CompatUnpickler
 def get_mode(gpu):
    mode = get_default_mode()
-    mode = copy.copy(mode)
+    if theano.config.mode == 'FAST_COMPILE':
+        mode = theano.compile.get_mode('FAST_RUN')
    if gpu:
        mode = mode.including('gpu', 'gpu_local_optimizations',
                              'local_cut_gpu_host_gpu',
                              'local_gpu_multinomial')
-    if isinstance(mode.linker, theano.gof.PerformLinker):
-        mode.linker = predefined_linkers['c|py']
-    if hasattr(mode.linker, 'c_thunks'):
-        mode.linker.c_thunks = True
    return mode

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -6218,7 +6218,7 @@ class AllocEmpty(gof.Op):
    # specify the type of the data
    def __init__(self, dtype):
-        assert isinstance(dtype, str)
+        assert isinstance(dtype, str), dtype
        self.dtype = dtype.lower()
    def validate_shape(self, shape):

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -285,7 +285,9 @@ class TestCorrConv2d(BaseTestConv2d):
    def tcase(self, i, f, s, b, flip, provide_shape):
        o = self.get_output_shape(i, f, s, b)
-        if not theano.config.blas.ldflags:
+        if (not theano.config.blas.ldflags or
+                not theano.config.cxx or
+                theano.config.mode == "FAST_COMPILE"):
            raise SkipTest("Need blas to test conv2d")
        self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
                     verify_grad=True, provide_shape=provide_shape,
@@ -541,7 +543,10 @@ class TestBilinearUpsampling(unittest.TestCase):
    # If BLAS is not available on CPU, then we accept the fallback to the
    # slow Python implementation for that test.
    compile_mode = theano.compile.mode.get_default_mode()
-    if not theano.config.blas.ldflags:
+    if theano.config.mode == "FAST_COMPILE":
+        compile_mode = compile_mode.excluding("conv_gemm")
+        compile_mode = compile_mode.excluding('AbstractConvCheck')
+    elif not theano.config.blas.ldflags or not theano.config.cxx:
        compile_mode = compile_mode.excluding('AbstractConvCheck')
    def numerical_kernel_1D(self, ratio):

--- a/theano/tensor/nnet/tests/test_bn.py
+++ b/theano/tensor/nnet/tests/test_bn.py
@@ -60,11 +60,11 @@ def test_bn_feature_maps():
        return n * G + B
    numpy.random.seed(1234)
-    X = 1 + numpy.random.random([10, 20, 4, 4]).astype('float32')
+    X = 1 + numpy.random.random([2, 3, 4, 4]).astype('float32')
-    B = 1 + numpy.random.random([20]).astype('float32')
+    B = 1 + numpy.random.random([3]).astype('float32')
-    G = 1 + numpy.random.random([20]).astype('float32')
+    G = 1 + numpy.random.random([3]).astype('float32')
-    M = 1 + numpy.random.random([20]).astype('float32')
+    M = 1 + numpy.random.random([3]).astype('float32')
-    V = 1 + numpy.random.random([20]).astype('float32')
+    V = 1 + numpy.random.random([3]).astype('float32')
    x = theano.tensor.tensor4('x')
    b = theano.tensor.vector('b')

--- a/theano/tensor/nnet/tests/test_corr.py
+++ b/theano/tensor/nnet/tests/test_corr.py
@@ -132,7 +132,8 @@ class TestCorr2D(utt.InferShapeTester):
        # TEST GRADIENT
        if verify_grad:
-            utt.verify_grad(sym_CorrMM, [orig_image_data, filter_data])
+            utt.verify_grad(sym_CorrMM, [orig_image_data, filter_data],
+                            mode=self.mode)
    @attr('slow')
    def test_basic(self):
@@ -235,6 +236,8 @@ class TestCorr2D(utt.InferShapeTester):
    @attr('slow')
    def test_infer_shape_forward(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("CorrMM don't work in FAST_COMPILE")
        def rand(*shape):
            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
@@ -264,6 +267,8 @@ class TestCorr2D(utt.InferShapeTester):
    @attr('slow')
    def test_infer_shape_gradW(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("CorrMM don't work in FAST_COMPILE")
        def rand(*shape):
            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')
@@ -300,6 +305,8 @@ class TestCorr2D(utt.InferShapeTester):
    @attr('slow')
    def test_infer_shape_gradI(self):
+        if theano.config.mode == "FAST_COMPILE":
+            raise SkipTest("CorrMM don't work in FAST_COMPILE")
        def rand(*shape):
            r = numpy.asarray(numpy.random.rand(*shape), dtype='float64')

--- a/theano/tests/test_rop.py
+++ b/theano/tests/test_rop.py
@@ -279,16 +279,20 @@ class test_RopLop(RopLop_checker):
                    return conv_op(input, filters, border_mode=border_mode)
                output = sym_conv2d(input, filters).flatten()
                yv = tensor.Rop(output, [input, filters], [ev_input, ev_filters])
+                mode = None
+                if theano.config.mode == "FAST_COMPILE":
+                    mode = "FAST_RUN"
                rop_f = function([input, filters, ev_input, ev_filters],
-                                 yv, on_unused_input='ignore')
+                                 yv, on_unused_input='ignore', mode=mode)
                sy, _ = theano.scan(lambda i, y, x1, x2, v1, v2:
                                    (tensor.grad(y[i], x1) * v1).sum() +
                                    (tensor.grad(y[i], x2) * v2).sum(),
                                    sequences=tensor.arange(output.shape[0]),
                                    non_sequences=[output, input, filters,
-                                                   ev_input, ev_filters])
+                                                   ev_input, ev_filters],
+                                    mode=mode)
                scan_f = function([input, filters, ev_input, ev_filters], sy,
-                                  on_unused_input='ignore')
+                                  on_unused_input='ignore', mode=mode)
                dtype = theano.config.floatX
                image_data = numpy.random.random(image_shape).astype(dtype)
                filter_data = numpy.random.random(filter_shape).astype(dtype)