提交 126614a3 authored 作者: slefrancois's avatar slefrancois

add support for odd ffts and grads

上级 4bf5cc7f
...@@ -4,6 +4,7 @@ import numpy as np ...@@ -4,6 +4,7 @@ import numpy as np
import theano import theano
from theano import Op from theano import Op
import theano.tensor as T import theano.tensor as T
from theano.gradient import DisconnectedType
from theano.gpuarray import (basic_ops, GpuArrayType) from theano.gpuarray import (basic_ops, GpuArrayType)
...@@ -37,7 +38,7 @@ class CuRFFTOp(Op): ...@@ -37,7 +38,7 @@ class CuRFFTOp(Op):
broadcastable=[False] * (inp.type.ndim + 1), broadcastable=[False] * (inp.type.ndim + 1),
context_name=inp.type.context_name) context_name=inp.type.context_name)
def make_node(self, inp): def make_node(self, inp, s):
if not scikits_cuda_available: if not scikits_cuda_available:
raise RuntimeError("scikits.cuda is needed for CuFFTOp") raise RuntimeError("scikits.cuda is needed for CuFFTOp")
...@@ -50,10 +51,13 @@ class CuRFFTOp(Op): ...@@ -50,10 +51,13 @@ class CuRFFTOp(Op):
inp = basic_ops.gpu_contiguous( inp = basic_ops.gpu_contiguous(
basic_ops.as_gpuarray_variable(inp, basic_ops.as_gpuarray_variable(inp,
basic_ops.infer_context_name(inp))) basic_ops.infer_context_name(inp)))
s = T.as_tensor_variable(s)
assert inp.dtype == "float32" assert inp.dtype == "float32"
assert s.ndim == 1
assert 'int' in s.dtype
return theano.Apply(self, [inp], [self.output_type(inp)()]) return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2):
...@@ -69,9 +73,12 @@ class CuRFFTOp(Op): ...@@ -69,9 +73,12 @@ class CuRFFTOp(Op):
def thunk(): def thunk():
input_shape = inputs[0][0].shape input_shape = inputs[0][0].shape
s = inputs[1][0]
assert (input_shape[1:] == s).all()
# construct output shape # construct output shape
output_shape = list(input_shape) output_shape = [input_shape[0]] + list(s)
# DFT of real input is symmetric, no need to store # DFT of real input is symmetric, no need to store
# redundant coefficients # redundant coefficients
output_shape[-1] = output_shape[-1] // 2 + 1 output_shape[-1] = output_shape[-1] // 2 + 1
...@@ -98,7 +105,7 @@ class CuRFFTOp(Op): ...@@ -98,7 +105,7 @@ class CuRFFTOp(Op):
# only initialise plan if necessary # only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape: if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape plan_input_shape[0] = input_shape
plan[0] = fft.Plan(input_shape[1:], np.float32, np.complex64, plan[0] = fft.Plan(s, np.float32, np.complex64,
batch=input_shape[0]) batch=input_shape[0])
# Sync GPU variables before computation # Sync GPU variables before computation
input_pycuda.sync() input_pycuda.sync()
...@@ -116,12 +123,17 @@ class CuRFFTOp(Op): ...@@ -116,12 +123,17 @@ class CuRFFTOp(Op):
def grad(self, inputs, output_grads): def grad(self, inputs, output_grads):
gout, = output_grads gout, = output_grads
s = inputs[1]
# Divide the last dimension of the output gradients by 2, they are # Divide the last dimension of the output gradients by 2, they are
# double-counted by the real-IFFT due to symmetry, except the first # double-counted by the real-IFFT due to symmetry, except the first
# and last elements (for even transforms) which are unique. # and last elements (for even transforms) which are unique.
idx = [slice(None)] * (gout.ndim - 2) + [slice(1,-1)] + [slice(None)] idx = [slice(None)] * (gout.ndim - 2) \
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
gout = T.set_subtensor(gout[idx], gout[idx]*0.5) gout = T.set_subtensor(gout[idx], gout[idx]*0.5)
return [cuirfft_op(gout)] return [cuirfft_op(gout, s), DisconnectedType()()]
def connection_pattern(self, node):
return [[True],[False]]
curfft_op = CuRFFTOp() curfft_op = CuRFFTOp()
...@@ -136,7 +148,7 @@ class CuIRFFTOp(Op): ...@@ -136,7 +148,7 @@ class CuIRFFTOp(Op):
broadcastable=[False] * (inp.type.ndim - 1), broadcastable=[False] * (inp.type.ndim - 1),
context_name=inp.type.context_name) context_name=inp.type.context_name)
def make_node(self, inp): def make_node(self, inp, s):
if not scikits_cuda_available: if not scikits_cuda_available:
raise RuntimeError("scikits.cuda is needed for CuIFFTOp") raise RuntimeError("scikits.cuda is needed for CuIFFTOp")
...@@ -149,10 +161,12 @@ class CuIRFFTOp(Op): ...@@ -149,10 +161,12 @@ class CuIRFFTOp(Op):
inp = basic_ops.gpu_contiguous( inp = basic_ops.gpu_contiguous(
basic_ops.as_gpuarray_variable(inp, basic_ops.as_gpuarray_variable(inp,
basic_ops.infer_context_name(inp))) basic_ops.infer_context_name(inp)))
s = T.as_tensor_variable(s)
assert inp.dtype == "float32" assert inp.dtype == "float32"
assert s.ndim == 1
return theano.Apply(self, [inp], [self.output_type(inp)()]) return theano.Apply(self, [inp, s], [self.output_type(inp)()])
def make_thunk(self, node, storage_map, _, _2): def make_thunk(self, node, storage_map, _, _2):
...@@ -168,14 +182,11 @@ class CuIRFFTOp(Op): ...@@ -168,14 +182,11 @@ class CuIRFFTOp(Op):
def thunk(): def thunk():
input_shape = inputs[0][0].shape input_shape = inputs[0][0].shape
s = inputs[1][0]
# construct output shape # construct output shape
# chop off the extra length-2 dimension for real/imag # chop off the extra length-2 dimension for real/imag
output_shape = list(input_shape[:-1]) output_shape = [input_shape[0]] + list(s)
# restore full signal length
output_shape[-1] = (output_shape[-1] - 1) * 2
# if inputs[0][0][0,-1,1] != 0:
# output_shape[-1] += 1
output_shape = tuple(output_shape) output_shape = tuple(output_shape)
z = outputs[0] z = outputs[0]
...@@ -196,8 +207,7 @@ class CuIRFFTOp(Op): ...@@ -196,8 +207,7 @@ class CuIRFFTOp(Op):
# only initialise plan if necessary # only initialise plan if necessary
if plan[0] is None or plan_input_shape[0] != input_shape: if plan[0] is None or plan_input_shape[0] != input_shape:
plan_input_shape[0] = input_shape plan_input_shape[0] = input_shape
plan[0] = fft.Plan(output_shape[1:], plan[0] = fft.Plan(s,np.complex64, np.float32,
np.complex64, np.float32,
batch=output_shape[0]) batch=output_shape[0])
# Sync GPU variables before computation # Sync GPU variables before computation
input_pycuda.sync() input_pycuda.sync()
...@@ -219,17 +229,22 @@ class CuIRFFTOp(Op): ...@@ -219,17 +229,22 @@ class CuIRFFTOp(Op):
def grad(self, inputs, output_grads): def grad(self, inputs, output_grads):
gout, = output_grads gout, = output_grads
gf = curfft_op(gout) s = inputs[1]
gf = curfft_op(gout, s)
# Multiply the last dimension of the gradient by 2, they represent # Multiply the last dimension of the gradient by 2, they represent
# both positive and negative frequencies, except the first # both positive and negative frequencies, except the first
# and last elements (for even transforms) which are unique. # and last elements (for even transforms) which are unique.
idx = [slice(None)] * (gf.ndim - 2) + [slice(1,-1)] + [slice(None)] idx = [slice(None)] * (gf.ndim - 2) \
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(None)]
gf = T.set_subtensor(gf[idx], gf[idx]*2) gf = T.set_subtensor(gf[idx], gf[idx]*2)
return [gf] return [gf, DisconnectedType()()]
def connection_pattern(self, node):
return [[True],[False]]
cuirfft_op = CuIRFFTOp() cuirfft_op = CuIRFFTOp()
def curfft(inputs, norm=None): def curfft(inp, norm=None):
""" """
Performs the fast Fourier transform of a real-valued output on the GPU Performs the fast Fourier transform of a real-valued output on the GPU
through the gpuarray backend. through the gpuarray backend.
...@@ -246,7 +261,7 @@ def curfft(inputs, norm=None): ...@@ -246,7 +261,7 @@ def curfft(inputs, norm=None):
Parameters Parameters
---------- ----------
inputs inp
Array of real-valued float32 of size (m, ..., n), containing m inputs of Array of real-valued float32 of size (m, ..., n), containing m inputs of
size (..., n). size (..., n).
norm : {None, 'ortho', 'no_norm'} norm : {None, 'ortho', 'no_norm'}
...@@ -257,15 +272,17 @@ def curfft(inputs, norm=None): ...@@ -257,15 +272,17 @@ def curfft(inputs, norm=None):
""" """
s = inp.shape[1:]
cond_norm = _unitary(norm) cond_norm = _unitary(norm)
if cond_norm is None or cond_norm == "no_norm": if cond_norm is None or cond_norm == "no_norm":
return curfft_op(inputs) scaling = 1
elif cond_norm == "ortho": elif cond_norm == "ortho":
return curfft_op(inputs) / T.sqrt(((inputs.shape[1:]).prod()) scaling = T.sqrt(s.prod().astype('float32'))
.astype('float32'))
return curfft_op(inp, s) / scaling
def cuirfft(inputs, norm=None): def cuirfft(inp, norm=None, is_odd=False):
""" """
Performs the real-valued output inverse Fourier Transform using the Performs the real-valued output inverse Fourier Transform using the
gpuarray backend. gpuarray backend.
...@@ -281,7 +298,7 @@ def cuirfft(inputs, norm=None): ...@@ -281,7 +298,7 @@ def cuirfft(inputs, norm=None):
Parameters Parameters
---------- ----------
inputs inp
Array of float32 of size (m, ..., n//2+1, 2), containing m inputs Array of float32 of size (m, ..., n//2+1, 2), containing m inputs
with n/2+1 non-trivial elements on the last dimension and real with n/2+1 non-trivial elements on the last dimension and real
and imaginary parts stored as separate arrays. and imaginary parts stored as separate arrays.
...@@ -293,17 +310,21 @@ def cuirfft(inputs, norm=None): ...@@ -293,17 +310,21 @@ def cuirfft(inputs, norm=None):
""" """
s = inp.shape[1:-1]
if is_odd:
s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
else:
s = T.set_subtensor(s[-1], (s[-1] - 1) * 2)
cond_norm = _unitary(norm) cond_norm = _unitary(norm)
if cond_norm is None: if cond_norm is None:
return cuirfft_op(inputs) / ((inputs.shape[1:-2].prod() * scaling = s.prod().astype('float32')
((inputs.shape[-2] - 1) * 2))
.astype('float32'))
if cond_norm == "ortho": if cond_norm == "ortho":
return cuirfft_op(inputs) / T.sqrt((inputs.shape[1:-2].prod() * scaling = T.sqrt(s.prod().astype('float32'))
((inputs.shape[-2] - 1) * 2))
.astype('float32'))
if cond_norm == "no_norm": if cond_norm == "no_norm":
return cuirfft_op(inputs) scaling = 1
return cuirfft_op(inp, s) / scaling
def _unitary(norm): def _unitary(norm):
if norm not in (None, "ortho", "no_norm"): if norm not in (None, "ortho", "no_norm"):
......
...@@ -122,46 +122,82 @@ class TestFFT(unittest.TestCase): ...@@ -122,46 +122,82 @@ class TestFFT(unittest.TestCase):
# enough epsilon to get good accuracy. # enough epsilon to get good accuracy.
eps = 1e-1 eps = 1e-1
def f_rfft(inp):
return theano.gpuarray.fft.curfft(inp)
inputs_val = np.random.random((1, N, N)).astype('float32') inputs_val = np.random.random((1, N, N)).astype('float32')
utt.verify_grad(theano.gpuarray.fft.curfft_op, [inputs_val], eps=eps) utt.verify_grad(f_rfft, [inputs_val], eps=eps)
def f_irfft(inp):
return theano.gpuarray.fft.cuirfft(inp)
inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32') inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
utt.verify_grad(theano.gpuarray.fft.cuirfft_op, [inputs_val], eps=eps) utt.verify_grad(f_irfft, [inputs_val], eps=eps)
#
# def f_rfft(inp):
# def test_odd(self): return theano.gpuarray.fft.curfft(inp, norm='ortho')
# M = N - 1 inputs_val = np.random.random((1, N, N)).astype('float32')
# utt.verify_grad(f_rfft, [inputs_val], eps=eps)
# inputs_val = np.random.random((1, M, M)).astype('float32')
# inputs = theano.shared(inputs_val) def f_irfft(inp):
# return theano.gpuarray.fft.cuirfft(inp, norm='no_norm')
# rfft = theano.gpuarray.fft.curfft(inputs, norm='no_norm') inputs_val = np.random.random((1, N, N // 2 + 1, 2)).astype('float32')
# f_rfft = theano.function([], rfft, mode=mode_with_gpu) utt.verify_grad(f_irfft, [inputs_val], eps=eps)
# res_rfft = f_rfft()
# def test_odd(self):
# res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) + M = N - 1
# 1j * np.asarray(res_rfft[:, :, :, 1]))
# inputs_val = np.random.random((1, M, M)).astype('float32')
# rfft_ref = numpy.fft.rfftn(inputs_val, s=(M,M), axes=(1,2))#, s=(M, M), axes=(1,2)) inputs = theano.shared(inputs_val)
#
# utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4) rfft = theano.gpuarray.fft.curfft(inputs)
# f_rfft = theano.function([], rfft, mode=mode_with_gpu)
# m = rfft.type() res_rfft = f_rfft()
# ifft = theano.gpuarray.fft.cuirfft(m, norm='no_norm')
# f_ifft = theano.function([m], ifft, mode=mode_with_gpu) res_rfft_comp = (np.asarray(res_rfft[:, :, :, 0]) +
# res_ifft = f_ifft(res_rfft) 1j * np.asarray(res_rfft[:, :, :, 1]))
#
# utt.assert_allclose(inputs_val, np.asarray(res_ifft)/M**2) rfft_ref = numpy.fft.rfftn(inputs_val, s=(M,M), axes=(1,2))#, s=(M, M), axes=(1,2))
utt.assert_allclose(rfft_ref, res_rfft_comp, atol=1e-4, rtol=1e-4)
m = rfft.type()
ifft = theano.gpuarray.fft.cuirfft(m, is_odd=True)
f_ifft = theano.function([m], ifft, mode=mode_with_gpu)
res_ifft = f_ifft(res_rfft)
utt.assert_allclose(inputs_val, np.asarray(res_ifft))
inputs_val = np.random.random((1, M, M//2+1, 2)).astype('float32')
inputs = theano.shared(inputs_val)
irfft = theano.gpuarray.fft.cuirfft(inputs, norm='ortho', is_odd=True)
f_irfft = theano.function([], irfft, mode=mode_with_gpu)
res_irfft = f_irfft()
inputs_ref = inputs_val[:, :, :, 0] + 1j * inputs_val[:, :, :, 1]
irfft_ref = numpy.fft.irfftn(inputs_ref, s=(M, M), axes=(1,2), norm='ortho')
utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
# The numerical gradient of the FFT is sensitive, must set large
# enough epsilon to get good accuracy.
eps = 1e-1
def f_rfft(inp):
return theano.gpuarray.fft.curfft(inp)
inputs_val = np.random.random((1, M, M)).astype('float32')
utt.verify_grad(f_rfft, [inputs_val], eps=eps)
def f_irfft(inp):
return theano.gpuarray.fft.cuirfft(inp, is_odd=True)
inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
utt.verify_grad(f_irfft, [inputs_val], eps=eps)
# inputs_val = np.random.random((1, M//2+1, 2)).astype('float32') def f_rfft(inp):
# # inputs_val[0,0,1] = 0 return theano.gpuarray.fft.curfft(inp, norm='ortho')
# inputs = theano.shared(inputs_val) inputs_val = np.random.random((1, M, M)).astype('float32')
# utt.verify_grad(f_rfft, [inputs_val], eps=eps)
# irfft = theano.gpuarray.fft.cuirfft(inputs)
# f_irfft = theano.function([], irfft, mode=mode_with_gpu) def f_irfft(inp):
# res_irfft = f_irfft() return theano.gpuarray.fft.cuirfft(inp, norm='no_norm', is_odd=True)
# inputs_val = np.random.random((1, M, M // 2 + 1, 2)).astype('float32')
# inputs_ref = inputs_val[:, :, 0] + 1j * inputs_val[:, :, 1] utt.verify_grad(f_irfft, [inputs_val], eps=eps)
# irfft_ref = numpy.fft.irfft(inputs_ref, n=M, axis=1)#, s=(M, M), axes=(1,2))
#
# utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论