Continuing from the previous two parts which covered a Radix-n fully parallel if unsynthesisable FFT implementation, I will convert the VHDL code to something synthesisable. Well, it should elaborate in RTL even if it cannot fit inside a real device for anything other than small FFT point sizes.
- 'sfixed' Package
- Entity Specification
- Radix-2 Architecture
- Radix-4 Architecture
- Radix-n Architecture
- Synthesis Results
- Conclusions
- References
'sfixed' Package
library ieee;
use ieee.numeric_std.all;
use ieee.math_complex.all;
library ieee_proposed;
use ieee_proposed.fixed_pkg.all;
package fft_sfixed_pkg is
type complex_t is record
re : sfixed; -- Real part
im : sfixed; -- Imaginary part
end record;
type complex_arr_t is array (integer range <>) of complex_t;
type complex_2darr_t is array (integer range<>) of complex_arr_t;
type natural_vector is array (integer range <>) of natural;
type natural_vector_arr_t is array (integer range<>) of natural_vector;
function bit_reverse(i : natural; bits : positive) return natural;
function array_reverse(i : complex_arr_t) return complex_arr_t;
function "+" (
l : complex_t;
r : complex_t
) return complex_t;
function "-" (
l : complex_t;
r : complex_t
) return complex_t;
function "*" (
l : complex_t;
r : complex_t
) return complex_t;
function "=" (
l : complex_t;
r : complex_t
) return boolean;
function resize(
val : complex_t;
template : sfixed
) return complex_t;
function init_twiddles_half(
num : positive;
template : sfixed
) return complex_arr_t;
function twid_mod(
t : complex_arr_t;
n : natural
) return complex_t;
function twiddle_power(num : positive) return natural_vector;
function to_complex_t(
z : complex;
template : sfixed
) return complex_t;
function to_complex_t(
re : real;
im : real;
template : sfixed
) return complex_t;
function to_complex_arr_t(
i : work.fft_real_pkg.complex_vector;
template : sfixed
) return complex_arr_t;
function operation(
twids : complex_arr_t;
power : natural;
operand : complex_t;
id : string
) return complex_t;
function opt_pwr_arr(
powers : natural_vector;
group_width : positive
) return natural_vector_arr_t;
function part_pwr_arr(
powers : natural_vector;
group_width : positive
) return natural_vector_arr_t;
end package;
library local;
package body fft_sfixed_pkg is
function bit_reverse(i : natural; bits : positive) return natural is
variable tmp : unsigned(bits-1 downto 0) := to_unsigned(i, bits);
variable ret : unsigned(bits-1 downto 0);
begin
for j in 0 to bits-1 loop
ret(j) := tmp(tmp'high-j);
end loop;
return to_integer(ret);
end function;
function array_reverse(i : complex_arr_t) return complex_arr_t is
variable ret : complex_arr_t(i'range)(
re(i(i'low).re'range),
im(i(i'low).im'range)
);
begin
-- Copy the input array elements across to theire new location.
for j in i'range loop
ret(bit_reverse(j, local.math_pkg.ceil_log(i'length))) := i(j);
end loop;
return ret;
end function;
function "+" (
l : complex_t;
r : complex_t
) return complex_t is
begin
return (
re => l.re + r.re,
im => l.im + r.im
);
end function;
function "-" (
l : complex_t;
r : complex_t
) return complex_t is
begin
return (
re => l.re - r.re,
im => l.im - r.im
);
end function;
function "-" (
z : complex_t
) return complex_t is
begin
return (
re => - z.re,
im => - z.im
);
end function;
function "*" (
l : complex_t;
r : complex_t
) return complex_t is
begin
return (
re => (l.re * r.re) - (l.im * r.im),
im => (l.re * r.im) + (l.im * r.re)
);
end function;
function "=" (
l : complex_t;
r : complex_t
) return boolean is
begin
if (l.re = r.re) and (l.im = r.im) then
return true;
else
return false;
end if;
end function;
function resize(
val : complex_t;
template : sfixed
) return complex_t is
begin
return (
re => resize(
val.re,
template
),
im => resize(
val.im,
template
)
);
end function;
function init_twiddles_half(
num : positive;
template : sfixed
) return complex_arr_t is
-- NB. Really just need element 0 initialised
variable ret : complex_arr_t(0 to num-1)(
re(template'range),
im(template'range)
);
begin
-- Calculate the num'th root of unity and raise to the power of k.
for k in ret'range loop
ret(k) := (
re => to_sfixed( ieee.math_real.cos(ieee.math_real.math_pi * real(k) / real(num)), template),
im => to_sfixed(-ieee.math_real.sin(ieee.math_real.math_pi * real(k) / real(num)), template)
);
end loop;
return ret;
end function;
function twid_mod(
t : complex_arr_t;
n : natural
) return complex_t is
variable idx : natural := n mod (2*t'length);
begin
if idx < t'length then
return t(idx);
else
-- Exploit the symmetry of FFT twiddles
return -t(idx-t'length);
end if;
end function;
function twiddle_power(num : positive) return natural_vector is
variable ret : natural_vector(0 to num-1);
begin
for i in ret'range loop
ret(i) := bit_reverse(i, local.math_pkg.ceil_log(num));
end loop;
return ret;
end function;
function operation(
twids : complex_arr_t;
power : natural;
operand : complex_t;
id : string
) return complex_t is
constant debug : boolean := true;
variable p : natural := power mod (2*twids'length);
begin
if p = 0 then
-- n=0, avoid multiply by 1
return operand;
elsif (4 * p) = (2 * twids'length) then
-- n=N/4, avoid multiply by -i
return (re => operand.im, im => -operand.re);
elsif (2 * p) = (2 * twids'length) then
-- n=N/2, avoid multiply by -1
return (re => -operand.re, im => -operand.im);
elsif (4 * p) = (3 * (2 * twids'length)) then
-- n=3*N/4, avoid multiply by i
return (re => -operand.im, im => operand.re);
else
-- We have to perform the multiplication
if debug then
report "Multiplier ID " & id & ": For W^n_N, N=" & integer'image(2*twids'length) & " n=" & integer'image(p) severity note;
end if;
return twid_mod(twids, p) * operand;
end if;
end function;
function opt_pwr_arr(
powers : natural_vector;
group_width : positive
) return natural_vector_arr_t is
constant radix_c : positive := powers'length;
variable opt_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to radix_c-1);
begin
col : for j in 1 to radix_c-1 loop
row : for k in opt_pwr(opt_pwr'low)'range loop -- (0 to radix_c-1)
opt_pwr(j)(k) := (powers(j)*k*group_width) mod (radix_c*group_width); -- NB. o'length = radix_c * group_width
end loop;
end loop;
return opt_pwr;
end function;
function part_pwr_arr(
powers : natural_vector;
group_width : positive
) return natural_vector_arr_t is
constant radix_c : positive := powers'length;
variable part_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to group_width-1);
begin
col : for j in 1 to radix_c-1 loop
row : for k in part_pwr(part_pwr'low)'range loop -- (0 to group_width_c-1)
part_pwr(j)(k) := (powers(j)*k) mod (radix_c*group_width); -- NB. o'length = radix_c * group_width
end loop;
end loop;
return part_pwr;
end function;
function to_complex_t(
z : complex;
template : sfixed
) return complex_t is
begin
return (
re => to_sfixed(z.re, template),
im => to_sfixed(z.im, template)
);
end function;
function to_complex_t(
re : real;
im : real;
template : sfixed
) return complex_t is
begin
return (
re => to_sfixed(re, template),
im => to_sfixed(im, template)
);
end function;
function to_complex_arr_t(
i : work.fft_real_pkg.complex_vector;
template : sfixed
) return complex_arr_t is
variable ret : complex_arr_t(i'range)(
re(template'range),
im(template'range)
);
begin
for k in i'range loop
ret(k) := to_complex_t(i(k), template);
end loop;
return ret;
end function;
end package body;
The function names in common with the 'real' package have already been explained in part 1. The additional functions included here cover arithmetic operations on the custom complex_t type, e.g. complex addition, multiplication and resizing. The complex_t type is required in order to implement a complex type based on fixed point numbers, or the VHDL sfixed type. A previous exploration of sfixed and float types in VHDL shows that any desire to use the native multipliers means using fixed point arithmetic. That blog post was created as an avenue of investigation during researching the FFT implementation. The significant difference between sfixed and real type usage is the need to provide a 'template' for the fixed type's width. E.g. sfixed(6 downto -8), provides the size of the whole (5 bits + 1 sign bit) and fractional parts (8 bits) of the fixed point representation and can be used to define the size of the sfixed complex parts by using 'template'range'.
Entity Specification
These VHDL entities are used for multiple architectures implementing different radix FFTs.
library ieee;
use ieee.std_logic_1164.std_logic;
library ieee_proposed;
use ieee_proposed.fixed_pkg.all;
use work.fft_sfixed_pkg.all;
entity dft_sfixed is
generic (
log_num_inputs_g : positive; -- E.g. 1024 point FFT => log_num_inputs_g = 10
template_g : sfixed -- Provide an uninitialised vector solely for passing information about the fixed point range
);
port (
clk : std_logic;
reset : std_logic;
i : in complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
o : out complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
)
);
end entity;
library ieee;
use ieee.std_logic_1164.all;
library ieee_proposed;
use ieee_proposed.fixed_pkg.all;
use work.fft_sfixed_pkg.all;
entity dftr_sfixed is
generic (
log_num_inputs_g : positive; -- E.g. 1024 point FFT => log_num_inputs_g = 10
template_g : sfixed -- Provide an uninitialised vector solely for passing information about the fixed point range
);
port (
clk : std_logic;
reset : std_logic;
i : in complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
o : out complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
)
);
end entity;
As with the real type version, the generic log_num_inputs_g is unsurprisingly the log(number of inputs) specifically to base 2. This has been done deliberately in order to prevent instantiation of the FFT core with non powers of 2 inputs and outputs. dftr_real is the recursively instantiated entity only used after the top level entity dft_real has performed a permutation of the inputs according to "bit reversal". The architecture permuting the inputs is defined separately below because it references the specific radix2 recursive architecture to instantiate which must be compiled first.
Radix-2 Architecture
Large comments have been retained in order to show my workings as I "boot strapped" my knowledge.
architecture radix2 of dftr_sfixed is
constant radix_c : positive := 2;
constant twid_c : complex_arr_t(0 to 2**(log_num_inputs_g-1)-1) := init_twiddles_half(2**(log_num_inputs_g-1), template_g);
begin
recurse_g : if log_num_inputs_g = 1 generate
assert false
report "Adders: " & integer'image(o'length*(radix_c-1))
severity note;
-- Perform the Radix-2 FFT on two operands
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
o <= (others => (
re => (others => '0'),
im => (others => '0')
));
else
-- Perform the Radix-2 FFT on two operands
o(0) <= resize(i(0) + i(1), template_g);
o(1) <= resize(i(0) - i(1), template_g);
end if;
end if;
end process;
else generate
constant group_width_c : positive := o'length/radix_c;
constant powers_c : natural_vector := twiddle_power(radix_c);
constant opt_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to radix_c-1) := opt_pwr_arr(powers_c, group_width_c);
constant part_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to group_width_c-1) := part_pwr_arr(powers_c, group_width_c);
signal t : complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
signal prods : complex_2darr_t(0 to radix_c-1)(0 to group_width_c-1)(
re(template_g'range),
im(template_g'range)
);
signal sums : complex_2darr_t(o'range)(0 to radix_c-1)(
re(template_g'range),
im(template_g'range)
);
begin
-- Recurse and combine
dftr_i0 : entity work.dftr_sfixed(radix2)
generic map (
log_num_inputs_g => log_num_inputs_g-1,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(0 to i'length/2-1),
o => t(0 to t'length/2-1)
);
dftr_i1 : entity work.dftr_sfixed(radix2)
generic map (
log_num_inputs_g => log_num_inputs_g-1,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(i'length/2 to i'high),
o => t(t'length/2 to t'high)
);
-- Combine
-- Reusable multiplications
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
prods <= (others => (others => (
re => (others => '0'),
im => (others => '0')
)));
else
for j in prods'range loop -- j'th output
if j = 0 then
for k in 0 to group_width_c-1 loop
prods(0)(k) <= t(k);
end loop;
else
for k in prods(j)'range loop
prods(j)(k) <= resize(operation(twid_c, part_pwr(j)(k), t(k+(j*group_width_c)), prods'instance_name & integer'image(j) & "," & integer'image(k)), template_g);
end loop;
end if;
end loop;
end if;
end if;
end process;
col_g : for j in sums'range generate -- o'range
constant group_c : natural := (j / group_width_c);
begin
sums(j)(0) <= t(j-(group_c*group_width_c));
row_g : for k in 1 to radix_c-1 generate
constant id : string := prods'instance_name & integer'image(j) & "," & integer'image(k);
begin
sums(j)(k) <= resize(operation(twid_c, opt_pwr(k)(group_c), prods(k)(j-(group_c*group_width_c)), id), template_g);
end generate;
end generate;
assert false
report "Adders: " & integer'image(o'length*(radix_c-1))
severity note;
process(clk)
constant group_width_c : natural := o'length/2;
begin
if rising_edge(clk) then
if reset = '1' then
o <= (others => (
re => (others => '0'),
im => (others => '0')
));
else
for j in o'range loop
o(j) <= resize(sums(j)(0) + sums(j)(1), template_g);
end loop;
end if;
end if;
end process;
end generate;
end architecture;
-- Just perform the bit reversal on the input indices, then call the recursive component.
architecture radix2 of dft_sfixed is
begin
dftr_i : entity work.dftr_sfixed(radix2)
generic map (
log_num_inputs_g => log_num_inputs_g,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => array_reverse(i),
o => o
);
end architecture;
The main issue that needed solving here was the resizing of arithmetic results. The sfixed library forces the return values to have sufficient size to accommodate the arithmetic's result, e.g. addition adds a bit to the longest operand, multiplication combines the width of operands. My design above makes a simplifying assumption that the design will be simulated to ensure that no overflow occurs in the number representation and that the annoyance of managing the growing sizes of operands can be solved another day, when there is a specific application in mind.
Radix-4 Architecture
library local;
use work.adder_tree_pkg.all;
architecture radix4 of dftr_sfixed is
constant twid_c : complex_arr_t(0 to 2**(log_num_inputs_g-1)-1) := init_twiddles_half(2**(log_num_inputs_g-1), template_g);
begin
recurse_g : if log_num_inputs_g = 1 generate
constant radix_c : positive := 2;
begin
assert false
report "Adders: " & integer'image(o'length*(radix_c-1))
severity note;
-- Perform the Radix-2 FFT on two operands
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
o <= (others => (
re => (others => '0'),
im => (others => '0')
));
else
-- Perform the Radix-2 FFT on two operands
o(0) <= resize(i(0) + i(1), template_g);
o(1) <= resize(i(0) - i(1), template_g);
end if;
end if;
end process;
elsif log_num_inputs_g = 2 generate
constant radix_c : positive := 4;
constant group_width_c : positive := o'length/radix_c; -- Always 1
constant powers_c : natural_vector := twiddle_power(radix_c);
constant opt_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to radix_c-1) := opt_pwr_arr(powers_c, group_width_c);
constant part_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to group_width_c-1) := part_pwr_arr(powers_c, group_width_c);
signal prods : complex_2darr_t(1 to radix_c-1)(0 to group_width_c-1)(
re(template_g'range),
im(template_g'range)
);
signal sums : complex_2darr_t(o'range)(0 to radix_c-1)(
re(template_g'range),
im(template_g'range)
);
begin
-- Perform the Radix-4 FFT on four operands
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
prods <= (others => (others => (
re => (others => '0'),
im => (others => '0')
)));
else
prods <= (
(0 => i(1)),
(0 => i(2)),
(0 => i(3))
);
end if;
end if;
end process;
col_g : for j in sums'range generate -- o'range
sums(j)(0) <= i(0);
row_g : for k in 1 to radix_c-1 generate
constant id : string := prods'instance_name & integer'image(j) & "," & integer'image(k);
begin
sums(j)(k) <= resize(operation(twid_c, opt_pwr(k)(j), prods(k)(0), id), template_g);
end generate;
end generate;
assert false
report "Adders: " & integer'image(o'length*(radix_c-1))
severity note;
adders_g : for j in o'range generate
signal u : complex_t(
re(output_bits(template_g'high, sums(j)'length) downto template_g'low),
im(output_bits(template_g'high, sums(j)'length) downto template_g'low)
);
signal ot : complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
begin
adder_tree_complex_pipe_i : entity work.adder_tree_complex_pipe
generic map (
depth_g => local.math_pkg.ceil_log(sums(j)'length),
num_operands_g => sums(j)'length,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => sums(j),
o => u
);
-- Slice the results down to size and assume simulation will detect deviation from
-- Octave derived results comparisons.
o(j) <= (
u.re(template_g'range),
u.im(template_g'range)
);
end generate;
else generate
constant radix_c : positive := 4;
constant group_width_c : natural := o'length/4;
constant powers_c : natural_vector := twiddle_power(radix_c);
constant opt_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to radix_c-1) := opt_pwr_arr(powers_c, group_width_c);
constant part_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to group_width_c-1) := part_pwr_arr(powers_c, group_width_c);
signal t : complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
signal prods : complex_2darr_t(0 to radix_c-1)(0 to group_width_c-1)(
re(template_g'range),
im(template_g'range)
);
signal sums : complex_2darr_t(o'range)(0 to radix_c-1)(
re(template_g'range),
im(template_g'range)
);
begin
-- Recurse and combine
dftr_i0 : entity work.dftr_sfixed(radix4)
generic map (
log_num_inputs_g => log_num_inputs_g-2,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(0 to i'length/4-1),
o => t(0 to t'length/4-1)
);
dftr_i1 : entity work.dftr_sfixed(radix4)
generic map (
log_num_inputs_g => log_num_inputs_g-2,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(i'length/4 to i'length/2-1),
o => t(t'length/4 to t'length/2-1)
);
dftr_i2 : entity work.dftr_sfixed(radix4)
generic map (
log_num_inputs_g => log_num_inputs_g-2,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(i'length/2 to 3*i'length/4-1),
o => t(t'length/2 to 3*t'length/4-1)
);
dftr_i3 : entity work.dftr_sfixed(radix4)
generic map (
log_num_inputs_g => log_num_inputs_g-2,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => i(3*i'length/4 to i'high),
o => t(3*t'length/4 to t'high)
);
-- Combine
-- Reusable multiplications
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
prods <= (others => (others => (
re => (others => '0'),
im => (others => '0')
)));
else
for j in prods'range loop -- j'th output
if j = 0 then
for k in 0 to group_width_c-1 loop
prods(0)(k) <= t(k);
end loop;
else
for k in prods(j)'range loop
prods(j)(k) <= resize(operation(twid_c, part_pwr(j)(k), t(k+(j*group_width_c)), prods'instance_name & integer'image(j) & "," & integer'image(k)), template_g);
end loop;
end if;
end loop;
-- for j in o'range loop
-- if j < group_width_c then
-- prods(j) <= (
-- t(j ),
-- resize(operation(twid_c, 0, t(j+ group_width_c ), prods'instance_name & integer'image(0) & "," & integer'image(j+ group_width_c )), template_g),
-- resize(operation(twid_c, 0, t(j+(2*group_width_c)), prods'instance_name & integer'image(0) & "," & integer'image(j+(2*group_width_c))), template_g),
-- resize(operation(twid_c, 0, t(j+(3*group_width_c)), prods'instance_name & integer'image(0) & "," & integer'image(j+(3*group_width_c))), template_g)
-- );
-- elsif j < 2 * group_width_c then
-- prods(j) <= (
-- t(j- group_width_c ),
-- resize(operation(twid_c, part_pwr(j)(1), t(j ), prods'instance_name & integer'image(1) & "," & integer'image(j+ group_width_c )), template_g),
-- resize(operation(twid_c, part_pwr(j)(2), t(j+ group_width_c ), prods'instance_name & integer'image(1) & "," & integer'image(j+(2*group_width_c))), template_g),
-- resize(operation(twid_c, part_pwr(j)(3), t(j+(2*group_width_c)), prods'instance_name & integer'image(1) & "," & integer'image(j+(3*group_width_c))), template_g)
-- );
-- elsif j < 3 * group_width_c then
-- -- For each group of width group_width_c
-- else
-- prods(j) <= (
-- t(j-(3*group_width_c)),
-- resize(operation(twid_c, part_pwr(j)(1), t(j-(2*group_width_c)), prods'instance_name & integer'image(3) & "," & integer'image(j+ group_width_c )), template_g),
-- resize(operation(twid_c, part_pwr(j)(2), t(j- group_width_c ), prods'instance_name & integer'image(3) & "," & integer'image(j+(2*group_width_c))), template_g),
-- resize(operation(twid_c, part_pwr(j)(3), t(j ), prods'instance_name & integer'image(3) & "," & integer'image(j+(3*group_width_c))), template_g)
-- );
-- end if;
-- end loop;
end if;
end if;
end process;
col_g : for j in sums'range generate -- o'range
constant group_c : natural := (j / group_width_c);
begin
sums(j)(0) <= t(j-(group_c*group_width_c));
row_g : for k in 1 to radix_c-1 generate
constant id : string := prods'instance_name & integer'image(j) & "," & integer'image(k);
begin
sums(j)(k) <= resize(operation(twid_c, opt_pwr(k)(group_c), prods(k)(j-(group_c*group_width_c)), id), template_g);
end generate;
end generate;
assert false
report "Adders: " & integer'image(o'length*(radix_c-1))
severity note;
adders_g : for j in o'range generate
signal u : complex_t(
re(output_bits(template_g'high, sums(j)'length) downto template_g'low),
im(output_bits(template_g'high, sums(j)'length) downto template_g'low)
);
begin
adder_tree_complex_pipe_i : entity work.adder_tree_complex_pipe
generic map (
depth_g => local.math_pkg.ceil_log(sums(j)'length),
num_operands_g => sums(j)'length,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => sums(j),
o => u
);
-- Slice the results down to size and assume simulation will detect deviation from
-- Octave derived results comparisons.
o(j) <= (
u.re(template_g'range),
u.im(template_g'range)
);
end generate;
end generate;
end architecture;
-- Just perform the bit reversal on the input indices, then call the recursive component.
architecture radix4 of dft_sfixed is
begin
dftr_i : entity work.dftr_sfixed(radix4)
generic map (
log_num_inputs_g => log_num_inputs_g,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => array_reverse(i),
o => o
);
end architecture;
The new feature here is the pipelined addition logic. What we want to achieve is a minimum delay through the addition logic. If the adders are in a linear chain, even if pipelined they incur a linear number of clock cycle delays of \(n-1\) for Radix-n. By dictating that the addition is performed as a tree, in this case a perfectly balanced binary tree, we minimise the clock cycle delay to \(\log_2(n)\). Here I borrowed an existing solution from some previous work I did on pipelined adder trees, but augmented it to work for the complex_t type, i.e. a pair of trees adding the sfixed type instead of ieee.numeric_std.signed. Since the solution is so similar, the differences will not be covered further here.
The operands to the adder tree are assembled into an array called sums, and the pair of indexes arranged such that the first index to sums provides the one dimensional array of operands to be summed up by applying the indexed array directly to the adder tree inputs. The alternative is to trust the synthesis tool to rearrange the adders into a tree structure automatically. This part of the design could be pipelined differently, by allowing multiple additions per clock cycle if the multiplication logic propagation takes much longer than the additions, which would reduce the number of registers used. The code to implement that would require an additional generic to specify how aggressively the adder tree should be pipelined. For now, whilst awaiting synthesis results I have just gone for the maximum pipelining at the expense of register logic.
Radix-n Architecture
Completing the Radix-4 solution and having the real type Radix-8, I had gained sufficient insight to move directly to the synthesisable Radix-n solution.
library ieee;
use ieee.std_logic_1164.std_logic;
library ieee_proposed;
use ieee_proposed.fixed_pkg.all;
use work.fft_sfixed_pkg.all;
-- Perform the bit reversal on the input indices once at the top level, then call
-- the recursive component.
entity dft_multi_radix_sfixed is
generic (
log_num_inputs_g : positive; -- E.g. 1024 point FFT => log_num_inputs_g = 10
template_g : sfixed; -- Provide an uninitialised vector solely for passing information about the fixed point range
-- Not using the logarithm here which is inconsistent! Conventionally everyone refers to Radix-2, Radix-4, Radix-8 etc.
max_radix_g : positive -- Limit the FFT Radix to be used, allowing smaller radices for the last stage.
);
port (
clk : std_logic;
reset : std_logic;
i : in complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
o : out complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
)
);
end entity;
library ieee;
use ieee.std_logic_1164.all;
library ieee_proposed;
use ieee_proposed.fixed_pkg.all;
use work.fft_sfixed_pkg.all;
entity dftr_multi_radix_sfixed is
generic (
log_num_inputs_g : positive; -- E.g. 1024 point FFT => log_num_inputs_g = 10. Using the logarithm here ensures an erroneous inputs width cannot be selected.
template_g : sfixed; -- Provide an uninitialised vector solely for passing information about the fixed point range
-- Not using the logarithm here which is inconsistent! Conventionally everyone refers to Radix-2, Radix-4, Radix-8 etc.
max_radix_g : positive -- Limit the FFT Radix to be used, allowing smaller radices for the last stage.
);
port (
clk : std_logic;
reset : std_logic;
i : in complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
o : out complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
)
);
end entity;
library local;
use work.adder_tree_pkg.all;
architecture radix_n of dftr_multi_radix_sfixed is
constant radix_c : positive := minimum(max_radix_g, 2**log_num_inputs_g);
constant group_width_c : natural := o'length/radix_c;
constant twid_c : complex_arr_t(0 to (2**(log_num_inputs_g-1))-1) := init_twiddles_half(2**(log_num_inputs_g-1), template_g);
constant powers_c : natural_vector := twiddle_power(radix_c);
constant opt_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to radix_c-1) := opt_pwr_arr(powers_c, group_width_c);
constant part_pwr : natural_vector_arr_t(1 to radix_c-1)(0 to group_width_c-1) := part_pwr_arr(powers_c, group_width_c);
signal t : complex_arr_t(0 to (2**log_num_inputs_g)-1)(
re(template_g'range),
im(template_g'range)
);
signal prods : complex_2darr_t(1 to radix_c-1)(0 to group_width_c-1)(
re(template_g'range),
im(template_g'range)
);
signal sums : complex_2darr_t(o'range)(0 to radix_c-1)(
re(template_g'range),
im(template_g'range)
);
begin
recurse_g : if (2**log_num_inputs_g) <= radix_c generate
-- No recursion, just map the inputs to the intermediate values
t <= i;
else generate
-- Recurse with radix_c instantiations
dft_g : for j in 0 to radix_c-1 generate
dftr_i : entity work.dftr_multi_radix_sfixed
generic map (
log_num_inputs_g => log_num_inputs_g-local.math_pkg.ceil_log(radix_c),
template_g => template_g,
max_radix_g => max_radix_g
)
port map (
clk => clk,
reset => reset,
i => i(j*i'length/radix_c to (j+1)*i'length/radix_c-1),
o => t(j*t'length/radix_c to (j+1)*t'length/radix_c-1)
);
end generate;
end generate;
-- Combine with the radix_c butterfly
process(clk)
begin
if rising_edge(clk) then
if reset = '1' then
prods <= (others => (others => (
re => (others => '0'),
im => (others => '0')
)));
else
for j in prods'range loop -- j'th output
for k in prods(j)'range loop
prods(j)(k) <= resize(operation(twid_c, part_pwr(j)(k), t(k+(j*group_width_c)), prods'instance_name & integer'image(j) & "," & integer'image(k)), template_g);
end loop;
end loop;
end if;
end if;
end process;
col_g : for j in sums'range generate -- o'range
constant group_c : natural := (j / group_width_c);
begin
sums(j)(0) <= t(j-(group_c*group_width_c));
row_g : for k in 1 to radix_c-1 generate
constant id : string := prods'instance_name & integer'image(j) & "," & integer'image(k);
begin
sums(j)(k) <= resize(operation(twid_c, opt_pwr(k)(group_c), prods(k)(j-(group_c*group_width_c)), id), template_g);
end generate;
end generate;
adders_g : for j in o'range generate
signal u : complex_t(
re(output_bits(template_g'high, sums(j)'length) downto template_g'low),
im(output_bits(template_g'high, sums(j)'length) downto template_g'low)
);
begin
assert false
report "Adders: " & integer'image(radix_c-1)
severity note;
adder_tree_complex_pipe_i : entity work.adder_tree_complex_pipe
generic map (
depth_g => local.math_pkg.ceil_log(sums(j)'length),
num_operands_g => sums(j)'length,
template_g => template_g
)
port map (
clk => clk,
reset => reset,
i => sums(j),
o => u
);
-- Slice the results down to size and assume simulation will detect deviation from
-- Octave derived results comparisons.
o(j) <= (
u.re(template_g'range),
u.im(template_g'range)
);
end generate;
end architecture;
library local;
-- Just perform the bit reversal on the input indices, then call the recursive component.
architecture radix_n of dft_multi_radix_sfixed is
begin
assert 2**local.math_pkg.ceil_log(max_radix_g) = max_radix_g
report "max_radix_g must be a power of 2."
severity error;
dftr_i : entity work.dftr_multi_radix_sfixed
generic map (
log_num_inputs_g => log_num_inputs_g,
template_g => template_g,
max_radix_g => max_radix_g
)
port map (
clk => clk,
reset => reset,
i => array_reverse(i),
o => o
);
end architecture;
Here the same code efficiencies were made as for the real version.
- Recursion is not 'terminated' in its own special case clause, merely the recursive instantiations are omitted at leaf nodes. We no longer require the stage 1 butterflies to bootstrap our understanding and their implementation is lost in the general combiner for any number of inputs for any radix.
- The max_radix_g generic is provided to specify the intended radix to use at each stage of recursion. It is a maximum because a smaller radix may be used in order to finish off the recursion with the remaining inputs that do not fill an instantiation of the desired radix size. As we now have multi-radix support we can recurse on the same Radix-n component simply by specifying the radix as a generic value. The inconsistency here is that whilst the number of inputs is specified as the log of the size, the radix is not just because of the user familiarity with the FFT terminology. Your mileage may vary here, each to their own OCD.
- The operands for summation are assembled into a two dimensional array such that the first index peals off a one dimensional array of operands for the adder tree logic.
Synthesis Results
So as usual, Xilinx's Vivado has ruled itself out of the running by not properly supporting recursion. This leaves me trying only Intel Quartus Prime's Webpack. Here I tried both:
- Quartus 18.1.1 Build 646 04/11/2019 SJ Lite Edition
- Quartus 20.1.1 Build 720 11/11/2020 SJ Lite Edition
Error (10410): VHDL Type Conversion error at fft_sfixed_pkg.vhdl(XX): Type Conversion near text or symbol "complex_t" must have one argument
This is caused by VHDL constructs like:
variable ret : complex_arr_t(i'range)(
re(template'range),
im(template'range)
);
variable ret : complex_arr_t(i'range)(
re(i(i'low).re'range),
im(i(i'low).im'range)
);
The problem seems to be a lack of full VHDL-2008 support for record types containing unconstrained arrays.
- VHDL-2008 support for unconstrained arrays in records
- Unconstrained array of unconstrained records, VHDL-2008, Quartus Prime 16.0.0
- ieee.fixed_pkg in Quartus Prime 16.0 Lite Edition
Conclusions
If anyone would be willing to try my code for me and provide me with a graphic of the RTL elaboration of a suitably sized design then I'll have confirmation of synthesability and be able to include the results in this blog. After all the work on deriving an FFT implementation this feels rather anticlimactic. However, the fully parallel implementation quite simply is not practical at all beyond a useful project from which to learn about the Fast Fourier Transform. In reality, one would immediately choose an iterative approach, performing the successive layers of recursion on values stored in a RAM in order to limit the arithmetic resource requirements. So armed with what I now know, something like the implementation used in the article "Design and Implementation of 256-Point Radix-4 100 Gbit/s FFT Algorithm into FPGA for High-Speed Applications" would be more appropriate.
I have however demonstrated the benefits of Radix-2 & Radix-4 over any higher radix. Questions about the radix size are often posted on DSP forums, so are clearly of interest, along with questions about the benefits of mixed radix which I partially address here with the closing level of recursion. I believe I have demonstrated a practical way to derive the twiddle factor powers for any radix FFT at any stage of recursion. Finally, I think this recursive example is one where a "just as good" iterative implementation can be realised because the number of inputs and outputs is the same at each stage. However the adder trees cannot be done quite so easily in an iterative fashion (I think one would need a partially used two dimensional array with some clever indexing). As such no further attempts will be made with other implementations, I shall simply use Vivado's IP Catalogue to generate an FFT Core of the required size and be happy I understand the mathematics behind it.
References
- Github Source Code
- Cooley-Tukey fast Fourier transform algorithm
- An Algorithm for the Machine Calculation of Complex Fourier Series, James W. Cooley and John W. Tukey - Original Paper for historical curiosity
- A DFT and FFT Tutorial
- Lecture notes: The Fast Fourier Transform; Radix–2 and Radix–4 Algorithms, Kenneth E. Barner, University of Delaware
- Compiling VHDL For The Missing Fixed And Floating Point Libraries
- Adder Trees Pipelined Efficiently by Recursion
- Design and Implementation of 256-Point Radix-4 100 Gbit/s FFT Algorithm into FPGA for High-Speed Applications