use fearless_simd::{dispatch, Simd};
use crate::algorithms::bravo::{bit_rev_bravo_f32, bit_rev_bravo_f64};
use crate::kernels::codelets::{fft_dit_codelet_16_f64, fft_dit_codelet_32_f32};
use crate::kernels::dit::*;
use crate::options::Options;
use crate::parallel::run_maybe_in_parallel;
use crate::planner::{Direction, PlannerDit32, PlannerDit64};
const L1_BLOCK_SIZE: usize = 1024;
fn recursive_dit_fft_f64<S: Simd>(
simd: S,
reals: &mut [f64],
imags: &mut [f64],
size: usize,
planner: &PlannerDit64,
opts: &Options,
mut stage_twiddle_idx: usize,
) -> usize {
let log_size = size.ilog2() as usize;
if size <= L1_BLOCK_SIZE {
let codelet_stages = 4;
let start_stage = if stage_twiddle_idx == 0 && size >= power_of_two(codelet_stages) {
fft_dit_codelet_16_f64(simd, &mut reals[..size], &mut imags[..size]);
codelet_stages
} else {
0
};
for stage in start_stage..log_size {
stage_twiddle_idx = execute_dit_stage_f64(
simd,
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
);
}
stage_twiddle_idx
} else {
let half = size / 2;
let log_half = half.ilog2() as usize;
let (re_first_half, re_second_half) = reals.split_at_mut(half);
let (im_first_half, im_second_half) = imags.split_at_mut(half);
run_maybe_in_parallel(
size > opts.smallest_parallel_chunk_size,
|| recursive_dit_fft_f64(simd, re_first_half, im_first_half, half, planner, opts, 0),
|| recursive_dit_fft_f64(simd, re_second_half, im_second_half, half, planner, opts, 0),
);
stage_twiddle_idx = log_half.saturating_sub(6);
for stage in log_half..log_size {
stage_twiddle_idx = execute_dit_stage_f64(
simd,
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
);
}
stage_twiddle_idx
}
}
fn recursive_dit_fft_f32<S: Simd>(
simd: S,
reals: &mut [f32],
imags: &mut [f32],
size: usize,
planner: &PlannerDit32,
opts: &Options,
mut stage_twiddle_idx: usize,
) -> usize {
let log_size = size.ilog2() as usize;
if size <= L1_BLOCK_SIZE {
let codelet_stages = 5;
let start_stage = if stage_twiddle_idx == 0 && size >= power_of_two(codelet_stages) {
fft_dit_codelet_32_f32(simd, &mut reals[..size], &mut imags[..size]);
codelet_stages
} else {
0
};
for stage in start_stage..log_size {
stage_twiddle_idx = execute_dit_stage_f32(
simd,
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
);
}
stage_twiddle_idx
} else {
let half = size / 2;
let log_half = half.ilog2() as usize;
let (re_first_half, re_second_half) = reals.split_at_mut(half);
let (im_first_half, im_second_half) = imags.split_at_mut(half);
run_maybe_in_parallel(
size > opts.smallest_parallel_chunk_size,
|| recursive_dit_fft_f32(simd, re_first_half, im_first_half, half, planner, opts, 0),
|| recursive_dit_fft_f32(simd, re_second_half, im_second_half, half, planner, opts, 0),
);
stage_twiddle_idx = log_half.saturating_sub(6);
for stage in log_half..log_size {
stage_twiddle_idx = execute_dit_stage_f32(
simd,
&mut reals[..size],
&mut imags[..size],
stage,
planner,
stage_twiddle_idx,
);
}
stage_twiddle_idx
}
}
fn execute_dit_stage_f64<S: Simd>(
simd: S,
reals: &mut [f64],
imags: &mut [f64],
stage: usize,
planner: &PlannerDit64,
stage_twiddle_idx: usize,
) -> usize {
let dist = 1 << stage; let chunk_size = dist * 2;
if chunk_size == 2 {
simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
stage_twiddle_idx
} else if chunk_size == 4 {
fft_dit_chunk_4_f64(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 8 {
fft_dit_chunk_8_f64(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 16 {
fft_dit_chunk_16_f64(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 32 {
fft_dit_chunk_32_f64(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 64 {
fft_dit_chunk_64_f64(simd, reals, imags);
stage_twiddle_idx
} else {
let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
fft_dit_chunk_n_f64(simd, reals, imags, twiddles_re, twiddles_im, dist);
stage_twiddle_idx + 1
}
}
fn execute_dit_stage_f32<S: Simd>(
simd: S,
reals: &mut [f32],
imags: &mut [f32],
stage: usize,
planner: &PlannerDit32,
stage_twiddle_idx: usize,
) -> usize {
let dist = 1 << stage; let chunk_size = dist * 2;
if chunk_size == 2 {
simd.vectorize(|| fft_dit_chunk_2(simd, reals, imags));
stage_twiddle_idx
} else if chunk_size == 4 {
fft_dit_chunk_4_f32(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 8 {
fft_dit_chunk_8_f32(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 16 {
fft_dit_chunk_16_f32(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 32 {
fft_dit_chunk_32_f32(simd, reals, imags);
stage_twiddle_idx
} else if chunk_size == 64 {
fft_dit_chunk_64_f32(simd, reals, imags);
stage_twiddle_idx
} else {
let (twiddles_re, twiddles_im) = &planner.stage_twiddles[stage_twiddle_idx];
fft_dit_chunk_n_f32(simd, reals, imags, twiddles_re, twiddles_im, dist);
stage_twiddle_idx + 1
}
}
pub fn fft_64_dit_with_planner_and_opts(
reals: &mut [f64],
imags: &mut [f64],
direction: Direction,
planner: &PlannerDit64,
opts: &Options,
) {
dispatch!(planner.simd_level, simd => fft_64_dit_with_planner_and_opts_impl(simd, reals, imags, direction, planner, opts))
}
#[inline(always)] fn fft_64_dit_with_planner_and_opts_impl<S: Simd>(
simd: S,
reals: &mut [f64],
imags: &mut [f64],
direction: Direction,
planner: &PlannerDit64,
opts: &Options,
) {
assert_eq!(reals.len(), imags.len());
assert!(reals.len().is_power_of_two());
let n = reals.len();
let log_n = n.ilog2() as usize;
assert_eq!(log_n, planner.log_n);
let (reals, imags) = match direction {
Direction::Forward => (reals, imags),
Direction::Reverse => (imags, reals),
};
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| {
simd.vectorize(
#[inline(always)]
|| bit_rev_bravo_f64(simd, reals, log_n),
)
},
|| {
simd.vectorize(
#[inline(always)]
|| bit_rev_bravo_f64(simd, imags, log_n),
)
},
);
simd.vectorize(
#[inline(always)]
|| recursive_dit_fft_f64(simd, reals, imags, n, planner, opts, 0),
);
if let Direction::Reverse = direction {
let scaling_factor = 1.0 / n as f64;
for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) {
*z_re *= scaling_factor;
*z_im *= scaling_factor;
}
}
}
pub fn fft_32_dit_with_planner_and_opts(
reals: &mut [f32],
imags: &mut [f32],
direction: Direction,
planner: &PlannerDit32,
opts: &Options,
) {
dispatch!(planner.simd_level, simd => fft_32_dit_with_planner_and_opts_impl(simd, reals, imags, direction, planner, opts))
}
fn fft_32_dit_with_planner_and_opts_impl<S: Simd>(
simd: S,
reals: &mut [f32],
imags: &mut [f32],
direction: Direction,
planner: &PlannerDit32,
opts: &Options,
) {
assert_eq!(reals.len(), imags.len());
assert!(reals.len().is_power_of_two());
let n = reals.len();
let log_n = n.ilog2() as usize;
assert_eq!(log_n, planner.log_n);
let (reals, imags) = match direction {
Direction::Forward => (reals, imags),
Direction::Reverse => (imags, reals),
};
run_maybe_in_parallel(
opts.multithreaded_bit_reversal,
|| {
simd.vectorize(
#[inline(always)]
|| bit_rev_bravo_f32(simd, reals, log_n),
)
},
|| {
simd.vectorize(
#[inline(always)]
|| bit_rev_bravo_f32(simd, imags, log_n),
)
},
);
simd.vectorize(
#[inline(always)]
|| recursive_dit_fft_f32(simd, reals, imags, n, planner, opts, 0),
);
if let Direction::Reverse = direction {
let scaling_factor = 1.0 / n as f32;
for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) {
*z_re *= scaling_factor;
*z_im *= scaling_factor;
}
}
}
#[inline]
fn power_of_two(power: usize) -> usize {
debug_assert!(power < usize::BITS as usize);
1 << power
}