use crate::{
cufft::CuFFTPlan,
dsp::{bpsk_modulate, frequency_shift, real_to_complex},
};
use anyhow::Result;
use cudarc::{
driver::{CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg},
nvrtc::{compile_ptx_with_opts, safe::CompileOptions},
};
use ndarray::prelude::*;
use num_complex::Complex32;
use std::{ops::RangeInclusive, sync::Arc};
#[derive(Debug)]
pub struct CuFFTAcquisition {
stream: Arc<CudaStream>,
parameters: Parameters,
buffers: Buffers,
plans: Plans,
kernels: Kernels,
}
#[derive(Debug, Clone, PartialEq)]
pub struct AcquisitionConfiguration {
pub sample_rate: u32,
pub coherent_integration_ms: usize,
pub num_noncoherent_integrations: usize,
pub doppler_range: RangeInclusive<f64>,
pub doppler_oversampling: usize,
pub doppler_block_size: usize,
}
impl Default for AcquisitionConfiguration {
fn default() -> AcquisitionConfiguration {
AcquisitionConfiguration {
sample_rate: 8_000_000,
coherent_integration_ms: 20,
num_noncoherent_integrations: 14,
doppler_range: -10e3..=10e3,
doppler_oversampling: 2,
doppler_block_size: 8,
}
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
struct Parameters {
threads_per_block: usize,
samples_per_ms: usize,
coherent_ms: usize,
num_noncoherent: usize,
doppler_oversampling: usize,
doppler_block_size: usize,
num_doppler_blocks: usize,
first_doppler_bin: isize,
}
#[derive(Debug)]
struct Buffers {
signal_fft_cuda: CudaSlice<f32>,
signal_fft_cuda_is_set: bool,
replicas_fft_cuda: CudaSlice<f32>,
products_cuda: CudaSlice<f32>,
iffts_cuda: CudaSlice<f32>,
results_cuda: CudaSlice<f32>,
}
#[derive(Debug)]
struct Plans {
signal_plan: CuFFTPlan,
replicas_plan: CuFFTPlan,
iffts_plan: CuFFTPlan,
}
#[derive(Debug)]
struct Kernels {
signal_replica_products: CudaFunction,
noncoherent_accumulate: CudaFunction,
}
impl CuFFTAcquisition {
pub fn new(
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
configuration: &AcquisitionConfiguration,
) -> Result<CuFFTAcquisition> {
let parameters = Parameters::new(configuration)?;
let kernels = Kernels::new(&ctx, ¶meters)?;
let plans = Plans::new(&stream, ¶meters)?;
let buffers = unsafe { Buffers::new(&stream, ¶meters)? };
Ok(CuFFTAcquisition {
stream,
parameters,
plans,
buffers,
kernels,
})
}
pub fn set_signal(&mut self, signal: &[Complex32]) -> Result<()> {
self.compute_signal_fft(signal)?;
Ok(())
}
pub fn acquire(&mut self, prns: &[usize]) -> Result<Array3<f32>> {
let mut results = Array3::zeros((
prns.len(),
self.parameters.results_dopplers(),
self.parameters.coherent_samples(),
));
for (prn_idx, &prn) in prns.iter().enumerate() {
self.compute_replicas_fft(prn)?;
unsafe { self.duplicate_replicas_fft() }?;
for doppler_block in 0..self.parameters.num_doppler_blocks {
let doppler_bin0 = self.parameters.first_doppler_bin
+ isize::try_from(doppler_block * self.parameters.doppler_block_size).unwrap();
let replica_offset = if doppler_bin0 >= 0 {
self.parameters.nfft() - usize::try_from(doppler_bin0).unwrap()
} else {
usize::try_from(-doppler_bin0).unwrap()
};
unsafe { self.compute_products(replica_offset) }?;
unsafe { self.compute_iffts() }?;
let doppler_bin0_hz = doppler_bin0 as f64 * self.parameters.doppler_bin_hz();
let carrier_frequency = 1575.42e6; let offset_per_noncoherent = -doppler_bin0_hz / carrier_frequency
* self.parameters.coherent_samples() as f64;
unsafe {
self.compute_noncoherent_accumulation(doppler_block, offset_per_noncoherent)
}?;
}
let mut results_slice = results.slice_mut(s![prn_idx, .., ..]);
self.stream.memcpy_dtoh(
&self.buffers.results_cuda,
results_slice.as_slice_mut().unwrap(),
)?;
}
Ok(results)
}
pub fn doppler_axis(&self) -> impl Iterator<Item = f64> {
self.parameters.doppler_axis()
}
pub fn sample_rate(&self) -> f64 {
self.parameters.samp_rate()
}
fn compute_signal_fft(&mut self, signal: &[Complex32]) -> Result<()> {
let num_signal_samples = self.parameters.num_signal_samples();
anyhow::ensure!(
signal.len() >= num_signal_samples,
"not enough signal samples; required {num_signal_samples} got {}",
signal.len()
);
let signal_cuda = self
.stream
.clone_htod(complex32_slice_to_f32(&signal[..num_signal_samples]))?;
self.plans.signal_plan.execute_c2c_forward(
signal_cuda.as_view(),
self.buffers.signal_fft_cuda.as_view_mut(),
)?;
self.buffers.signal_fft_cuda_is_set = true;
Ok(())
}
fn compute_replicas_fft(&mut self, prn: usize) -> Result<()> {
let code = crate::gps::l1ca::ca_code(prn)?;
let mut replicas = vec![Complex32::new(0.0, 0.0); self.parameters.num_replica_samples()];
let chip_rate = 1.023e6;
let replica_zero_doppler = real_to_complex(bpsk_modulate(
&code,
chip_rate,
(self.parameters.samples_per_ms * 1000) as f64,
))
.take(self.parameters.coherent_samples())
.collect::<Vec<_>>();
for (subdoppler_bin, replica) in replicas
.chunks_exact_mut(self.parameters.nfft())
.enumerate()
{
let subdoppler = isize::try_from(subdoppler_bin).unwrap()
- isize::try_from(self.parameters.doppler_oversampling).unwrap() / 2;
if subdoppler != 0 {
let doppler_cycles_sample = subdoppler as f64
/ (self.parameters.doppler_oversampling * self.parameters.nfft()) as f64;
let replica_with_doppler = frequency_shift(
replica_zero_doppler.iter().copied(),
doppler_cycles_sample,
1.0,
);
for (z, w) in replica.iter_mut().zip(replica_with_doppler) {
*z = w;
}
} else {
replica[..replica_zero_doppler.len()].copy_from_slice(&replica_zero_doppler);
}
}
let replicas_cuda = self.stream.clone_htod(complex32_slice_to_f32(&replicas))?;
self.plans.replicas_plan.execute_c2c_forward(
replicas_cuda.as_view(),
self.buffers
.replicas_fft_cuda
.slice_mut(..self.buffers.replicas_fft_cuda.len() - 2 * self.parameters.nfft()),
)?;
Ok(())
}
unsafe fn duplicate_replicas_fft(&mut self) -> Result<()> {
for n in 0..self.parameters.doppler_oversampling {
let offset = 4 * self.parameters.nfft() * n;
let (a, mut b) = self
.buffers
.replicas_fft_cuda
.split_at_mut(offset + 2 * self.parameters.nfft());
self.stream.memcpy_dtod(
&a.slice(offset..),
&mut b.slice_mut(..2 * self.parameters.nfft()),
)?;
}
Ok(())
}
unsafe fn compute_products(&mut self, replica_offset: usize) -> Result<()> {
anyhow::ensure!(
self.buffers.signal_fft_cuda_is_set,
"set_signal must be called before acquire"
);
let mut builder = self
.stream
.launch_builder(&self.kernels.signal_replica_products);
builder.arg(&self.buffers.signal_fft_cuda);
let replicas_slice = self.buffers.replicas_fft_cuda.slice(2 * replica_offset..);
builder.arg(&replicas_slice);
builder.arg(&mut self.buffers.products_cuda);
unsafe {
builder.launch(LaunchConfig {
grid_dim: (
self.parameters.block_x(),
self.parameters.num_noncoherent.try_into().unwrap(),
self.parameters.doppler_block_size.try_into().unwrap(),
),
block_dim: self.parameters.block_dim(),
shared_mem_bytes: 0,
})
}?;
Ok(())
}
unsafe fn compute_iffts(&mut self) -> Result<()> {
self.plans.iffts_plan.execute_c2c_inverse(
self.buffers.products_cuda.as_view(),
self.buffers.iffts_cuda.as_view_mut(),
)?;
Ok(())
}
unsafe fn compute_noncoherent_accumulation(
&mut self,
doppler_block: usize,
offset_per_noncoherent: f64,
) -> Result<()> {
let mut builder = self
.stream
.launch_builder(&self.kernels.noncoherent_accumulate);
builder.arg(&self.buffers.iffts_cuda);
let mut results_slice = self.buffers.results_cuda.slice_mut(
self.parameters.results_block_size() * doppler_block
..self.parameters.results_block_size() * (doppler_block + 1),
);
let offset_per_noncoherent = offset_per_noncoherent as f32;
builder.arg(&offset_per_noncoherent);
builder.arg(&mut results_slice);
unsafe {
builder.launch(LaunchConfig {
grid_dim: (
self.parameters
.coherent_samples()
.div_ceil(self.parameters.threads_per_block)
.try_into()
.unwrap(),
(self.parameters.doppler_oversampling * self.parameters.doppler_block_size)
.try_into()
.unwrap(),
1,
),
block_dim: self.parameters.block_dim(),
shared_mem_bytes: 0,
})
}?;
Ok(())
}
}
impl Parameters {
fn new(configuration: &AcquisitionConfiguration) -> Result<Parameters> {
anyhow::ensure!(
configuration.sample_rate.is_multiple_of(1000),
"there must be an integer number of samples per millisecond"
);
anyhow::ensure!(
configuration.coherent_integration_ms > 0,
"the coherent integration length must not be zero"
);
anyhow::ensure!(
configuration.num_noncoherent_integrations > 0,
"the number of non-coherent integrations must not be zero"
);
anyhow::ensure!(
configuration.doppler_oversampling > 0,
"the Doppler oversampling must not be zero"
);
anyhow::ensure!(
configuration.doppler_block_size > 0,
"the Doppler block size must not be zero"
);
let samples_per_ms = (configuration.sample_rate / 1000).try_into().unwrap();
let nfft = 2 * configuration.coherent_integration_ms * samples_per_ms;
let doppler_bin_width = configuration.sample_rate as f64 / nfft as f64;
let min_subdoppler = -isize::try_from(configuration.doppler_oversampling).unwrap() / 2;
let min_subdoppler = min_subdoppler as f64 * configuration.sample_rate as f64
/ (configuration.doppler_oversampling * nfft) as f64;
let first_doppler_bin = ((*configuration.doppler_range.start() + min_subdoppler)
/ doppler_bin_width)
.floor() as isize;
let first_doppler_bin = first_doppler_bin
.div_euclid(isize::try_from(configuration.doppler_oversampling).unwrap())
* isize::try_from(configuration.doppler_oversampling).unwrap();
let max_subdoppler =
configuration.doppler_oversampling - 1 - configuration.doppler_oversampling / 2;
let max_subdoppler = max_subdoppler as f64 * configuration.sample_rate as f64
/ (configuration.doppler_oversampling * nfft) as f64;
let last_doppler_bin = ((*configuration.doppler_range.end() - max_subdoppler)
/ doppler_bin_width)
.ceil() as isize;
let total_doppler_bins = usize::try_from(last_doppler_bin - first_doppler_bin + 1).unwrap();
let adjusted_doppler_bins =
total_doppler_bins.next_multiple_of(configuration.doppler_block_size);
let num_doppler_blocks = adjusted_doppler_bins / configuration.doppler_block_size;
let parameters = Parameters {
threads_per_block: 512,
samples_per_ms,
coherent_ms: configuration.coherent_integration_ms,
num_noncoherent: configuration.num_noncoherent_integrations,
doppler_oversampling: configuration.doppler_oversampling,
doppler_block_size: configuration.doppler_block_size,
num_doppler_blocks,
first_doppler_bin,
};
assert_eq!(parameters.nfft(), nfft);
Ok(parameters)
}
fn coherent_samples(&self) -> usize {
self.coherent_ms * self.samples_per_ms
}
fn nfft(&self) -> usize {
2 * self.coherent_samples()
}
fn num_signal_samples(&self) -> usize {
(self.num_noncoherent - 1) * self.coherent_samples() + self.nfft()
}
fn num_replica_samples(&self) -> usize {
self.nfft() * self.doppler_oversampling
}
fn num_product_samples(&self) -> usize {
self.nfft() * self.results_block_dopplers() * self.num_noncoherent
}
fn results_block_size(&self) -> usize {
self.coherent_samples() * self.results_block_dopplers()
}
fn results_block_dopplers(&self) -> usize {
self.doppler_oversampling * self.doppler_block_size
}
fn results_samples(&self) -> usize {
self.results_block_size() * self.num_doppler_blocks
}
fn results_dopplers(&self) -> usize {
self.results_block_dopplers() * self.num_doppler_blocks
}
fn block_dim(&self) -> (u32, u32, u32) {
(self.threads_per_block.try_into().unwrap(), 1, 1)
}
fn block_x(&self) -> u32 {
self.nfft()
.div_ceil(self.threads_per_block)
.try_into()
.unwrap()
}
fn samp_rate(&self) -> f64 {
(self.samples_per_ms * 1000) as f64
}
fn doppler_bin_hz(&self) -> f64 {
self.samp_rate() / self.nfft() as f64
}
fn doppler_axis(&self) -> impl Iterator<Item = f64> {
(self.first_doppler_bin..)
.take(self.num_doppler_blocks * self.doppler_block_size)
.flat_map(move |bin| {
let freq = bin as f64 * self.doppler_bin_hz();
(0..self.doppler_oversampling).map(move |subdoppler| {
let subdoppler = isize::try_from(subdoppler).unwrap()
- isize::try_from(self.doppler_oversampling).unwrap() / 2;
let subdoppler = self.samp_rate() * subdoppler as f64
/ (self.doppler_oversampling * self.nfft()) as f64;
freq + subdoppler
})
})
}
}
impl Buffers {
unsafe fn new(stream: &Arc<CudaStream>, parameters: &Parameters) -> Result<Buffers> {
let signal_fft_cuda =
unsafe { stream.alloc::<f32>(2 * parameters.nfft() * parameters.num_noncoherent) }?;
let replicas_fft_cuda =
unsafe { stream.alloc::<f32>(4 * parameters.num_replica_samples()) }?;
let products_cuda = unsafe { stream.alloc::<f32>(2 * parameters.num_product_samples()) }?;
let iffts_cuda = unsafe { stream.alloc::<f32>(2 * parameters.num_product_samples()) }?;
let results_cuda = unsafe { stream.alloc::<f32>(parameters.results_samples()) }?;
Ok(Buffers {
signal_fft_cuda,
signal_fft_cuda_is_set: false,
replicas_fft_cuda,
products_cuda,
iffts_cuda,
results_cuda,
})
}
}
impl Plans {
fn new(stream: &Arc<CudaStream>, parameters: &Parameters) -> Result<Plans> {
let signal_plan = CuFFTPlan::new_c2c_batch_with_idist(
stream,
parameters.nfft(),
parameters.num_noncoherent,
parameters.coherent_samples(),
)?;
let replicas_plan = CuFFTPlan::new_c2c_batch_with_odist(
stream,
parameters.nfft(),
parameters.doppler_oversampling,
2 * parameters.nfft(),
)?;
let iffts_plan = CuFFTPlan::new_c2c_batch(
stream,
parameters.nfft(),
parameters.doppler_block_size
* parameters.doppler_oversampling
* parameters.num_noncoherent,
)?;
Ok(Plans {
signal_plan,
replicas_plan,
iffts_plan,
})
}
}
impl Kernels {
fn new(ctx: &Arc<CudaContext>, parameters: &Parameters) -> Result<Kernels> {
let ptx = compile_ptx_with_opts(
kernels_ptx(parameters),
CompileOptions {
include_paths: vec!["/opt/cuda/targets/x86_64-linux/include/".to_string()],
..Default::default()
},
)?;
let module = ctx.load_module(ptx)?;
let signal_replica_products = module.load_function("signal_replica_products")?;
let noncoherent_accumulate = module.load_function("noncoherent_accumulate")?;
Ok(Kernels {
signal_replica_products,
noncoherent_accumulate,
})
}
}
fn kernels_ptx(parameters: &Parameters) -> String {
let nfft = parameters.nfft();
let coherent_samples = parameters.coherent_samples();
let doppler_oversampling = parameters.doppler_oversampling;
let num_noncoherent = parameters.num_noncoherent;
format!(
r#"
#include <cuComplex.h>
extern "C" __global__ void signal_replica_products(
const cuComplex* signal,
const cuComplex* replicas,
cuComplex* products
) {{
const int index = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = blockDim.x * gridDim.x;
const int signal_offset = blockIdx.y * {nfft};
const int replica_idx = blockIdx.z;
for (int i = 0; i < {doppler_oversampling}; ++i) {{
for (int j = index; j < {nfft}; j += stride) {{
const auto z = signal[signal_offset + j];
const auto w = replicas[i * 2 * {nfft} + j - replica_idx];
const auto prod = cuCmulf(z, cuConjf(w));
const int out_idx = (replica_idx * {doppler_oversampling} + i) * {nfft} * {num_noncoherent}
+ signal_offset + j;
products[out_idx] = prod;
}}
}}
}}
extern "C" __global__ void noncoherent_accumulate(
const cuComplex* z,
float offset_per_noncoherent,
float* out
) {{
const int index = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = blockDim.x * gridDim.x;
const int z_offset = blockIdx.y * {nfft} * {num_noncoherent};
const int out_offset = blockIdx.y * {coherent_samples};
for (int i = index; i < {coherent_samples}; i += stride) {{
float accumulator = 0.0f;
for (int j = 0; j < {num_noncoherent}; ++j) {{
const int offset = roundf(offset_per_noncoherent * j);
auto i_corrected = i + offset;
if (i_corrected < 0) {{
i_corrected -= {coherent_samples};
}} else if (i_corrected >= {coherent_samples}
&& j * {nfft} + i_corrected + {coherent_samples} < {nfft} * {num_noncoherent}) {{
i_corrected += {coherent_samples};
}}
const auto a = z[z_offset + j * {nfft} + i_corrected];
const float magsq = cuCrealf(a) * cuCrealf(a) + cuCimagf(a) * cuCimagf(a);
accumulator += magsq;
}}
out[out_offset + i] = accumulator;
}}
}}
"#
)
}
fn complex32_slice_to_f32(x: &[Complex32]) -> &[f32] {
unsafe { std::slice::from_raw_parts(x.as_ptr().cast::<f32>(), 2 * x.len()) }
}
#[cfg(test)]
mod test {
use super::*;
use num_complex::ComplexFloat;
use rand::prelude::*;
use std::sync::Mutex;
static GPU_MUTEX: Mutex<()> = Mutex::new(());
fn f32_slice_to_complex32(x: &[f32]) -> &[Complex32] {
assert!(x.len().is_multiple_of(2));
unsafe { std::slice::from_raw_parts(x.as_ptr().cast::<Complex32>(), x.len() / 2) }
}
fn new_complex32_random(rng: &mut ThreadRng, n: usize) -> Vec<Complex32> {
let range = -1.0..=1.0;
std::iter::repeat_with(|| {
Complex32::new(
rng.random_range(range.clone()),
rng.random_range(range.clone()),
)
})
.take(n)
.collect()
}
macro_rules! assert_close {
($x:expr, $y:expr, $relative_tolerance:expr) => {
assert_close!($x, $y, $relative_tolerance, 0.0);
};
($x:expr, $y:expr, $relative_tolerance:expr, $absolute_tolerance:expr) => {
assert_eq!($x.len(), $y.len());
for (a, b) in $x.iter().zip($y.iter()) {
assert!(
(2.0 * (a - b).abs() / (a.abs() + b.abs()) <= $relative_tolerance)
|| (a - b).abs() <= $absolute_tolerance,
"{a} and {b} differ by more than relative tolerance {} and absolute tolerance {}",
$relative_tolerance,
$absolute_tolerance,
);
}
};
}
#[test]
fn compute_signal_fft() {
let _gpu = GPU_MUTEX.lock().unwrap();
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut acquisition =
CuFFTAcquisition::new(ctx, stream.clone(), &AcquisitionConfiguration::default())
.unwrap();
let mut rng = rand::rng();
let signal = new_complex32_random(&mut rng, acquisition.parameters.num_signal_samples());
acquisition.compute_signal_fft(&signal).unwrap();
let signal_fft = stream
.clone_dtoh(&acquisition.buffers.signal_fft_cuda)
.unwrap();
let mut planner = rustfft::FftPlanner::new();
let fft = planner.plan_fft_forward(acquisition.parameters.nfft());
let signal_rustfft = signal
.windows(acquisition.parameters.nfft())
.step_by(acquisition.parameters.coherent_samples())
.flat_map(|window| {
let mut window = window.to_vec();
fft.process(&mut window);
window
})
.collect::<Vec<_>>();
assert_close!(f32_slice_to_complex32(&signal_fft), &signal_rustfft, 5e-3);
}
#[test]
fn compute_replicas_fft() {
let _gpu = GPU_MUTEX.lock().unwrap();
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut acquisition =
CuFFTAcquisition::new(ctx, stream.clone(), &AcquisitionConfiguration::default())
.unwrap();
let prn = 1;
acquisition.compute_replicas_fft(prn).unwrap();
let replicas_fft = stream
.clone_dtoh(&acquisition.buffers.replicas_fft_cuda)
.unwrap();
let mut planner = rustfft::FftPlanner::new();
let fft = planner.plan_fft_forward(acquisition.parameters.nfft());
for (bin, replica_fft) in f32_slice_to_complex32(&replicas_fft)
.chunks_exact(acquisition.parameters.nfft())
.step_by(2)
.enumerate()
{
let code = crate::gps::l1ca::ca_code(prn).unwrap();
let samp_rate = 8e6;
let bin = isize::try_from(bin).unwrap()
- isize::try_from(acquisition.parameters.doppler_oversampling).unwrap() / 2;
let doppler = samp_rate * (bin as f64)
/ (acquisition.parameters.doppler_oversampling * acquisition.parameters.nfft())
as f64;
let chip_rate = 1.023e6;
let samp_rate = 8e6;
let mut replica_rustfft = frequency_shift(
real_to_complex(bpsk_modulate(&code, chip_rate, samp_rate)),
doppler,
samp_rate,
)
.take(acquisition.parameters.coherent_samples())
.chain(std::iter::repeat(Complex32::new(0.0, 0.0)))
.take(acquisition.parameters.nfft())
.collect::<Vec<_>>();
fft.process(&mut replica_rustfft);
assert_close!(replica_fft, &replica_rustfft, 3e-3, 1e-3);
}
}
#[test]
fn compute_products() {
let _gpu = GPU_MUTEX.lock().unwrap();
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut acquisition =
CuFFTAcquisition::new(ctx, stream.clone(), &AcquisitionConfiguration::default())
.unwrap();
let mut rng = rand::rng();
let signal_fft = new_complex32_random(
&mut rng,
acquisition.parameters.nfft() * acquisition.parameters.num_noncoherent,
);
stream
.memcpy_htod(
complex32_slice_to_f32(&signal_fft),
&mut acquisition.buffers.signal_fft_cuda,
)
.unwrap();
acquisition.buffers.signal_fft_cuda_is_set = true;
let replicas_fft =
new_complex32_random(&mut rng, acquisition.parameters.num_replica_samples());
let mut replicas_fft_2x = Vec::new();
for replica in replicas_fft.chunks_exact(acquisition.parameters.nfft()) {
replicas_fft_2x.extend_from_slice(replica);
replicas_fft_2x.resize(
replicas_fft_2x.len() + acquisition.parameters.nfft(),
Complex32::new(0.0, 0.0),
);
}
stream
.memcpy_htod(
complex32_slice_to_f32(&replicas_fft_2x),
&mut acquisition.buffers.replicas_fft_cuda,
)
.unwrap();
let replica_offset = rng.random_range(
0..=acquisition.parameters.nfft() - acquisition.parameters.doppler_block_size,
);
unsafe {
acquisition.duplicate_replicas_fft().unwrap();
acquisition.compute_products(replica_offset).unwrap();
}
let products = stream
.clone_dtoh(&acquisition.buffers.products_cuda)
.unwrap();
let mut products_expected = Vec::new();
for doppler_idx in 0..acquisition.parameters.doppler_block_size {
for replica in replicas_fft.chunks_exact(acquisition.parameters.nfft()) {
for signal in signal_fft.chunks_exact(acquisition.parameters.nfft()) {
for (z, w) in signal.iter().zip(
replica
.iter()
.cycle()
.skip(replica_offset + acquisition.parameters.nfft() - doppler_idx),
) {
products_expected.push(z * w.conj());
}
}
}
}
assert_close!(f32_slice_to_complex32(&products), &products_expected, 1e-6);
}
#[test]
fn compute_iffts() {
let _gpu = GPU_MUTEX.lock().unwrap();
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut acquisition =
CuFFTAcquisition::new(ctx, stream.clone(), &AcquisitionConfiguration::default())
.unwrap();
let mut rng = rand::rng();
let products = new_complex32_random(&mut rng, acquisition.parameters.num_product_samples());
stream
.memcpy_htod(
complex32_slice_to_f32(&products),
&mut acquisition.buffers.products_cuda,
)
.unwrap();
unsafe { acquisition.compute_iffts().unwrap() };
let iffts = stream.clone_dtoh(&acquisition.buffers.iffts_cuda).unwrap();
let mut planner = rustfft::FftPlanner::new();
let fft = planner.plan_fft_inverse(acquisition.parameters.nfft());
let mut iffts_rustfft = products.clone();
fft.process(&mut iffts_rustfft);
assert_close!(f32_slice_to_complex32(&iffts), &iffts_rustfft, 2e-2);
}
#[test]
fn compute_noncoherent_accumulation() {
let _gpu = GPU_MUTEX.lock().unwrap();
let ctx = CudaContext::new(0).unwrap();
let stream = ctx.default_stream();
let mut acquisition =
CuFFTAcquisition::new(ctx, stream.clone(), &AcquisitionConfiguration::default())
.unwrap();
let mut rng = rand::rng();
let iffts = new_complex32_random(&mut rng, acquisition.parameters.num_product_samples());
stream
.memcpy_htod(
complex32_slice_to_f32(&iffts),
&mut acquisition.buffers.iffts_cuda,
)
.unwrap();
let doppler_block = rng.random_range(0..acquisition.parameters.num_doppler_blocks);
let offset_per_noncoherent: f32 = rng.random_range(-3.0..=3.0);
unsafe {
acquisition
.compute_noncoherent_accumulation(doppler_block, offset_per_noncoherent.into())
.unwrap()
};
let results_offset = doppler_block * acquisition.parameters.results_block_size();
let results_len = acquisition.parameters.results_block_size();
let partial_results = stream
.clone_dtoh(
&acquisition
.buffers
.results_cuda
.slice(results_offset..results_offset + results_len),
)
.unwrap();
let coherent_samples = acquisition.parameters.coherent_samples();
let nfft = acquisition.parameters.nfft();
let results_expected = iffts
.chunks_exact(nfft * acquisition.parameters.num_noncoherent)
.flat_map(|chunk| {
(0..coherent_samples).map(|idx| {
let mut accumulator = 0.0;
for fft_idx in 0..acquisition.parameters.num_noncoherent {
let offset = (offset_per_noncoherent * fft_idx as f32).round() as isize;
let idx = isize::try_from(idx).unwrap() + offset;
let coherent_samples_i = isize::try_from(coherent_samples).unwrap();
let (fft_idx, idx) = if idx < 0 {
assert!(fft_idx > 0);
(fft_idx - 1, idx + coherent_samples_i)
} else if idx >= coherent_samples_i
&& fft_idx + 1 < acquisition.parameters.num_noncoherent
{
(fft_idx + 1, idx - coherent_samples_i)
} else {
(fft_idx, idx)
};
let idx = usize::try_from(idx).unwrap();
let z = chunk[fft_idx * nfft + idx];
accumulator += z.norm_sqr();
}
accumulator
})
})
.collect::<Vec<f32>>();
assert_close!(&partial_results, &results_expected, 1e-6);
}
}