#[cfg(target_os = "linux")]
use crate::gpu::cubic_cell::host_substrate::HostMomentBatch;
#[cfg(target_os = "linux")]
use crate::gpu::cubic_cell::{
CubicCellDerivativeMomentHostView, CubicCellDerivativeMomentOutput, CubicCellMomentStatus,
GpuCellBranchTag, branch::classify_cell_for_gpu,
};
#[cfg(target_os = "linux")]
use crate::gpu::error::GpuError;
#[cfg(target_os = "linux")]
use crate::gpu::error::GpuResultExt;
#[cfg(target_os = "linux")]
use crate::gpu_err;
#[cfg(target_os = "linux")]
use std::sync::{Arc, Mutex, OnceLock};
#[cfg(target_os = "linux")]
use cudarc::driver::{CudaContext, CudaModule, CudaStream};
#[cfg(target_os = "linux")]
pub(crate) fn try_device_moments(
view: &CubicCellDerivativeMomentHostView<'_>,
) -> Result<Option<HostMomentBatch>, GpuError> {
let backend = match CubicCellGpuBackend::probe() {
Ok(b) => b,
Err(GpuError::DriverLibraryUnavailable { .. }) => return Ok(None),
Err(other) => return Err(other),
};
backend.dispatch(view).map(Some)
}
#[cfg(target_os = "linux")]
pub(crate) fn try_device_moments_resident(
view: &CubicCellDerivativeMomentHostView<'_>,
) -> Result<Option<CubicCellDerivativeMomentOutput>, GpuError> {
let backend = match CubicCellGpuBackend::probe() {
Ok(b) => b,
Err(GpuError::DriverLibraryUnavailable { .. }) => return Ok(None),
Err(other) => return Err(other),
};
backend.dispatch_device_resident(view).map(Some)
}
#[cfg(target_os = "linux")]
#[must_use]
pub(crate) struct CubicCellGpuBackend {
inner: CubicCellGpuContextLinux,
}
#[cfg(target_os = "linux")]
struct CubicCellGpuContextLinux {
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
modules: Mutex<std::collections::HashMap<usize, Arc<CudaModule>>>,
}
#[cfg(target_os = "linux")]
impl CubicCellGpuBackend {
pub(crate) fn probe() -> Result<&'static Self, GpuError> {
static BACKEND: OnceLock<Result<CubicCellGpuBackend, GpuError>> = OnceLock::new();
BACKEND
.get_or_init(Self::probe_linux)
.as_ref()
.map_err(GpuError::clone)
}
#[cfg(target_os = "linux")]
fn probe_linux() -> Result<Self, GpuError> {
let runtime = crate::gpu::runtime::GpuRuntime::global().ok_or_else(|| {
GpuError::DriverLibraryUnavailable {
reason: "cubic_cell backend: no CUDA runtime available".to_string(),
}
})?;
let ctx = crate::gpu::runtime::cuda_context_for(runtime.selected_device().ordinal)
.ok_or_else(|| {
gpu_err!(
"cubic_cell backend: failed to create CUDA context for device {}",
runtime.selected_device().ordinal
)
})?;
let stream = ctx.default_stream();
Ok(CubicCellGpuBackend {
inner: CubicCellGpuContextLinux {
ctx,
stream,
modules: Mutex::new(std::collections::HashMap::new()),
},
})
}
#[cfg(target_os = "linux")]
fn module_for_degree(&self, max_degree: usize) -> Result<Arc<CudaModule>, GpuError> {
let key = max_degree;
{
let guard = self
.inner
.modules
.lock()
.gpu_ctx("cubic_cell module cache mutex poisoned")?;
if let Some(module) = guard.get(&key) {
return Ok(Arc::clone(module));
}
}
let source =
crate::gpu::cubic_cell::kernel_src::build_cubic_deriv_moments_kernel_source(max_degree);
let ptx = cudarc::nvrtc::compile_ptx(&source).gpu_ctx_with(|err| {
format!("cubic_cell NVRTC compile (degree={max_degree}) failed: {err}")
})?;
let module = self.inner.ctx.load_module(ptx).gpu_ctx_with(|err| {
format!("cubic_cell module load (degree={max_degree}) failed: {err}")
})?;
let mut guard = self
.inner
.modules
.lock()
.gpu_ctx("cubic_cell module cache mutex poisoned")?;
let entry = guard.entry(key).or_insert(module);
Ok(Arc::clone(entry))
}
#[cfg(target_os = "linux")]
fn dispatch(
&self,
view: &CubicCellDerivativeMomentHostView<'_>,
) -> Result<HostMomentBatch, GpuError> {
let n_cells = view.cells.len();
let stride = view.max_degree + 1;
let mut moments = vec![0.0_f64; n_cells * stride];
let mut status = vec![CubicCellMomentStatus::Ok as u8; n_cells];
let mut eligible_branches: Vec<GpuCellBranchTag> = Vec::with_capacity(n_cells);
let mut eligible_idx: Vec<usize> = Vec::with_capacity(n_cells);
for (i, &gpu_cell) in view.cells.iter().enumerate() {
match classify_cell_for_gpu(gpu_cell) {
Ok(host_tag) => {
if host_tag != view.branches[i] {
status[i] = CubicCellMomentStatus::InvalidInterval as u8;
continue;
}
eligible_branches.push(host_tag);
eligible_idx.push(i);
}
Err(code) => {
status[i] = code as u8;
}
}
}
let mut nonaffine_idx: Vec<usize> = Vec::new();
let mut cpu_idx: Vec<usize> = Vec::new();
for (pos, &tag) in eligible_branches.iter().enumerate() {
let cell_idx = eligible_idx[pos];
match tag {
GpuCellBranchTag::NonAffineFinite => nonaffine_idx.push(cell_idx),
GpuCellBranchTag::Affine | GpuCellBranchTag::AffineTail => cpu_idx.push(cell_idx),
}
}
if !cpu_idx.is_empty() {
self.populate_cpu_buckets(
view,
&cpu_idx,
stride,
moments.as_mut_slice(),
status.as_mut_slice(),
);
}
if !nonaffine_idx.is_empty() {
self.launch_nonaffine_bucket(
view,
&nonaffine_idx,
stride,
moments.as_mut_slice(),
status.as_mut_slice(),
)?;
}
Ok(HostMomentBatch {
moments,
status,
stride,
})
}
#[cfg(target_os = "linux")]
fn populate_cpu_buckets(
&self,
view: &CubicCellDerivativeMomentHostView<'_>,
cpu_idx: &[usize],
stride: usize,
moments: &mut [f64],
status: &mut [u8],
) {
use crate::families::cubic_cell_kernel::{
DenestedCubicCell, evaluate_cell_derivative_moments_uncached,
};
for &cell_idx in cpu_idx {
let gpu_cell = view.cells[cell_idx];
let cpu_cell = DenestedCubicCell {
left: gpu_cell.left,
right: gpu_cell.right,
c0: gpu_cell.c0,
c1: gpu_cell.c1,
c2: gpu_cell.c2,
c3: gpu_cell.c3,
};
let row = &mut moments[cell_idx * stride..(cell_idx + 1) * stride];
match evaluate_cell_derivative_moments_uncached(cpu_cell, view.max_degree) {
Ok(state) => {
let copy_len = state.moments.len().min(stride);
row[..copy_len].copy_from_slice(&state.moments[..copy_len]);
if row.iter().any(|x| !x.is_finite()) {
for slot in row.iter_mut() {
*slot = 0.0;
}
status[cell_idx] = CubicCellMomentStatus::NonFiniteEvaluation as u8;
}
}
Err(_) => {
for slot in row.iter_mut() {
*slot = 0.0;
}
status[cell_idx] = match view.branches[cell_idx] {
GpuCellBranchTag::AffineTail => {
CubicCellMomentStatus::NonAffineInfiniteInterval as u8
}
_ => CubicCellMomentStatus::InvalidInterval as u8,
};
}
}
}
}
#[cfg(target_os = "linux")]
fn launch_nonaffine_bucket(
&self,
view: &CubicCellDerivativeMomentHostView<'_>,
nonaffine_idx: &[usize],
stride: usize,
moments: &mut [f64],
status: &mut [u8],
) -> Result<(), GpuError> {
use cudarc::driver::{LaunchConfig, PushKernelArg};
let m = nonaffine_idx.len();
let mut left = Vec::with_capacity(m);
let mut right = Vec::with_capacity(m);
let mut c0 = Vec::with_capacity(m);
let mut c1 = Vec::with_capacity(m);
let mut c2 = Vec::with_capacity(m);
let mut c3 = Vec::with_capacity(m);
let mut branch_code = Vec::with_capacity(m);
for &i in nonaffine_idx {
let c = view.cells[i];
left.push(c.left);
right.push(c.right);
c0.push(c.c0);
c1.push(c.c1);
c2.push(c.c2);
c3.push(c.c3);
branch_code.push(1u8);
}
let max_degree = view.max_degree;
let module = self.module_for_degree(max_degree)?;
let kernel_name = format!("cubic_deriv_moments_d{max_degree}");
let func = module
.load_function(&kernel_name)
.gpu_ctx_with(|err| format!("cubic_cell load_function {kernel_name}: {err}"))?;
let stream = &self.inner.stream;
let d_left = stream
.clone_htod(&left)
.gpu_ctx("cubic_cell memcpy_stod left")?;
let d_right = stream
.clone_htod(&right)
.gpu_ctx("cubic_cell memcpy_stod right")?;
let d_c0 = stream
.clone_htod(&c0)
.gpu_ctx("cubic_cell memcpy_stod c0")?;
let d_c1 = stream
.clone_htod(&c1)
.gpu_ctx("cubic_cell memcpy_stod c1")?;
let d_c2 = stream
.clone_htod(&c2)
.gpu_ctx("cubic_cell memcpy_stod c2")?;
let d_c3 = stream
.clone_htod(&c3)
.gpu_ctx("cubic_cell memcpy_stod c3")?;
let d_branch = stream
.clone_htod(&branch_code)
.gpu_ctx("cubic_cell memcpy_stod branch_code")?;
let mut d_moments = stream
.alloc_zeros::<f64>(m * stride)
.gpu_ctx("cubic_cell alloc_zeros moments")?;
let mut d_status = stream
.alloc_zeros::<u8>(m)
.gpu_ctx("cubic_cell alloc_zeros status")?;
let warps_per_block: u32 = 4;
let block: u32 = 32 * warps_per_block;
let m_u32: u32 =
u32::try_from(m).map_err(|_| gpu_err!("cubic_cell n_cells={m} overflows u32"))?;
let grid: u32 = m_u32.div_ceil(warps_per_block).max(1);
let cfg = LaunchConfig {
grid_dim: (grid, 1, 1),
block_dim: (block, 1, 1),
shared_mem_bytes: 0,
};
let n_cells_u32 = m_u32;
let mut builder = stream.launch_builder(&func);
builder
.arg(&d_left)
.arg(&d_right)
.arg(&d_c0)
.arg(&d_c1)
.arg(&d_c2)
.arg(&d_c3)
.arg(&d_branch)
.arg(&mut d_moments)
.arg(&mut d_status)
.arg(&n_cells_u32);
unsafe { builder.launch(cfg) }.gpu_ctx("cubic_cell kernel launch")?;
let host_moments = stream
.clone_dtoh(&d_moments)
.gpu_ctx("cubic_cell memcpy_dtov moments")?;
let host_status = stream
.clone_dtoh(&d_status)
.gpu_ctx("cubic_cell memcpy_dtov status")?;
stream.synchronize().gpu_ctx("cubic_cell synchronize")?;
for (pos, &cell_idx) in nonaffine_idx.iter().enumerate() {
let dst = &mut moments[cell_idx * stride..(cell_idx + 1) * stride];
let src = &host_moments[pos * stride..(pos + 1) * stride];
dst.copy_from_slice(src);
status[cell_idx] = host_status[pos];
if host_status[pos] != CubicCellMomentStatus::Ok as u8 {
for slot in dst.iter_mut() {
*slot = 0.0;
}
}
}
Ok(())
}
#[cfg(target_os = "linux")]
fn dispatch_device_resident(
&self,
view: &CubicCellDerivativeMomentHostView<'_>,
) -> Result<CubicCellDerivativeMomentOutput, GpuError> {
use cudarc::driver::{LaunchConfig, PushKernelArg};
let n_cells = view.cells.len();
let stride = view.max_degree + 1;
assert!(n_cells > 0, "caller must guard empty views");
let mut status_host = vec![CubicCellMomentStatus::Ok as u8; n_cells];
let mut branch_code = vec![255_u8; n_cells];
let mut left = vec![0.0_f64; n_cells];
let mut right = vec![0.0_f64; n_cells];
let mut c0 = vec![0.0_f64; n_cells];
let mut c1 = vec![0.0_f64; n_cells];
let mut c2 = vec![0.0_f64; n_cells];
let mut c3 = vec![0.0_f64; n_cells];
for (i, &gpu_cell) in view.cells.iter().enumerate() {
left[i] = gpu_cell.left;
right[i] = gpu_cell.right;
c0[i] = gpu_cell.c0;
c1[i] = gpu_cell.c1;
c2[i] = gpu_cell.c2;
c3[i] = gpu_cell.c3;
match classify_cell_for_gpu(gpu_cell) {
Ok(host_tag) => {
if host_tag != view.branches[i] {
status_host[i] = CubicCellMomentStatus::InvalidInterval as u8;
continue;
}
branch_code[i] = match host_tag {
GpuCellBranchTag::Affine => 0,
GpuCellBranchTag::NonAffineFinite => 1,
GpuCellBranchTag::AffineTail => 2,
};
}
Err(code) => {
status_host[i] = code as u8;
}
}
}
let max_degree = view.max_degree;
let module = self.module_for_degree(max_degree)?;
let kernel_name = format!("cubic_deriv_moments_d{max_degree}");
let func = module
.load_function(&kernel_name)
.gpu_ctx_with(|err| format!("cubic_cell load_function {kernel_name}: {err}"))?;
let stream = &self.inner.stream;
let d_left = stream
.clone_htod(&left)
.gpu_ctx("cubic_cell device-resident memcpy left")?;
let d_right = stream
.clone_htod(&right)
.gpu_ctx("cubic_cell device-resident memcpy right")?;
let d_c0 = stream
.clone_htod(&c0)
.gpu_ctx("cubic_cell device-resident memcpy c0")?;
let d_c1 = stream
.clone_htod(&c1)
.gpu_ctx("cubic_cell device-resident memcpy c1")?;
let d_c2 = stream
.clone_htod(&c2)
.gpu_ctx("cubic_cell device-resident memcpy c2")?;
let d_c3 = stream
.clone_htod(&c3)
.gpu_ctx("cubic_cell device-resident memcpy c3")?;
let d_branch = stream
.clone_htod(&branch_code)
.gpu_ctx("cubic_cell device-resident memcpy branch")?;
let mut d_moments = stream
.alloc_zeros::<f64>(n_cells * stride)
.map_err(|err| gpu_err!("cubic_cell device-resident alloc moments: {err}"))?;
let mut d_status = stream
.alloc_zeros::<u8>(n_cells)
.gpu_ctx("cubic_cell device-resident alloc status")?;
let warps_per_block: u32 = 4;
let block: u32 = 32 * warps_per_block;
let n_u32: u32 = u32::try_from(n_cells)
.map_err(|_| gpu_err!("cubic_cell n_cells={n_cells} overflows u32"))?;
let grid: u32 = n_u32.div_ceil(warps_per_block).max(1);
let cfg = LaunchConfig {
grid_dim: (grid, 1, 1),
block_dim: (block, 1, 1),
shared_mem_bytes: 0,
};
let n_cells_u32 = n_u32;
let mut builder = stream.launch_builder(&func);
builder
.arg(&d_left)
.arg(&d_right)
.arg(&d_c0)
.arg(&d_c1)
.arg(&d_c2)
.arg(&d_c3)
.arg(&d_branch)
.arg(&mut d_moments)
.arg(&mut d_status)
.arg(&n_cells_u32);
unsafe { builder.launch(cfg) }.gpu_ctx("cubic_cell device-resident kernel launch")?;
let kernel_status = stream
.clone_dtoh(&d_status)
.gpu_ctx("cubic_cell device-resident DtoH status")?;
stream
.synchronize()
.gpu_ctx("cubic_cell device-resident sync after kernel")?;
for i in 0..n_cells {
if status_host[i] == CubicCellMomentStatus::Ok as u8 {
status_host[i] = kernel_status[i];
}
}
drop(d_status);
Ok(CubicCellDerivativeMomentOutput::Device {
d_moments,
status: status_host,
stride,
n_cells,
})
}
}
#[cfg(all(test, target_os = "linux"))]
mod tests {
use super::*;
use crate::gpu::cubic_cell::{
CubicCellDerivativeMomentHostView, CubicCellDerivativeMomentOutput,
CubicCellMomentResidency, CubicCellMomentStatus, GpuCellBranchTag, GpuDenestedCubicCell,
try_build_cubic_cell_derivative_moments,
};
use crate::gpu::error::GpuError;
use crate::gpu::error::GpuResultExt;
use crate::gpu::runtime::GpuRuntime;
fn download_moments(
backend: &CubicCellGpuBackend,
d_moments: &cudarc::driver::CudaSlice<f64>,
) -> Result<Vec<f64>, GpuError> {
let stream = &backend.inner.stream;
let host = stream
.clone_dtoh(d_moments)
.gpu_ctx("cubic_cell tests::download_moments DtoH")?;
stream
.synchronize()
.gpu_ctx("cubic_cell tests::download_moments sync")?;
Ok(host)
}
fn make_nonaffine_cells() -> Vec<GpuDenestedCubicCell> {
vec![
GpuDenestedCubicCell {
left: -1.25,
right: -0.2,
c0: -0.35,
c1: 0.85,
c2: 0.4,
c3: 0.0,
},
GpuDenestedCubicCell {
left: -0.5,
right: 1.7,
c0: 0.2,
c1: -0.6,
c2: 0.25,
c3: 0.18,
},
GpuDenestedCubicCell {
left: 0.1,
right: 0.9,
c0: 0.05,
c1: 0.0,
c2: -0.3,
c3: 0.12,
},
GpuDenestedCubicCell {
left: -2.0,
right: 2.0,
c0: 0.0,
c1: 0.0,
c2: 0.5,
c3: 0.0,
},
GpuDenestedCubicCell {
left: -0.8,
right: 0.4,
c0: 0.1,
c1: -0.25,
c2: 0.05,
c3: -0.07,
},
]
}
#[test]
fn cubic_cell_gpu_nonaffine_matches_cpu_within_tol() {
let Some(runtime) = GpuRuntime::global() else {
eprintln!(
"[cubic_cell_gpu test] no CUDA runtime — skipping NonAffineFinite parity test"
);
return;
};
eprintln!(
"[cubic_cell_gpu test] runtime selected device ordinal={}",
runtime.selected_device().ordinal
);
let cells = make_nonaffine_cells();
let branches = vec![GpuCellBranchTag::NonAffineFinite; cells.len()];
let max_degree = 9;
let view = CubicCellDerivativeMomentHostView {
cells: &cells,
branches: &branches,
max_degree,
residency: CubicCellMomentResidency::Host,
};
let gpu_batch = try_device_moments(&view)
.expect("device dispatch must succeed on a host with CUDA")
.expect("Some(_) from device dispatch when GPU is present");
let cpu_batch = crate::gpu::cubic_cell::host_substrate::build_host_moments(&view)
.expect("host substrate produces parity reference");
assert_eq!(gpu_batch.stride, cpu_batch.stride);
assert_eq!(gpu_batch.status, cpu_batch.status);
let stride = gpu_batch.stride;
for cell_idx in 0..cells.len() {
assert_eq!(
gpu_batch.status[cell_idx],
CubicCellMomentStatus::Ok as u8,
"cell {cell_idx} must classify Ok"
);
let gpu_row = &gpu_batch.moments[cell_idx * stride..(cell_idx + 1) * stride];
let cpu_row = &cpu_batch.moments[cell_idx * stride..(cell_idx + 1) * stride];
for (k, (&got, &want)) in gpu_row.iter().zip(cpu_row.iter()).enumerate() {
let denom = want.abs().max(1.0);
let rel = (got - want).abs() / denom;
assert!(
rel <= 1e-8,
"cell={cell_idx} k={k} gpu={got:.17e} cpu={want:.17e} rel={rel:.3e}"
);
}
}
}
#[test]
fn cubic_cell_gpu_returns_none_when_runtime_absent() {
if GpuRuntime::global().is_some() {
eprintln!(
"[cubic_cell_gpu test] CUDA runtime present — skipping the absent-runtime case"
);
return;
}
let cells = make_nonaffine_cells();
let branches = vec![GpuCellBranchTag::NonAffineFinite; cells.len()];
let view = CubicCellDerivativeMomentHostView {
cells: &cells,
branches: &branches,
max_degree: 9,
residency: CubicCellMomentResidency::Host,
};
let out = try_device_moments(&view).expect("clean Ok on hosts without CUDA");
assert!(
out.is_none(),
"expected Ok(None) on a host without a usable CUDA runtime"
);
}
#[cfg(target_os = "linux")]
#[test]
fn cubic_cell_device_residency_matches_cpu_all_branches() {
use crate::families::cubic_cell_kernel::{
DenestedCubicCell, evaluate_cell_derivative_moments_uncached,
};
if GpuRuntime::global().is_none() {
eprintln!("[cubic_cell device-residency parity] no CUDA runtime — skipping");
return;
}
let cpu_cells = vec![
DenestedCubicCell {
left: -1.0,
right: 1.0,
c0: 0.2,
c1: 0.7,
c2: 0.0,
c3: 0.0,
},
DenestedCubicCell {
left: -1.25,
right: -0.2,
c0: -0.35,
c1: 0.85,
c2: 0.4,
c3: 0.0,
},
DenestedCubicCell {
left: -0.5,
right: 1.7,
c0: 0.2,
c1: -0.6,
c2: 0.25,
c3: 0.18,
},
DenestedCubicCell {
left: f64::NEG_INFINITY,
right: -0.7,
c0: 0.1,
c1: 0.5,
c2: 0.0,
c3: 0.0,
},
DenestedCubicCell {
left: 1.2,
right: f64::INFINITY,
c0: -0.05,
c1: 0.3,
c2: 0.0,
c3: 0.0,
},
DenestedCubicCell {
left: f64::NEG_INFINITY,
right: f64::INFINITY,
c0: 0.0,
c1: 0.0,
c2: 0.0,
c3: 0.0,
},
];
let cells_gpu: Vec<GpuDenestedCubicCell> = cpu_cells
.iter()
.map(|c| GpuDenestedCubicCell {
left: c.left,
right: c.right,
c0: c.c0,
c1: c.c1,
c2: c.c2,
c3: c.c3,
})
.collect();
let branches: Vec<GpuCellBranchTag> = cpu_cells
.iter()
.map(|c| {
if !c.left.is_finite() || !c.right.is_finite() {
GpuCellBranchTag::AffineTail
} else if c.c2 == 0.0 && c.c3 == 0.0 {
GpuCellBranchTag::Affine
} else {
GpuCellBranchTag::NonAffineFinite
}
})
.collect();
for &max_degree in &[9_usize, 15, 21] {
let view = CubicCellDerivativeMomentHostView {
cells: &cells_gpu,
branches: &branches,
max_degree,
residency: CubicCellMomentResidency::Device,
};
let out = try_build_cubic_cell_derivative_moments(view)
.expect("device-residency dispatch must succeed with CUDA")
.expect("non-empty input must yield output");
let (d_moments, status, stride, n_cells) = match out {
CubicCellDerivativeMomentOutput::Device {
d_moments,
status,
stride,
n_cells,
} => (d_moments, status, stride, n_cells),
CubicCellDerivativeMomentOutput::Host { .. } => panic!(
"device residency must produce CubicCellDerivativeMomentOutput::Device on a CUDA host"
),
};
assert_eq!(stride, max_degree + 1);
assert_eq!(n_cells, cpu_cells.len());
assert_eq!(status.len(), cpu_cells.len());
let backend = CubicCellGpuBackend::probe().expect("backend probe");
let host_moments =
download_moments(backend, &d_moments).expect("DtoH download for parity check");
for (i, &cpu_cell) in cpu_cells.iter().enumerate() {
assert_eq!(
status[i],
CubicCellMomentStatus::Ok as u8,
"cell {i} must classify Ok (status={})",
status[i]
);
let row = &host_moments[i * stride..(i + 1) * stride];
let cpu_state = evaluate_cell_derivative_moments_uncached(cpu_cell, max_degree)
.expect("cpu reference");
for (k, (&got, &want)) in row.iter().zip(cpu_state.moments.iter()).enumerate() {
let abs = (got - want).abs();
let denom = want.abs().max(1.0);
let rel = abs / denom;
assert!(
abs <= 1e-12 || rel <= 1e-11,
"device parity drift at degree={max_degree} cell={i} k={k} \
gpu={got:.17e} cpu={want:.17e} abs={abs:.3e} rel={rel:.3e}"
);
}
}
}
}
}