use crate::raptorq::gf256::{
Gf256, Gf256ArchitectureClass, Gf256ProfilePackId, gf256_add_slice, gf256_addmul_slice,
gf256_mul_slice,
};
use serde::{Deserialize, Serialize};
use crate::time::wall_now;
use crate::types::Time;
use crate::util::DetHashSet;
use std::collections::BTreeMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct KernelCandidate {
pub candidate_id: String,
pub architecture_class: Gf256ArchitectureClass,
pub tile_bytes: usize,
pub unroll: usize,
pub prefetch_distance: usize,
pub fusion_shape: FusionShape,
pub optimization_flags: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FusionShape {
Split,
Fused,
Balanced,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResult {
pub candidate: KernelCandidate,
pub workload_id: String,
pub iterations: usize,
pub latency_stats: LatencyStats,
pub throughput_ops_per_sec: f64,
pub bandwidth_gbps: f64,
pub bit_exactness_verified: bool,
pub benchmark_timestamp: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyStats {
pub median_ns: f64,
pub p95_ns: f64,
pub p99_ns: f64,
pub stddev_ns: f64,
pub min_ns: f64,
pub max_ns: f64,
}
#[derive(Debug, Clone)]
pub struct TuningSpace {
pub architecture_class: Gf256ArchitectureClass,
pub tile_sizes: Vec<usize>,
pub unroll_factors: Vec<usize>,
pub prefetch_distances: Vec<usize>,
pub fusion_shapes: Vec<FusionShape>,
}
#[derive(Debug, Clone)]
pub struct TuningWorkload {
pub workload_id: String,
pub data_size: usize,
pub multiplicand: u8,
pub operation: GF256Operation,
pub weight: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GF256Operation {
Mul,
AddMul,
Add,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptimizationCriteria {
pub latency_weight: f64,
pub throughput_weight: f64,
pub bandwidth_weight: f64,
pub min_improvement_threshold: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProfilePackSpec {
pub schema_version: String,
pub profile_pack: Gf256ProfilePackId,
pub architecture_class: Gf256ArchitectureClass,
pub tuning_corpus_id: String,
pub selected_tuning_candidate_id: String,
pub rejected_tuning_candidate_ids: Vec<String>,
pub mul_min_total: usize,
pub mul_max_total: usize,
pub addmul_min_total: usize,
pub addmul_max_total: usize,
pub addmul_min_lane: usize,
pub max_lane_ratio: usize,
pub replay_pointer: String,
pub command_bundle: String,
pub decision_artifact_id: String,
pub decision_role: String,
pub selected_candidate_summary: String,
pub rejected_candidate_set_summary: String,
pub selected_mul_delta_vs_baseline_pct: String,
pub selected_addmul_delta_vs_baseline_pct: String,
pub selected_targeted_addmul_average_delta_pct: String,
}
pub const DEFAULT_BENCHMARK_ITERATIONS: usize = 100;
pub struct OfflineTuner {
architecture_class: Gf256ArchitectureClass,
tuning_space: TuningSpace,
workloads: Vec<TuningWorkload>,
criteria: OptimizationCriteria,
benchmark_iterations: usize,
benchmark_results: Vec<BenchmarkResult>,
clock_anchor: Option<Time>,
}
impl OfflineTuner {
pub fn new(architecture_class: Gf256ArchitectureClass, criteria: OptimizationCriteria) -> Self {
let tuning_space = Self::default_tuning_space_for_arch(architecture_class);
let workloads = Self::default_workloads_for_arch(architecture_class);
Self {
architecture_class,
tuning_space,
workloads,
criteria,
benchmark_iterations: DEFAULT_BENCHMARK_ITERATIONS,
benchmark_results: Vec::new(),
clock_anchor: None,
}
}
#[must_use]
pub fn with_clock_anchor(mut self, anchor: Time) -> Self {
self.clock_anchor = Some(anchor);
self
}
#[must_use]
pub fn clock_anchor(&self) -> Option<Time> {
self.clock_anchor
}
fn benchmark_clock(&self) -> Time {
self.clock_anchor.unwrap_or_else(wall_now)
}
#[must_use]
pub fn with_benchmark_iterations(mut self, iterations: usize) -> Self {
self.benchmark_iterations = iterations.max(1);
self
}
#[must_use]
pub fn benchmark_iterations(&self) -> usize {
self.benchmark_iterations
}
pub fn generate_candidates(&self) -> Vec<KernelCandidate> {
let mut candidates = Vec::new();
for &tile_bytes in &self.tuning_space.tile_sizes {
for &unroll in &self.tuning_space.unroll_factors {
for &prefetch_distance in &self.tuning_space.prefetch_distances {
for &fusion_shape in &self.tuning_space.fusion_shapes {
let candidate_id = format!(
"{:?}-t{}-u{}-pf{}-{:?}-v1",
self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape
)
.to_lowercase()
.replace(' ', "_");
let optimization_flags = Self::derive_optimization_flags(
self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape,
);
candidates.push(KernelCandidate {
candidate_id,
architecture_class: self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape,
optimization_flags,
});
}
}
}
}
candidates
}
pub fn run_systematic_benchmarks(&mut self) -> Result<(), TuningError> {
let candidates = self.generate_candidates();
for candidate in &candidates {
for workload in &self.workloads {
let result = self.benchmark_candidate(candidate, workload)?;
self.benchmark_results.push(result);
}
}
Ok(())
}
fn benchmark_candidate(
&self,
candidate: &KernelCandidate,
workload: &TuningWorkload,
) -> Result<BenchmarkResult, TuningError> {
let test_data = self.generate_test_data(workload);
let (latency_stats, throughput_ops_per_sec, bandwidth_gbps) =
self.measure_performance(candidate, workload, &test_data)?;
let bit_exactness_verified = self.verify_bit_exactness(candidate, workload, &test_data)?;
Ok(BenchmarkResult {
candidate: candidate.clone(),
workload_id: workload.workload_id.clone(),
iterations: self.benchmark_iterations,
latency_stats,
throughput_ops_per_sec,
bandwidth_gbps,
bit_exactness_verified,
benchmark_timestamp: format!("t_ns={}", self.benchmark_clock().as_nanos()),
})
}
pub fn select_optimal_candidate(&self) -> Result<KernelCandidate, TuningError> {
if self.benchmark_results.is_empty() {
return Err(TuningError::NoBenchmarkResults);
}
let mut candidate_scores: BTreeMap<String, f64> = BTreeMap::new();
for result in &self.benchmark_results {
let candidate_id = &result.candidate.candidate_id;
let latency_score = 1.0 / (result.latency_stats.median_ns + 1.0);
let throughput_score = result.throughput_ops_per_sec;
let bandwidth_score = result.bandwidth_gbps;
let weighted_score = self.criteria.latency_weight * latency_score
+ self.criteria.throughput_weight * throughput_score
+ self.criteria.bandwidth_weight * bandwidth_score;
*candidate_scores.entry(candidate_id.clone()).or_insert(0.0) +=
weighted_score * self.workload_weight(&result.workload_id);
}
let best_candidate_id = candidate_scores
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or(TuningError::NoValidCandidates)?
.0;
self.benchmark_results
.iter()
.find(|r| &r.candidate.candidate_id == best_candidate_id)
.map(|r| r.candidate.clone())
.ok_or(TuningError::NoValidCandidates)
}
pub fn emit_profile_pack(
&self,
selected: &KernelCandidate,
) -> Result<ProfilePackSpec, TuningError> {
let profile_pack_id = match self.architecture_class {
Gf256ArchitectureClass::GenericScalar => Gf256ProfilePackId::ScalarConservativeV1,
Gf256ArchitectureClass::X86Avx2 => Gf256ProfilePackId::X86Avx2BalancedV1,
Gf256ArchitectureClass::Aarch64Neon => Gf256ProfilePackId::Aarch64NeonBalancedV1,
};
let (
mul_min_total,
mul_max_total,
addmul_min_total,
addmul_max_total,
addmul_min_lane,
max_lane_ratio,
) = Self::derive_thresholds_from_candidate(selected);
let baseline = self.baseline_candidate();
let baseline_id = baseline.as_ref().map(|c| c.candidate_id.as_str());
let mul_delta_pct =
self.format_aggregate_delta_pct(selected, baseline_id, GF256Operation::Mul);
let addmul_delta_pct =
self.format_aggregate_delta_pct(selected, baseline_id, GF256Operation::AddMul);
let targeted_addmul_avg_pct = self.format_per_workload_average_delta_pct(
selected,
baseline_id,
GF256Operation::AddMul,
);
Ok(ProfilePackSpec {
schema_version: "raptorq-gf256-profile-pack-v2".to_string(),
profile_pack: profile_pack_id,
architecture_class: self.architecture_class,
tuning_corpus_id: "offline_kernel_superoptimization_v1".to_string(),
selected_tuning_candidate_id: selected.candidate_id.clone(),
rejected_tuning_candidate_ids: self
.benchmark_results
.iter()
.map(|r| &r.candidate.candidate_id)
.filter(|id| *id != &selected.candidate_id)
.cloned()
.collect::<std::collections::BTreeSet<_>>()
.into_iter()
.collect(),
mul_min_total,
mul_max_total,
addmul_min_total,
addmul_max_total,
addmul_min_lane,
max_lane_ratio,
replay_pointer: "replay:offline-kernel-superopt-v1".to_string(),
command_bundle: format!(
"offline_tuner --arch {:?} --candidate {}",
self.architecture_class, selected.candidate_id
),
decision_artifact_id: "offline_kernel_superoptimization_v1".to_string(),
decision_role: "automated_offline_kernel_optimization".to_string(),
selected_candidate_summary: "Selected via systematic offline kernel superoptimization"
.to_string(),
rejected_candidate_set_summary: "Rejected candidates had lower multi-objective scores"
.to_string(),
selected_mul_delta_vs_baseline_pct: mul_delta_pct,
selected_addmul_delta_vs_baseline_pct: addmul_delta_pct,
selected_targeted_addmul_average_delta_pct: targeted_addmul_avg_pct,
})
}
fn baseline_candidate(&self) -> Option<KernelCandidate> {
self.generate_candidates().into_iter().next()
}
fn mean_median_ns(&self, candidate_id: &str, op: GF256Operation) -> Option<f64> {
let op_workloads: DetHashSet<&str> = self
.workloads
.iter()
.filter(|w| w.operation == op)
.map(|w| w.workload_id.as_str())
.collect();
if op_workloads.is_empty() {
return None;
}
let mut sum = 0.0_f64;
let mut count = 0usize;
for r in &self.benchmark_results {
if r.candidate.candidate_id == candidate_id
&& op_workloads.contains(r.workload_id.as_str())
{
sum += r.latency_stats.median_ns;
count += 1;
}
}
if count == 0 {
None
} else {
Some(sum / count as f64)
}
}
fn format_aggregate_delta_pct(
&self,
selected: &KernelCandidate,
baseline_id: Option<&str>,
op: GF256Operation,
) -> String {
let Some(baseline_id) = baseline_id else {
return "no_baseline_candidate".to_string();
};
if selected.candidate_id == baseline_id {
return "0.000".to_string();
}
let baseline_ns = match self.mean_median_ns(baseline_id, op) {
Some(v) if v > 0.0 => v,
Some(_) => return "baseline_zero_latency".to_string(),
None => return "no_baseline_data".to_string(),
};
let Some(selected_ns) = self.mean_median_ns(&selected.candidate_id, op) else {
return "no_selected_data".to_string();
};
let delta = (baseline_ns - selected_ns) / baseline_ns * 100.0;
format!("{delta:.3}")
}
fn format_per_workload_average_delta_pct(
&self,
selected: &KernelCandidate,
baseline_id: Option<&str>,
op: GF256Operation,
) -> String {
let Some(baseline_id) = baseline_id else {
return "no_baseline_candidate".to_string();
};
if selected.candidate_id == baseline_id {
return "0.000".to_string();
}
let mut deltas: Vec<f64> = Vec::new();
for workload in self.workloads.iter().filter(|w| w.operation == op) {
let baseline_ns = self
.benchmark_results
.iter()
.find(|r| {
r.candidate.candidate_id == baseline_id && r.workload_id == workload.workload_id
})
.map(|r| r.latency_stats.median_ns);
let selected_ns = self
.benchmark_results
.iter()
.find(|r| {
r.candidate.candidate_id == selected.candidate_id
&& r.workload_id == workload.workload_id
})
.map(|r| r.latency_stats.median_ns);
if let (Some(b), Some(s)) = (baseline_ns, selected_ns) {
if b > 0.0 {
deltas.push((b - s) / b * 100.0);
}
}
}
if deltas.is_empty() {
return "no_paired_workload_data".to_string();
}
let mean = deltas.iter().sum::<f64>() / deltas.len() as f64;
format!("{mean:.3}")
}
fn default_tuning_space_for_arch(arch: Gf256ArchitectureClass) -> TuningSpace {
match arch {
Gf256ArchitectureClass::GenericScalar => TuningSpace {
architecture_class: arch,
tile_sizes: vec![8, 16, 32],
unroll_factors: vec![1, 2],
prefetch_distances: vec![0],
fusion_shapes: vec![FusionShape::Split, FusionShape::Balanced],
},
Gf256ArchitectureClass::X86Avx2 => TuningSpace {
architecture_class: arch,
tile_sizes: vec![16, 32, 64],
unroll_factors: vec![2, 4, 8],
prefetch_distances: vec![0, 32, 64, 128],
fusion_shapes: vec![
FusionShape::Split,
FusionShape::Fused,
FusionShape::Balanced,
],
},
Gf256ArchitectureClass::Aarch64Neon => TuningSpace {
architecture_class: arch,
tile_sizes: vec![16, 32, 64],
unroll_factors: vec![1, 2, 4],
prefetch_distances: vec![0, 16, 32, 64],
fusion_shapes: vec![
FusionShape::Split,
FusionShape::Fused,
FusionShape::Balanced,
],
},
}
}
fn default_workloads_for_arch(_arch: Gf256ArchitectureClass) -> Vec<TuningWorkload> {
vec![
TuningWorkload {
workload_id: "small_mul".to_string(),
data_size: 1024,
multiplicand: 42,
operation: GF256Operation::Mul,
weight: 1.0,
},
TuningWorkload {
workload_id: "medium_mul".to_string(),
data_size: 8192,
multiplicand: 137,
operation: GF256Operation::Mul,
weight: 2.0,
},
TuningWorkload {
workload_id: "large_mul".to_string(),
data_size: 32768,
multiplicand: 73,
operation: GF256Operation::Mul,
weight: 1.5,
},
TuningWorkload {
workload_id: "small_addmul".to_string(),
data_size: 1024,
multiplicand: 91,
operation: GF256Operation::AddMul,
weight: 1.0,
},
TuningWorkload {
workload_id: "medium_addmul".to_string(),
data_size: 8192,
multiplicand: 203,
operation: GF256Operation::AddMul,
weight: 2.0,
},
TuningWorkload {
workload_id: "large_addmul".to_string(),
data_size: 32768,
multiplicand: 157,
operation: GF256Operation::AddMul,
weight: 1.5,
},
]
}
fn derive_optimization_flags(
arch: Gf256ArchitectureClass,
_tile_bytes: usize,
unroll: usize,
prefetch_distance: usize,
fusion_shape: FusionShape,
) -> Vec<String> {
let mut flags = Vec::new();
match arch {
Gf256ArchitectureClass::X86Avx2 => {
flags.push("avx2".to_string());
if unroll >= 4 {
flags.push("aggressive_unroll".to_string());
}
}
Gf256ArchitectureClass::Aarch64Neon => {
flags.push("neon".to_string());
}
Gf256ArchitectureClass::GenericScalar => {
flags.push("scalar".to_string());
}
}
if prefetch_distance > 0 {
flags.push("prefetch_enabled".to_string());
}
match fusion_shape {
FusionShape::Fused => flags.push("fusion_enabled".to_string()),
FusionShape::Balanced => flags.push("fusion_adaptive".to_string()),
FusionShape::Split => flags.push("fusion_disabled".to_string()),
}
flags
}
fn derive_thresholds_from_candidate(
candidate: &KernelCandidate,
) -> (usize, usize, usize, usize, usize, usize) {
let max_lane_ratio = candidate.unroll.max(1);
match candidate.fusion_shape {
FusionShape::Fused => {
(
candidate.tile_bytes * 4,
candidate.tile_bytes * 16,
candidate.tile_bytes * 2,
candidate.tile_bytes * 8,
candidate.tile_bytes,
max_lane_ratio,
)
}
FusionShape::Split => {
(
usize::MAX,
0,
candidate.tile_bytes,
candidate.tile_bytes * 4,
candidate.tile_bytes / 2,
max_lane_ratio,
)
}
FusionShape::Balanced => {
(
candidate.tile_bytes * 2,
candidate.tile_bytes * 8,
candidate.tile_bytes,
candidate.tile_bytes * 6,
candidate.tile_bytes / 2,
max_lane_ratio,
)
}
}
}
fn generate_test_data(&self, workload: &TuningWorkload) -> Vec<u8> {
let mut data = vec![0u8; workload.data_size];
let mut state = 0x1234_5678_9ABC_DEF0u64;
for byte in &mut data {
state ^= state >> 12;
state ^= state << 25;
state ^= state >> 27;
*byte = (state.wrapping_mul(0x2545_F491_4F6C_DD1D) & 0xFF) as u8;
}
data
}
fn measure_performance(
&self,
_candidate: &KernelCandidate,
workload: &TuningWorkload,
test_data: &[u8],
) -> Result<(LatencyStats, f64, f64), TuningError> {
let iterations = self.benchmark_iterations;
let mut latencies = Vec::with_capacity(iterations);
for _ in 0..iterations {
let start = wall_now();
let digest = match workload.operation {
GF256Operation::Mul => self.execute_mul_kernel(workload, test_data)?,
GF256Operation::AddMul => self.execute_addmul_kernel(workload, test_data)?,
GF256Operation::Add => self.execute_add_kernel(test_data)?,
};
std::hint::black_box(digest);
#[allow(clippy::cast_precision_loss)]
latencies.push(wall_now().duration_since(start) as f64);
}
latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median_ns = latencies[latencies.len() / 2];
let p95_ns = latencies[(latencies.len() * 95) / 100];
let p99_ns = latencies[(latencies.len() * 99) / 100];
let min_ns = latencies[0];
let max_ns = latencies[latencies.len() - 1];
let mean = latencies.iter().sum::<f64>() / latencies.len() as f64;
let variance =
latencies.iter().map(|l| (l - mean).powi(2)).sum::<f64>() / latencies.len() as f64;
let stddev_ns = variance.sqrt();
let latency_stats = LatencyStats {
median_ns,
p95_ns,
p99_ns,
stddev_ns,
min_ns,
max_ns,
};
let ops_per_sec = 1_000_000_000.0 / median_ns; let throughput_ops_per_sec = ops_per_sec * test_data.len() as f64;
let bandwidth_gbps =
(throughput_ops_per_sec * test_data.len() as f64) / (1024.0 * 1024.0 * 1024.0);
Ok((latency_stats, throughput_ops_per_sec, bandwidth_gbps))
}
fn verify_bit_exactness(
&self,
_candidate: &KernelCandidate,
workload: &TuningWorkload,
test_data: &[u8],
) -> Result<bool, TuningError> {
let mut reference_data = test_data.to_vec();
let mut test_data_copy = test_data.to_vec();
let scalar = Gf256::new(workload.multiplicand);
match workload.operation {
GF256Operation::Mul => {
for byte in &mut reference_data {
*byte = Gf256::new(*byte).mul_field(scalar).raw();
}
gf256_mul_slice(&mut test_data_copy, scalar);
}
GF256Operation::AddMul => {
let src_data = test_data.to_vec();
reference_data.fill(0); test_data_copy.fill(0);
for (dst_byte, src_byte) in reference_data.iter_mut().zip(&src_data) {
let product = Gf256::new(*src_byte).mul_field(scalar);
*dst_byte = Gf256::new(*dst_byte).add(product).raw();
}
gf256_addmul_slice(&mut test_data_copy, &src_data, scalar);
}
GF256Operation::Add => {
let src_data = test_data.to_vec();
reference_data.fill(0);
test_data_copy.fill(0);
for (dst_byte, src_byte) in reference_data.iter_mut().zip(&src_data) {
*dst_byte = Gf256::new(*dst_byte).add(Gf256::new(*src_byte)).raw();
}
gf256_add_slice(&mut test_data_copy, &src_data);
}
}
let bit_exact = reference_data == test_data_copy;
if !bit_exact {
return Err(TuningError::BitExactnessVerificationFailed);
}
Ok(bit_exact)
}
fn workload_weight(&self, workload_id: &str) -> f64 {
self.workloads
.iter()
.find(|w| w.workload_id == workload_id)
.map_or(1.0, |w| w.weight)
}
fn execute_mul_kernel(
&self,
workload: &TuningWorkload,
data: &[u8],
) -> Result<u64, TuningError> {
let mut data_copy = data.to_vec();
let scalar = Gf256::new(workload.multiplicand);
gf256_mul_slice(&mut data_copy, scalar);
Ok(Self::digest_kernel_output(&data_copy))
}
fn execute_addmul_kernel(
&self,
workload: &TuningWorkload,
data: &[u8],
) -> Result<u64, TuningError> {
let src_data = data.to_vec();
let mut dst_data = vec![0u8; data.len()];
let scalar = Gf256::new(workload.multiplicand);
gf256_addmul_slice(&mut dst_data, &src_data, scalar);
Ok(Self::digest_kernel_output(&dst_data))
}
fn execute_add_kernel(&self, data: &[u8]) -> Result<u64, TuningError> {
let src_data = data.to_vec();
let mut dst_data = vec![0u8; data.len()];
gf256_add_slice(&mut dst_data, &src_data);
Ok(Self::digest_kernel_output(&dst_data))
}
fn digest_kernel_output(bytes: &[u8]) -> u64 {
bytes.iter().fold(0xcbf2_9ce4_8422_2325, |acc, byte| {
acc.wrapping_mul(0x100_0000_01b3) ^ u64::from(*byte)
})
}
}
#[derive(Debug, thiserror::Error)]
pub enum TuningError {
#[error("No benchmark results available for optimization")]
NoBenchmarkResults,
#[error("No valid candidates found after filtering")]
NoValidCandidates,
#[error("Kernel execution failed: {0}")]
KernelExecutionFailed(String),
#[error("Bit-exactness verification failed")]
BitExactnessVerificationFailed,
#[error("I/O error during tuning: {0}")]
IoError(#[from] std::io::Error),
}
#[cfg(test)]
mod tests {
#![allow(
clippy::pedantic,
clippy::nursery,
clippy::expect_fun_call,
clippy::map_unwrap_or,
clippy::cast_possible_wrap,
clippy::future_not_send
)]
use super::*;
#[test]
fn test_clock_anchor_pins_benchmark_timestamp_3wxmb3() {
let anchor = Time::from_nanos(0xdead_beef_0000_0000);
let make = || {
OfflineTuner::new(
Gf256ArchitectureClass::GenericScalar,
OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
},
)
.with_clock_anchor(anchor)
};
let t1 = make();
let t2 = make();
assert_eq!(t1.clock_anchor(), Some(anchor));
assert_eq!(t2.clock_anchor(), Some(anchor));
assert_eq!(t1.benchmark_clock(), anchor);
assert_eq!(t2.benchmark_clock(), anchor);
}
#[test]
fn test_no_clock_anchor_falls_back_to_wall_now_3wxmb3() {
let tuner = OfflineTuner::new(
Gf256ArchitectureClass::GenericScalar,
OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
},
);
assert!(tuner.clock_anchor().is_none());
let _t1 = tuner.benchmark_clock();
let _t2 = tuner.benchmark_clock();
}
#[test]
fn test_candidate_generation() {
let tuner = OfflineTuner::new(
Gf256ArchitectureClass::GenericScalar,
OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
},
);
let candidates = tuner.generate_candidates();
assert!(!candidates.is_empty());
let mut candidate_ids = std::collections::HashSet::new();
for candidate in &candidates {
assert!(candidate_ids.insert(&candidate.candidate_id));
}
}
#[test]
fn test_tuning_space_x86_avx2() {
let space = OfflineTuner::default_tuning_space_for_arch(Gf256ArchitectureClass::X86Avx2);
assert_eq!(space.architecture_class, Gf256ArchitectureClass::X86Avx2);
assert!(space.tile_sizes.contains(&32));
assert!(space.unroll_factors.contains(&4));
assert!(space.prefetch_distances.contains(&64));
assert!(space.fusion_shapes.contains(&FusionShape::Fused));
}
#[test]
fn test_workload_generation() {
let workloads = OfflineTuner::default_workloads_for_arch(Gf256ArchitectureClass::X86Avx2);
assert!(!workloads.is_empty());
assert!(workloads.iter().any(|w| w.operation == GF256Operation::Mul));
assert!(
workloads
.iter()
.any(|w| w.operation == GF256Operation::AddMul)
);
}
fn test_criteria() -> OptimizationCriteria {
OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
}
}
#[test]
fn benchmark_iterations_defaults_to_constant() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria());
assert_eq!(tuner.benchmark_iterations(), DEFAULT_BENCHMARK_ITERATIONS);
}
#[test]
fn benchmark_iterations_override_is_honored() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria())
.with_benchmark_iterations(7);
assert_eq!(tuner.benchmark_iterations(), 7);
}
#[test]
fn benchmark_iterations_clamps_zero_to_one() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria())
.with_benchmark_iterations(0);
assert_eq!(
tuner.benchmark_iterations(),
1,
"zero iterations would break median/p95 indexing"
);
}
#[test]
fn benchmark_result_reports_configured_iterations() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria())
.with_benchmark_iterations(3);
let candidates = tuner.generate_candidates();
let candidate = candidates.first().expect("at least one candidate");
let workload = tuner
.workloads
.first()
.expect("default workloads non-empty");
let result = tuner
.benchmark_candidate(candidate, workload)
.expect("benchmark runs");
assert_eq!(result.iterations, 3);
}
#[test]
fn benchmark_execution_uses_real_workload_kernel_inputs() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria());
let workload = TuningWorkload {
workload_id: "mul_oracle".to_string(),
data_size: 64,
multiplicand: 137,
operation: GF256Operation::Mul,
weight: 1.0,
};
let data = tuner.generate_test_data(&workload);
let mut expected_mul = data.clone();
gf256_mul_slice(&mut expected_mul, Gf256::new(workload.multiplicand));
assert_eq!(
tuner
.execute_mul_kernel(&workload, &data)
.expect("mul execution"),
OfflineTuner::digest_kernel_output(&expected_mul)
);
let mut expected_addmul = vec![0u8; data.len()];
gf256_addmul_slice(
&mut expected_addmul,
&data,
Gf256::new(workload.multiplicand),
);
assert_eq!(
tuner
.execute_addmul_kernel(&workload, &data)
.expect("addmul execution"),
OfflineTuner::digest_kernel_output(&expected_addmul)
);
let mut expected_add = vec![0u8; data.len()];
gf256_add_slice(&mut expected_add, &data);
assert_eq!(
tuner.execute_add_kernel(&data).expect("add execution"),
OfflineTuner::digest_kernel_output(&expected_add)
);
}
fn synthetic_bench(
candidate: &KernelCandidate,
workload_id: &str,
median_ns: f64,
) -> BenchmarkResult {
BenchmarkResult {
candidate: candidate.clone(),
workload_id: workload_id.to_string(),
iterations: 100,
latency_stats: LatencyStats {
median_ns,
p95_ns: median_ns * 1.2,
p99_ns: median_ns * 1.5,
stddev_ns: median_ns * 0.1,
min_ns: median_ns * 0.8,
max_ns: median_ns * 2.0,
},
throughput_ops_per_sec: 1.0e9 / median_ns,
bandwidth_gbps: 0.0,
bit_exactness_verified: true,
benchmark_timestamp: "synthetic".to_string(),
}
}
#[test]
fn baseline_delta_reports_percentage_when_data_present() {
let mut tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria());
let candidates = tuner.generate_candidates();
let baseline = candidates.first().expect("baseline").clone();
let selected = candidates.last().expect("selected").clone();
assert_ne!(
baseline.candidate_id, selected.candidate_id,
"baseline and selected must differ so the delta is non-trivial"
);
for wl in ["small_mul", "medium_mul", "large_mul"] {
tuner
.benchmark_results
.push(synthetic_bench(&baseline, wl, 200.0));
tuner
.benchmark_results
.push(synthetic_bench(&selected, wl, 100.0));
}
for wl in ["small_addmul", "medium_addmul", "large_addmul"] {
tuner
.benchmark_results
.push(synthetic_bench(&baseline, wl, 400.0));
tuner
.benchmark_results
.push(synthetic_bench(&selected, wl, 300.0));
}
let pack = tuner.emit_profile_pack(&selected).expect("profile pack");
assert_eq!(pack.selected_mul_delta_vs_baseline_pct, "50.000");
assert_eq!(pack.selected_addmul_delta_vs_baseline_pct, "25.000");
assert_eq!(pack.selected_targeted_addmul_average_delta_pct, "25.000");
}
#[test]
fn baseline_delta_sentinel_when_no_benchmarks_run() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria());
let selected = tuner
.generate_candidates()
.last()
.expect("selected candidate")
.clone();
let pack = tuner.emit_profile_pack(&selected).expect("profile pack");
assert_eq!(pack.selected_mul_delta_vs_baseline_pct, "no_baseline_data");
assert_eq!(
pack.selected_addmul_delta_vs_baseline_pct,
"no_baseline_data"
);
assert_eq!(
pack.selected_targeted_addmul_average_delta_pct,
"no_paired_workload_data"
);
}
#[test]
fn baseline_delta_zero_when_selected_equals_baseline() {
let tuner = OfflineTuner::new(Gf256ArchitectureClass::GenericScalar, test_criteria());
let baseline = tuner
.generate_candidates()
.first()
.expect("baseline")
.clone();
let pack = tuner.emit_profile_pack(&baseline).expect("profile pack");
assert_eq!(pack.selected_mul_delta_vs_baseline_pct, "0.000");
assert_eq!(pack.selected_addmul_delta_vs_baseline_pct, "0.000");
assert_eq!(pack.selected_targeted_addmul_average_delta_pct, "0.000");
}
}