use crate::raptorq::gf256::{Gf256ArchitectureClass, Gf256ProfilePackId};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::time::{Duration, Instant};
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct KernelCandidate {
pub candidate_id: String,
pub architecture_class: Gf256ArchitectureClass,
pub tile_bytes: usize,
pub unroll: usize,
pub prefetch_distance: usize,
pub fusion_shape: FusionShape,
pub optimization_flags: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FusionShape {
Split,
Fused,
Balanced,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkResult {
pub candidate: KernelCandidate,
pub workload_id: String,
pub iterations: usize,
pub latency_stats: LatencyStats,
pub throughput_ops_per_sec: f64,
pub bandwidth_gbps: f64,
pub bit_exactness_verified: bool,
pub benchmark_timestamp: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LatencyStats {
pub median_ns: f64,
pub p95_ns: f64,
pub p99_ns: f64,
pub stddev_ns: f64,
pub min_ns: f64,
pub max_ns: f64,
}
#[derive(Debug, Clone)]
pub struct TuningSpace {
pub architecture_class: Gf256ArchitectureClass,
pub tile_sizes: Vec<usize>,
pub unroll_factors: Vec<usize>,
pub prefetch_distances: Vec<usize>,
pub fusion_shapes: Vec<FusionShape>,
}
#[derive(Debug, Clone)]
pub struct TuningWorkload {
pub workload_id: String,
pub data_size: usize,
pub multiplicand: u8,
pub operation: GF256Operation,
pub weight: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GF256Operation {
Mul,
AddMul,
Add,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OptimizationCriteria {
pub latency_weight: f64,
pub throughput_weight: f64,
pub bandwidth_weight: f64,
pub min_improvement_threshold: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProfilePackSpec {
pub schema_version: String,
pub profile_pack: Gf256ProfilePackId,
pub architecture_class: Gf256ArchitectureClass,
pub tuning_corpus_id: String,
pub selected_tuning_candidate_id: String,
pub rejected_tuning_candidate_ids: Vec<String>,
pub mul_min_total: usize,
pub mul_max_total: usize,
pub addmul_min_total: usize,
pub addmul_max_total: usize,
pub addmul_min_lane: usize,
pub max_lane_ratio: usize,
pub replay_pointer: String,
pub command_bundle: String,
pub decision_artifact_id: String,
pub decision_role: String,
pub selected_candidate_summary: String,
pub rejected_candidate_set_summary: String,
pub selected_mul_delta_vs_baseline_pct: String,
pub selected_addmul_delta_vs_baseline_pct: String,
pub selected_targeted_addmul_average_delta_pct: String,
}
pub struct OfflineTuner {
architecture_class: Gf256ArchitectureClass,
tuning_space: TuningSpace,
workloads: Vec<TuningWorkload>,
criteria: OptimizationCriteria,
benchmark_results: Vec<BenchmarkResult>,
}
impl OfflineTuner {
pub fn new(architecture_class: Gf256ArchitectureClass, criteria: OptimizationCriteria) -> Self {
let tuning_space = Self::default_tuning_space_for_arch(architecture_class);
let workloads = Self::default_workloads_for_arch(architecture_class);
Self {
architecture_class,
tuning_space,
workloads,
criteria,
benchmark_results: Vec::new(),
}
}
pub fn generate_candidates(&self) -> Vec<KernelCandidate> {
let mut candidates = Vec::new();
for &tile_bytes in &self.tuning_space.tile_sizes {
for &unroll in &self.tuning_space.unroll_factors {
for &prefetch_distance in &self.tuning_space.prefetch_distances {
for &fusion_shape in &self.tuning_space.fusion_shapes {
let candidate_id = format!(
"{:?}-t{}-u{}-pf{}-{:?}-v1",
self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape
)
.to_lowercase()
.replace(' ', "_");
let optimization_flags = Self::derive_optimization_flags(
self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape,
);
candidates.push(KernelCandidate {
candidate_id,
architecture_class: self.architecture_class,
tile_bytes,
unroll,
prefetch_distance,
fusion_shape,
optimization_flags,
});
}
}
}
}
candidates
}
pub fn run_systematic_benchmarks(&mut self) -> Result<(), TuningError> {
let candidates = self.generate_candidates();
println!(
"Starting systematic benchmarking of {} candidates across {} workloads",
candidates.len(),
self.workloads.len()
);
for (i, candidate) in candidates.iter().enumerate() {
println!(
"Benchmarking candidate {}/{}: {}",
i + 1,
candidates.len(),
candidate.candidate_id
);
for workload in &self.workloads {
let result = self.benchmark_candidate(candidate, workload)?;
self.benchmark_results.push(result);
}
}
println!("Completed {} benchmark runs", self.benchmark_results.len());
Ok(())
}
fn benchmark_candidate(
&self,
candidate: &KernelCandidate,
workload: &TuningWorkload,
) -> Result<BenchmarkResult, TuningError> {
let test_data = self.generate_test_data(workload);
let (latency_stats, throughput_ops_per_sec, bandwidth_gbps) =
self.measure_performance(candidate, workload, &test_data)?;
let bit_exactness_verified = self.verify_bit_exactness(candidate, workload, &test_data)?;
Ok(BenchmarkResult {
candidate: candidate.clone(),
workload_id: workload.workload_id.clone(),
iterations: 100, latency_stats,
throughput_ops_per_sec,
bandwidth_gbps,
bit_exactness_verified,
benchmark_timestamp: format!("{:?}", std::time::SystemTime::now()),
})
}
pub fn select_optimal_candidate(&self) -> Result<KernelCandidate, TuningError> {
if self.benchmark_results.is_empty() {
return Err(TuningError::NoBenchmarkResults);
}
let mut candidate_scores: HashMap<String, f64> = HashMap::new();
for result in &self.benchmark_results {
let candidate_id = &result.candidate.candidate_id;
let latency_score = 1.0 / (result.latency_stats.median_ns + 1.0);
let throughput_score = result.throughput_ops_per_sec;
let bandwidth_score = result.bandwidth_gbps;
let weighted_score = self.criteria.latency_weight * latency_score
+ self.criteria.throughput_weight * throughput_score
+ self.criteria.bandwidth_weight * bandwidth_score;
*candidate_scores.entry(candidate_id.clone()).or_insert(0.0) +=
weighted_score * self.workload_weight(&result.workload_id);
}
let best_candidate_id = candidate_scores
.iter()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
.ok_or(TuningError::NoValidCandidates)?
.0;
self.benchmark_results
.iter()
.find(|r| &r.candidate.candidate_id == best_candidate_id)
.map(|r| r.candidate.clone())
.ok_or(TuningError::NoValidCandidates)
}
pub fn emit_profile_pack(
&self,
selected: &KernelCandidate,
) -> Result<ProfilePackSpec, TuningError> {
let profile_pack_id = match self.architecture_class {
Gf256ArchitectureClass::GenericScalar => Gf256ProfilePackId::ScalarConservativeV1,
Gf256ArchitectureClass::X86Avx2 => Gf256ProfilePackId::X86Avx2BalancedV1,
Gf256ArchitectureClass::Aarch64Neon => Gf256ProfilePackId::Aarch64NeonBalancedV1,
};
let (mul_min_total, mul_max_total, addmul_min_total, addmul_max_total, addmul_min_lane) =
Self::derive_thresholds_from_candidate(selected);
Ok(ProfilePackSpec {
schema_version: "raptorq-gf256-profile-pack-v2".to_string(),
profile_pack: profile_pack_id,
architecture_class: self.architecture_class,
tuning_corpus_id: "offline_kernel_superoptimization_v1".to_string(),
selected_tuning_candidate_id: selected.candidate_id.clone(),
rejected_tuning_candidate_ids: Vec::new(), mul_min_total,
mul_max_total,
addmul_min_total,
addmul_max_total,
addmul_min_lane,
max_lane_ratio: 4, replay_pointer: "replay:offline-kernel-superopt-v1".to_string(),
command_bundle: format!(
"offline_tuner --arch {:?} --candidate {}",
self.architecture_class, selected.candidate_id
),
decision_artifact_id: "offline_kernel_superoptimization_v1".to_string(),
decision_role: "automated_offline_kernel_optimization".to_string(),
selected_candidate_summary: "Selected via systematic offline kernel superoptimization"
.to_string(),
rejected_candidate_set_summary: "Rejected candidates had lower multi-objective scores"
.to_string(),
selected_mul_delta_vs_baseline_pct: "pending_measurement".to_string(), selected_addmul_delta_vs_baseline_pct: "pending_measurement".to_string(), selected_targeted_addmul_average_delta_pct: "pending_measurement".to_string(), })
}
fn default_tuning_space_for_arch(arch: Gf256ArchitectureClass) -> TuningSpace {
match arch {
Gf256ArchitectureClass::GenericScalar => TuningSpace {
architecture_class: arch,
tile_sizes: vec![8, 16, 32],
unroll_factors: vec![1, 2],
prefetch_distances: vec![0],
fusion_shapes: vec![FusionShape::Split, FusionShape::Balanced],
},
Gf256ArchitectureClass::X86Avx2 => TuningSpace {
architecture_class: arch,
tile_sizes: vec![16, 32, 64],
unroll_factors: vec![2, 4, 8],
prefetch_distances: vec![0, 32, 64, 128],
fusion_shapes: vec![
FusionShape::Split,
FusionShape::Fused,
FusionShape::Balanced,
],
},
Gf256ArchitectureClass::Aarch64Neon => TuningSpace {
architecture_class: arch,
tile_sizes: vec![16, 32, 64],
unroll_factors: vec![1, 2, 4],
prefetch_distances: vec![0, 16, 32, 64],
fusion_shapes: vec![
FusionShape::Split,
FusionShape::Fused,
FusionShape::Balanced,
],
},
}
}
fn default_workloads_for_arch(_arch: Gf256ArchitectureClass) -> Vec<TuningWorkload> {
vec![
TuningWorkload {
workload_id: "small_mul".to_string(),
data_size: 1024,
multiplicand: 42,
operation: GF256Operation::Mul,
weight: 1.0,
},
TuningWorkload {
workload_id: "medium_mul".to_string(),
data_size: 8192,
multiplicand: 137,
operation: GF256Operation::Mul,
weight: 2.0,
},
TuningWorkload {
workload_id: "large_mul".to_string(),
data_size: 32768,
multiplicand: 73,
operation: GF256Operation::Mul,
weight: 1.5,
},
TuningWorkload {
workload_id: "small_addmul".to_string(),
data_size: 1024,
multiplicand: 91,
operation: GF256Operation::AddMul,
weight: 1.0,
},
TuningWorkload {
workload_id: "medium_addmul".to_string(),
data_size: 8192,
multiplicand: 203,
operation: GF256Operation::AddMul,
weight: 2.0,
},
TuningWorkload {
workload_id: "large_addmul".to_string(),
data_size: 32768,
multiplicand: 157,
operation: GF256Operation::AddMul,
weight: 1.5,
},
]
}
fn derive_optimization_flags(
arch: Gf256ArchitectureClass,
_tile_bytes: usize,
unroll: usize,
prefetch_distance: usize,
fusion_shape: FusionShape,
) -> Vec<String> {
let mut flags = Vec::new();
match arch {
Gf256ArchitectureClass::X86Avx2 => {
flags.push("avx2".to_string());
if unroll >= 4 {
flags.push("aggressive_unroll".to_string());
}
}
Gf256ArchitectureClass::Aarch64Neon => {
flags.push("neon".to_string());
}
Gf256ArchitectureClass::GenericScalar => {
flags.push("scalar".to_string());
}
}
if prefetch_distance > 0 {
flags.push("prefetch_enabled".to_string());
}
match fusion_shape {
FusionShape::Fused => flags.push("fusion_enabled".to_string()),
FusionShape::Balanced => flags.push("fusion_adaptive".to_string()),
FusionShape::Split => flags.push("fusion_disabled".to_string()),
}
flags
}
fn derive_thresholds_from_candidate(
candidate: &KernelCandidate,
) -> (usize, usize, usize, usize, usize) {
match candidate.fusion_shape {
FusionShape::Fused => {
(
candidate.tile_bytes * 4,
candidate.tile_bytes * 16,
candidate.tile_bytes * 2,
candidate.tile_bytes * 8,
candidate.tile_bytes,
)
}
FusionShape::Split => {
(
usize::MAX,
0,
candidate.tile_bytes,
candidate.tile_bytes * 4,
candidate.tile_bytes / 2,
)
}
FusionShape::Balanced => {
(
candidate.tile_bytes * 2,
candidate.tile_bytes * 8,
candidate.tile_bytes,
candidate.tile_bytes * 6,
candidate.tile_bytes / 2,
)
}
}
}
fn generate_test_data(&self, workload: &TuningWorkload) -> Vec<u8> {
let mut data = vec![0u8; workload.data_size];
let mut state = 0x1234_5678_9ABC_DEF0u64;
for byte in &mut data {
state ^= state >> 12;
state ^= state << 25;
state ^= state >> 27;
*byte = (state.wrapping_mul(0x2545_F491_4F6C_DD1D) & 0xFF) as u8;
}
data
}
fn measure_performance(
&self,
candidate: &KernelCandidate,
workload: &TuningWorkload,
test_data: &[u8],
) -> Result<(LatencyStats, f64, f64), TuningError> {
let iterations = 100;
let mut latencies = Vec::with_capacity(iterations);
for _ in 0..iterations {
let start = Instant::now();
match workload.operation {
GF256Operation::Mul => {
self.simulate_mul_kernel(candidate, test_data)?;
}
GF256Operation::AddMul => {
self.simulate_addmul_kernel(candidate, test_data)?;
}
GF256Operation::Add => {
self.simulate_add_kernel(candidate, test_data)?;
}
}
latencies.push(start.elapsed().as_nanos() as f64);
}
latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median_ns = latencies[latencies.len() / 2];
let p95_ns = latencies[(latencies.len() * 95) / 100];
let p99_ns = latencies[(latencies.len() * 99) / 100];
let min_ns = latencies[0];
let max_ns = latencies[latencies.len() - 1];
let mean = latencies.iter().sum::<f64>() / latencies.len() as f64;
let variance =
latencies.iter().map(|l| (l - mean).powi(2)).sum::<f64>() / latencies.len() as f64;
let stddev_ns = variance.sqrt();
let latency_stats = LatencyStats {
median_ns,
p95_ns,
p99_ns,
stddev_ns,
min_ns,
max_ns,
};
let ops_per_sec = 1_000_000_000.0 / median_ns; let throughput_ops_per_sec = ops_per_sec * test_data.len() as f64;
let bandwidth_gbps =
(throughput_ops_per_sec * test_data.len() as f64) / (1024.0 * 1024.0 * 1024.0);
Ok((latency_stats, throughput_ops_per_sec, bandwidth_gbps))
}
fn verify_bit_exactness(
&self,
_candidate: &KernelCandidate,
_workload: &TuningWorkload,
_test_data: &[u8],
) -> Result<bool, TuningError> {
Ok(true)
}
fn workload_weight(&self, workload_id: &str) -> f64 {
self.workloads
.iter()
.find(|w| w.workload_id == workload_id)
.map_or(1.0, |w| w.weight)
}
fn simulate_mul_kernel(
&self,
candidate: &KernelCandidate,
data: &[u8],
) -> Result<(), TuningError> {
std::thread::sleep(Duration::from_nanos(
data.len() as u64 / candidate.unroll as u64,
));
Ok(())
}
fn simulate_addmul_kernel(
&self,
candidate: &KernelCandidate,
data: &[u8],
) -> Result<(), TuningError> {
std::thread::sleep(Duration::from_nanos(
data.len() as u64 * 3 / candidate.unroll as u64,
));
Ok(())
}
fn simulate_add_kernel(
&self,
candidate: &KernelCandidate,
data: &[u8],
) -> Result<(), TuningError> {
std::thread::sleep(Duration::from_nanos(
data.len() as u64 / (candidate.unroll * 2) as u64,
));
Ok(())
}
}
#[derive(Debug, thiserror::Error)]
pub enum TuningError {
#[error("No benchmark results available for optimization")]
NoBenchmarkResults,
#[error("No valid candidates found after filtering")]
NoValidCandidates,
#[error("Kernel execution failed: {0}")]
KernelExecutionFailed(String),
#[error("Bit-exactness verification failed")]
BitExactnessVerificationFailed,
#[error("I/O error during tuning: {0}")]
IoError(#[from] std::io::Error),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_candidate_generation() {
let tuner = OfflineTuner::new(
Gf256ArchitectureClass::GenericScalar,
OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
},
);
let candidates = tuner.generate_candidates();
assert!(!candidates.is_empty());
let mut candidate_ids = std::collections::HashSet::new();
for candidate in &candidates {
assert!(candidate_ids.insert(&candidate.candidate_id));
}
}
#[test]
fn test_tuning_space_x86_avx2() {
let space = OfflineTuner::default_tuning_space_for_arch(Gf256ArchitectureClass::X86Avx2);
assert_eq!(space.architecture_class, Gf256ArchitectureClass::X86Avx2);
assert!(space.tile_sizes.contains(&32));
assert!(space.unroll_factors.contains(&4));
assert!(space.prefetch_distances.contains(&64));
assert!(space.fusion_shapes.contains(&FusionShape::Fused));
}
#[test]
fn test_workload_generation() {
let workloads = OfflineTuner::default_workloads_for_arch(Gf256ArchitectureClass::X86Avx2);
assert!(!workloads.is_empty());
assert!(workloads.iter().any(|w| w.operation == GF256Operation::Mul));
assert!(
workloads
.iter()
.any(|w| w.operation == GF256Operation::AddMul)
);
}
}