use std::fs;
use std::path::PathBuf;
use std::process;
use clap::{Parser, Subcommand};
use serde_json;
use asupersync::raptorq::gf256::{Gf256ArchitectureClass, active_kernel};
use asupersync::raptorq::offline_tuner::{OfflineTuner, OptimizationCriteria};
use asupersync::runtime::scheduler::SchedulerEvidenceArtifact;
#[derive(Debug, Clone)]
struct ValidationConfig {
size: usize,
scalar: u8,
seed: u64,
scenario: &'static str,
}
impl ValidationConfig {
fn generate_data(&self) -> Vec<u8> {
let mut data = Vec::with_capacity(self.size);
for i in 0..self.size {
let value = ((i as u64).wrapping_mul(17).wrapping_add(self.seed)) % 256;
data.push(value as u8);
}
data
}
}
fn reference_mul_slice(data: &mut [u8], scalar: u8) {
use asupersync::raptorq::gf256::Gf256;
let gf_scalar = Gf256::new(scalar);
for byte in data {
*byte = Gf256::new(*byte).mul_field(gf_scalar).raw();
}
}
fn reference_addmul_slice(dst: &mut [u8], src: &[u8], scalar: u8) {
use asupersync::raptorq::gf256::Gf256;
assert_eq!(dst.len(), src.len(), "slice length mismatch");
let gf_scalar = Gf256::new(scalar);
for (dst_byte, src_byte) in dst.iter_mut().zip(src) {
let product = Gf256::new(*src_byte).mul_field(gf_scalar);
*dst_byte = Gf256::new(*dst_byte).add(product).raw();
}
}
fn validate_mul_slice_bit_exact(config: &ValidationConfig, verbose: bool) -> Result<(), String> {
use asupersync::raptorq::gf256::{Gf256, gf256_mul_slice};
let mut reference_data = config.generate_data();
let mut test_data = reference_data.clone();
reference_mul_slice(&mut reference_data, config.scalar);
gf256_mul_slice(&mut test_data, Gf256::new(config.scalar));
if reference_data == test_data {
if verbose {
println!(
" mul_slice bit-exact: size={}, scalar={}",
config.size, config.scalar
);
}
Ok(())
} else {
Err(format!(
"mul_slice not bit-exact: size={}, scalar={}, first_diff={}",
config.size,
config.scalar,
reference_data
.iter()
.zip(&test_data)
.position(|(a, b)| a != b)
.unwrap_or(0)
))
}
}
fn validate_addmul_slice_bit_exact(config: &ValidationConfig, verbose: bool) -> Result<(), String> {
use asupersync::raptorq::gf256::{Gf256, gf256_addmul_slice};
let src_data = config.generate_data();
let mut reference_dst = vec![0u8; config.size];
let mut test_dst = vec![0u8; config.size];
for (i, byte) in reference_dst.iter_mut().enumerate() {
*byte = ((i as u64 * 23 + config.seed + 1000) % 256) as u8;
}
test_dst.copy_from_slice(&reference_dst);
reference_addmul_slice(&mut reference_dst, &src_data, config.scalar);
gf256_addmul_slice(&mut test_dst, &src_data, Gf256::new(config.scalar));
if reference_dst == test_dst {
if verbose {
println!(
" addmul_slice bit-exact: size={}, scalar={}",
config.size, config.scalar
);
}
Ok(())
} else {
Err(format!(
"addmul_slice not bit-exact: size={}, scalar={}, first_diff={}",
config.size,
config.scalar,
reference_dst
.iter()
.zip(&test_dst)
.position(|(a, b)| a != b)
.unwrap_or(0)
))
}
}
#[derive(Parser)]
#[command(name = "offline_tuner")]
#[command(about = "Offline tuning workflows for RaptorQ kernels and scheduler evidence artifacts")]
#[command(version)]
struct Cli {
#[command(subcommand)]
command: Commands,
#[arg(short, long, global = true)]
verbose: bool,
#[arg(short, long, global = true, default_value = "tuning_results")]
output_dir: PathBuf,
}
#[derive(Subcommand)]
enum Commands {
Optimize {
#[arg(long, value_enum)]
arch: Option<ArchitectureArg>,
#[arg(long)]
auto_detect: bool,
#[arg(long, default_value = "0.5")]
latency_weight: f64,
#[arg(long, default_value = "0.3")]
throughput_weight: f64,
#[arg(long, default_value = "0.2")]
bandwidth_weight: f64,
#[arg(long, default_value = "5.0")]
min_improvement_threshold: f64,
},
Candidates {
#[arg(long, value_enum)]
arch: ArchitectureArg,
},
EmitProfile {
#[arg(long)]
results_file: PathBuf,
#[arg(long, default_value = "optimized_profile.json")]
output_file: PathBuf,
},
Validate {
#[arg(long, value_enum)]
arch: ArchitectureArg,
#[arg(long)]
profile_file: Option<PathBuf>,
},
SchedulerRecommend {
#[arg(long)]
evidence_file: PathBuf,
#[arg(long, default_value = "scheduler_tuning_report.json")]
output_file: PathBuf,
},
}
#[derive(clap::ValueEnum, Clone, Debug)]
enum ArchitectureArg {
#[value(name = "scalar")]
Scalar,
#[value(name = "x86-avx2")]
X86Avx2,
#[value(name = "aarch64-neon")]
Aarch64Neon,
}
impl From<ArchitectureArg> for Gf256ArchitectureClass {
fn from(arg: ArchitectureArg) -> Self {
match arg {
ArchitectureArg::Scalar => Gf256ArchitectureClass::GenericScalar,
ArchitectureArg::X86Avx2 => Gf256ArchitectureClass::X86Avx2,
ArchitectureArg::Aarch64Neon => Gf256ArchitectureClass::Aarch64Neon,
}
}
}
fn main() {
let cli = Cli::parse();
if cli.verbose {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("debug")).init();
} else {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
}
if let Err(e) = fs::create_dir_all(&cli.output_dir) {
eprintln!("Error: Failed to create output directory: {}", e);
process::exit(1);
}
let result = match cli.command {
Commands::Optimize {
arch,
auto_detect,
latency_weight,
throughput_weight,
bandwidth_weight,
min_improvement_threshold,
} => run_optimization(
arch,
auto_detect,
&cli.output_dir,
cli.verbose,
OptimizationCriteria {
latency_weight,
throughput_weight,
bandwidth_weight,
min_improvement_threshold,
},
),
Commands::Candidates { arch } => {
generate_candidates(arch.into(), &cli.output_dir, cli.verbose)
}
Commands::EmitProfile {
results_file,
output_file,
} => emit_profile_pack(results_file, output_file, cli.verbose),
Commands::Validate { arch, profile_file } => {
validate_kernels(arch.into(), profile_file, cli.verbose)
}
Commands::SchedulerRecommend {
evidence_file,
output_file,
} => emit_scheduler_recommendation(evidence_file, output_file, cli.verbose),
};
if let Err(e) = result {
eprintln!("Error: {}", e);
process::exit(1);
}
}
fn run_optimization(
arch: Option<ArchitectureArg>,
auto_detect: bool,
output_dir: &PathBuf,
verbose: bool,
criteria: OptimizationCriteria,
) -> Result<(), Box<dyn std::error::Error>> {
let target_arch = if auto_detect {
let kernel = active_kernel();
match kernel {
asupersync::raptorq::gf256::Gf256Kernel::Scalar => {
Gf256ArchitectureClass::GenericScalar
}
#[cfg(all(
feature = "simd-intrinsics",
any(target_arch = "x86", target_arch = "x86_64")
))]
asupersync::raptorq::gf256::Gf256Kernel::X86Avx2 => Gf256ArchitectureClass::X86Avx2,
#[cfg(all(feature = "simd-intrinsics", target_arch = "aarch64"))]
asupersync::raptorq::gf256::Gf256Kernel::Aarch64Neon => {
Gf256ArchitectureClass::Aarch64Neon
}
}
} else {
arch.ok_or("Must specify --arch or --auto-detect")?.into()
};
println!(
"Starting offline kernel superoptimization for {:?}",
target_arch
);
println!(
"Optimization criteria: latency={:.2}, throughput={:.2}, bandwidth={:.2}",
criteria.latency_weight, criteria.throughput_weight, criteria.bandwidth_weight
);
let mut tuner = OfflineTuner::new(target_arch, criteria.clone());
let candidates = tuner.generate_candidates();
println!("Generated {} kernel candidates", candidates.len());
if verbose {
println!("Candidates:");
for (i, candidate) in candidates.iter().enumerate() {
println!(
" {}: {} (tile={}, unroll={}, prefetch={}, fusion={:?})",
i + 1,
candidate.candidate_id,
candidate.tile_bytes,
candidate.unroll,
candidate.prefetch_distance,
candidate.fusion_shape
);
}
}
println!("Running systematic benchmarks...");
tuner.run_systematic_benchmarks()?;
let optimal = tuner.select_optimal_candidate()?;
println!("Selected optimal candidate: {}", optimal.candidate_id);
if verbose {
println!("Optimal configuration:");
println!(" Tile size: {} bytes", optimal.tile_bytes);
println!(" Unroll factor: {}", optimal.unroll);
println!(" Prefetch distance: {} bytes", optimal.prefetch_distance);
println!(" Fusion shape: {:?}", optimal.fusion_shape);
println!(" Optimization flags: {:?}", optimal.optimization_flags);
}
let profile_pack = tuner.emit_profile_pack(&optimal)?;
let results_file = output_dir.join(format!("tuning_results_{:?}.json", target_arch));
let profile_file = output_dir.join(format!("optimized_profile_{:?}.json", target_arch));
let tuning_results = serde_json::json!({
"target_architecture": format!("{:?}", target_arch),
"optimization_criteria": criteria,
"selected_candidate": optimal,
"generated_at": format!("{:?}", std::time::SystemTime::now()),
"total_candidates": candidates.len(),
});
fs::write(
&results_file,
serde_json::to_string_pretty(&tuning_results)?,
)?;
fs::write(&profile_file, serde_json::to_string_pretty(&profile_pack)?)?;
println!("Optimization complete!");
println!("Results saved to: {}", results_file.display());
println!("Profile pack saved to: {}", profile_file.display());
Ok(())
}
fn generate_candidates(
arch: Gf256ArchitectureClass,
output_dir: &PathBuf,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let criteria = OptimizationCriteria {
latency_weight: 0.5,
throughput_weight: 0.3,
bandwidth_weight: 0.2,
min_improvement_threshold: 5.0,
};
let tuner = OfflineTuner::new(arch, criteria);
let candidates = tuner.generate_candidates();
println!(
"Generated {} kernel candidates for {:?}",
candidates.len(),
arch
);
if verbose {
for (i, candidate) in candidates.iter().enumerate() {
println!(
"{}. {} (tile={}, unroll={}, prefetch={}, fusion={:?})",
i + 1,
candidate.candidate_id,
candidate.tile_bytes,
candidate.unroll,
candidate.prefetch_distance,
candidate.fusion_shape
);
}
}
let output_file = output_dir.join(format!("candidates_{:?}.json", arch));
let candidates_json = serde_json::json!({
"architecture": format!("{:?}", arch),
"candidate_count": candidates.len(),
"candidates": candidates,
"generated_at": format!("{:?}", std::time::SystemTime::now()),
});
fs::write(
&output_file,
serde_json::to_string_pretty(&candidates_json)?,
)?;
println!("Candidates saved to: {}", output_file.display());
Ok(())
}
fn emit_profile_pack(
results_file: PathBuf,
output_file: PathBuf,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Loading tuning results from: {}", results_file.display());
let results_json = fs::read_to_string(&results_file)?;
let results: serde_json::Value = serde_json::from_str(&results_json)?;
let selected_candidate = results["selected_candidate"].clone();
let arch_str = results["target_architecture"]
.as_str()
.unwrap_or("GenericScalar");
let arch = match arch_str {
"X86Avx2" => Gf256ArchitectureClass::X86Avx2,
"Aarch64Neon" => Gf256ArchitectureClass::Aarch64Neon,
_ => Gf256ArchitectureClass::GenericScalar,
};
let criteria: OptimizationCriteria =
serde_json::from_value(results["optimization_criteria"].clone())?;
let optimal: asupersync::raptorq::offline_tuner::KernelCandidate =
serde_json::from_value(selected_candidate.clone())?;
if verbose {
println!(
"Selected candidate: {}",
selected_candidate["candidate_id"]
.as_str()
.unwrap_or("unknown")
);
}
let tuner = OfflineTuner::new(arch, criteria);
let profile_pack = tuner.emit_profile_pack(&optimal)?;
fs::write(&output_file, serde_json::to_string_pretty(&profile_pack)?)?;
println!(
"Profile pack generated and saved to: {}",
output_file.display()
);
Ok(())
}
fn validate_kernels(
arch: Gf256ArchitectureClass,
profile_file: Option<PathBuf>,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!("Validating bit-exactness for {:?} kernels", arch);
if let Some(profile_path) = profile_file {
println!("Using profile pack: {}", profile_path.display());
} else {
println!("Using default profile pack for {:?}", arch);
}
let validation_scenarios = vec![
ValidationConfig {
size: 1,
scalar: 1,
seed: 0,
scenario: "single_byte",
},
ValidationConfig {
size: 15,
scalar: 17,
seed: 42,
scenario: "sub_simd_odd",
},
ValidationConfig {
size: 16,
scalar: 255,
seed: 123,
scenario: "exactly_simd",
},
ValidationConfig {
size: 17,
scalar: 2,
seed: 456,
scenario: "just_over_simd",
},
ValidationConfig {
size: 64,
scalar: 85,
seed: 789,
scenario: "cache_line",
},
ValidationConfig {
size: 256,
scalar: 42,
seed: 1011,
scenario: "typical_block",
},
ValidationConfig {
size: 1024,
scalar: 170,
seed: 1314,
scenario: "large_block",
},
];
let mut total_tests = 0;
let mut failed_tests = 0;
for config in &validation_scenarios {
total_tests += 1;
if let Err(e) = validate_mul_slice_bit_exact(config, verbose) {
println!("FAILED: mul_slice for scenario {}: {}", config.scenario, e);
failed_tests += 1;
} else if verbose {
println!("PASSED: mul_slice for scenario {}", config.scenario);
}
total_tests += 1;
if let Err(e) = validate_addmul_slice_bit_exact(config, verbose) {
println!(
"FAILED: addmul_slice for scenario {}: {}",
config.scenario, e
);
failed_tests += 1;
} else if verbose {
println!("PASSED: addmul_slice for scenario {}", config.scenario);
}
}
if failed_tests == 0 {
println!("Bit-exactness validation: PASSED ({} tests)", total_tests);
Ok(())
} else {
println!(
"Bit-exactness validation: FAILED ({}/{} tests failed)",
failed_tests, total_tests
);
Err(format!(
"Bit-exactness validation failed: {}/{} tests failed",
failed_tests, total_tests
)
.into())
}
}
fn emit_scheduler_recommendation(
evidence_file: PathBuf,
output_file: PathBuf,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
println!(
"Loading scheduler evidence artifact from: {}",
evidence_file.display()
);
let artifact_json = fs::read_to_string(&evidence_file)?;
let artifact: SchedulerEvidenceArtifact = serde_json::from_str(&artifact_json)?;
let report = artifact.tune_report()?;
if verbose {
println!("Run label: {}", report.source_run_label);
println!("Profile: {}", report.profile_name);
println!("Confidence: {}%", report.confidence_percent);
println!("Reasons: {:?}", report.reason_codes);
}
fs::write(&output_file, serde_json::to_string_pretty(&report)?)?;
println!(
"Scheduler tuning report generated and saved to: {}",
output_file.display()
);
Ok(())
}