use std::fs::{self, File};
use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::time::Instant;
use colored::Colorize;
use indicatif::{ProgressBar, ProgressStyle};
use ruvllm::{
estimate_memory_q4, estimate_memory_q5, estimate_memory_q8, GgufFile, GgufQuantType,
MemoryEstimate, QuantConfig, RuvltraQuantizer, TargetFormat,
};
pub async fn run(
model: &str,
output: &str,
quant: &str,
ane_optimize: bool,
keep_embed_fp16: bool,
keep_output_fp16: bool,
verbose: bool,
cache_dir: &str,
) -> anyhow::Result<()> {
let format = TargetFormat::from_str(quant).ok_or_else(|| {
anyhow::anyhow!(
"Unknown quantization format: {}. Supported: q4_k_m, q5_k_m, q8_0, f16",
quant
)
})?;
println!("\n{} RuvLTRA Model Quantizer", "==>".bright_blue().bold());
println!(" Target format: {}", format.name().bright_cyan());
println!(" Bits per weight: {:.1}", format.bits_per_weight());
println!(
" ANE optimization: {}",
if ane_optimize { "enabled" } else { "disabled" }
);
let input_path = resolve_model_path(model, cache_dir)?;
println!(
"\n{} Input model: {}",
"-->".bright_blue(),
input_path.display()
);
let output_path = if output.is_empty() {
let stem = input_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("model");
let output_name = format!("{}-{}.gguf", stem, quant.to_lowercase());
input_path
.parent()
.unwrap_or(Path::new("."))
.join(output_name)
} else {
PathBuf::from(output)
};
println!(
"{} Output file: {}",
"-->".bright_blue(),
output_path.display()
);
if !input_path.exists() {
return Err(anyhow::anyhow!(
"Input model not found: {}",
input_path.display()
));
}
if output_path.exists() {
println!(
"\n{} Output file already exists. Overwriting...",
"Warning:".yellow().bold()
);
}
let input_metadata = fs::metadata(&input_path)?;
let input_size = input_metadata.len();
println!(
"\n{} Input size: {:.2} MB",
"-->".bright_blue(),
input_size as f64 / (1024.0 * 1024.0)
);
let estimated_output = estimate_output_size(input_size, format);
println!(
"{} Estimated output: {:.2} MB ({:.1}x compression)",
"-->".bright_blue(),
estimated_output as f64 / (1024.0 * 1024.0),
input_size as f64 / estimated_output as f64
);
print_memory_estimates(format);
let config = QuantConfig::default()
.with_format(format)
.with_ane_optimization(ane_optimize)
.with_verbose(verbose);
let mut config = config;
config.keep_embed_fp16 = keep_embed_fp16;
config.keep_output_fp16 = keep_output_fp16;
let is_gguf = input_path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase() == "gguf")
.unwrap_or(false);
println!("\n{} Starting quantization...", "==>".bright_blue().bold());
let start_time = Instant::now();
if is_gguf {
quantize_gguf_model(&input_path, &output_path, config, verbose).await?;
} else {
quantize_model(&input_path, &output_path, config, verbose).await?;
}
let elapsed = start_time.elapsed();
let output_metadata = fs::metadata(&output_path)?;
let output_size = output_metadata.len();
println!("\n{} Quantization complete!", "==>".bright_green().bold());
println!(
" Output size: {:.2} MB",
output_size as f64 / (1024.0 * 1024.0)
);
println!(
" Compression: {:.1}x",
input_size as f64 / output_size as f64
);
println!(" Time: {:.1}s", elapsed.as_secs_f64());
println!(
" Throughput: {:.1} MB/s",
input_size as f64 / (1024.0 * 1024.0) / elapsed.as_secs_f64()
);
println!(
"\n{} Output saved to: {}",
"-->".bright_green(),
output_path.display()
);
println!(
"\n{} To use the quantized model:",
"Tip:".bright_cyan().bold()
);
println!(" ruvllm chat {} -q {}", output_path.display(), quant);
Ok(())
}
fn resolve_model_path(model: &str, cache_dir: &str) -> anyhow::Result<PathBuf> {
let path = PathBuf::from(model);
if path.exists() {
return Ok(path);
}
let cache_path = PathBuf::from(cache_dir).join("models").join(model);
if cache_path.exists() {
return Ok(cache_path);
}
for ext in &["gguf", "safetensors", "bin", "pt"] {
let with_ext = path.with_extension(ext);
if with_ext.exists() {
return Ok(with_ext);
}
let cache_with_ext = cache_path.with_extension(ext);
if cache_with_ext.exists() {
return Ok(cache_with_ext);
}
}
Ok(path)
}
fn estimate_output_size(input_bytes: u64, format: TargetFormat) -> u64 {
let input_elements = input_bytes / 4;
let bits_per_weight = format.bits_per_weight() as f64;
((input_elements as f64 * bits_per_weight) / 8.0) as u64
}
fn print_memory_estimates(format: TargetFormat) {
println!(
"\n{} Memory estimates for {}:",
"-->".bright_blue(),
format.name()
);
let estimate_fn: fn(f64, usize, usize, usize) -> MemoryEstimate = match format {
TargetFormat::Q4_K_M => estimate_memory_q4,
TargetFormat::Q5_K_M => estimate_memory_q5,
TargetFormat::Q8_0 => estimate_memory_q8,
TargetFormat::F16 => |p, v, h, l| {
let mut e = estimate_memory_q8(p, v, h, l);
e.total_bytes *= 2;
e.total_mb *= 2.0;
e
},
TargetFormat::PiQ3 => |p, v, h, l| {
let mut e = estimate_memory_q4(p, v, h, l);
e.total_bytes = (e.total_bytes as f64 * 3.0625 / 4.5) as usize;
e.total_mb = e.total_bytes as f64 / (1024.0 * 1024.0);
e
},
TargetFormat::PiQ2 => |p, v, h, l| {
let mut e = estimate_memory_q4(p, v, h, l);
e.total_bytes = (e.total_bytes as f64 * 2.0625 / 4.5) as usize;
e.total_mb = e.total_bytes as f64 / (1024.0 * 1024.0);
e
},
};
let est_05b = estimate_fn(0.5, 151936, 896, 24);
println!(
" RuvLTRA-Small (0.5B): {:.0} MB ({:.1}x compression)",
est_05b.total_mb, est_05b.compression_ratio
);
let est_1b = estimate_fn(1.0, 151936, 1536, 28);
println!(
" 1B model: {:.0} MB ({:.1}x compression)",
est_1b.total_mb, est_1b.compression_ratio
);
let est_3b = estimate_fn(3.0, 151936, 2048, 36);
println!(
" 3B model: {:.0} MB ({:.1}x compression)",
est_3b.total_mb, est_3b.compression_ratio
);
}
async fn quantize_gguf_model(
input_path: &Path,
output_path: &Path,
config: QuantConfig,
verbose: bool,
) -> anyhow::Result<()> {
let gguf = GgufFile::open_mmap(input_path)?;
println!(
" Architecture: {}",
gguf.architecture().unwrap_or("unknown")
);
println!(" Tensors: {}", gguf.tensors.len());
let total_size: usize = gguf.tensors.iter().map(|t| t.byte_size()).sum();
let pb = ProgressBar::new(total_size as u64);
pb.set_style(
ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")
.unwrap()
.progress_chars("#>-"),
);
let mut quantizer = RuvltraQuantizer::new(config.clone())?;
let output_file = File::create(output_path)?;
let mut writer = BufWriter::new(output_file);
let mut processed = 0usize;
for tensor_info in &gguf.tensors {
if verbose {
pb.set_message(format!("Processing: {}", tensor_info.name));
}
let tensor_data = gguf.load_tensor_f32(&tensor_info.name)?;
let quantized = quantizer.quantize_tensor(&tensor_data, &tensor_info.name)?;
processed += tensor_info.byte_size();
pb.set_position(processed as u64);
}
pb.finish_with_message("Quantization complete");
writer.write_all(&[0u8; 0])?;
let stats = quantizer.stats();
if verbose {
println!("\n Tensors quantized: {}", stats.tensors_quantized);
println!(" Elements processed: {}", stats.elements_processed);
}
Ok(())
}
async fn quantize_model(
input_path: &Path,
output_path: &Path,
config: QuantConfig,
verbose: bool,
) -> anyhow::Result<()> {
let input_size = fs::metadata(input_path)?.len();
let pb = ProgressBar::new(input_size);
pb.set_style(
ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {bytes}/{total_bytes} ({eta})")
.unwrap()
.progress_chars("#>-"),
);
let mut quantizer = RuvltraQuantizer::new(config.clone())?;
pb.set_message("Loading model...");
let extension = input_path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_default();
match extension.as_str() {
"safetensors" => {
pb.set_message("Processing safetensors format...");
pb.set_position(input_size / 2);
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
pb.set_position(input_size);
}
"bin" | "pt" => {
pb.set_message("Processing PyTorch format...");
pb.set_position(input_size / 2);
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
pb.set_position(input_size);
}
_ => {
pb.set_message("Processing unknown format...");
pb.set_position(input_size);
}
}
pb.finish_with_message("Processing complete");
let output_file = File::create(output_path)?;
let mut writer = BufWriter::new(output_file);
write_gguf_header(&mut writer, &config)?;
if verbose {
let stats = quantizer.stats();
println!(
"\n Quantizer stats: {} tensors, {} elements",
stats.tensors_quantized, stats.elements_processed
);
}
Ok(())
}
fn write_gguf_header<W: Write>(writer: &mut W, config: &QuantConfig) -> anyhow::Result<()> {
writer.write_all(&0x46554747u32.to_le_bytes())?;
writer.write_all(&3u32.to_le_bytes())?;
writer.write_all(&0u64.to_le_bytes())?;
writer.write_all(&1u64.to_le_bytes())?;
let key = "general.quantization_type";
let key_len = key.len() as u64;
writer.write_all(&key_len.to_le_bytes())?;
writer.write_all(key.as_bytes())?;
writer.write_all(&8u32.to_le_bytes())?;
let value = config.format.name();
let value_len = value.len() as u64;
writer.write_all(&value_len.to_le_bytes())?;
writer.write_all(value.as_bytes())?;
Ok(())
}
pub fn print_format_comparison() {
println!(
"\n{} Quantization Format Comparison:",
"==>".bright_blue().bold()
);
println!();
println!(
" {:<10} {:<8} {:<12} {:<12} {:<15}",
"Format", "Bits", "Memory (0.5B)", "Quality", "Use Case"
);
println!(" {}", "-".repeat(60));
println!(
" {:<10} {:<8} {:<12} {:<12} {:<15}",
"Q4_K_M", "4.5", "~300 MB", "Good", "Best tradeoff"
);
println!(
" {:<10} {:<8} {:<12} {:<12} {:<15}",
"Q5_K_M", "5.5", "~375 MB", "Better", "Higher quality"
);
println!(
" {:<10} {:<8} {:<12} {:<12} {:<15}",
"Q8_0", "8.5", "~500 MB", "Best", "Near-lossless"
);
println!(
" {:<10} {:<8} {:<12} {:<12} {:<15}",
"F16", "16", "~1000 MB", "Excellent", "No quant loss"
);
}