use anyhow::Result;
use serde::Serialize;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use weave::{LensContext, ReadMode};
const HEADLINE_MODES: &[&str] = &["map", "signatures", "aggressive", "entropy"];
const QUALITY_WARN: f64 = 93.0;
const ELIGIBLE_EXTS: &[&str] = &[
"rs", "ts", "tsx", "js", "jsx", "py", "go", "java", "kt", "rb", "php", "swift",
];
const SKIP_DIRS: &[&str] = &[
"target",
"node_modules",
".git",
"dist",
"build",
"__pycache__",
".next",
"vendor",
"coverage",
];
#[derive(Serialize)]
pub struct ModeResult {
pub mode: String,
pub tokens_before: usize,
pub tokens_after: usize,
pub savings_pct: f64,
pub quality_pct: f64,
}
#[derive(Serialize)]
pub struct FileResult {
pub path: String,
pub tokens: usize,
pub modes: Vec<ModeResult>,
pub best_mode: String,
pub best_savings_pct: f64,
}
#[derive(Serialize)]
pub struct BenchmarkReport {
pub files_scanned: usize,
pub total_tokens: usize,
pub avg_savings_pct: f64,
pub avg_quality_pct: f64,
pub results: Vec<FileResult>,
}
pub fn handle(
path: String,
mode: Option<String>,
all_modes: bool,
json: bool,
min_kb: u64,
) -> Result<()> {
forge::budget::estimator::TokenEstimator::warmup();
let target = Path::new(&path);
let effective_min_bytes = if target.is_file() { 0 } else { min_kb * 1024 };
let files = collect_files(target, effective_min_bytes);
if files.is_empty() {
eprintln!("bctx benchmark: no eligible files found under '{path}'");
eprintln!(" tip: use --min-kb 0 to include very small files");
return Ok(());
}
let run_modes: Vec<ReadMode> = if let Some(ref m) = mode {
let rm = ReadMode::parse(m).ok_or_else(|| {
anyhow::anyhow!("unknown mode '{m}' — run `bctx modes` to list all modes")
})?;
vec![rm]
} else if all_modes {
ReadMode::all_named()
.iter()
.filter(|m| !matches!(m, ReadMode::Full | ReadMode::Auto))
.cloned()
.collect()
} else {
HEADLINE_MODES
.iter()
.filter_map(|name| ReadMode::parse(name))
.collect()
};
let mut report_results: Vec<FileResult> = Vec::new();
for file_path in &files {
let Ok(content) = std::fs::read_to_string(file_path) else {
continue;
};
let tokens_before = forge::budget::estimator::TokenEstimator::count(&content);
if tokens_before < 40 {
continue; }
let rel = {
let s = file_path
.strip_prefix(target)
.unwrap_or(file_path)
.to_string_lossy()
.to_string();
if s.is_empty() {
file_path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| file_path.to_string_lossy().to_string())
} else {
s
}
};
let ctx = LensContext::new(tokens_before * 2);
let mut mode_results: Vec<ModeResult> = Vec::new();
for rm in &run_modes {
let output = rm.apply(&content, &ctx);
let after = output.tokens_after.max(1);
let savings = savings_pct(tokens_before, after);
let quality = quality_score(&content, &output.content);
mode_results.push(ModeResult {
mode: rm.name(),
tokens_before,
tokens_after: after,
savings_pct: savings,
quality_pct: quality,
});
}
let best = mode_results
.iter()
.filter(|r| r.quality_pct >= QUALITY_WARN && r.savings_pct > 0.0)
.max_by(|a, b| a.savings_pct.partial_cmp(&b.savings_pct).unwrap());
let (best_mode, best_savings) = best
.map(|r| (r.mode.clone(), r.savings_pct))
.unwrap_or_else(|| {
mode_results
.iter()
.filter(|r| r.savings_pct > 0.0)
.max_by(|a, b| a.savings_pct.partial_cmp(&b.savings_pct).unwrap())
.map(|r| (r.mode.clone(), r.savings_pct))
.unwrap_or(("–".into(), 0.0))
});
report_results.push(FileResult {
path: rel,
tokens: tokens_before,
modes: mode_results,
best_mode,
best_savings_pct: best_savings,
});
}
if report_results.is_empty() {
eprintln!("bctx benchmark: all files were too small to benchmark (< 40 tokens)");
return Ok(());
}
let total_tokens: usize = report_results.iter().map(|r| r.tokens).sum();
let avg_savings = report_results
.iter()
.map(|r| r.best_savings_pct)
.sum::<f64>()
/ report_results.len() as f64;
let avg_quality = report_results
.iter()
.flat_map(|r| r.modes.iter().map(|m| m.quality_pct))
.sum::<f64>()
/ report_results
.iter()
.map(|r| r.modes.len())
.sum::<usize>()
.max(1) as f64;
if json {
let report = BenchmarkReport {
files_scanned: report_results.len(),
total_tokens,
avg_savings_pct: avg_savings,
avg_quality_pct: avg_quality,
results: report_results,
};
println!("{}", serde_json::to_string_pretty(&report)?);
return Ok(());
}
print_report(
&report_results,
total_tokens,
avg_savings,
avg_quality,
all_modes || mode.is_some(),
&path,
);
Ok(())
}
fn print_report(
results: &[FileResult],
total_tokens: usize,
avg_savings: f64,
avg_quality: f64,
detail: bool,
scan_path: &str,
) {
let n = results.len();
println!();
println!(
" \x1b[1mbctx benchmark\x1b[0m · {} file{} · {} tokens total",
n,
if n == 1 { "" } else { "s" },
fmt_tokens(total_tokens)
);
println!();
if detail {
for r in results {
println!(
" \x1b[38;5;208m{}\x1b[0m ({} tok)",
r.path,
fmt_tokens(r.tokens)
);
let bar = "─".repeat(62);
println!(" {bar}");
println!(
" {:12} {:>8} {:>8} {:>7} {:>8}",
"MODE", "BEFORE", "AFTER", "SAVED", "QUALITY"
);
println!(" {bar}");
for m in &r.modes {
let warn = if m.quality_pct < QUALITY_WARN {
" ⚠"
} else {
" "
};
println!(
" {:12} {:>8} {:>8} {:>6}% {:>6.0}%{}",
m.mode,
fmt_tokens(m.tokens_before),
fmt_tokens(m.tokens_after),
format!("{:.0}", m.savings_pct),
m.quality_pct,
warn
);
}
println!(" {bar}");
println!(
" best: \x1b[32m{} → {:.0}% saved\x1b[0m",
r.best_mode, r.best_savings_pct
);
println!();
}
} else {
let mode_names: Vec<String> = results
.first()
.map(|r| r.modes.iter().map(|m| m.mode.clone()).collect())
.unwrap_or_default();
let col_w = 8usize;
let path_w = results
.iter()
.map(|r| r.path.len())
.max()
.unwrap_or(20)
.clamp(20, 42);
print!(" {:path_w$}", "FILE");
for name in &mode_names {
let short = name.chars().take(6).collect::<String>();
print!(" {:>col_w$}", short.to_uppercase());
}
println!(" {:>col_w$} QUALITY", "BEST");
let sep = "─".repeat(path_w + mode_names.len() * (col_w + 2) + col_w * 2 + 12);
println!(" {sep}");
for r in results {
let short_path = if r.path.len() > path_w {
format!("…{}", &r.path[r.path.len() - path_w + 1..])
} else {
r.path.clone()
};
print!(" {:path_w$}", short_path);
for m in &r.modes {
let warn = if m.quality_pct < QUALITY_WARN {
"⚠"
} else {
""
};
print!(" {:>col_w$}", format!("{:.0}%{}", m.savings_pct, warn));
}
let file_quality =
r.modes.iter().map(|m| m.quality_pct).sum::<f64>() / r.modes.len().max(1) as f64;
println!(
" {:>col_w$} {:>5.0}%",
format!("{} {:.0}%", r.best_mode, r.best_savings_pct),
file_quality
);
}
println!(" {sep}");
print!(" {:path_w$}", "average");
if !results.is_empty() {
for i in 0..mode_names.len() {
let avg = results.iter().map(|r| r.modes[i].savings_pct).sum::<f64>()
/ results.len() as f64;
print!(" {:>col_w$}", format!("{avg:.0}%"));
}
}
println!(
" {:>col_w$} {:>5.0}%",
format!("{avg_savings:.0}%"),
avg_quality
);
println!();
if avg_quality < QUALITY_WARN {
println!(" ⚠ Some results below quality threshold ({QUALITY_WARN:.0}%). Run with --all-modes for detail.");
}
println!(" Run \x1b[38;5;208mbctx benchmark {scan_path} --all-modes\x1b[0m for per-mode breakdown.");
println!(" Run \x1b[38;5;208mbctx benchmark {scan_path} --json\x1b[0m for machine-readable output.");
}
println!();
}
pub fn quality_score(original: &str, compressed: &str) -> f64 {
let orig_ids = extract_identifiers(original);
if orig_ids.is_empty() {
return 100.0;
}
let comp_ids = extract_identifiers(compressed);
let preserved = orig_ids.intersection(&comp_ids).count();
let id_coverage = preserved as f64 / orig_ids.len() as f64;
let orig_lines: Vec<&str> = original.lines().filter(|l| !l.trim().is_empty()).collect();
let represented = orig_lines
.iter()
.filter(|line| {
extract_identifiers(line)
.iter()
.any(|tok| comp_ids.contains(*tok))
})
.count();
let struct_density = if orig_lines.is_empty() {
1.0
} else {
represented as f64 / orig_lines.len() as f64
};
(id_coverage * 0.70 + struct_density * 0.30) * 100.0
}
fn extract_identifiers(text: &str) -> HashSet<&str> {
let mut ids = HashSet::new();
let bytes = text.as_bytes();
let mut start: Option<usize> = None;
for (i, &b) in bytes.iter().enumerate() {
let is_id = b.is_ascii_alphanumeric() || b == b'_';
match (start, is_id) {
(None, true) => start = Some(i),
(Some(s), false) => {
let tok = &text[s..i];
if tok.len() >= 3 && !is_all_digits(tok) {
ids.insert(tok);
}
start = None;
}
_ => {}
}
}
if let Some(s) = start {
let tok = &text[s..];
if tok.len() >= 3 && !is_all_digits(tok) {
ids.insert(tok);
}
}
ids
}
fn is_all_digits(s: &str) -> bool {
s.bytes().all(|b| b.is_ascii_digit())
}
fn savings_pct(before: usize, after: usize) -> f64 {
if before == 0 {
return 0.0;
}
((before.saturating_sub(after)) as f64 / before as f64 * 100.0).max(0.0)
}
fn fmt_tokens(n: usize) -> String {
if n >= 1_000_000 {
format!("{:.1}M", n as f64 / 1_000_000.0)
} else if n >= 1_000 {
format!("{:.1}K", n as f64 / 1_000.0)
} else {
n.to_string()
}
}
fn collect_files(root: &Path, min_bytes: u64) -> Vec<PathBuf> {
let mut out = Vec::new();
collect_recursive(root, min_bytes, &mut out);
out.sort();
out
}
fn collect_recursive(dir: &Path, min_bytes: u64, out: &mut Vec<PathBuf>) {
let Ok(entries) = std::fs::read_dir(dir) else {
if dir.is_file() {
if let Some(ext) = dir.extension().and_then(|e| e.to_str()) {
if ELIGIBLE_EXTS.contains(&ext) {
let size = std::fs::metadata(dir).map(|m| m.len()).unwrap_or(0);
if size >= min_bytes {
out.push(dir.to_path_buf());
}
}
}
}
return;
};
for entry in entries.flatten() {
let path = entry.path();
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if path.is_dir() {
if !SKIP_DIRS.contains(&name) {
collect_recursive(&path, min_bytes, out);
}
} else if path.is_file() {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
if ELIGIBLE_EXTS.contains(&ext) {
let size = entry.metadata().map(|m| m.len()).unwrap_or(0);
if size >= min_bytes {
out.push(path);
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn quality_identical_content_is_100() {
let code = "fn authenticate(user: &str, password: &str) -> bool { true }";
assert_eq!(quality_score(code, code) as u64, 100);
}
#[test]
fn quality_empty_compressed_is_low() {
let code = "fn authenticate(user: &str, token: &str) -> Result<Session> { Ok(()) }";
let q = quality_score(code, "");
assert!(q < 10.0, "empty compressed should score very low, got {q}");
}
#[test]
fn quality_partial_compression_is_mid_range() {
let original = "fn process_request(req: HttpRequest, db: Database) -> Response {
let user = db.find_user(&req.user_id);
let session = Session::new(user);
Response::ok(session)
}";
let compressed = "fn process_request(req: HttpRequest, db: Database) -> Response";
let q = quality_score(original, compressed);
assert!(
q > 40.0 && q < 90.0,
"partial compression should be mid-range, got {q}"
);
}
#[test]
fn extract_identifiers_finds_snake_case() {
let ids = extract_identifiers("fn user_profile(id: UserId) -> Profile");
assert!(ids.contains("user_profile"));
assert!(ids.contains("UserId"));
assert!(ids.contains("Profile"));
}
#[test]
fn extract_identifiers_skips_short_tokens() {
let ids = extract_identifiers("fn do(id: u8) -> Ok");
assert!(!ids.contains("do"));
assert!(!ids.contains("id"));
assert!(!ids.contains("u8"));
}
#[test]
fn extract_identifiers_skips_pure_numbers() {
let ids = extract_identifiers("timeout = 3000 retries = 5");
assert!(!ids.contains("3000"));
assert!(!ids.contains("5"));
}
#[test]
fn savings_pct_correct() {
assert_eq!(savings_pct(1000, 200) as u64, 80);
assert_eq!(savings_pct(1000, 1000) as u64, 0);
assert_eq!(savings_pct(0, 0) as u64, 0);
}
#[test]
fn fmt_tokens_formats_correctly() {
assert_eq!(fmt_tokens(500), "500");
assert_eq!(fmt_tokens(1500), "1.5K");
assert_eq!(fmt_tokens(1_200_000), "1.2M");
}
#[test]
fn collect_files_skips_target_dir() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
std::fs::create_dir(dir.path().join("target")).unwrap();
std::fs::write(dir.path().join("target").join("lib.rs"), "fn main() {}").unwrap();
std::fs::write(dir.path().join("lib.rs"), "fn main() {}").unwrap();
let files = collect_files(dir.path(), 0);
assert_eq!(files.len(), 1);
assert!(files[0].ends_with("lib.rs"));
}
#[test]
fn collect_files_respects_min_bytes() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
std::fs::write(dir.path().join("small.rs"), "fn f(){}").unwrap();
std::fs::write(
dir.path().join("large.rs"),
"fn function_with_more_content() { let x = 1; let y = 2; x + y }",
)
.unwrap();
let files = collect_files(dir.path(), 30);
assert_eq!(files.len(), 1);
assert!(files[0].ends_with("large.rs"));
}
#[test]
fn collect_files_handles_single_file_path() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
let file = dir.path().join("auth.rs");
std::fs::write(
&file,
"pub fn authenticate(token: &str) -> bool { !token.is_empty() }",
)
.unwrap();
let files = collect_files(&file, 0);
assert_eq!(files.len(), 1);
}
#[test]
fn handle_on_temp_dir_succeeds() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
std::fs::write(
dir.path().join("lib.rs"),
"pub fn authenticate(token: &str) -> bool {\n !token.is_empty()\n}\n\npub fn validate_expiry(exp: u64, now: u64) -> bool {\n now < exp\n}\n",
)
.unwrap();
let path = dir.path().to_string_lossy().to_string();
assert!(handle(path.clone(), None, false, false, 0).is_ok());
}
#[test]
fn handle_json_on_temp_dir_succeeds() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
std::fs::write(
dir.path().join("server.rs"),
"pub fn start(port: u16) -> std::io::Result<()> {\n println!(\"listening on {port}\");\n Ok(())\n}\n",
)
.unwrap();
let path = dir.path().to_string_lossy().to_string();
assert!(handle(path, None, false, true, 0).is_ok());
}
#[test]
fn handle_all_modes_on_temp_dir_succeeds() {
use tempfile::tempdir;
let dir = tempdir().unwrap();
std::fs::write(
dir.path().join("routes.rs"),
"pub fn register(app: &mut App) {\n app.route(\"/health\", get(health_handler));\n app.route(\"/users\", get(list_users));\n}\n",
)
.unwrap();
let path = dir.path().to_string_lossy().to_string();
assert!(handle(path, None, true, false, 0).is_ok());
}
}