1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::time::Instant;
4
5use walkdir::WalkDir;
6
7use crate::core::compressor;
8use crate::core::deps;
9use crate::core::entropy;
10use crate::core::preservation;
11use crate::core::signatures;
12use crate::core::tokens::count_tokens;
13
14const COST_PER_TOKEN: f64 = crate::core::stats::DEFAULT_INPUT_PRICE_PER_M / 1_000_000.0;
15const MAX_FILE_SIZE: u64 = 100 * 1024;
16const MAX_FILES: usize = 50;
17const CACHE_HIT_TOKENS: usize = 13;
18
19#[derive(Debug, Clone)]
22pub struct ModeMeasurement {
23 pub mode: String,
24 pub tokens: usize,
25 pub savings_pct: f64,
26 pub latency_us: u64,
27 pub preservation_score: f64,
28}
29
30#[derive(Debug, Clone)]
31pub struct FileMeasurement {
32 #[allow(dead_code)]
33 pub path: String,
34 pub ext: String,
35 pub raw_tokens: usize,
36 pub modes: Vec<ModeMeasurement>,
37}
38
39#[derive(Debug, Clone)]
40pub struct LanguageStats {
41 pub ext: String,
42 pub count: usize,
43 pub total_tokens: usize,
44}
45
46#[derive(Debug, Clone)]
47pub struct ModeSummary {
48 pub mode: String,
49 pub total_compressed_tokens: usize,
50 pub avg_savings_pct: f64,
51 pub avg_latency_us: u64,
52 pub avg_preservation: f64,
53}
54
55#[derive(Debug, Clone)]
56pub struct SessionSimResult {
57 pub raw_tokens: usize,
58 pub lean_tokens: usize,
59 pub lean_ccp_tokens: usize,
60 pub raw_cost: f64,
61 pub lean_cost: f64,
62 pub ccp_cost: f64,
63}
64
65#[derive(Debug, Clone)]
66pub struct ProjectBenchmark {
67 pub root: String,
68 pub files_scanned: usize,
69 pub files_measured: usize,
70 pub total_raw_tokens: usize,
71 pub languages: Vec<LanguageStats>,
72 pub mode_summaries: Vec<ModeSummary>,
73 pub session_sim: SessionSimResult,
74 #[allow(dead_code)]
75 pub file_results: Vec<FileMeasurement>,
76}
77
78fn is_skipped_dir(name: &str) -> bool {
81 matches!(
82 name,
83 "node_modules"
84 | ".git"
85 | "target"
86 | "dist"
87 | "build"
88 | ".next"
89 | ".nuxt"
90 | "__pycache__"
91 | ".cache"
92 | "coverage"
93 | "vendor"
94 | ".svn"
95 | ".hg"
96 )
97}
98
99fn is_text_ext(ext: &str) -> bool {
100 matches!(
101 ext,
102 "rs" | "ts"
103 | "tsx"
104 | "js"
105 | "jsx"
106 | "py"
107 | "go"
108 | "java"
109 | "c"
110 | "cpp"
111 | "h"
112 | "hpp"
113 | "cs"
114 | "kt"
115 | "swift"
116 | "rb"
117 | "php"
118 | "vue"
119 | "svelte"
120 | "html"
121 | "css"
122 | "scss"
123 | "less"
124 | "json"
125 | "yaml"
126 | "yml"
127 | "toml"
128 | "xml"
129 | "md"
130 | "txt"
131 | "sh"
132 | "bash"
133 | "zsh"
134 | "fish"
135 | "sql"
136 | "graphql"
137 | "proto"
138 | "ex"
139 | "exs"
140 | "zig"
141 | "lua"
142 | "r"
143 | "R"
144 | "dart"
145 | "scala"
146 )
147}
148
149fn scan_project(root: &str) -> Vec<PathBuf> {
150 let mut files: Vec<(PathBuf, u64)> = Vec::new();
151
152 for entry in WalkDir::new(root)
153 .max_depth(8)
154 .into_iter()
155 .filter_entry(|e| {
156 let name = e.file_name().to_string_lossy();
157 if e.file_type().is_dir() {
158 if e.depth() > 0 && name.starts_with('.') {
159 return false;
160 }
161 return !is_skipped_dir(&name);
162 }
163 true
164 })
165 {
166 let entry = match entry {
167 Ok(e) => e,
168 Err(_) => continue,
169 };
170
171 if entry.file_type().is_dir() {
172 continue;
173 }
174
175 let path = entry.path().to_path_buf();
176 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
177
178 if !is_text_ext(ext) {
179 continue;
180 }
181
182 let size = entry.metadata().map(|m| m.len()).unwrap_or(0);
183 if size == 0 || size > MAX_FILE_SIZE {
184 continue;
185 }
186
187 files.push((path, size));
188 }
189
190 files.sort_by_key(|x| std::cmp::Reverse(x.1));
191
192 let mut selected = Vec::new();
193 let mut ext_counts: HashMap<String, usize> = HashMap::new();
194
195 for (path, _size) in &files {
196 if selected.len() >= MAX_FILES {
197 break;
198 }
199 let ext = path
200 .extension()
201 .and_then(|e| e.to_str())
202 .unwrap_or("")
203 .to_string();
204 let count = ext_counts.entry(ext.clone()).or_insert(0);
205 if *count < 10 {
206 *count += 1;
207 selected.push(path.clone());
208 }
209 }
210
211 selected
212}
213
214fn measure_mode(content: &str, ext: &str, mode: &str, raw_tokens: usize) -> ModeMeasurement {
217 let start = Instant::now();
218
219 let compressed = match mode {
220 "map" => {
221 let sigs = signatures::extract_signatures(content, ext);
222 let dep_info = deps::extract_deps(content, ext);
223 let mut parts = Vec::new();
224 if !dep_info.imports.is_empty() {
225 parts.push(format!("deps: {}", dep_info.imports.join(", ")));
226 }
227 if !dep_info.exports.is_empty() {
228 parts.push(format!("exports: {}", dep_info.exports.join(", ")));
229 }
230 let key_sigs: Vec<String> = sigs
231 .iter()
232 .filter(|s| s.is_exported || s.indent == 0)
233 .map(|s| s.to_compact())
234 .collect();
235 if !key_sigs.is_empty() {
236 parts.push(key_sigs.join("\n"));
237 }
238 parts.join("\n")
239 }
240 "signatures" => {
241 let sigs = signatures::extract_signatures(content, ext);
242 sigs.iter()
243 .map(|s| s.to_compact())
244 .collect::<Vec<_>>()
245 .join("\n")
246 }
247 "aggressive" => compressor::aggressive_compress(content, Some(ext)),
248 "entropy" => entropy::entropy_compress(content).output,
249 "cache_hit" => "cached re-read ~13tok".to_string(),
250 _ => content.to_string(),
251 };
252
253 let latency = start.elapsed();
254 let tokens = if mode == "cache_hit" {
255 CACHE_HIT_TOKENS
256 } else {
257 count_tokens(&compressed)
258 };
259
260 let savings_pct = if raw_tokens > 0 {
261 (1.0 - tokens as f64 / raw_tokens as f64) * 100.0
262 } else {
263 0.0
264 };
265
266 let preservation_score = if mode == "cache_hit" {
267 -1.0
268 } else {
269 preservation::measure(content, &compressed, ext).overall()
270 };
271
272 ModeMeasurement {
273 mode: mode.to_string(),
274 tokens,
275 savings_pct,
276 latency_us: latency.as_micros() as u64,
277 preservation_score,
278 }
279}
280
281fn measure_file(path: &Path, root: &str) -> Option<FileMeasurement> {
282 let content = std::fs::read_to_string(path).ok()?;
283 if content.is_empty() {
284 return None;
285 }
286
287 let ext = path
288 .extension()
289 .and_then(|e| e.to_str())
290 .unwrap_or("")
291 .to_string();
292
293 let raw_tokens = count_tokens(&content);
294 if raw_tokens == 0 {
295 return None;
296 }
297
298 let modes = ["map", "signatures", "aggressive", "entropy", "cache_hit"];
299 let measurements: Vec<ModeMeasurement> = modes
300 .iter()
301 .map(|m| measure_mode(&content, &ext, m, raw_tokens))
302 .collect();
303
304 let display_path = path
305 .strip_prefix(root)
306 .unwrap_or(path)
307 .to_string_lossy()
308 .to_string();
309
310 Some(FileMeasurement {
311 path: display_path,
312 ext,
313 raw_tokens,
314 modes: measurements,
315 })
316}
317
318fn aggregate_languages(files: &[FileMeasurement]) -> Vec<LanguageStats> {
321 let mut map: HashMap<String, (usize, usize)> = HashMap::new();
322 for f in files {
323 let entry = map.entry(f.ext.clone()).or_insert((0, 0));
324 entry.0 += 1;
325 entry.1 += f.raw_tokens;
326 }
327 let mut stats: Vec<LanguageStats> = map
328 .into_iter()
329 .map(|(ext, (count, total_tokens))| LanguageStats {
330 ext,
331 count,
332 total_tokens,
333 })
334 .collect();
335 stats.sort_by_key(|x| std::cmp::Reverse(x.total_tokens));
336 stats
337}
338
339fn aggregate_modes(files: &[FileMeasurement]) -> Vec<ModeSummary> {
340 let mode_names = ["map", "signatures", "aggressive", "entropy", "cache_hit"];
341 let mut summaries = Vec::new();
342
343 for mode_name in &mode_names {
344 let mut total_tokens = 0usize;
345 let mut total_savings = 0.0f64;
346 let mut total_latency = 0u64;
347 let mut total_preservation = 0.0f64;
348 let mut preservation_count = 0usize;
349 let mut count = 0usize;
350
351 for f in files {
352 if let Some(m) = f.modes.iter().find(|m| m.mode == *mode_name) {
353 total_tokens += m.tokens;
354 total_savings += m.savings_pct;
355 total_latency += m.latency_us;
356 if m.preservation_score >= 0.0 {
357 total_preservation += m.preservation_score;
358 preservation_count += 1;
359 }
360 count += 1;
361 }
362 }
363
364 if count == 0 {
365 continue;
366 }
367
368 summaries.push(ModeSummary {
369 mode: mode_name.to_string(),
370 total_compressed_tokens: total_tokens,
371 avg_savings_pct: total_savings / count as f64,
372 avg_latency_us: total_latency / count as u64,
373 avg_preservation: if preservation_count > 0 {
374 total_preservation / preservation_count as f64
375 } else {
376 -1.0
377 },
378 });
379 }
380
381 summaries
382}
383
384fn simulate_session(files: &[FileMeasurement]) -> SessionSimResult {
387 if files.is_empty() {
388 return SessionSimResult {
389 raw_tokens: 0,
390 lean_tokens: 0,
391 lean_ccp_tokens: 0,
392 raw_cost: 0.0,
393 lean_cost: 0.0,
394 ccp_cost: 0.0,
395 };
396 }
397
398 let file_count = files.len().min(15);
399 let selected = &files[..file_count];
400
401 let first_read_raw: usize = selected.iter().map(|f| f.raw_tokens).sum();
402
403 let first_read_lean: usize = selected
404 .iter()
405 .enumerate()
406 .map(|(i, f)| {
407 let mode = if i % 3 == 0 { "aggressive" } else { "map" };
408 f.modes
409 .iter()
410 .find(|m| m.mode == mode)
411 .map(|m| m.tokens)
412 .unwrap_or(f.raw_tokens)
413 })
414 .sum();
415
416 let cache_reread_count = 10usize.min(file_count);
417 let cache_raw: usize = selected[..cache_reread_count]
418 .iter()
419 .map(|f| f.raw_tokens)
420 .sum();
421 let cache_lean: usize = cache_reread_count * CACHE_HIT_TOKENS;
422
423 let shell_count = 8usize;
424 let shell_raw = shell_count * 500;
425 let shell_lean = shell_count * 200;
426
427 let resume_raw: usize = selected.iter().map(|f| f.raw_tokens).sum();
428 let resume_lean: usize = selected
429 .iter()
430 .map(|f| {
431 f.modes
432 .iter()
433 .find(|m| m.mode == "map")
434 .map(|m| m.tokens)
435 .unwrap_or(f.raw_tokens)
436 })
437 .sum();
438 let resume_ccp = 400usize;
439
440 let raw_total = first_read_raw + cache_raw + shell_raw + resume_raw;
441 let lean_total = first_read_lean + cache_lean + shell_lean + resume_lean;
442 let ccp_total = first_read_lean + cache_lean + shell_lean + resume_ccp;
443
444 SessionSimResult {
445 raw_tokens: raw_total,
446 lean_tokens: lean_total,
447 lean_ccp_tokens: ccp_total,
448 raw_cost: raw_total as f64 * COST_PER_TOKEN,
449 lean_cost: lean_total as f64 * COST_PER_TOKEN,
450 ccp_cost: ccp_total as f64 * COST_PER_TOKEN,
451 }
452}
453
454pub fn run_project_benchmark(path: &str) -> ProjectBenchmark {
457 let root = if path.is_empty() { "." } else { path };
458 let scanned = scan_project(root);
459 let files_scanned = scanned.len();
460
461 let file_results: Vec<FileMeasurement> = scanned
462 .iter()
463 .filter_map(|p| measure_file(p, root))
464 .collect();
465
466 let total_raw_tokens: usize = file_results.iter().map(|f| f.raw_tokens).sum();
467 let languages = aggregate_languages(&file_results);
468 let mode_summaries = aggregate_modes(&file_results);
469 let session_sim = simulate_session(&file_results);
470
471 ProjectBenchmark {
472 root: root.to_string(),
473 files_scanned,
474 files_measured: file_results.len(),
475 total_raw_tokens,
476 languages,
477 mode_summaries,
478 session_sim,
479 file_results,
480 }
481}
482
483pub fn format_terminal(b: &ProjectBenchmark) -> String {
486 let mut out = Vec::new();
487 let sep = "\u{2550}".repeat(66);
488
489 out.push(sep.to_string());
490 out.push(format!(" lean-ctx Benchmark — {}", b.root));
491 out.push(sep.to_string());
492
493 let lang_summary: Vec<String> = b
494 .languages
495 .iter()
496 .take(5)
497 .map(|l| format!("{} {}", l.count, l.ext))
498 .collect();
499 out.push(format!(
500 " Scanned: {} files ({})",
501 b.files_measured,
502 lang_summary.join(", ")
503 ));
504 out.push(format!(
505 " Total raw tokens: {}",
506 format_num(b.total_raw_tokens)
507 ));
508 out.push(String::new());
509
510 out.push(" Mode Performance:".to_string());
511 out.push(format!(
512 " {:<14} {:>10} {:>10} {:>10} {:>10}",
513 "Mode", "Tokens", "Savings", "Latency", "Quality"
514 ));
515 out.push(format!(" {}", "\u{2500}".repeat(58)));
516
517 for m in &b.mode_summaries {
518 let qual = if m.avg_preservation < 0.0 {
519 "N/A".to_string()
520 } else {
521 format!("{:.1}%", m.avg_preservation * 100.0)
522 };
523 let latency = if m.avg_latency_us > 1000 {
524 format!("{:.1}ms", m.avg_latency_us as f64 / 1000.0)
525 } else {
526 format!("{}μs", m.avg_latency_us)
527 };
528 out.push(format!(
529 " {:<14} {:>10} {:>9.1}% {:>10} {:>10}",
530 m.mode,
531 format_num(m.total_compressed_tokens),
532 m.avg_savings_pct,
533 latency,
534 qual,
535 ));
536 }
537
538 out.push(String::new());
539 out.push(" Session Simulation (30-min coding):".to_string());
540 out.push(format!(
541 " {:<24} {:>10} {:>10} {:>10}",
542 "Approach", "Tokens", "Cost", "Savings"
543 ));
544 out.push(format!(" {}", "\u{2500}".repeat(58)));
545
546 let s = &b.session_sim;
547 out.push(format!(
548 " {:<24} {:>10} {:>10} {:>10}",
549 "Raw (no compression)",
550 format_num(s.raw_tokens),
551 format!("${:.3}", s.raw_cost),
552 "\u{2014}",
553 ));
554
555 let lean_pct = if s.raw_tokens > 0 {
556 (1.0 - s.lean_tokens as f64 / s.raw_tokens as f64) * 100.0
557 } else {
558 0.0
559 };
560 out.push(format!(
561 " {:<24} {:>10} {:>10} {:>9.1}%",
562 "lean-ctx (no CCP)",
563 format_num(s.lean_tokens),
564 format!("${:.3}", s.lean_cost),
565 lean_pct,
566 ));
567
568 let ccp_pct = if s.raw_tokens > 0 {
569 (1.0 - s.lean_ccp_tokens as f64 / s.raw_tokens as f64) * 100.0
570 } else {
571 0.0
572 };
573 out.push(format!(
574 " {:<24} {:>10} {:>10} {:>9.1}%",
575 "lean-ctx + CCP",
576 format_num(s.lean_ccp_tokens),
577 format!("${:.3}", s.ccp_cost),
578 ccp_pct,
579 ));
580
581 out.push(sep.to_string());
582 out.join("\n")
583}
584
585pub fn format_markdown(b: &ProjectBenchmark) -> String {
588 let mut out = Vec::new();
589
590 out.push("# lean-ctx Benchmark Report".to_string());
591 out.push(String::new());
592 out.push(format!("**Project:** `{}`", b.root));
593 out.push(format!("**Files measured:** {}", b.files_measured));
594 out.push(format!(
595 "**Total raw tokens:** {}",
596 format_num(b.total_raw_tokens)
597 ));
598 out.push(String::new());
599
600 out.push("## Languages".to_string());
601 out.push(String::new());
602 out.push("| Extension | Files | Tokens |".to_string());
603 out.push("|-----------|------:|-------:|".to_string());
604 for l in &b.languages {
605 out.push(format!(
606 "| {} | {} | {} |",
607 l.ext,
608 l.count,
609 format_num(l.total_tokens)
610 ));
611 }
612 out.push(String::new());
613
614 out.push("## Mode Performance".to_string());
615 out.push(String::new());
616 out.push("| Mode | Tokens | Savings | Latency | Quality |".to_string());
617 out.push("|------|-------:|--------:|--------:|--------:|".to_string());
618 for m in &b.mode_summaries {
619 let qual = if m.avg_preservation < 0.0 {
620 "N/A".to_string()
621 } else {
622 format!("{:.1}%", m.avg_preservation * 100.0)
623 };
624 let latency = if m.avg_latency_us > 1000 {
625 format!("{:.1}ms", m.avg_latency_us as f64 / 1000.0)
626 } else {
627 format!("{}μs", m.avg_latency_us)
628 };
629 out.push(format!(
630 "| {} | {} | {:.1}% | {} | {} |",
631 m.mode,
632 format_num(m.total_compressed_tokens),
633 m.avg_savings_pct,
634 latency,
635 qual
636 ));
637 }
638 out.push(String::new());
639
640 out.push("## Session Simulation (30-min coding)".to_string());
641 out.push(String::new());
642 out.push("| Approach | Tokens | Cost | Savings |".to_string());
643 out.push("|----------|-------:|-----:|--------:|".to_string());
644
645 let s = &b.session_sim;
646 out.push(format!(
647 "| Raw (no compression) | {} | ${:.3} | — |",
648 format_num(s.raw_tokens),
649 s.raw_cost
650 ));
651
652 let lean_pct = if s.raw_tokens > 0 {
653 (1.0 - s.lean_tokens as f64 / s.raw_tokens as f64) * 100.0
654 } else {
655 0.0
656 };
657 out.push(format!(
658 "| lean-ctx (no CCP) | {} | ${:.3} | {:.1}% |",
659 format_num(s.lean_tokens),
660 s.lean_cost,
661 lean_pct
662 ));
663
664 let ccp_pct = if s.raw_tokens > 0 {
665 (1.0 - s.lean_ccp_tokens as f64 / s.raw_tokens as f64) * 100.0
666 } else {
667 0.0
668 };
669 out.push(format!(
670 "| lean-ctx + CCP | {} | ${:.3} | {:.1}% |",
671 format_num(s.lean_ccp_tokens),
672 s.ccp_cost,
673 ccp_pct
674 ));
675
676 out.push(String::new());
677 out.push(format!(
678 "*Generated by lean-ctx benchmark v{} — https://leanctx.com*",
679 env!("CARGO_PKG_VERSION")
680 ));
681
682 out.join("\n")
683}
684
685pub fn format_json(b: &ProjectBenchmark) -> String {
688 let modes: Vec<serde_json::Value> = b.mode_summaries.iter().map(|m| {
689 serde_json::json!({
690 "mode": m.mode,
691 "total_compressed_tokens": m.total_compressed_tokens,
692 "avg_savings_pct": round2(m.avg_savings_pct),
693 "avg_latency_us": m.avg_latency_us,
694 "avg_preservation": if m.avg_preservation < 0.0 { serde_json::Value::Null } else { serde_json::json!(round2(m.avg_preservation * 100.0)) },
695 })
696 }).collect();
697
698 let languages: Vec<serde_json::Value> = b
699 .languages
700 .iter()
701 .map(|l| {
702 serde_json::json!({
703 "ext": l.ext,
704 "count": l.count,
705 "total_tokens": l.total_tokens,
706 })
707 })
708 .collect();
709
710 let s = &b.session_sim;
711 let report = serde_json::json!({
712 "version": env!("CARGO_PKG_VERSION"),
713 "root": b.root,
714 "files_scanned": b.files_scanned,
715 "files_measured": b.files_measured,
716 "total_raw_tokens": b.total_raw_tokens,
717 "languages": languages,
718 "mode_summaries": modes,
719 "session_simulation": {
720 "raw_tokens": s.raw_tokens,
721 "lean_tokens": s.lean_tokens,
722 "lean_ccp_tokens": s.lean_ccp_tokens,
723 "raw_cost_usd": round2(s.raw_cost),
724 "lean_cost_usd": round2(s.lean_cost),
725 "ccp_cost_usd": round2(s.ccp_cost),
726 },
727 });
728
729 serde_json::to_string_pretty(&report).unwrap_or_else(|_| "{}".to_string())
730}
731
732fn format_num(n: usize) -> String {
735 if n >= 1_000_000 {
736 format!("{:.1}M", n as f64 / 1_000_000.0)
737 } else if n >= 1_000 {
738 format!("{:.1}K", n as f64 / 1_000.0)
739 } else {
740 format!("{n}")
741 }
742}
743
744fn round2(v: f64) -> f64 {
745 (v * 100.0).round() / 100.0
746}