1use anyhow::Result;
2use clap::{Parser, Subcommand, ValueEnum};
3use serde::Serialize;
4use std::cmp::Reverse;
5
6use std::collections::{BTreeSet, HashMap, HashSet};
7use std::io::{self, BufRead};
8use std::path::Path;
9
10use crate::errors;
11use crate::model::Section;
12use crate::pack::{pack_by_ids, PackSearchOptions};
13use crate::parse::{load_markdown, parse_markdown};
14use crate::render::{
15 render_pack, render_read, render_search, render_sections, render_stats, render_tree,
16 FileSectionsMap, PackIncluded, SectionsEntry, StatsEntry,
17};
18use crate::search::{discover_markdown_files, get_doc_section_summaries, search_files};
19use crate::tokens::{estimate_tokens, truncate_to_tokens};
20
21const TRUNCATION_NOTICE: &str = "\n\n<!-- mdlens: truncated at token budget -->";
22
23#[derive(Parser)]
24#[command(name = "mdlens")]
25#[command(about = "Token-efficient Markdown structure CLI for AI agents")]
26#[command(
27 long_about = "mdlens parses Markdown files into a hierarchical section tree with\ndotted IDs, token estimates, and bounded-context packing.\n\nDesigned for AI agents that need to navigate, search, and pack\nMarkdown documentation into context windows efficiently.\n\nAgent quickstart:\n 1. For question answering over a Markdown directory, start with:\n mdlens scout <dir> \"<question>\" --max-tokens 1400\n 2. Answer from scout when [highlights] and [evidence] are sufficient.\n 3. If one detail is missing, use a listed section id:\n mdlens read <file> --id <N.N> --max-tokens 1200\n 4. Use search/tree/sections only when scout points at the wrong file or you\n need broader navigation.\n\nScout is the recommended first command for arbitrary messy English markdown.\nIt returns query expansion, a compact file map, ranked highlights, and bounded\nevidence sections with parent heading/status context.\n\nAnswering from scout:\n - Read [highlights] first, then [evidence].\n - Preserve distinctive evidence terms: flags, IDs, metrics, option names,\n labels, row values, and short policy/risk phrases.\n - Copy short source phrases exactly when they are likely answer terms; avoid\n changing singular/plural or rewriting concise labels into paraphrases.\n - If scout already names the answer plus its rule, risk, command, or policy,\n answer directly instead of continuing broad retrieval.\n - For current-vs-stale questions, prefer current/current loader sections and\n treat Do Not Use, copied tables, stale notes, and old runbooks as\n distractors.\n - For table questions, keep the table header with the selected row; do not\n average unrelated rows unless the document says to.\n - For why, policy, safety, privacy, negative, or tradeoff questions, include\n the compact rule/risk/rationale bullets, not only the command or metric.\n - For multi-file comparisons, answer each named entity separately, then\n summarize the shared pattern.\n - If evidence is missing, say the corpus does not specify the fact.\n\nRun `mdlens scout --help` for detailed scout-specific guidance."
28)]
29struct Cli {
30 #[command(subcommand)]
31 command: Commands,
32}
33
34#[derive(Subcommand)]
35enum Commands {
36 Tree(TreeArgs),
38 Read(ReadArgs),
40 Search(SearchArgs),
42 Scout(ScoutArgs),
44 Pack(PackArgs),
46 Stats(StatsArgs),
48 Sections(SectionsArgs),
50}
51
52#[derive(clap::Args)]
53struct TreeArgs {
54 path: String,
56 #[arg(long)]
58 json: bool,
59 #[arg(long)]
61 max_depth: Option<usize>,
62 #[arg(long)]
64 include_preamble: bool,
65 #[arg(long)]
67 files: bool,
68}
69
70#[derive(clap::Args)]
71struct ReadArgs {
72 file: String,
74 #[arg(long, conflicts_with_all = ["heading_path", "lines"])]
76 id: Option<String>,
77 #[arg(long, conflicts_with_all = ["id", "lines"])]
79 heading_path: Option<String>,
80 #[arg(long, conflicts_with_all = ["id", "heading_path"])]
82 lines: Option<String>,
83 #[arg(long)]
85 parents: bool,
86 #[arg(long, conflicts_with = "no_children")]
88 children: bool,
89 #[arg(long, conflicts_with = "children")]
91 no_children: bool,
92 #[arg(long)]
94 max_tokens: Option<usize>,
95 #[arg(long)]
97 json: bool,
98}
99
100#[derive(clap::Args)]
101struct SearchArgs {
102 path: String,
104 query: String,
106 #[arg(long)]
108 json: bool,
109 #[arg(long)]
111 regex: bool,
112 #[arg(long)]
114 case_sensitive: bool,
115 #[arg(long, default_value_t = 20)]
117 max_results: usize,
118 #[arg(long, default_value_t = 2)]
120 context_lines: usize,
121 #[arg(long)]
123 content: bool,
124 #[arg(long)]
126 preview: Option<usize>,
127 #[arg(long)]
129 max_tokens: Option<usize>,
130}
131
132#[derive(clap::Args)]
133#[command(
134 long_about = "One-shot agent evidence pack for answering a natural-language question over Markdown.\n\n`scout` is optimized for agent workflows: fewer shell calls, bounded output,\nand enough section context to answer without dumping whole files. It searches\nsection text, headings, paths, parent context, and table rows; ranks likely\nevidence; then emits a compact pack."
135)]
136#[command(
137 after_help = "Agent workflow:\n - Use scout as the first retrieval call for QA over a directory:\n mdlens scout docs/ \"What policy changed between the old and current loader?\" --max-tokens 1400\n - Use --json when a harness wants structured metadata plus the same rendered evidence pack.\n - Read [highlights] first. They are globally ranked compact evidence lines.\n - Then read [evidence]. Each block names file, section id, heading path, line\n span, token estimate, and ranking reason.\n - If the answer is present, stop and answer directly. Preserve distinctive\n terms: flags, IDs, metrics, option names, row values, labels, and short\n policy phrases.\n - Copy short source phrases exactly when they are likely answer terms; avoid\n changing singular/plural or rewriting concise labels into paraphrases.\n - If exactly one fact is missing, use the section map from [files] and read\n one section:\n mdlens read <file> --id <section-id> --max-tokens 1200\n - Use `mdlens search` only when scout clearly found the wrong file or when\n you need a second independent query.\n\nHow to interpret scout output:\n [queries] Search expansions derived from the question.\n [files] Candidate files, picked section ids, and nearby unread sections.\n [focus] Dominant file when the question appears single-file.\n [highlights] Globally ranked lines/table rows likely to answer the question.\n [evidence] Bounded excerpts from the selected sections.\n\nQuestion-shape guidance:\n - Current-vs-stale questions: prefer sections marked current/current loader;\n treat Do Not Use, stale notes, copied tables, and old runbooks as distractors.\n - Table questions: keep the table header with the selected row; do not average\n unrelated rows unless the document says to.\n - Why, policy, safety, privacy, negative, or tradeoff questions: include the\n compact rule/risk/rationale bullets, not only the command or metric.\n - Multi-file comparison: answer each named entity separately, then summarize\n the shared pattern.\n - Missing evidence: say the corpus does not specify the fact rather than\n guessing from file names.\n\nUseful defaults:\n --max-tokens 1400 keeps scout cheap for most agent turns.\n --max-sections 12 gives enough diversity before packing.\n --max-files 4 keeps the file map readable."
138)]
139struct ScoutArgs {
140 path: String,
142 question: String,
144 #[arg(long)]
146 json: bool,
147 #[arg(long, default_value_t = 1400)]
149 max_tokens: usize,
150 #[arg(long, default_value_t = 12)]
152 max_sections: usize,
153 #[arg(long, default_value_t = 4)]
155 max_files: usize,
156}
157
158#[derive(clap::Args)]
159struct PackArgs {
160 path: String,
162 #[arg(long, conflicts_with_all = ["paths", "search"])]
164 ids: Option<String>,
165 #[arg(long, conflicts_with_all = ["ids", "search"])]
167 paths: Option<String>,
168 #[arg(long, conflicts_with_all = ["ids", "paths"])]
170 search: Option<String>,
171 #[arg(long)]
173 max_tokens: usize,
174 #[arg(long)]
176 parents: bool,
177 #[arg(long, conflicts_with = "no_dedupe")]
179 dedupe: bool,
180 #[arg(long, conflicts_with = "dedupe")]
182 no_dedupe: bool,
183 #[arg(long)]
185 regex: bool,
186 #[arg(long)]
188 case_sensitive: bool,
189 #[arg(long, default_value_t = 20)]
191 max_results: usize,
192 #[arg(long, default_value_t = 2)]
194 context_lines: usize,
195 #[arg(long)]
197 json: bool,
198}
199
200#[derive(Clone, ValueEnum)]
201enum StatsSort {
202 Path,
203 Tokens,
204 Lines,
205}
206
207#[derive(clap::Args)]
208struct StatsArgs {
209 path: String,
211 #[arg(long)]
213 json: bool,
214 #[arg(long, value_enum, default_value_t = StatsSort::Path)]
216 sort: StatsSort,
217 #[arg(long)]
219 top: Option<usize>,
220}
221
222#[derive(clap::Args)]
223struct SectionsArgs {
224 #[arg(value_name = "FILE")]
226 files: Vec<String>,
227 #[arg(long)]
229 content: bool,
230 #[arg(long)]
232 children: bool,
233 #[arg(long)]
235 preview: Option<usize>,
236 #[arg(long)]
238 max_depth: Option<usize>,
239 #[arg(long)]
241 max_tokens: Option<usize>,
242 #[arg(long)]
244 max_sections: Option<usize>,
245 #[arg(long)]
247 max_files: Option<usize>,
248 #[arg(long)]
250 json: bool,
251 #[arg(long)]
253 heading_paths: bool,
254 #[arg(long)]
256 lines: bool,
257 #[arg(long, default_value_t = true)]
259 dedupe: bool,
260 #[arg(long, conflicts_with = "dedupe")]
262 no_dedupe: bool,
263}
264
265#[derive(Clone)]
266struct SectionHit {
267 path: String,
268 line: usize,
269}
270
271enum SectionInput {
272 File(String),
273 Hit(SectionHit),
274}
275
276pub fn run() -> Result<()> {
277 let cli = Cli::parse();
278
279 match cli.command {
280 Commands::Tree(args) => cmd_tree(args),
281 Commands::Read(args) => cmd_read(args),
282 Commands::Search(args) => cmd_search(args),
283 Commands::Scout(args) => cmd_scout(args),
284 Commands::Pack(args) => cmd_pack(args),
285 Commands::Stats(args) => cmd_stats(args),
286 Commands::Sections(args) => cmd_sections(args),
287 }
288}
289
290fn cmd_tree(args: TreeArgs) -> Result<()> {
291 let files = crate::search::discover_markdown_files(&args.path)?;
292
293 if files.len() == 1 {
294 let doc = parse_markdown(&files[0])?;
295 if args.json {
296 let output = TreeJsonOutput {
297 schema_version: 1,
298 path: doc.path.clone(),
299 line_count: doc.line_count,
300 byte_count: doc.byte_count,
301 char_count: doc.char_count,
302 word_count: doc.word_count,
303 token_estimate: doc.token_estimate,
304 sections: serialize_sections(
305 &doc.sections,
306 args.max_depth,
307 args.include_preamble,
308 0,
309 ),
310 };
311 println!("{}", serde_json::to_string_pretty(&output)?);
312 } else {
313 println!(
314 "{}",
315 render_tree(&doc, args.max_depth, args.include_preamble)
316 );
317 }
318 } else {
319 let depth_capped = args.max_depth.is_none();
321 let effective_depth = args.max_depth.or(Some(1));
322
323 if args.json {
324 let mut file_outputs = Vec::new();
325 for file in &files {
326 let doc = parse_markdown(file)?;
327 file_outputs.push(TreeFileJsonOutput {
328 path: doc.path.clone(),
329 line_count: doc.line_count,
330 byte_count: doc.byte_count,
331 char_count: doc.char_count,
332 word_count: doc.word_count,
333 token_estimate: doc.token_estimate,
334 sections: serialize_sections(
335 &doc.sections,
336 effective_depth,
337 args.include_preamble,
338 0,
339 ),
340 });
341 }
342 let output = TreeMultiJsonOutput {
343 schema_version: 1,
344 files: file_outputs,
345 };
346 println!("{}", serde_json::to_string_pretty(&output)?);
347 } else {
348 for file in &files {
349 let doc = parse_markdown(file)?;
350 println!(
351 "\n{}",
352 render_tree(&doc, effective_depth, args.include_preamble)
353 );
354 }
355 if depth_capped {
356 eprintln!("[tree] directory mode: showing depth ≤1 by default; use --max-depth N for more");
357 }
358 }
359 }
360
361 Ok(())
362}
363
364fn cmd_read(args: ReadArgs) -> Result<()> {
365 let parsed = load_markdown(&args.file)?;
366 let doc = &parsed.doc;
367 let lines = &parsed.lines;
368 let include_children = !args.no_children || args.children;
369
370 let (section_text, section_meta, selector_type, selector_value, section_ref) =
371 if let Some(ref id) = args.id {
372 let section = doc
373 .find_section_by_id(id)
374 .ok_or_else(|| anyhow::anyhow!("section id not found: {id}"))?;
375 let content = if include_children {
376 section.extract_content(lines)
377 } else {
378 section.extract_direct_content(lines)
379 }
380 .join("\n");
381 (
382 content,
383 SectionMeta::from(section),
384 "id",
385 id.clone(),
386 Some(section),
387 )
388 } else if let Some(ref path_str) = args.heading_path {
389 let section = find_unique_section_by_path(doc, path_str)?;
390 let content = if include_children {
391 section.extract_content(lines)
392 } else {
393 section.extract_direct_content(lines)
394 }
395 .join("\n");
396 (
397 content,
398 SectionMeta::from(section),
399 "path",
400 path_str.clone(),
401 Some(section),
402 )
403 } else if let Some(ref lines_str) = args.lines {
404 let parts: Vec<&str> = lines_str.split(':').collect();
405 if parts.len() != 2 {
406 return Err(anyhow::anyhow!(
407 "invalid line range: {}; expected format START:END",
408 lines_str
409 ));
410 }
411 let start: usize = parts[0].trim().parse()?;
412 let end: usize = parts[1].trim().parse()?;
413 if start > end {
414 return Err(errors::invalid_line_range(start, end));
415 }
416 if start < 1 || end > lines.len() {
417 return Err(anyhow::anyhow!(
418 "line range {}:{} out of bounds (file has {} lines)",
419 start,
420 end,
421 lines.len()
422 ));
423 }
424 let content = lines[(start - 1)..end].join("\n");
425 let token_est = estimate_tokens(&content);
426 (
427 content,
428 SectionMeta {
429 id: format!("lines:{}:{}", start, end),
430 title: format!("Lines {}-{}", start, end),
431 level: 0,
432 path: vec![format!("Lines {}-{}", start, end)],
433 line_start: start,
434 line_end: end,
435 token_estimate: token_est,
436 },
437 "lines",
438 format!("{}:{}", start, end),
439 None,
440 )
441 } else {
442 return Err(anyhow::anyhow!(
443 "exactly one of --id, --heading-path, or --lines is required"
444 ));
445 };
446
447 let mut full_content = String::new();
448
449 if args.parents {
450 if let Some(sec) = section_ref {
451 let parents = find_parent_headings(doc, sec);
452 for line_idx in parents {
453 if !full_content.is_empty() {
454 full_content.push_str("\n\n");
455 }
456 full_content.push_str(&lines[line_idx - 1]);
457 }
458 }
459 }
460
461 if !full_content.is_empty() && !section_text.is_empty() {
462 full_content.push_str("\n\n");
463 }
464 full_content.push_str(§ion_text);
465
466 let truncated = if let Some(max_tokens) = args.max_tokens {
467 if estimate_tokens(&full_content) > max_tokens {
468 full_content = truncate_content_to_tokens(&full_content, max_tokens);
469 true
470 } else {
471 false
472 }
473 } else {
474 false
475 };
476
477 if args.json {
478 let output = ReadJsonOutput {
479 schema_version: 1,
480 path: doc.path.clone(),
481 selector: ReadSelector {
482 r#type: selector_type.to_string(),
483 value: selector_value.to_string(),
484 },
485 section: SectionJsonOutput {
486 id: section_meta.id.clone(),
487 title: section_meta.title.clone(),
488 level: section_meta.level,
489 path: section_meta.path.clone(),
490 line_start: section_meta.line_start,
491 line_end: section_meta.line_end,
492 token_estimate: section_meta.token_estimate,
493 children: Vec::new(),
494 },
495 content: full_content,
496 truncated,
497 };
498 println!("{}", serde_json::to_string_pretty(&output)?);
499 } else {
500 let section = Section {
501 id: section_meta.id.clone(),
502 slug: Section::slugify(§ion_meta.title),
503 title: section_meta.title.clone(),
504 level: section_meta.level,
505 path: section_meta.path.clone(),
506 line_start: section_meta.line_start,
507 line_end: section_meta.line_end,
508 content_line_start: section_meta.line_start,
509 byte_start: 0,
510 byte_end: 0,
511 char_count: 0,
512 word_count: 0,
513 token_estimate: section_meta.token_estimate,
514 children: Vec::new(),
515 };
516 println!("{}", render_read(§ion, &full_content, truncated));
517 }
518
519 Ok(())
520}
521
522struct SectionMeta {
523 id: String,
524 title: String,
525 level: u8,
526 path: Vec<String>,
527 line_start: usize,
528 line_end: usize,
529 token_estimate: usize,
530}
531
532impl From<&Section> for SectionMeta {
533 fn from(s: &Section) -> Self {
534 SectionMeta {
535 id: s.id.clone(),
536 title: s.title.clone(),
537 level: s.level,
538 path: s.path.clone(),
539 line_start: s.line_start,
540 line_end: s.line_end,
541 token_estimate: s.token_estimate,
542 }
543 }
544}
545
546fn find_parent_headings(doc: &crate::model::Document, section: &Section) -> Vec<usize> {
548 let mut parent_map: std::collections::HashMap<String, Option<String>> =
549 std::collections::HashMap::new();
550 build_parent_map(&doc.sections, None, &mut parent_map);
551 let mut chain = Vec::new();
552 let mut current_id = section.id.clone();
553 while let Some(Some(pid)) = parent_map.get(¤t_id) {
554 if let Some(parent_sec) = doc.find_section_by_id(pid) {
555 chain.push(parent_sec.line_start);
556 }
557 current_id = pid.clone();
558 }
559 chain.reverse();
560 chain
561}
562
563fn find_unique_section_by_path<'a>(
564 doc: &'a crate::model::Document,
565 path_str: &str,
566) -> Result<&'a Section> {
567 let path = parse_heading_path(path_str);
568 let matches = doc.find_sections_by_path(&path);
569 match matches.len() {
570 0 => Err(anyhow::anyhow!("path not found: {path_str}")),
571 1 => Ok(matches[0]),
572 _ => Err(errors::ambiguous_path(path_str, &matches)),
573 }
574}
575
576fn parse_heading_path(path: &str) -> Vec<String> {
577 let mut parts = Vec::new();
578 let mut current = String::new();
579 let mut escaped = false;
580
581 for ch in path.chars() {
582 if escaped {
583 current.push(ch);
584 escaped = false;
585 continue;
586 }
587
588 match ch {
589 '\\' => escaped = true,
590 '>' => {
591 let part = current.trim();
592 if !part.is_empty() {
593 parts.push(part.to_string());
594 }
595 current.clear();
596 }
597 _ => current.push(ch),
598 }
599 }
600
601 let part = current.trim();
602 if !part.is_empty() {
603 parts.push(part.to_string());
604 }
605
606 parts
607}
608
609fn build_parent_map(
610 sections: &[Section],
611 parent_id: Option<String>,
612 map: &mut std::collections::HashMap<String, Option<String>>,
613) {
614 for section in sections {
615 map.insert(section.id.clone(), parent_id.clone());
616 build_parent_map(§ion.children, Some(section.id.clone()), map);
617 }
618}
619
620fn cmd_search(args: SearchArgs) -> Result<()> {
621 let mut results = search_files(
622 &args.path,
623 &args.query,
624 args.case_sensitive,
625 args.regex,
626 args.max_results,
627 args.context_lines,
628 )?;
629
630 if args.content || args.preview.is_some() || args.max_tokens.is_some() {
631 enrich_search_results(&mut results, args.content, args.preview)?;
632 }
633
634 if let Some(max_tokens) = args.max_tokens {
635 let mut kept = Vec::new();
636 let mut total_tokens = 0usize;
637 for result in results {
638 let item_tokens = if args.content {
639 result
640 .body
641 .as_ref()
642 .map(|body| estimate_tokens(body))
643 .unwrap_or(result.token_estimate)
644 } else if let Some(preview) = &result.preview {
645 estimate_tokens(preview)
646 } else {
647 result.token_estimate
648 };
649 if total_tokens + item_tokens > max_tokens {
650 break;
651 }
652 total_tokens += item_tokens;
653 kept.push(result);
654 }
655 results = kept;
656 }
657
658 if args.json {
659 let output = SearchJsonOutput {
660 schema_version: 1,
661 query: args.query,
662 root: args.path,
663 results: results
664 .iter()
665 .map(|r| SearchJsonResult {
666 path: r.path.clone(),
667 section_id: r.section_id.clone(),
668 section_title: r.section_title.clone(),
669 section_path: r.section_path.clone(),
670 line_start: r.line_start,
671 line_end: r.line_end,
672 token_estimate: r.token_estimate,
673 match_count: r.match_count,
674 body: r.body.clone(),
675 preview: r.preview.clone(),
676 snippets: r
677 .snippets
678 .iter()
679 .map(|s| SearchJsonSnippet {
680 line_start: s.line_start,
681 line_end: s.line_end,
682 text: s.text.clone(),
683 })
684 .collect(),
685 })
686 .collect(),
687 };
688 println!("{}", serde_json::to_string_pretty(&output)?);
689 } else {
690 let file_sections = build_file_sections_map(&results);
691 println!("{}", render_search(&results, args.content, &file_sections));
692 }
693
694 Ok(())
695}
696
697fn build_file_sections_map(results: &[crate::render::SearchResult]) -> FileSectionsMap {
698 let unique_files: std::collections::HashSet<&str> =
699 results.iter().map(|r| r.path.as_str()).collect();
700 let mut map = FileSectionsMap::new();
701 for path in unique_files {
702 if let Ok(summaries) = get_doc_section_summaries(path) {
703 map.insert(path.to_string(), summaries);
704 }
705 }
706 map
707}
708
709#[derive(Clone, Serialize)]
710struct ScoutCandidate {
711 path: String,
712 section_id: String,
713 score: i32,
714 reason: String,
715}
716
717struct ScoutHighlight {
718 score: i32,
719 path: String,
720 section_id: String,
721 line_no: usize,
722 line: String,
723}
724
725fn cmd_scout(args: ScoutArgs) -> Result<()> {
726 let queries = scout_queries(&args.question);
727 let mut candidates: Vec<ScoutCandidate> = Vec::new();
728 let per_query_results = (args.max_sections * 3).max(args.max_sections).min(60);
729
730 for query in &queries {
731 let results = search_files(&args.path, query, false, false, per_query_results, 2)?;
732 for result in results {
733 let query_tokens = signal_tokens(query);
734 let normalized_path = normalize_for_match(&result.path);
735 let path_quality_score = scout_path_quality_score(&result.path);
736 let path_hits = query_tokens
737 .iter()
738 .filter(|token| normalized_path.contains(&normalize_for_match(token)))
739 .count() as i32;
740 let path_boost = if path_hits > 0 {
741 180 + path_hits * 45
742 } else {
743 0
744 };
745 let broad_penalty = if path_hits == 0 && query_tokens.len() <= 1 {
746 60
747 } else {
748 0
749 };
750 candidates.push(ScoutCandidate {
751 path: result.path,
752 section_id: result.section_id,
753 score: 100
754 + path_boost
755 + path_quality_score
756 + result.match_count as i32 * 5
757 + scout_heading_score(
758 &result.section_path,
759 &result.section_title,
760 &args.question,
761 )
762 - result.token_estimate as i32 / 250
763 - broad_penalty,
764 reason: format!("content match: {query}"),
765 });
766 }
767 }
768
769 add_lexical_scout_candidates(
770 &args.path,
771 &args.question,
772 &mut candidates,
773 args.max_sections * 4,
774 )?;
775 add_path_match_candidates(&args.path, &args.question, &mut candidates)?;
776 add_named_target_candidates(&args.path, &args.question, &mut candidates)?;
777 add_neighbor_candidates(&mut candidates)?;
778
779 candidates.sort_by(|lhs, rhs| {
780 rhs.score
781 .cmp(&lhs.score)
782 .then(lhs.path.cmp(&rhs.path))
783 .then(lhs.section_id.cmp(&rhs.section_id))
784 });
785 dedupe_scout_candidates(&mut candidates);
786 prune_parent_scout_candidates(&mut candidates);
787 let candidate_pool = candidates.clone();
788 diversify_scout_candidates(&mut candidates, args.max_sections, &args.question);
789 ensure_named_target_coverage(
790 &mut candidates,
791 &candidate_pool,
792 args.max_sections,
793 &args.question,
794 )?;
795 candidates.truncate(args.max_sections);
796
797 let mut out = String::new();
798 out.push_str(&format!(
799 "[scout] question=\"{}\" budget=~{}t candidates={}\n",
800 args.question,
801 args.max_tokens,
802 candidates.len()
803 ));
804 if !queries.is_empty() {
805 out.push_str(&format!("[queries] {}\n", queries.join(" | ")));
806 }
807 out.push('\n');
808 let evidence_candidates = order_scout_evidence(
809 focused_scout_candidates(&candidates, &args.question),
810 &args.question,
811 )?;
812 let map_candidates = if evidence_candidates.len() < candidates.len() {
813 &evidence_candidates
814 } else {
815 &candidates
816 };
817 render_scout_file_maps(&mut out, map_candidates, args.max_files)?;
818 if !evidence_candidates.is_empty() && evidence_candidates.len() < candidates.len() {
819 out.push_str(&format!("\n[focus] {}\n", evidence_candidates[0].path));
820 }
821 out.push_str("\n[highlights]\n");
822 render_scout_highlights(&mut out, &evidence_candidates, &args.question, 10)?;
823 out.push_str("\n[evidence]\n");
824 render_scout_evidence(
825 &mut out,
826 &evidence_candidates,
827 &args.question,
828 args.max_tokens,
829 )?;
830
831 if args.json {
832 let output = ScoutJsonOutput {
833 schema_version: 1,
834 root: args.path,
835 question: args.question,
836 token_budget: args.max_tokens,
837 candidate_count: candidates.len(),
838 queries,
839 candidates: evidence_candidates,
840 rendered_text: out,
841 };
842 println!("{}", serde_json::to_string_pretty(&output)?);
843 } else {
844 print!("{out}");
845 }
846 Ok(())
847}
848
849fn scout_queries(question: &str) -> Vec<String> {
850 let mut queries = Vec::new();
851 let phrases = extract_capitalized_phrases(question);
852 for phrase in phrases {
853 let cleaned = clean_query_phrase(&phrase);
854 push_unique_query(&mut queries, cleaned.clone());
855 if cleaned.contains('-') {
856 push_unique_query(&mut queries, cleaned.replace('-', " "));
857 }
858 }
859
860 for phrase in scout_semantic_queries(question) {
861 push_unique_query(&mut queries, phrase);
862 }
863
864 let signal_tokens = signal_tokens(question);
865 for token in signal_tokens.into_iter().take(8) {
866 if token.len() >= 8
867 || token.contains('-')
868 || token.contains('_')
869 || token.chars().any(|c| c.is_ascii_digit())
870 {
871 push_unique_query(&mut queries, token);
872 }
873 }
874
875 if queries.is_empty() {
876 push_unique_query(&mut queries, question.to_string());
877 }
878 queries.truncate(12);
879 queries
880}
881
882fn add_lexical_scout_candidates(
883 root: &str,
884 question: &str,
885 candidates: &mut Vec<ScoutCandidate>,
886 limit: usize,
887) -> Result<()> {
888 let query_terms = lexical_query_terms(question);
889 if query_terms.is_empty() {
890 return Ok(());
891 }
892
893 struct LexicalSection {
894 path: String,
895 section_id: String,
896 section_path: Vec<String>,
897 section_title: String,
898 token_estimate: usize,
899 len: usize,
900 terms: HashMap<String, usize>,
901 title_terms: HashSet<String>,
902 path_terms: HashSet<String>,
903 }
904
905 let files = discover_markdown_files(root)?;
906 let mut sections = Vec::new();
907 let mut df: HashMap<String, usize> = HashMap::new();
908 let mut total_len = 0usize;
909
910 for file in files {
911 let parsed = load_markdown(&file)?;
912 let path_terms = lexical_terms(&file).into_iter().collect::<HashSet<_>>();
913 for section in flatten_doc_sections(&parsed.doc.sections) {
914 if section.title == "<preamble>" {
915 continue;
916 }
917 let content = section.extract_content(&parsed.lines).join("\n");
918 let title_text = section.path.join(" ");
919 let mut terms = lexical_terms(&format!("{title_text}\n{content}"));
920 if terms.is_empty() {
921 continue;
922 }
923 let title_terms = lexical_terms(&title_text)
924 .into_iter()
925 .collect::<HashSet<_>>();
926 let mut tf = HashMap::new();
927 let mut unique = HashSet::new();
928 for term in terms.drain(..) {
929 *tf.entry(term.clone()).or_insert(0) += 1;
930 unique.insert(term);
931 }
932 for term in unique {
933 *df.entry(term).or_insert(0) += 1;
934 }
935 let len = tf.values().sum::<usize>().max(1);
936 total_len += len;
937 sections.push(LexicalSection {
938 path: file.clone(),
939 section_id: section.id.clone(),
940 section_path: section.path.clone(),
941 section_title: section.title.clone(),
942 token_estimate: section.token_estimate,
943 len,
944 terms: tf,
945 title_terms,
946 path_terms: path_terms.clone(),
947 });
948 }
949 }
950
951 let n = sections.len();
952 if n == 0 {
953 return Ok(());
954 }
955 let avg_len = total_len as f64 / n as f64;
956 let unique_query_terms = query_terms.into_iter().collect::<BTreeSet<_>>();
957 let mut scored = Vec::new();
958
959 for section in sections {
960 let mut score = 0.0f64;
961 let mut matched = 0usize;
962 for term in &unique_query_terms {
963 let tf = section.terms.get(term).copied().unwrap_or(0) as f64;
964 let title_hit = section.title_terms.contains(term);
965 let path_hit = section.path_terms.contains(term);
966 if tf == 0.0 && !title_hit && !path_hit {
967 continue;
968 }
969 matched += 1;
970 let doc_freq = df.get(term).copied().unwrap_or(1) as f64;
971 let idf = ((n as f64 - doc_freq + 0.5) / (doc_freq + 0.5) + 1.0).ln();
972 let k1 = 1.2;
973 let b = 0.75;
974 let bm25 = if tf > 0.0 {
975 idf * (tf * (k1 + 1.0)) / (tf + k1 * (1.0 - b + b * section.len as f64 / avg_len))
976 } else {
977 0.0
978 };
979 score += bm25;
980 if title_hit {
981 score += idf * 1.8;
982 }
983 if path_hit {
984 score += idf * 1.1;
985 }
986 }
987 if matched == 0 {
988 continue;
989 }
990 let coverage = matched as f64 / unique_query_terms.len().max(1) as f64;
991 let structural_prior =
992 scout_heading_score(§ion.section_path, §ion.section_title, question) as f64
993 / 25.0;
994 let path_prior = scout_path_quality_score(§ion.path) as f64 / 20.0;
995 let authority_prior =
996 scout_source_authority_score(§ion.path, §ion.section_path, "", question) as f64
997 / 15.0;
998 let compactness = -(section.token_estimate as f64 / 900.0);
999 let final_score = (score * (0.75 + coverage)
1000 + structural_prior
1001 + path_prior
1002 + authority_prior
1003 + compactness)
1004 * 100.0;
1005 scored.push((
1006 final_score.round() as i32,
1007 section.path,
1008 section.section_id,
1009 matched,
1010 ));
1011 }
1012
1013 scored.sort_by(|lhs, rhs| {
1014 rhs.0
1015 .cmp(&lhs.0)
1016 .then(rhs.3.cmp(&lhs.3))
1017 .then(lhs.1.cmp(&rhs.1))
1018 .then(lhs.2.cmp(&rhs.2))
1019 });
1020 for (score, path, section_id, matched) in scored.into_iter().take(limit.max(1)) {
1021 candidates.push(ScoutCandidate {
1022 path,
1023 section_id,
1024 score,
1025 reason: format!("lexical relevance: {matched} query terms"),
1026 });
1027 }
1028 Ok(())
1029}
1030
1031fn lexical_query_terms(text: &str) -> Vec<String> {
1032 let mut out = Vec::new();
1033 for token in lexical_terms(text) {
1034 if token.len() >= 3
1035 && !matches!(
1036 token.as_str(),
1037 "answer" | "doc" | "docs" | "file" | "markdown" | "readme" | "section"
1038 )
1039 && !out.contains(&token)
1040 {
1041 out.push(token);
1042 }
1043 }
1044 out
1045}
1046
1047fn lexical_terms(text: &str) -> Vec<String> {
1048 text.split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-')
1049 .filter_map(normalize_lexical_term)
1050 .collect()
1051}
1052
1053fn normalize_lexical_term(raw: &str) -> Option<String> {
1054 let mut token = raw.trim().trim_matches('-').to_ascii_lowercase();
1055 if token.len() < 3 || is_stopword(&token) {
1056 return None;
1057 }
1058 if token.chars().all(|c| c.is_ascii_digit()) {
1059 return Some(token);
1060 }
1061 for suffix in ["ing", "edly", "edly", "ed", "es", "s"] {
1062 if token.len() > suffix.len() + 3 && token.ends_with(suffix) {
1063 token.truncate(token.len() - suffix.len());
1064 break;
1065 }
1066 }
1067 Some(token)
1068}
1069
1070fn scout_heading_score(section_path: &[String], section_title: &str, question: &str) -> i32 {
1071 let question_l = question.to_ascii_lowercase();
1072 let heading_l = format!("{} {}", section_path.join(" "), section_title).to_ascii_lowercase();
1073 let mut score = 0;
1074
1075 for token in signal_tokens(question).iter().take(8) {
1076 if heading_l.contains(&token.to_ascii_lowercase()) {
1077 score += 20;
1078 }
1079 }
1080 for (needle, heading, weight) in [
1081 ("install", "install", 90),
1082 ("command", "install", 45),
1083 ("usage", "usage", 70),
1084 ("example", "example", 55),
1085 ("configure", "configuration", 70),
1086 ("config", "configuration", 70),
1087 ("option", "option", 65),
1088 ("hyperparameter", "hyperparameter", 75),
1089 ("limitation", "limitation", 90),
1090 ("caveat", "caveat", 90),
1091 ("good fit", "for you", 130),
1092 ("compared", "for you", 90),
1093 ("yourself", "for you", 90),
1094 ("proxy", "proxy", 120),
1095 ("external", "external", 45),
1096 ("caveat", "finding", 50),
1097 ("caveat", "bottom line", 35),
1098 ("caveat", "unambiguous", 55),
1099 ("uniformly", "unambiguous", 55),
1100 ("conclude", "conclude", 70),
1101 ("conclude", "bottom line", 65),
1102 ("why", "finding", 35),
1103 ("why", "conclude", 35),
1104 ("analysis", "analysis", 45),
1105 ("failure", "failure", 55),
1106 ("recommend", "recommendation", 95),
1107 ("policy", "recommendation", 65),
1108 ("policy", "policy", 95),
1109 ("privacy", "privacy", 95),
1110 ("mask", "privacy", 75),
1111 ("masking", "privacy", 75),
1112 ("rule", "rule", 90),
1113 ("rules", "rule", 90),
1114 ("counting", "counting", 100),
1115 ("safety", "safety", 100),
1116 ("hazard", "safety", 75),
1117 ("hazard", "hazard", 85),
1118 ("risk", "risk", 80),
1119 ("why", "policy", 70),
1120 ("why", "rule", 70),
1121 ("why", "risk", 65),
1122 ("treat", "policy", 70),
1123 ("treat", "rule", 70),
1124 ("treat", "risk", 65),
1125 ("reflected", "policy", 65),
1126 ("reflection", "policy", 65),
1127 ("glare", "risk", 65),
1128 ("corrupted", "risk", 55),
1129 ("current", "current loader", 90),
1130 ("loader", "current loader", 90),
1131 ("flag", "current loader", 85),
1132 ("flag", "do not use", 75),
1133 ("stale", "do not use", 95),
1134 ("still", "do not use", 70),
1135 ("recommended", "current loader", 85),
1136 ("direction", "recommendation", 45),
1137 ] {
1138 if question_l.contains(needle) && heading_l.contains(heading) {
1139 score += weight;
1140 }
1141 }
1142 if (question_l.contains("hard") || question_l.contains("remains"))
1143 && heading_l.contains("ambiguity")
1144 {
1145 score += 80;
1146 }
1147 for (low_value, penalty) in [
1148 ("license", 70),
1149 ("citation", 80),
1150 ("cite", 80),
1151 ("contact", 55),
1152 ("contribute", 55),
1153 ("acknowledg", 55),
1154 ] {
1155 if heading_l.contains(low_value) && !question_l.contains(low_value) {
1156 score -= penalty;
1157 }
1158 }
1159 score
1160}
1161
1162fn scout_path_quality_score(path: &str) -> i32 {
1163 let stem = Path::new(path)
1164 .file_stem()
1165 .and_then(|name| name.to_str())
1166 .unwrap_or(path)
1167 .to_ascii_lowercase();
1168 let mut score = 0;
1169 for marker in [
1170 "policy",
1171 "runbook",
1172 "guide",
1173 "manual",
1174 "spec",
1175 "reference",
1176 "card",
1177 "schema",
1178 "protocol",
1179 ] {
1180 if stem.contains(marker) {
1181 score += 45;
1182 }
1183 }
1184 for marker in [
1185 "scratch",
1186 "tmp",
1187 "temp",
1188 "draft",
1189 "random",
1190 "copied",
1191 "copy",
1192 "chat",
1193 "conversation",
1194 ] {
1195 if stem.contains(marker) {
1196 score -= 180;
1197 }
1198 }
1199 score
1200}
1201
1202fn scout_source_authority_score(
1203 path: &str,
1204 section_path: &[String],
1205 content: &str,
1206 question: &str,
1207) -> i32 {
1208 let mut score = scout_path_quality_score(path);
1209 let question_l = question.to_ascii_lowercase();
1210 let heading_l = section_path.join(" ").to_ascii_lowercase();
1211 let content_l = content.to_ascii_lowercase();
1212 let combined = format!("{heading_l}\n{content_l}");
1213
1214 for marker in [
1215 "source of truth",
1216 "current",
1217 "locked",
1218 "policy",
1219 "rule",
1220 "spec",
1221 "reference",
1222 "runbook",
1223 "known risk",
1224 "export notes",
1225 "current loader",
1226 "annotation policy",
1227 ] {
1228 if combined.contains(marker) {
1229 score += 28;
1230 }
1231 }
1232
1233 let asks_for_informal = [
1234 "scratch",
1235 "draft",
1236 "old note",
1237 "old notes",
1238 "stale",
1239 "historical",
1240 "outdated",
1241 "do not use",
1242 ]
1243 .iter()
1244 .any(|needle| question_l.contains(needle));
1245 let low_authority_multiplier = if asks_for_informal { 1 } else { 2 };
1246 for (marker, penalty) in [
1247 ("not authoritative", 180),
1248 ("maybe stale", 140),
1249 ("random copied", 120),
1250 ("todo maybe", 110),
1251 ("scratch note", 100),
1252 ("copied wrong", 80),
1253 ("old notes disagree", 75),
1254 ] {
1255 if combined.contains(marker) {
1256 score -= penalty * low_authority_multiplier;
1257 }
1258 }
1259
1260 score
1261}
1262
1263fn wants_multi_file_evidence(question: &str) -> bool {
1264 let question_l = question.to_ascii_lowercase();
1265 [
1266 " across ",
1267 " between ",
1268 " compare ",
1269 " compares ",
1270 " comparing ",
1271 " contrast ",
1272 " both ",
1273 " each ",
1274 " multiple ",
1275 " multi-file ",
1276 ]
1277 .iter()
1278 .any(|needle| format!(" {question_l} ").contains(needle))
1279}
1280
1281fn scout_semantic_queries(question: &str) -> Vec<String> {
1282 let question_l = question.to_ascii_lowercase();
1283 let mut queries = Vec::new();
1284
1285 if question_l.contains("external") {
1286 queries.push("external".to_string());
1287 if question_l.contains("proxy") {
1288 queries.push("proxy".to_string());
1289 }
1290 if question_l.contains("panel") {
1291 queries.push("panel".to_string());
1292 queries.push("agreement".to_string());
1293 }
1294 }
1295 if question_l.contains("caveat")
1296 || question_l.contains("not specify")
1297 || question_l.contains("does not specify")
1298 || question_l.contains("uniformly")
1299 {
1300 queries.push("caveat".to_string());
1301 queries.push("not uniformly".to_string());
1302 queries.push("not specified".to_string());
1303 }
1304 if question_l.contains("compare")
1305 || question_l.contains("compared")
1306 || question_l.contains("difference")
1307 || question_l.contains("changed")
1308 {
1309 queries.push("compared".to_string());
1310 queries.push("difference".to_string());
1311 }
1312 if question_l.contains("best") && question_l.contains("candidate") {
1313 queries.push("best candidate".to_string());
1314 }
1315 if question_l.contains("failure") && question_l.contains("analysis") {
1316 queries.push("failure analysis".to_string());
1317 }
1318 if question_l.contains("recommend") || question_l.contains("policy direction") {
1319 queries.push("recommendation".to_string());
1320 }
1321 if question_l.contains("why")
1322 || question_l.contains("rule")
1323 || question_l.contains("policy")
1324 || question_l.contains("privacy")
1325 || question_l.contains("safety")
1326 || question_l.contains("hazard")
1327 || question_l.contains("counting")
1328 || question_l.contains("treat")
1329 || question_l.contains("reflected")
1330 || question_l.contains("reflection")
1331 || question_l.contains("glare")
1332 || question_l.contains("corrupted")
1333 {
1334 queries.push("policy".to_string());
1335 queries.push("rule".to_string());
1336 queries.push("known risk".to_string());
1337 }
1338 if question_l.contains("stale")
1339 || question_l.contains("current")
1340 || question_l.contains("recommended")
1341 || question_l.contains("still")
1342 || question_l.contains("flag")
1343 || question_l.contains("loader")
1344 {
1345 queries.push("current loader".to_string());
1346 queries.push("do not use".to_string());
1347 queries.push("stale flag".to_string());
1348 }
1349
1350 queries
1351}
1352
1353fn push_unique_query(queries: &mut Vec<String>, query: String) {
1354 let query = query
1355 .trim()
1356 .trim_matches(|c: char| !c.is_alphanumeric())
1357 .to_string();
1358 if query.len() < 3 {
1359 return;
1360 }
1361 if is_stopword(&query) {
1362 return;
1363 }
1364 if !queries
1365 .iter()
1366 .any(|existing| existing.eq_ignore_ascii_case(&query))
1367 {
1368 queries.push(query);
1369 }
1370}
1371
1372fn clean_query_phrase(phrase: &str) -> String {
1373 phrase
1374 .split_whitespace()
1375 .filter_map(|token| {
1376 let cleaned =
1377 token.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '/');
1378 if cleaned.eq_ignore_ascii_case("readme") || is_stopword(cleaned) {
1379 None
1380 } else {
1381 Some(cleaned.to_string())
1382 }
1383 })
1384 .collect::<Vec<_>>()
1385 .join(" ")
1386}
1387
1388fn extract_capitalized_phrases(text: &str) -> Vec<String> {
1389 let mut phrases = Vec::new();
1390 let mut current: Vec<String> = Vec::new();
1391 for raw in text.split_whitespace() {
1392 let word = raw.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '/');
1393 let is_signal = word
1394 .chars()
1395 .next()
1396 .is_some_and(|c| c.is_ascii_uppercase() || c.is_ascii_digit())
1397 || word.chars().any(|c| c.is_ascii_digit())
1398 || word.contains('-')
1399 || word.contains('/');
1400 if is_signal && word.len() > 1 {
1401 current.push(word.to_string());
1402 if raw.ends_with(',') || raw.ends_with(';') {
1403 if current.len() >= 2 || current[0].len() >= 5 {
1404 phrases.push(current.join(" "));
1405 }
1406 current.clear();
1407 }
1408 } else if !current.is_empty() {
1409 if current.len() >= 2 || current[0].len() >= 5 {
1410 phrases.push(current.join(" "));
1411 }
1412 current.clear();
1413 }
1414 }
1415 if !current.is_empty() && (current.len() >= 2 || current[0].len() >= 5) {
1416 phrases.push(current.join(" "));
1417 }
1418 phrases
1419}
1420
1421fn signal_tokens(text: &str) -> Vec<String> {
1422 let mut out = Vec::new();
1423 for raw in text.split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-') {
1424 let token = raw.trim().trim_matches('-');
1425 if token.len() < 3 {
1426 continue;
1427 }
1428 if is_stopword(token) {
1429 continue;
1430 }
1431 if !out
1432 .iter()
1433 .any(|existing: &String| existing.eq_ignore_ascii_case(token))
1434 {
1435 out.push(token.to_string());
1436 }
1437 }
1438 out
1439}
1440
1441fn is_stopword(token: &str) -> bool {
1442 matches!(
1443 token.to_ascii_lowercase().as_str(),
1444 "about"
1445 | "according"
1446 | "added"
1447 | "after"
1448 | "against"
1449 | "answer"
1450 | "are"
1451 | "across"
1452 | "before"
1453 | "between"
1454 | "can"
1455 | "compared"
1456 | "complete"
1457 | "does"
1458 | "during"
1459 | "explain"
1460 | "fit"
1461 | "for"
1462 | "from"
1463 | "given"
1464 | "good"
1465 | "has"
1466 | "have"
1467 | "how"
1468 | "in"
1469 | "instead"
1470 | "into"
1471 | "its"
1472 | "list"
1473 | "provide"
1474 | "readme"
1475 | "row"
1476 | "run"
1477 | "should"
1478 | "than"
1479 | "that"
1480 | "the"
1481 | "their"
1482 | "there"
1483 | "these"
1484 | "they"
1485 | "this"
1486 | "toolbox"
1487 | "using"
1488 | "user"
1489 | "wants"
1490 | "what"
1491 | "when"
1492 | "where"
1493 | "which"
1494 | "while"
1495 | "with"
1496 | "without"
1497 | "would"
1498 | "yourself"
1499 | "and"
1500 )
1501}
1502
1503fn add_path_match_candidates(
1504 root: &str,
1505 question: &str,
1506 candidates: &mut Vec<ScoutCandidate>,
1507) -> Result<()> {
1508 let files = discover_markdown_files(root)?;
1509 let question_tokens = signal_tokens(question);
1510 if question_tokens.is_empty() {
1511 return Ok(());
1512 }
1513 for path in files {
1514 let normalized = normalize_for_match(&path);
1515 let mut hits = 0;
1516 for token in &question_tokens {
1517 if normalized.contains(&normalize_for_match(token)) {
1518 hits += 1;
1519 }
1520 }
1521 let source_like_path = scout_path_quality_score(&path) > 0;
1522 let policy_or_multi_question = wants_multi_file_evidence(question)
1523 || question.to_ascii_lowercase().contains("why")
1524 || question.to_ascii_lowercase().contains("rule")
1525 || question.to_ascii_lowercase().contains("policy")
1526 || question.to_ascii_lowercase().contains("safety")
1527 || question.to_ascii_lowercase().contains("privacy");
1528 let required_hits = if source_like_path && policy_or_multi_question {
1529 1
1530 } else {
1531 2
1532 };
1533 if hits < required_hits {
1534 continue;
1535 }
1536 let parsed = load_markdown(&path)?;
1537 for section in parsed.doc.sections.iter().take(2) {
1538 candidates.push(ScoutCandidate {
1539 path: path.clone(),
1540 section_id: section.id.clone(),
1541 score: 240 + hits * 30,
1542 reason: "path/name match".to_string(),
1543 });
1544 }
1545 if let Some(best) = best_named_section(&parsed.doc.sections, question) {
1546 candidates.push(ScoutCandidate {
1547 path: path.clone(),
1548 section_id: best.id.clone(),
1549 score: 300
1550 + hits * 45
1551 + scout_path_quality_score(&path)
1552 + scout_heading_score(&best.path, &best.title, question),
1553 reason: "path/name match + relevant heading".to_string(),
1554 });
1555 }
1556 }
1557 Ok(())
1558}
1559
1560fn add_named_target_candidates(
1561 root: &str,
1562 question: &str,
1563 candidates: &mut Vec<ScoutCandidate>,
1564) -> Result<()> {
1565 let targets = target_phrases_from_question(question);
1566 if targets.len() < 2 {
1567 return Ok(());
1568 }
1569
1570 for target in targets {
1571 let results = search_files(root, &target, false, false, 12, 2)?;
1572 let mut seen_files = HashSet::new();
1573 for result in results.into_iter().take(8) {
1574 let content_authority =
1575 scout_source_authority_score(&result.path, &result.section_path, "", question);
1576 candidates.push(ScoutCandidate {
1577 path: result.path.clone(),
1578 section_id: result.section_id.clone(),
1579 score: 620
1580 + content_authority
1581 + result.match_count as i32 * 20
1582 + scout_heading_score(&result.section_path, &result.section_title, question),
1583 reason: format!("named target: {target}"),
1584 });
1585
1586 if seen_files.insert(result.path.clone()) {
1587 let parsed = load_markdown(&result.path)?;
1588 if let Some(best) = best_named_section(&parsed.doc.sections, question) {
1589 candidates.push(ScoutCandidate {
1590 path: result.path.clone(),
1591 section_id: best.id.clone(),
1592 score: 760
1593 + scout_source_authority_score(&result.path, &best.path, "", question)
1594 + scout_heading_score(&best.path, &best.title, question),
1595 reason: format!("named target + relevant heading: {target}"),
1596 });
1597 }
1598 }
1599 }
1600 }
1601 Ok(())
1602}
1603
1604fn normalize_for_match(text: &str) -> String {
1605 text.chars()
1606 .map(|c| {
1607 if c.is_ascii_alphanumeric() {
1608 c.to_ascii_lowercase()
1609 } else {
1610 ' '
1611 }
1612 })
1613 .collect::<String>()
1614}
1615
1616fn best_named_section<'a>(sections: &'a [Section], question: &str) -> Option<&'a Section> {
1617 let mut best: Option<(&Section, i32)> = None;
1618 score_named_sections(sections, question, &mut best);
1619 best.map(|(section, _)| section)
1620}
1621
1622fn score_named_sections<'a>(
1623 sections: &'a [Section],
1624 question: &str,
1625 best: &mut Option<(&'a Section, i32)>,
1626) {
1627 for section in sections {
1628 let title = section.title.to_ascii_lowercase();
1629 let mut score = 0;
1630 for (needle, weight) in [
1631 ("usage", 30),
1632 ("install", 30),
1633 ("quick", 20),
1634 ("example", 20),
1635 ("configuration", 20),
1636 ("training", 20),
1637 ("preprocess", 20),
1638 ("limitation", 25),
1639 ("caveat", 25),
1640 ("documentation", 10),
1641 ("overview", 10),
1642 ("policy", 120),
1643 ("privacy", 110),
1644 ("rule", 115),
1645 ("counting", 110),
1646 ("safety", 115),
1647 ("risk", 90),
1648 ("current", 75),
1649 ("loader", 75),
1650 ("stale", 75),
1651 ("do not use", 90),
1652 ] {
1653 if title.contains(needle) {
1654 score += weight;
1655 }
1656 }
1657 for token in signal_tokens(question).iter().take(8) {
1658 if title.contains(&token.to_ascii_lowercase()) {
1659 score += 25;
1660 }
1661 }
1662 if score > 0 && best.is_none_or(|(_, best_score)| score > best_score) {
1663 *best = Some((section, score));
1664 }
1665 score_named_sections(§ion.children, question, best);
1666 }
1667}
1668
1669fn add_neighbor_candidates(candidates: &mut Vec<ScoutCandidate>) -> Result<()> {
1670 let originals = candidates.to_vec();
1671 let mut by_file: HashMap<String, HashSet<String>> = HashMap::new();
1672 for candidate in &originals {
1673 by_file
1674 .entry(candidate.path.clone())
1675 .or_default()
1676 .insert(candidate.section_id.clone());
1677 }
1678 for (path, ids) in by_file {
1679 let parsed = load_markdown(&path)?;
1680 let flat = flatten_doc_sections(&parsed.doc.sections);
1681 for (idx, section) in flat.iter().enumerate() {
1682 if !ids.contains(§ion.id) {
1683 continue;
1684 }
1685 let start = idx.saturating_sub(1);
1686 let end = (idx + 1).min(flat.len().saturating_sub(1));
1687 for neighbor in flat.iter().take(end + 1).skip(start) {
1688 if neighbor.id == section.id {
1689 continue;
1690 }
1691 candidates.push(ScoutCandidate {
1692 path: path.clone(),
1693 section_id: neighbor.id.clone(),
1694 score: 70,
1695 reason: format!("neighbor of §{}", section.id),
1696 });
1697 }
1698 }
1699 }
1700 Ok(())
1701}
1702
1703fn flatten_doc_sections(sections: &[Section]) -> Vec<&Section> {
1704 let mut out = Vec::new();
1705 collect_flat_sections(sections, &mut out);
1706 out.sort_by_key(|section| section.line_start);
1707 out
1708}
1709
1710fn collect_flat_sections<'a>(sections: &'a [Section], out: &mut Vec<&'a Section>) {
1711 for section in sections {
1712 out.push(section);
1713 collect_flat_sections(§ion.children, out);
1714 }
1715}
1716
1717fn dedupe_scout_candidates(candidates: &mut Vec<ScoutCandidate>) {
1718 let mut seen = HashSet::new();
1719 candidates
1720 .retain(|candidate| seen.insert(format!("{}::{}", candidate.path, candidate.section_id)));
1721}
1722
1723fn prune_parent_scout_candidates(candidates: &mut Vec<ScoutCandidate>) {
1724 let ids_by_file: HashMap<String, Vec<String>> =
1725 candidates
1726 .iter()
1727 .fold(HashMap::new(), |mut by_file, candidate| {
1728 by_file
1729 .entry(candidate.path.clone())
1730 .or_default()
1731 .push(candidate.section_id.clone());
1732 by_file
1733 });
1734
1735 candidates.retain(|candidate| {
1736 !ids_by_file.get(&candidate.path).is_some_and(|ids| {
1737 ids.iter()
1738 .any(|id| is_child_section_id(&candidate.section_id, id))
1739 })
1740 });
1741}
1742
1743fn diversify_scout_candidates(
1744 candidates: &mut Vec<ScoutCandidate>,
1745 max_sections: usize,
1746 question: &str,
1747) {
1748 if !wants_multi_file_evidence(question) || candidates.len() <= max_sections {
1749 return;
1750 }
1751
1752 let mut targets = target_phrases_from_question(question);
1753 if targets.len() < 2 {
1754 targets = target_tokens_from_question(question);
1755 }
1756 if let Some(selected) =
1757 target_coverage_scout_candidates(candidates, max_sections, &targets, question)
1758 {
1759 *candidates = selected;
1760 return;
1761 }
1762
1763 let mut selected = Vec::new();
1764 let mut selected_keys = HashSet::new();
1765 let mut per_file_count: HashMap<String, usize> = HashMap::new();
1766
1767 for candidate in candidates.iter() {
1768 if selected.len() >= max_sections {
1769 break;
1770 }
1771 let count = per_file_count.get(&candidate.path).copied().unwrap_or(0);
1772 if count >= 2 {
1773 continue;
1774 }
1775 let key = format!("{}::{}", candidate.path, candidate.section_id);
1776 if selected_keys.insert(key) {
1777 selected.push(candidate.clone());
1778 *per_file_count.entry(candidate.path.clone()).or_default() += 1;
1779 }
1780 }
1781
1782 for candidate in candidates.iter() {
1783 if selected.len() >= max_sections {
1784 break;
1785 }
1786 let key = format!("{}::{}", candidate.path, candidate.section_id);
1787 if selected_keys.insert(key) {
1788 selected.push(candidate.clone());
1789 }
1790 }
1791
1792 if selected.len() >= 2 {
1793 *candidates = selected;
1794 }
1795}
1796
1797fn target_coverage_scout_candidates(
1798 candidates: &[ScoutCandidate],
1799 max_sections: usize,
1800 targets: &[String],
1801 question: &str,
1802) -> Option<Vec<ScoutCandidate>> {
1803 if targets.len() < 2 || max_sections == 0 {
1804 return None;
1805 }
1806
1807 let mut cache: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
1808 let mut selected = Vec::new();
1809 let mut selected_keys = HashSet::new();
1810 let mut covered_targets: HashSet<String> = HashSet::new();
1811 let mut per_file_count: HashMap<String, usize> = HashMap::new();
1812
1813 while selected.len() < max_sections {
1814 let mut best_idx = None;
1815 let mut best_score = i32::MIN;
1816 let mut best_new_targets = HashSet::new();
1817
1818 for (idx, candidate) in candidates.iter().enumerate() {
1819 let key = format!("{}::{}", candidate.path, candidate.section_id);
1820 if selected_keys.contains(&key) {
1821 continue;
1822 }
1823 let Ok((target_hits, authority)) =
1824 scout_candidate_target_hits(candidate, targets, question, &mut cache)
1825 else {
1826 continue;
1827 };
1828 let new_targets = target_hits
1829 .difference(&covered_targets)
1830 .cloned()
1831 .collect::<HashSet<_>>();
1832 if new_targets.is_empty() && covered_targets.len() < targets.len() {
1833 continue;
1834 }
1835 let same_file_penalty =
1836 per_file_count.get(&candidate.path).copied().unwrap_or(0) as i32 * 160;
1837 let coverage_gain = new_targets.len() as i32 * 420 + target_hits.len() as i32 * 35;
1838 let score = candidate.score + authority + coverage_gain - same_file_penalty;
1839 if score > best_score {
1840 best_score = score;
1841 best_idx = Some(idx);
1842 best_new_targets = new_targets;
1843 }
1844 }
1845
1846 let Some(idx) = best_idx else {
1847 break;
1848 };
1849 let candidate = candidates[idx].clone();
1850 let key = format!("{}::{}", candidate.path, candidate.section_id);
1851 selected_keys.insert(key);
1852 for target in best_new_targets {
1853 covered_targets.insert(target);
1854 }
1855 *per_file_count.entry(candidate.path.clone()).or_default() += 1;
1856 selected.push(candidate);
1857
1858 if covered_targets.len() >= targets.len() {
1859 break;
1860 }
1861 }
1862
1863 if selected.len() < 2 {
1864 return None;
1865 }
1866
1867 for candidate in candidates {
1868 if selected.len() >= max_sections {
1869 break;
1870 }
1871 let key = format!("{}::{}", candidate.path, candidate.section_id);
1872 if selected_keys.contains(&key) {
1873 continue;
1874 }
1875 let Ok((_, authority)) =
1876 scout_candidate_target_hits(candidate, targets, question, &mut cache)
1877 else {
1878 continue;
1879 };
1880 if authority < -250 && selected.len() >= 2 {
1881 continue;
1882 }
1883 selected_keys.insert(key);
1884 selected.push(candidate.clone());
1885 }
1886
1887 Some(selected)
1888}
1889
1890fn ensure_named_target_coverage(
1891 selected: &mut Vec<ScoutCandidate>,
1892 pool: &[ScoutCandidate],
1893 max_sections: usize,
1894 question: &str,
1895) -> Result<()> {
1896 let targets = target_phrases_from_question(question);
1897 if targets.len() < 2 || max_sections == 0 {
1898 return Ok(());
1899 }
1900
1901 let mut cache: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
1902 let mut selected_keys = selected
1903 .iter()
1904 .map(|candidate| format!("{}::{}", candidate.path, candidate.section_id))
1905 .collect::<HashSet<_>>();
1906 let mut covered = HashSet::new();
1907 for candidate in selected.iter() {
1908 let (hits, _) = scout_candidate_target_hits(candidate, &targets, question, &mut cache)?;
1909 covered.extend(hits);
1910 }
1911
1912 for target in targets {
1913 if covered.contains(&target) {
1914 continue;
1915 }
1916
1917 let mut best: Option<(ScoutCandidate, i32)> = None;
1918 for candidate in pool {
1919 let key = format!("{}::{}", candidate.path, candidate.section_id);
1920 if selected_keys.contains(&key) {
1921 continue;
1922 }
1923 let (hits, authority) = scout_candidate_target_hits(
1924 candidate,
1925 std::slice::from_ref(&target),
1926 question,
1927 &mut cache,
1928 )?;
1929 if hits.is_empty() {
1930 continue;
1931 }
1932 let score = candidate.score + authority;
1933 if best
1934 .as_ref()
1935 .is_none_or(|(_, best_score)| score > *best_score)
1936 {
1937 best = Some((candidate.clone(), score));
1938 }
1939 }
1940
1941 let Some((candidate, _)) = best else {
1942 continue;
1943 };
1944 let key = format!("{}::{}", candidate.path, candidate.section_id);
1945 if selected.len() >= max_sections {
1946 selected.pop();
1947 }
1948 selected_keys.insert(key);
1949 covered.insert(target);
1950 selected.push(candidate);
1951 }
1952
1953 Ok(())
1954}
1955
1956fn scout_candidate_target_hits(
1957 candidate: &ScoutCandidate,
1958 targets: &[String],
1959 question: &str,
1960 cache: &mut HashMap<String, crate::parse::ParsedMarkdown>,
1961) -> Result<(HashSet<String>, i32)> {
1962 if !cache.contains_key(&candidate.path) {
1963 cache.insert(candidate.path.clone(), load_markdown(&candidate.path)?);
1964 }
1965 let parsed = cache.get(&candidate.path).expect("cached parsed markdown");
1966 let Some(section) = parsed.doc.find_section_by_id(&candidate.section_id) else {
1967 return Ok((HashSet::new(), scout_path_quality_score(&candidate.path)));
1968 };
1969 let content = section.extract_content(&parsed.lines).join("\n");
1970 let source_haystack =
1971 normalize_compact(&format!("{}\n{}", candidate.path, section.path.join(" ")));
1972 let haystack = normalize_compact(&format!(
1973 "{}\n{}\n{}",
1974 candidate.path,
1975 section.path.join(" "),
1976 content
1977 ));
1978 let hits = targets
1979 .iter()
1980 .filter(|target| haystack.contains(&normalize_compact(target)))
1981 .cloned()
1982 .collect::<HashSet<_>>();
1983 let source_hit_count = targets
1984 .iter()
1985 .filter(|target| source_haystack.contains(&normalize_compact(target)))
1986 .count() as i32;
1987 let mut authority =
1988 scout_source_authority_score(&candidate.path, §ion.path, &content, question);
1989 authority += source_hit_count * 360;
1990 if source_hit_count == 0 && !hits.is_empty() {
1991 authority -= 120;
1992 }
1993 Ok((hits, authority))
1994}
1995
1996fn is_child_section_id(parent: &str, child: &str) -> bool {
1997 child.len() > parent.len()
1998 && child.starts_with(parent)
1999 && child[parent.len()..].starts_with('.')
2000}
2001
2002fn focused_scout_candidates(candidates: &[ScoutCandidate], question: &str) -> Vec<ScoutCandidate> {
2003 let Some(top) = candidates.first() else {
2004 return Vec::new();
2005 };
2006 if wants_multi_file_evidence(question) {
2007 let targets = target_tokens_from_question(question);
2008 if !targets.is_empty() {
2009 let focused = candidates
2010 .iter()
2011 .filter(|candidate| path_matches_any_target(&candidate.path, &targets))
2012 .cloned()
2013 .collect::<Vec<_>>();
2014 if focused.len() >= 2 {
2015 return focused;
2016 }
2017 }
2018 return candidates.to_vec();
2019 }
2020 let top_path_tokens = distinctive_path_tokens(&top.path);
2021 if scout_path_quality_score(&top.path) > 0 && !top_path_tokens.is_empty() {
2022 let focused = candidates
2023 .iter()
2024 .filter(|candidate| {
2025 candidate.path == top.path
2026 || distinctive_path_tokens(&candidate.path)
2027 .iter()
2028 .any(|token| top_path_tokens.contains(token))
2029 })
2030 .cloned()
2031 .collect::<Vec<_>>();
2032 if focused.len() >= 2 {
2033 return focused;
2034 }
2035 }
2036 let best_other_score = candidates
2037 .iter()
2038 .find(|candidate| candidate.path != top.path)
2039 .map(|candidate| candidate.score);
2040 let dominant_file =
2041 top.score >= 280 && best_other_score.is_none_or(|score| top.score - score >= 80);
2042 if dominant_file {
2043 candidates
2044 .iter()
2045 .filter(|candidate| candidate.path == top.path)
2046 .cloned()
2047 .collect()
2048 } else {
2049 candidates.to_vec()
2050 }
2051}
2052
2053fn order_scout_evidence(
2054 mut candidates: Vec<ScoutCandidate>,
2055 question: &str,
2056) -> Result<Vec<ScoutCandidate>> {
2057 let question_l = question.to_ascii_lowercase();
2058 if !wants_rationale_or_policy_evidence(&question_l) {
2059 return Ok(candidates);
2060 }
2061
2062 let mut cache: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
2063 let mut scored = Vec::new();
2064 for (idx, candidate) in candidates.drain(..).enumerate() {
2065 if !cache.contains_key(&candidate.path) {
2066 cache.insert(candidate.path.clone(), load_markdown(&candidate.path)?);
2067 }
2068 let parsed = cache.get(&candidate.path).expect("cached parsed markdown");
2069 let score = parsed
2070 .doc
2071 .find_section_by_id(&candidate.section_id)
2072 .map(|section| {
2073 let content = section.extract_content(&parsed.lines).join("\n");
2074 candidate.score
2075 + scout_rationale_evidence_score(§ion.path, &content, &question_l)
2076 })
2077 .unwrap_or(candidate.score);
2078 scored.push((score, idx, candidate));
2079 }
2080 scored.sort_by(|lhs, rhs| rhs.0.cmp(&lhs.0).then(lhs.1.cmp(&rhs.1)));
2081 Ok(scored
2082 .into_iter()
2083 .map(|(_, _, candidate)| candidate)
2084 .collect())
2085}
2086
2087fn wants_rationale_or_policy_evidence(question_l: &str) -> bool {
2088 [
2089 "why",
2090 "what makes",
2091 "rather than",
2092 "policy",
2093 "privacy",
2094 "safety",
2095 "allow",
2096 "allows",
2097 "exporting",
2098 "mask",
2099 "masking",
2100 "rationale",
2101 "reason",
2102 ]
2103 .iter()
2104 .any(|needle| question_l.contains(needle))
2105}
2106
2107fn asks_for_metric_or_table(question_l: &str) -> bool {
2108 [
2109 "metric",
2110 "score",
2111 "baseline",
2112 "table",
2113 "row",
2114 "0.",
2115 "current score",
2116 ]
2117 .iter()
2118 .any(|needle| question_l.contains(needle))
2119}
2120
2121fn scout_rationale_evidence_score(section_path: &[String], content: &str, question_l: &str) -> i32 {
2122 let text = format!("{}\n{}", section_path.join(" "), content).to_ascii_lowercase();
2123 let mut score = 0;
2124 score += scout_rationale_marker_score(&text);
2125 score += scout_question_token_overlap_score(&text, question_l, 28, 220);
2126 if !asks_for_metric_or_table(question_l) {
2127 for needle in [
2128 "metric | score",
2129 "| score |",
2130 "baseline",
2131 "current metric",
2132 "benchmark",
2133 "leaderboard",
2134 ] {
2135 if text.contains(needle) {
2136 score -= 220;
2137 }
2138 }
2139 }
2140 score
2141}
2142
2143fn distinctive_path_tokens(path: &str) -> HashSet<String> {
2144 let stem = Path::new(path)
2145 .file_stem()
2146 .and_then(|name| name.to_str())
2147 .unwrap_or(path);
2148 stem.split(|c: char| !c.is_ascii_alphanumeric())
2149 .map(str::to_ascii_lowercase)
2150 .filter(|token| {
2151 token.len() >= 4
2152 && !matches!(
2153 token.as_str(),
2154 "readme"
2155 | "index"
2156 | "docs"
2157 | "doc"
2158 | "notes"
2159 | "note"
2160 | "eval"
2161 | "scene"
2162 | "card"
2163 | "annotation"
2164 | "policy"
2165 | "scratch"
2166 | "draft"
2167 | "copy"
2168 | "copied"
2169 | "tmp"
2170 | "temp"
2171 | "anchor"
2172 )
2173 })
2174 .collect()
2175}
2176
2177fn target_tokens_from_question(question: &str) -> Vec<String> {
2178 let mut out = Vec::new();
2179 for phrase in extract_capitalized_phrases(question) {
2180 for token in signal_tokens(&phrase) {
2181 for part in token.split('-') {
2182 let part = part.to_ascii_lowercase();
2183 if part.len() >= 4 && !is_stopword(&part) && !out.contains(&part) {
2184 out.push(part);
2185 }
2186 }
2187 }
2188 }
2189 out
2190}
2191
2192fn target_phrases_from_question(question: &str) -> Vec<String> {
2193 let mut out = Vec::new();
2194 for phrase in extract_capitalized_phrases(question) {
2195 if !phrase
2196 .chars()
2197 .any(|ch| ch.is_ascii_uppercase() || ch.is_ascii_digit())
2198 {
2199 continue;
2200 }
2201 let tokens = signal_tokens(&phrase)
2202 .into_iter()
2203 .filter(|token| {
2204 !matches!(
2205 token.to_ascii_lowercase().as_str(),
2206 "compare" | "contrast" | "across" | "between" | "which"
2207 )
2208 })
2209 .collect::<Vec<_>>();
2210 if tokens.is_empty() {
2211 continue;
2212 }
2213 let phrase = tokens.join(" ");
2214 if phrase.len() >= 4 && !out.iter().any(|existing| existing == &phrase) {
2215 out.push(phrase);
2216 }
2217 }
2218 out
2219}
2220
2221#[cfg(test)]
2222mod scout_tests {
2223 use super::target_phrases_from_question;
2224
2225 #[test]
2226 fn target_phrases_keep_hyphenated_entities() {
2227 let targets = target_phrases_from_question(
2228 "Across Harbor-17, Rainy Rail Depot, and Night Bus Stop, how do the docs treat reflected or glare-corrupted text?",
2229 );
2230 assert!(targets.contains(&"Harbor-17".to_string()), "{targets:?}");
2231 assert!(
2232 targets.contains(&"Rainy Rail Depot".to_string()),
2233 "{targets:?}"
2234 );
2235 assert!(
2236 targets.contains(&"Night Bus Stop".to_string()),
2237 "{targets:?}"
2238 );
2239 }
2240}
2241
2242fn path_matches_any_target(path: &str, targets: &[String]) -> bool {
2243 let path_l = normalize_compact(path);
2244 targets
2245 .iter()
2246 .any(|target| path_l.contains(&normalize_compact(target)))
2247}
2248
2249fn normalize_compact(text: &str) -> String {
2250 text.chars()
2251 .filter(|c| c.is_ascii_alphanumeric())
2252 .map(|c| c.to_ascii_lowercase())
2253 .collect()
2254}
2255
2256fn render_scout_file_maps(
2257 out: &mut String,
2258 candidates: &[ScoutCandidate],
2259 max_files: usize,
2260) -> Result<()> {
2261 let mut files = Vec::new();
2262 let mut seen = HashSet::new();
2263 for candidate in candidates {
2264 if seen.insert(candidate.path.clone()) {
2265 files.push(candidate.path.clone());
2266 }
2267 if files.len() >= max_files {
2268 break;
2269 }
2270 }
2271 out.push_str("[files]\n");
2272 for path in files {
2273 let summaries = get_doc_section_summaries(&path)?;
2274 let picked: HashSet<&str> = candidates
2275 .iter()
2276 .filter(|c| c.path == path)
2277 .map(|c| c.section_id.as_str())
2278 .collect();
2279 let sections = summaries
2280 .iter()
2281 .filter(|(id, title)| title != "<preamble>" && picked.contains(id.as_str()))
2282 .map(|(id, title)| format!("§{} {}", id, title))
2283 .take(6)
2284 .collect::<Vec<_>>();
2285 let also = summaries
2286 .iter()
2287 .filter(|(id, title)| title != "<preamble>" && !picked.contains(id.as_str()))
2288 .take(6)
2289 .map(|(id, title)| format!("§{} {}", id, title))
2290 .collect::<Vec<_>>();
2291 out.push_str(&format!("- {}\n", path));
2292 if !sections.is_empty() {
2293 out.push_str(&format!(" picked: {}\n", sections.join(" · ")));
2294 }
2295 if !also.is_empty() {
2296 out.push_str(&format!(" also: {}\n", also.join(" · ")));
2297 }
2298 }
2299 Ok(())
2300}
2301
2302fn render_scout_highlights(
2303 out: &mut String,
2304 candidates: &[ScoutCandidate],
2305 question: &str,
2306 max_lines: usize,
2307) -> Result<()> {
2308 let tokens: Vec<String> = signal_tokens(question)
2309 .into_iter()
2310 .map(|token| token.to_ascii_lowercase())
2311 .collect();
2312 let question_l = question.to_ascii_lowercase();
2313 let wants_code = ["cli", "command", "install", "invoke"]
2314 .iter()
2315 .any(|needle| question_l.contains(needle));
2316 let mut emitted = 0usize;
2317 let mut seen = HashSet::new();
2318 let mut highlights = Vec::new();
2319 let mut cache: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
2320
2321 for candidate in candidates {
2322 if !cache.contains_key(&candidate.path) {
2323 cache.insert(candidate.path.clone(), load_markdown(&candidate.path)?);
2324 }
2325 let parsed = cache.get(&candidate.path).expect("cached parsed markdown");
2326 let Some(section) = parsed.doc.find_section_by_id(&candidate.section_id) else {
2327 continue;
2328 };
2329 if is_low_value_section_for_question(section, &question_l) {
2330 continue;
2331 }
2332 let lines = section.extract_content(&parsed.lines);
2333 for (idx, line) in lines.iter().enumerate() {
2334 if emitted >= max_lines {
2335 break;
2336 }
2337 let trimmed = line.trim();
2338 let lower = trimmed.to_ascii_lowercase();
2339 if is_noisy_highlight_line(trimmed) && !is_relevant_table_line(trimmed, &tokens) {
2340 continue;
2341 }
2342 let token_hits = tokens.iter().filter(|token| lower.contains(*token)).count();
2343 let useful_code_line = trimmed.contains("--")
2344 || (wants_code
2345 && (trimmed.contains('`')
2346 || trimmed.starts_with("pip ")
2347 || trimmed.starts_with("conda ")
2348 || trimmed.starts_with("python ")
2349 || trimmed.starts_with("git ")
2350 || trimmed.starts_with("cmake ")
2351 || trimmed.starts_with("make ")));
2352 let useful_table_line = is_relevant_table_line(trimmed, &tokens);
2353 if token_hits == 0 && !useful_code_line && !useful_table_line {
2354 continue;
2355 }
2356 let mut score = token_hits as i32 * 20;
2357 if useful_table_line {
2358 score += 80;
2359 }
2360 if wants_rationale_or_policy_evidence(&question_l) {
2361 score += scout_rationale_highlight_score(&lower, &question_l);
2362 }
2363 for (needle, weight) in [
2364 ("--", 70),
2365 ("cpu", 45),
2366 ("gpu", 45),
2367 ("warning", 45),
2368 ("disable", 45),
2369 ("configuration", 30),
2370 ("header", 30),
2371 ("human-readable", 30),
2372 ("supported formats", 30),
2373 ("convert", 30),
2374 ] {
2375 if lower.contains(needle) {
2376 score += weight;
2377 }
2378 }
2379 highlights.push(ScoutHighlight {
2380 score,
2381 path: candidate.path.clone(),
2382 section_id: section.id.clone(),
2383 line_no: section.line_start + idx,
2384 line: if useful_table_line {
2385 scout_table_context(lines, idx)
2386 } else {
2387 scout_highlight_context(lines, idx, &lower)
2388 },
2389 });
2390 }
2391 }
2392
2393 highlights.sort_by(|lhs, rhs| {
2394 rhs.score
2395 .cmp(&lhs.score)
2396 .then(lhs.path.cmp(&rhs.path))
2397 .then(lhs.line_no.cmp(&rhs.line_no))
2398 });
2399 for highlight in highlights {
2400 if emitted >= max_lines {
2401 break;
2402 }
2403 emit_scout_highlight(out, &mut seen, &mut emitted, &highlight);
2404 }
2405
2406 if emitted == 0 {
2407 out.push_str("- no compact highlights; read evidence sections below\n");
2408 }
2409 Ok(())
2410}
2411
2412fn scout_rationale_highlight_score(lower: &str, question_l: &str) -> i32 {
2413 let mut score = 0;
2414 score += scout_rationale_marker_score(lower) / 2;
2415 score += scout_question_token_overlap_score(lower, question_l, 18, 120);
2416 if !asks_for_metric_or_table(question_l) {
2417 for needle in ["| score |", "baseline", "current metric", "benchmark", "0."] {
2418 if lower.contains(needle) {
2419 score -= 140;
2420 }
2421 }
2422 }
2423 score
2424}
2425
2426fn scout_rationale_marker_score(lower: &str) -> i32 {
2427 let mut score = 0;
2428 for (needles, weight) in [
2429 (
2430 &["rule:", "rule ", "policy", "guideline", "standard"][..],
2431 180,
2432 ),
2433 (
2434 &[
2435 "known risk",
2436 "risk",
2437 "unsafe",
2438 "wrong answer",
2439 "misread",
2440 "confus",
2441 "ambiguous",
2442 ][..],
2443 160,
2444 ),
2445 (
2446 &[
2447 "privacy",
2448 "personal data",
2449 "identifiable",
2450 "redact",
2451 "mask",
2452 "export",
2453 "leak",
2454 ][..],
2455 150,
2456 ),
2457 (
2458 &[
2459 "must",
2460 "should",
2461 "requires",
2462 "allow",
2463 "not enough",
2464 "do not",
2465 "rather than",
2466 ][..],
2467 100,
2468 ),
2469 (
2470 &["because", "reason", "rationale", "therefore", "so that"][..],
2471 80,
2472 ),
2473 ] {
2474 if needles.iter().any(|needle| lower.contains(needle)) {
2475 score += weight;
2476 }
2477 }
2478 score
2479}
2480
2481fn scout_question_token_overlap_score(
2482 lower: &str,
2483 question_l: &str,
2484 per_token: i32,
2485 cap: i32,
2486) -> i32 {
2487 let hits = signal_tokens(question_l)
2488 .into_iter()
2489 .map(|token| token.to_ascii_lowercase())
2490 .filter(|token| lower.contains(token))
2491 .count() as i32;
2492 (hits * per_token).min(cap)
2493}
2494
2495fn is_noisy_highlight_line(line: &str) -> bool {
2496 line.is_empty()
2497 || line.starts_with('|')
2498 || line == "```"
2499 || line == "```shell"
2500 || line.trim_matches('~') == "```"
2501 || line.trim_matches('~') == "```shell"
2502 || line.starts_with("<!--")
2503 || line.starts_with("[!")
2504 || line.starts_with("![")
2505 || line.starts_with("[![")
2506 || line.starts_with("@article")
2507 || line.starts_with("@inproceedings")
2508 || (line.starts_with('[') && line.contains("]: "))
2509 || line.len() > 1000
2510}
2511
2512fn is_relevant_table_line(line: &str, tokens: &[String]) -> bool {
2513 line.starts_with('|')
2514 && line.matches('|').count() >= 3
2515 && !is_table_separator_line(line)
2516 && tokens
2517 .iter()
2518 .any(|token| line.to_ascii_lowercase().contains(token))
2519}
2520
2521fn is_table_separator_line(line: &str) -> bool {
2522 line.chars()
2523 .all(|ch| ch == '|' || ch == '-' || ch == ':' || ch.is_whitespace())
2524}
2525
2526fn scout_table_context(lines: &[String], idx: usize) -> String {
2527 let row = lines[idx].trim();
2528 let header = (1..idx).rev().find_map(|candidate_idx| {
2529 let separator = lines[candidate_idx].trim();
2530 if !separator.starts_with('|') || !is_table_separator_line(separator) {
2531 return None;
2532 }
2533 let header = lines[candidate_idx - 1].trim();
2534 header.starts_with('|').then_some(header)
2535 });
2536
2537 match header {
2538 Some(header) if header != row => format!("{header} => {row}"),
2539 _ => row.to_string(),
2540 }
2541}
2542
2543fn scout_highlight_context(lines: &[String], idx: usize, lower: &str) -> String {
2544 let radius = if lower.contains("disable") || lower.contains("warning") {
2545 5
2546 } else if lines[idx].trim().len() < 300 {
2547 2
2548 } else {
2549 0
2550 };
2551 let start = idx.saturating_sub(radius);
2552 let end = (idx + radius).min(lines.len().saturating_sub(1));
2553 let mut parts = Vec::new();
2554 for line in &lines[start..=end] {
2555 let trimmed = line.trim();
2556 if is_noisy_highlight_line(trimmed) && !trimmed.starts_with('|') {
2557 continue;
2558 }
2559 parts.push(trimmed);
2560 }
2561 let mut joined = parts.join(" ");
2562 if joined.len() > 900 {
2563 joined.truncate(900);
2564 joined.push_str("...");
2565 }
2566 joined
2567}
2568
2569fn is_low_value_section_for_question(section: &Section, question_l: &str) -> bool {
2570 let section_path = section.path.join(" ").to_ascii_lowercase();
2571 let citation_section = section_path.contains("citation")
2572 || section_path.contains("cite")
2573 || section_path.contains("references");
2574 citation_section
2575 && !["citation", "cite", "doi", "reference", "paper"]
2576 .iter()
2577 .any(|needle| question_l.contains(needle))
2578}
2579
2580fn emit_scout_highlight(
2581 out: &mut String,
2582 seen: &mut HashSet<String>,
2583 emitted: &mut usize,
2584 highlight: &ScoutHighlight,
2585) {
2586 let key = format!(
2587 "{}:{}:{}",
2588 highlight.path, highlight.line_no, highlight.line
2589 );
2590 if !seen.insert(key) {
2591 return;
2592 }
2593 out.push_str(&format!(
2594 "- {} §{} l{}: {}\n",
2595 highlight.path, highlight.section_id, highlight.line_no, highlight.line
2596 ));
2597 *emitted += 1;
2598}
2599
2600fn render_scout_evidence(
2601 out: &mut String,
2602 candidates: &[ScoutCandidate],
2603 question: &str,
2604 max_tokens: usize,
2605) -> Result<()> {
2606 let mut total_tokens = 0usize;
2607 let mut cache: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
2608 let mut emitted_ranges: HashMap<String, Vec<(usize, usize)>> = HashMap::new();
2609 let question_l = question.to_ascii_lowercase();
2610 for candidate in candidates {
2611 if total_tokens >= max_tokens {
2612 out.push_str("\n<!-- mdlens: scout budget exhausted -->\n");
2613 break;
2614 }
2615 if !cache.contains_key(&candidate.path) {
2616 cache.insert(candidate.path.clone(), load_markdown(&candidate.path)?);
2617 }
2618 let parsed = cache.get(&candidate.path).expect("cached parsed markdown");
2619 let Some(section) = parsed.doc.find_section_by_id(&candidate.section_id) else {
2620 continue;
2621 };
2622 if is_low_value_section_for_question(section, &question_l) {
2623 continue;
2624 }
2625 let ranges = emitted_ranges.entry(candidate.path.clone()).or_default();
2626 if ranges.iter().any(|(start, end)| {
2627 section.line_start <= *start
2628 && section.line_end >= *end
2629 && (section.line_end - section.line_start) > (*end - *start)
2630 }) {
2631 continue;
2632 }
2633 let remaining = max_tokens.saturating_sub(total_tokens);
2634 let section_budget = remaining.min(650);
2635 let ancestors = section_ancestors(&parsed.doc.sections, §ion.id);
2636 let (content, truncated) =
2637 scout_section_content(section, &ancestors, &parsed.lines, question, section_budget);
2638 let emitted_tokens = estimate_tokens(&content);
2639 if emitted_tokens == 0 {
2640 continue;
2641 }
2642 out.push_str(&format!(
2643 "\n--- {} §{} {} l{}-{} ~{}t reason={} ---\n",
2644 candidate.path,
2645 section.id,
2646 section.path.join(" > "),
2647 section.line_start,
2648 section.line_end,
2649 section.token_estimate,
2650 candidate.reason
2651 ));
2652 out.push_str(&content);
2653 if !content.ends_with('\n') {
2654 out.push('\n');
2655 }
2656 ranges.push((section.line_start, section.line_end));
2657 total_tokens += emitted_tokens;
2658 if truncated {
2659 continue;
2660 }
2661 }
2662 Ok(())
2663}
2664
2665fn scout_section_content(
2666 section: &Section,
2667 ancestors: &[&Section],
2668 lines: &[String],
2669 question: &str,
2670 max_tokens: usize,
2671) -> (String, bool) {
2672 let parent_context = scout_parent_context(ancestors, lines, max_tokens.min(220));
2673 let content_lines = section.extract_content(lines);
2674 let full = content_lines.join("\n");
2675 let full_with_context = if parent_context.trim().is_empty() {
2676 full.clone()
2677 } else {
2678 format!("{parent_context}\n...\n{full}")
2679 };
2680 let full_tokens = estimate_tokens(&full_with_context);
2681 if full_tokens <= max_tokens {
2682 return (full_with_context, false);
2683 }
2684
2685 let focused_budget = max_tokens
2686 .saturating_sub(estimate_tokens(&parent_context))
2687 .max(max_tokens / 2);
2688 let focused = scout_focused_excerpt(content_lines, question, focused_budget);
2689 if !focused.trim().is_empty() {
2690 if parent_context.trim().is_empty() {
2691 return (focused, true);
2692 }
2693 return (format!("{parent_context}\n...\n{focused}"), true);
2694 }
2695
2696 (
2697 truncate_to_tokens(&full_with_context, max_tokens, TRUNCATION_NOTICE),
2698 true,
2699 )
2700}
2701
2702fn section_ancestors<'a>(sections: &'a [Section], target_id: &str) -> Vec<&'a Section> {
2703 let mut path = Vec::new();
2704 collect_section_ancestors(sections, target_id, &mut path);
2705 path
2706}
2707
2708fn collect_section_ancestors<'a>(
2709 sections: &'a [Section],
2710 target_id: &str,
2711 path: &mut Vec<&'a Section>,
2712) -> bool {
2713 for section in sections {
2714 if section.id == target_id {
2715 return true;
2716 }
2717 path.push(section);
2718 if collect_section_ancestors(§ion.children, target_id, path) {
2719 return true;
2720 }
2721 path.pop();
2722 }
2723 false
2724}
2725
2726fn scout_parent_context(ancestors: &[&Section], lines: &[String], max_tokens: usize) -> String {
2727 if ancestors.is_empty() || max_tokens == 0 {
2728 return String::new();
2729 }
2730
2731 let mut parts = Vec::new();
2732 for ancestor in ancestors {
2733 let direct = ancestor.extract_direct_content(lines);
2734 let cleaned = direct
2735 .iter()
2736 .map(|line| line.trim_end())
2737 .filter(|line| !line.trim().is_empty() && !is_noisy_highlight_line(line.trim()))
2738 .collect::<Vec<_>>()
2739 .join("\n");
2740 if cleaned.trim().is_empty() {
2741 continue;
2742 }
2743 parts.push(cleaned);
2744 }
2745
2746 let joined = parts.join("\n");
2747 if estimate_tokens(&joined) <= max_tokens {
2748 joined
2749 } else {
2750 truncate_to_tokens(&joined, max_tokens, TRUNCATION_NOTICE)
2751 }
2752}
2753
2754fn scout_focused_excerpt(lines: &[String], question: &str, max_tokens: usize) -> String {
2755 let tokens: Vec<String> = signal_tokens(question)
2756 .into_iter()
2757 .map(|token| token.to_ascii_lowercase())
2758 .collect();
2759 let question_l = question.to_ascii_lowercase();
2760 let wants_code = ["cli", "command", "install", "invoke"]
2761 .iter()
2762 .any(|needle| question_l.contains(needle));
2763
2764 let mut selected = BTreeSet::new();
2765 for (idx, line) in lines.iter().enumerate() {
2766 let trimmed = line.trim();
2767 let lower = trimmed.to_ascii_lowercase();
2768 if is_noisy_highlight_line(trimmed) && !is_relevant_table_line(trimmed, &tokens) {
2769 continue;
2770 }
2771 let token_hits = tokens.iter().filter(|token| lower.contains(*token)).count();
2772 let code_hit = trimmed.contains("--")
2773 || (wants_code
2774 && (trimmed.contains('`')
2775 || trimmed.starts_with("pip ")
2776 || trimmed.starts_with("conda ")
2777 || trimmed.starts_with("python ")
2778 || trimmed.starts_with("git ")
2779 || trimmed.starts_with("cmake ")
2780 || trimmed.starts_with("make ")));
2781 let table_hit = is_relevant_table_line(trimmed, &tokens);
2782 if token_hits == 0 && !code_hit && !table_hit {
2783 continue;
2784 }
2785 let radius = if table_hit {
2786 2
2787 } else if lower.contains("disable") || lower.contains("warning") || code_hit {
2788 5
2789 } else if token_hits >= 2 {
2790 2
2791 } else {
2792 1
2793 };
2794 for context_idx in idx.saturating_sub(radius)..=(idx + radius).min(lines.len() - 1) {
2795 selected.insert(context_idx);
2796 }
2797 }
2798
2799 let mut out = String::new();
2800 let mut last_idx = None;
2801 for idx in selected {
2802 let line = lines[idx].trim_end();
2803 if line.trim().is_empty() {
2804 continue;
2805 }
2806 if let Some(last) = last_idx {
2807 if idx > last + 1 && !out.ends_with("\n...\n") {
2808 out.push_str("...\n");
2809 }
2810 }
2811 let candidate = format!("{out}{line}\n");
2812 if estimate_tokens(&candidate) > max_tokens {
2813 out.push_str(TRUNCATION_NOTICE);
2814 break;
2815 }
2816 out = candidate;
2817 last_idx = Some(idx);
2818 }
2819
2820 out
2821}
2822
2823fn cmd_pack(args: PackArgs) -> Result<()> {
2824 let dedupe = args.dedupe && !args.no_dedupe;
2825 let result = if let Some(ref ids_str) = args.ids {
2826 let ids: Vec<String> = ids_str.split(',').map(|s| s.trim().to_string()).collect();
2827 pack_by_ids(&args.path, &ids, args.max_tokens, args.parents, dedupe)?
2828 } else if let Some(ref paths_str) = args.paths {
2829 let doc = parse_markdown(&args.path)?;
2830 let path_list: Vec<&str> = paths_str.split(';').collect();
2831 let mut ids = Vec::new();
2832 for p in path_list {
2833 ids.push(find_unique_section_by_path(&doc, p)?.id.clone());
2834 }
2835 pack_by_ids(&args.path, &ids, args.max_tokens, args.parents, dedupe)?
2836 } else if let Some(ref query) = args.search {
2837 crate::pack::pack_by_search(
2838 &args.path,
2839 query,
2840 args.max_tokens,
2841 PackSearchOptions {
2842 include_parents: args.parents,
2843 dedupe,
2844 case_sensitive: args.case_sensitive,
2845 use_regex: args.regex,
2846 max_results: args.max_results,
2847 context_lines: args.context_lines,
2848 },
2849 )?
2850 } else {
2851 return Err(anyhow::anyhow!(
2852 "exactly one of --ids, --paths, or --search is required"
2853 ));
2854 };
2855
2856 if args.json {
2857 let output = PackJsonOutput {
2858 schema_version: 1,
2859 token_budget: result.token_budget,
2860 token_estimate: result.token_estimate,
2861 truncated: result.truncated,
2862 included: result
2863 .included
2864 .iter()
2865 .map(|inc| PackJsonIncluded {
2866 path: inc.path.clone(),
2867 section_id: inc.section_id.clone(),
2868 section_path: inc.section_path.clone(),
2869 line_start: inc.line_start,
2870 line_end: inc.line_end,
2871 token_estimate: inc.token_estimate,
2872 truncated: inc.truncated,
2873 })
2874 .collect(),
2875 content: result.content.clone(),
2876 };
2877 println!("{}", serde_json::to_string_pretty(&output)?);
2878 } else {
2879 let included_render: Vec<PackIncluded> = result
2880 .included
2881 .iter()
2882 .map(|inc| PackIncluded {
2883 section_id: inc.section_id.clone(),
2884 section_title: inc.section_path.last().cloned().unwrap_or_default(),
2885 line_range: format!("{}-{}", inc.line_start, inc.line_end),
2886 token_estimate: inc.token_estimate,
2887 })
2888 .collect();
2889 println!(
2890 "{}",
2891 render_pack(
2892 &args.path,
2893 result.token_budget,
2894 &included_render,
2895 &result.content,
2896 result.truncated
2897 )
2898 );
2899 }
2900
2901 Ok(())
2902}
2903
2904fn cmd_stats(args: StatsArgs) -> Result<()> {
2905 let files = crate::search::discover_markdown_files(&args.path)?;
2906 let mut entries = Vec::new();
2907
2908 for file in &files {
2909 let doc = parse_markdown(file)?;
2910 entries.push(StatsEntry {
2911 path: doc.path,
2912 lines: doc.line_count,
2913 words: doc.word_count,
2914 tokens: doc.token_estimate,
2915 });
2916 }
2917
2918 match args.sort {
2920 StatsSort::Tokens => entries.sort_by_key(|entry| Reverse(entry.tokens)),
2921 StatsSort::Lines => entries.sort_by_key(|entry| Reverse(entry.lines)),
2922 StatsSort::Path => entries.sort_by(|lhs, rhs| lhs.path.cmp(&rhs.path)),
2923 }
2924
2925 let entries = if let Some(top) = args.top {
2927 &entries[..std::cmp::min(top, entries.len())]
2928 } else {
2929 &entries
2930 };
2931
2932 if args.json {
2933 let output = StatsJsonOutput {
2934 schema_version: 1,
2935 entries: entries
2936 .iter()
2937 .map(|e| StatsJsonEntry {
2938 path: e.path.clone(),
2939 lines: e.lines,
2940 words: e.words,
2941 tokens: e.tokens,
2942 })
2943 .collect(),
2944 };
2945 println!("{}", serde_json::to_string_pretty(&output)?);
2946 } else {
2947 println!("{}", render_stats(entries));
2948 }
2949
2950 Ok(())
2951}
2952
2953fn cmd_sections(args: SectionsArgs) -> Result<()> {
2954 let stdin = io::stdin();
2955 let mut inputs: Vec<SectionInput> = Vec::new();
2956
2957 if !args.files.is_empty() {
2959 for f in &args.files {
2961 let trimmed = f.trim().to_string();
2962 if !trimmed.is_empty() {
2963 inputs.push(SectionInput::File(trimmed));
2964 }
2965 }
2966 } else {
2967 for line in stdin.lock().lines() {
2968 let line = line?;
2969 if let Some(input) = parse_sections_input_line(&line) {
2970 inputs.push(input);
2971 }
2972 }
2973 }
2974
2975 if inputs.is_empty() {
2976 return Ok(());
2977 }
2978
2979 let dedupe = args.dedupe && !args.no_dedupe;
2980 let has_hit_input = inputs
2981 .iter()
2982 .any(|input| matches!(input, SectionInput::Hit(_)));
2983
2984 if !has_hit_input {
2985 let mut paths: Vec<String> = inputs
2986 .into_iter()
2987 .filter_map(|input| match input {
2988 SectionInput::File(path) => Some(path),
2989 SectionInput::Hit(_) => None,
2990 })
2991 .collect();
2992
2993 if dedupe {
2994 let mut seen = HashSet::new();
2995 paths.retain(|p| seen.insert(p.clone()));
2996 }
2997
2998 return render_sections_from_paths(args, paths);
2999 }
3000
3001 let mut file_order: Vec<String> = Vec::new();
3002 let mut file_hits: HashMap<String, Vec<usize>> = HashMap::new();
3003
3004 for input in inputs {
3005 match input {
3006 SectionInput::File(path) => {
3007 if !file_order.iter().any(|existing| existing == &path) {
3008 file_order.push(path.clone());
3009 }
3010 file_hits.entry(path).or_default();
3011 }
3012 SectionInput::Hit(hit) => {
3013 let entry = file_hits.entry(hit.path.clone()).or_default();
3014 if !dedupe || !entry.contains(&hit.line) {
3015 entry.push(hit.line);
3016 }
3017 if !file_order.iter().any(|existing| existing == &hit.path) {
3018 file_order.push(hit.path);
3019 }
3020 }
3021 }
3022 }
3023
3024 if let Some(max_files) = args.max_files {
3025 if file_order.len() > max_files {
3026 anyhow::bail!(
3027 "[error] {} files exceed --max-files {}; narrow with a more specific grep or raise the limit",
3028 file_order.len(),
3029 max_files
3030 );
3031 }
3032 } else if args.max_tokens.is_none() && file_order.len() > 8 {
3033 eprintln!(
3034 "[warn] {} files piped without --max-tokens or --max-files; output may be large",
3035 file_order.len()
3036 );
3037 }
3038
3039 let mut file_outputs: Vec<SectionsFileOutput> = Vec::new();
3040 let mut total_tokens: usize = 0;
3041 let mut omitted: usize = 0;
3042
3043 for path in &file_order {
3044 let parsed = match load_markdown(path) {
3045 Ok(p) => p,
3046 Err(e) => {
3047 eprintln!("Warning: could not read {}: {}", path, e);
3048 continue;
3049 }
3050 };
3051
3052 let doc = &parsed.doc;
3053 let lines = &parsed.lines;
3054
3055 let mut sections: Vec<SectionsSectionOutput> =
3056 if let Some(hit_lines) = file_hits.get(path).filter(|lines| !lines.is_empty()) {
3057 collect_hit_sections(
3058 &doc.sections,
3059 lines,
3060 hit_lines,
3061 args.children,
3062 args.preview,
3063 dedupe,
3064 )
3065 } else {
3066 let mut collected = Vec::new();
3067 collect_all_sections(
3068 &doc.sections,
3069 lines,
3070 args.children,
3071 args.preview,
3072 args.max_depth,
3073 0,
3074 &mut collected,
3075 );
3076 collected
3077 };
3078
3079 if sections.is_empty() {
3080 continue;
3081 }
3082
3083 if let Some(max_sections) = args.max_sections {
3084 if sections.len() > max_sections {
3085 omitted += sections.len() - max_sections;
3086 sections.truncate(max_sections);
3087 }
3088 }
3089
3090 if let Some(max_tokens) = args.max_tokens {
3092 let mut kept: Vec<SectionsSectionOutput> = Vec::new();
3093 for sec in sections {
3094 if total_tokens + sec.token_estimate > max_tokens {
3095 omitted += 1;
3096 } else {
3097 total_tokens += sec.token_estimate;
3098 kept.push(sec);
3099 }
3100 }
3101 sections = kept;
3102 }
3103
3104 if !sections.is_empty() {
3105 file_outputs.push(SectionsFileOutput {
3106 path: path.clone(),
3107 sections,
3108 });
3109 }
3110 }
3111
3112 emit_sections_output(&args, file_outputs, omitted)
3113}
3114
3115fn render_sections_from_paths(args: SectionsArgs, paths: Vec<String>) -> Result<()> {
3116 if paths.is_empty() {
3117 return Ok(());
3118 }
3119
3120 let depth_capped = args.max_depth.is_none() && (!args.content || args.preview.is_some());
3121 let effective_depth = if depth_capped {
3122 Some(2)
3123 } else {
3124 args.max_depth
3125 };
3126
3127 if let Some(max_files) = args.max_files {
3128 if paths.len() > max_files {
3129 anyhow::bail!(
3130 "[error] {} files exceed --max-files {}; narrow with a more specific grep or raise the limit",
3131 paths.len(),
3132 max_files
3133 );
3134 }
3135 } else if args.max_tokens.is_none() && paths.len() > 8 {
3136 eprintln!(
3137 "[warn] {} files piped without --max-tokens or --max-files; output may be large",
3138 paths.len()
3139 );
3140 }
3141
3142 let mut file_outputs: Vec<SectionsFileOutput> = Vec::new();
3143 let mut total_tokens: usize = 0;
3144 let mut omitted: usize = 0;
3145
3146 for path in &paths {
3147 let parsed = match load_markdown(path) {
3148 Ok(p) => p,
3149 Err(e) => {
3150 eprintln!("Warning: could not read {}: {}", path, e);
3151 continue;
3152 }
3153 };
3154
3155 let doc = &parsed.doc;
3156 let lines = &parsed.lines;
3157 let mut sections: Vec<SectionsSectionOutput> = Vec::new();
3158 collect_all_sections(
3159 &doc.sections,
3160 lines,
3161 args.children,
3162 args.preview,
3163 effective_depth,
3164 0,
3165 &mut sections,
3166 );
3167
3168 if sections.is_empty() {
3169 continue;
3170 }
3171
3172 if let Some(max_sections) = args.max_sections {
3173 if sections.len() > max_sections {
3174 omitted += sections.len() - max_sections;
3175 sections.truncate(max_sections);
3176 }
3177 }
3178
3179 if let Some(max_tokens) = args.max_tokens {
3180 let mut kept: Vec<SectionsSectionOutput> = Vec::new();
3181 for sec in sections {
3182 if total_tokens + sec.token_estimate > max_tokens {
3183 omitted += 1;
3184 } else {
3185 total_tokens += sec.token_estimate;
3186 kept.push(sec);
3187 }
3188 }
3189 sections = kept;
3190 }
3191
3192 if !sections.is_empty() {
3193 file_outputs.push(SectionsFileOutput {
3194 path: path.clone(),
3195 sections,
3196 });
3197 }
3198 }
3199
3200 if depth_capped {
3201 eprintln!(
3202 "[sections] whole-file mode: showing depth ≤2 by default; use --max-depth N for more"
3203 );
3204 }
3205
3206 emit_sections_output(&args, file_outputs, omitted)
3207}
3208
3209fn emit_sections_output(
3210 args: &SectionsArgs,
3211 file_outputs: Vec<SectionsFileOutput>,
3212 omitted: usize,
3213) -> Result<()> {
3214 if omitted > 0 {
3215 if let Some(max_tokens) = args.max_tokens {
3216 eprintln!(
3217 "[warn] {} sections omitted by limits (budget ~{}t)",
3218 omitted, max_tokens
3219 );
3220 } else {
3221 eprintln!("[warn] {} sections omitted by limits", omitted);
3222 }
3223 }
3224
3225 if file_outputs.is_empty() {
3226 return Ok(());
3227 }
3228
3229 if args.json {
3230 let output = SectionsJsonOutput {
3231 schema_version: 1,
3232 files: file_outputs
3233 .iter()
3234 .map(|fo| SectionsJsonFile {
3235 path: fo.path.clone(),
3236 sections: fo
3237 .sections
3238 .iter()
3239 .map(|s| SectionsJsonSection {
3240 id: s.id.clone(),
3241 title: s.title.clone(),
3242 heading_path: if args.heading_paths {
3243 Some(s.heading_path.clone())
3244 } else {
3245 None
3246 },
3247 line_start: if args.lines { Some(s.line_start) } else { None },
3248 line_end: if args.lines { Some(s.line_end) } else { None },
3249 token_estimate: s.token_estimate,
3250 body: if args.content {
3251 Some(s.body.clone())
3252 } else {
3253 None
3254 },
3255 preview: s.preview.clone(),
3256 })
3257 .collect(),
3258 })
3259 .collect(),
3260 };
3261 println!("{}", serde_json::to_string_pretty(&output)?);
3262 } else {
3263 let entries: Vec<SectionsEntry> = file_outputs
3264 .iter()
3265 .flat_map(|fo| {
3266 fo.sections.iter().map(|s| SectionsEntry {
3267 file_path: fo.path.clone(),
3268 id: s.id.clone(),
3269 title: s.title.clone(),
3270 heading_path: if args.heading_paths {
3271 Some(s.heading_path.clone())
3272 } else {
3273 None
3274 },
3275 line_start: if args.lines { Some(s.line_start) } else { None },
3276 line_end: if args.lines { Some(s.line_end) } else { None },
3277 token_estimate: s.token_estimate,
3278 body: if args.content {
3279 Some(s.body.clone())
3280 } else {
3281 None
3282 },
3283 preview: s.preview.clone(),
3284 })
3285 })
3286 .collect();
3287 println!("{}", render_sections(&entries, args.content));
3288 }
3289
3290 Ok(())
3291}
3292
3293struct SectionsSectionOutput {
3294 id: String,
3295 title: String,
3296 heading_path: Vec<String>,
3297 line_start: usize,
3298 line_end: usize,
3299 token_estimate: usize,
3300 body: String,
3301 preview: Option<String>,
3302}
3303
3304struct SectionsFileOutput {
3305 path: String,
3306 sections: Vec<SectionsSectionOutput>,
3307}
3308
3309#[derive(Clone)]
3310struct HitSectionAggregate<'a> {
3311 section: &'a Section,
3312 hit_count: usize,
3313 first_line: usize,
3314}
3315
3316fn parse_sections_input_line(line: &str) -> Option<SectionInput> {
3317 let trimmed = line.trim();
3318 if trimmed.is_empty() {
3319 return None;
3320 }
3321
3322 if let Some((path, line_num)) = parse_grep_hit(trimmed) {
3323 return Some(SectionInput::Hit(SectionHit {
3324 path: path.to_string(),
3325 line: line_num,
3326 }));
3327 }
3328
3329 Some(SectionInput::File(trimmed.to_string()))
3330}
3331
3332fn parse_grep_hit(line: &str) -> Option<(&str, usize)> {
3333 let first = line.find(':')?;
3334 let rest = &line[(first + 1)..];
3335 let second = rest.find(':')?;
3336 let path = &line[..first];
3337 let line_num = rest[..second].parse().ok()?;
3338 Some((path, line_num))
3339}
3340
3341fn collect_hit_sections(
3342 sections: &[Section],
3343 lines: &[String],
3344 hit_lines: &[usize],
3345 include_children: bool,
3346 preview_lines: Option<usize>,
3347 dedupe: bool,
3348) -> Vec<SectionsSectionOutput> {
3349 let mut by_section: HashMap<String, HitSectionAggregate<'_>> = HashMap::new();
3350 let mut ordered_hits: Vec<(usize, &Section)> = Vec::new();
3351
3352 for line_num in hit_lines {
3353 let Some(section) = find_deepest_section_for_line(sections, *line_num) else {
3354 continue;
3355 };
3356 if dedupe {
3357 by_section
3358 .entry(section.id.clone())
3359 .and_modify(|entry| entry.hit_count += 1)
3360 .or_insert(HitSectionAggregate {
3361 section,
3362 hit_count: 1,
3363 first_line: *line_num,
3364 });
3365 } else {
3366 ordered_hits.push((*line_num, section));
3367 }
3368 }
3369
3370 let aggregates: Vec<HitSectionAggregate<'_>> = if dedupe {
3371 let mut ranked: Vec<HitSectionAggregate<'_>> = by_section.into_values().collect();
3372 ranked.sort_by(|lhs, rhs| {
3373 rhs.hit_count
3374 .cmp(&lhs.hit_count)
3375 .then(lhs.section.token_estimate.cmp(&rhs.section.token_estimate))
3376 .then(lhs.first_line.cmp(&rhs.first_line))
3377 .then(lhs.section.line_start.cmp(&rhs.section.line_start))
3378 });
3379 ranked
3380 } else {
3381 ordered_hits.sort_by(|lhs, rhs| {
3382 lhs.0
3383 .cmp(&rhs.0)
3384 .then(lhs.1.line_start.cmp(&rhs.1.line_start))
3385 .then(lhs.1.id.cmp(&rhs.1.id))
3386 });
3387 ordered_hits
3388 .into_iter()
3389 .map(|(first_line, section)| HitSectionAggregate {
3390 section,
3391 hit_count: 1,
3392 first_line,
3393 })
3394 .collect()
3395 };
3396
3397 let mut collected = Vec::new();
3398 for aggregate in aggregates {
3399 let section = aggregate.section;
3400 let body_lines = if include_children {
3401 section.extract_content(lines)
3402 } else {
3403 section.extract_direct_content(lines)
3404 };
3405 let body = body_lines.join("\n");
3406 let preview = preview_lines.map(|n| {
3407 body_lines
3408 .iter()
3409 .filter(|l| !l.trim().is_empty())
3410 .take(n)
3411 .cloned()
3412 .collect::<Vec<_>>()
3413 .join("\n")
3414 });
3415
3416 collected.push(SectionsSectionOutput {
3417 id: section.id.clone(),
3418 title: section.title.clone(),
3419 heading_path: section.path.clone(),
3420 line_start: section.line_start,
3421 line_end: section.line_end,
3422 token_estimate: estimate_tokens(&body),
3423 body,
3424 preview,
3425 });
3426 }
3427
3428 collected
3429}
3430
3431fn collect_all_sections(
3432 sections: &[Section],
3433 lines: &[String],
3434 include_children: bool,
3435 preview_lines: Option<usize>,
3436 max_depth: Option<usize>,
3437 current_depth: usize,
3438 result: &mut Vec<SectionsSectionOutput>,
3439) {
3440 for section in sections {
3441 if section.title == "<preamble>" {
3442 continue;
3443 }
3444 if let Some(max) = max_depth {
3445 if current_depth >= max {
3446 continue;
3447 }
3448 }
3449 let body_lines = if include_children {
3450 section.extract_content(lines)
3451 } else {
3452 section.extract_direct_content(lines)
3453 };
3454 let body = body_lines.join("\n");
3455 let preview = preview_lines.map(|n| {
3456 body_lines
3457 .iter()
3458 .filter(|l| !l.trim().is_empty())
3459 .take(n)
3460 .cloned()
3461 .collect::<Vec<_>>()
3462 .join("\n")
3463 });
3464 result.push(SectionsSectionOutput {
3465 id: section.id.clone(),
3466 title: section.title.clone(),
3467 heading_path: section.path.clone(),
3468 line_start: section.line_start,
3469 line_end: section.line_end,
3470 token_estimate: estimate_tokens(&body),
3471 body,
3472 preview,
3473 });
3474 collect_all_sections(
3475 §ion.children,
3476 lines,
3477 include_children,
3478 preview_lines,
3479 max_depth,
3480 current_depth + 1,
3481 result,
3482 );
3483 }
3484}
3485
3486fn enrich_search_results(
3487 results: &mut [crate::render::SearchResult],
3488 with_content: bool,
3489 preview_lines: Option<usize>,
3490) -> Result<()> {
3491 let mut docs: HashMap<String, crate::parse::ParsedMarkdown> = HashMap::new();
3492
3493 for result in results.iter_mut() {
3494 let parsed = if let Some(parsed) = docs.get(&result.path) {
3495 parsed
3496 } else {
3497 let loaded = load_markdown(&result.path)?;
3498 docs.insert(result.path.clone(), loaded);
3499 docs.get(&result.path).expect("inserted parsed markdown")
3500 };
3501
3502 let Some(section) = parsed.doc.find_section_by_id(&result.section_id) else {
3503 continue;
3504 };
3505 let body_lines = section.extract_direct_content(&parsed.lines);
3506 if with_content {
3507 result.body = Some(body_lines.join("\n"));
3508 }
3509 if let Some(n) = preview_lines {
3510 result.preview = Some(
3511 body_lines
3512 .iter()
3513 .filter(|line| !line.trim().is_empty())
3514 .take(n)
3515 .cloned()
3516 .collect::<Vec<_>>()
3517 .join("\n"),
3518 );
3519 }
3520 }
3521
3522 Ok(())
3523}
3524
3525fn find_deepest_section_for_line(sections: &[Section], line_num: usize) -> Option<&Section> {
3526 for section in sections {
3527 if line_num < section.line_start || line_num > section.line_end {
3528 continue;
3529 }
3530 if let Some(child) = find_deepest_section_for_line(§ion.children, line_num) {
3531 return Some(child);
3532 }
3533 return Some(section);
3534 }
3535 None
3536}
3537
3538#[derive(Serialize)]
3541struct TreeJsonOutput {
3542 schema_version: u32,
3543 path: String,
3544 line_count: usize,
3545 byte_count: usize,
3546 char_count: usize,
3547 word_count: usize,
3548 token_estimate: usize,
3549 sections: Vec<SectionJsonOutput>,
3550}
3551
3552#[derive(Serialize)]
3553struct TreeFileJsonOutput {
3554 path: String,
3555 line_count: usize,
3556 byte_count: usize,
3557 char_count: usize,
3558 word_count: usize,
3559 token_estimate: usize,
3560 sections: Vec<SectionJsonOutput>,
3561}
3562
3563#[derive(Serialize)]
3564struct TreeMultiJsonOutput {
3565 schema_version: u32,
3566 files: Vec<TreeFileJsonOutput>,
3567}
3568
3569#[derive(Serialize)]
3570struct SectionJsonOutput {
3571 id: String,
3572 title: String,
3573 level: u8,
3574 path: Vec<String>,
3575 line_start: usize,
3576 line_end: usize,
3577 token_estimate: usize,
3578 #[serde(skip_serializing_if = "Vec::is_empty")]
3579 children: Vec<SectionJsonOutput>,
3580}
3581
3582#[derive(Serialize)]
3583struct ReadJsonOutput {
3584 schema_version: u32,
3585 path: String,
3586 selector: ReadSelector,
3587 section: SectionJsonOutput,
3588 content: String,
3589 truncated: bool,
3590}
3591
3592#[derive(Serialize)]
3593struct ReadSelector {
3594 #[serde(rename = "type")]
3595 r#type: String,
3596 value: String,
3597}
3598
3599#[derive(Serialize)]
3600struct SearchJsonOutput {
3601 schema_version: u32,
3602 query: String,
3603 root: String,
3604 results: Vec<SearchJsonResult>,
3605}
3606
3607#[derive(Serialize)]
3608struct SearchJsonResult {
3609 path: String,
3610 section_id: String,
3611 section_title: String,
3612 section_path: Vec<String>,
3613 line_start: usize,
3614 line_end: usize,
3615 token_estimate: usize,
3616 match_count: usize,
3617 body: Option<String>,
3618 preview: Option<String>,
3619 snippets: Vec<SearchJsonSnippet>,
3620}
3621
3622#[derive(Serialize)]
3623struct SearchJsonSnippet {
3624 line_start: usize,
3625 line_end: usize,
3626 text: String,
3627}
3628
3629#[derive(Serialize)]
3630struct ScoutJsonOutput {
3631 schema_version: u32,
3632 root: String,
3633 question: String,
3634 token_budget: usize,
3635 candidate_count: usize,
3636 queries: Vec<String>,
3637 candidates: Vec<ScoutCandidate>,
3638 rendered_text: String,
3639}
3640
3641#[derive(Serialize)]
3642struct PackJsonOutput {
3643 schema_version: u32,
3644 token_budget: usize,
3645 token_estimate: usize,
3646 truncated: bool,
3647 included: Vec<PackJsonIncluded>,
3648 content: String,
3649}
3650
3651#[derive(Serialize)]
3652struct PackJsonIncluded {
3653 path: String,
3654 section_id: String,
3655 section_path: Vec<String>,
3656 line_start: usize,
3657 line_end: usize,
3658 token_estimate: usize,
3659 truncated: bool,
3660}
3661
3662#[derive(Serialize)]
3663struct StatsJsonOutput {
3664 schema_version: u32,
3665 entries: Vec<StatsJsonEntry>,
3666}
3667
3668#[derive(Serialize)]
3669struct StatsJsonEntry {
3670 path: String,
3671 lines: usize,
3672 words: usize,
3673 tokens: usize,
3674}
3675
3676#[derive(Serialize)]
3677struct SectionsJsonOutput {
3678 schema_version: u32,
3679 files: Vec<SectionsJsonFile>,
3680}
3681
3682#[derive(Serialize)]
3683struct SectionsJsonFile {
3684 path: String,
3685 sections: Vec<SectionsJsonSection>,
3686}
3687
3688#[derive(Serialize)]
3689struct SectionsJsonSection {
3690 id: String,
3691 title: String,
3692 #[serde(skip_serializing_if = "Option::is_none")]
3693 heading_path: Option<Vec<String>>,
3694 #[serde(skip_serializing_if = "Option::is_none")]
3695 line_start: Option<usize>,
3696 #[serde(skip_serializing_if = "Option::is_none")]
3697 line_end: Option<usize>,
3698 token_estimate: usize,
3699 #[serde(skip_serializing_if = "Option::is_none")]
3700 body: Option<String>,
3701 #[serde(skip_serializing_if = "Option::is_none")]
3702 preview: Option<String>,
3703}
3704
3705fn serialize_sections(
3708 sections: &[Section],
3709 max_depth: Option<usize>,
3710 include_preamble: bool,
3711 current_depth: usize,
3712) -> Vec<SectionJsonOutput> {
3713 let mut result = Vec::new();
3714 for section in sections {
3715 if section.title == "<preamble>" && !include_preamble {
3716 continue;
3717 }
3718 let children = if let Some(max) = max_depth {
3719 if current_depth + 1 < max {
3720 serialize_sections(
3721 §ion.children,
3722 max_depth,
3723 include_preamble,
3724 current_depth + 1,
3725 )
3726 } else {
3727 Vec::new()
3728 }
3729 } else {
3730 serialize_sections(
3731 §ion.children,
3732 max_depth,
3733 include_preamble,
3734 current_depth + 1,
3735 )
3736 };
3737
3738 result.push(SectionJsonOutput {
3739 id: section.id.clone(),
3740 title: section.title.clone(),
3741 level: section.level,
3742 path: section.path.clone(),
3743 line_start: section.line_start,
3744 line_end: section.line_end,
3745 token_estimate: section.token_estimate,
3746 children,
3747 });
3748 }
3749 result
3750}
3751
3752fn truncate_content_to_tokens(content: &str, max_tokens: usize) -> String {
3753 truncate_to_tokens(content, max_tokens, TRUNCATION_NOTICE)
3754}