ast_doc_core/lib.rs
1//! Core library for [ast-doc](https://crates.io/crates/ast-doc): a four-stage pipeline
2//! for generating optimized `llms.txt` documentation from codebases.
3//!
4//! # Pipeline
5//!
6//! 1. **Ingestion** — File discovery, git metadata capture, directory tree generation.
7//! 2. **Parser** — tree-sitter AST extraction with pre-computed strategy variants.
8//! 3. **Scheduler** — Token budget optimization with intelligent degradation.
9//! 4. **Renderer** — Markdown assembly with anti-bloat rules.
10//!
11//! # Quick Start
12//!
13//! ```no_run
14//! use std::path::PathBuf;
15//!
16//! use ast_doc_core::{AstDocConfig, OutputStrategy};
17//!
18//! let config = AstDocConfig {
19//! path: PathBuf::from("."),
20//! output: None,
21//! max_tokens: 128_000,
22//! core_patterns: vec![],
23//! default_strategy: OutputStrategy::Full,
24//! include_patterns: vec![],
25//! exclude_patterns: vec![],
26//! no_git: false,
27//! no_tree: false,
28//! copy: false,
29//! verbose: false,
30//! };
31//!
32//! let result = ast_doc_core::run_pipeline(&config).expect("pipeline failed");
33//! println!("{}", result.output);
34//! ```
35
36#![allow(clippy::print_stdout, clippy::print_stderr)]
37
38pub mod config;
39pub mod error;
40pub mod ingestion;
41pub mod parser;
42pub mod renderer;
43pub mod scheduler;
44
45pub use config::{AstDocConfig, OutputStrategy};
46pub use error::AstDocError;
47pub use ingestion::{DiscoveredFile, GitContext, IngestionResult};
48pub use parser::{Language, ParsedFile, StrategyData};
49use rayon::prelude::*;
50pub use scheduler::{ScheduleResult, ScheduledFile};
51
52/// Maximum tokens allowed for a git diff before truncation.
53const MAX_DIFF_TOKENS: usize = 1000;
54
55/// Result of running the full pipeline.
56#[derive(Debug)]
57pub struct PipelineResult {
58 /// The rendered `llms.txt` output.
59 pub output: String,
60 /// The scheduling result with token breakdowns.
61 pub schedule: ScheduleResult,
62}
63
64/// Run the full ast-doc pipeline and return the rendered output plus scheduling metadata.
65///
66/// # Errors
67///
68/// Returns an error if any pipeline stage fails.
69#[cfg_attr(feature = "hotpath", allow(missing_docs))]
70#[cfg_attr(feature = "hotpath", hotpath::measure)]
71pub fn run_pipeline(config: &AstDocConfig) -> eyre::Result<PipelineResult> {
72 // Phase 1: Ingestion — file discovery, git metadata, directory tree
73 let ingestion = ingestion::run_ingestion(config)?;
74
75 // Phase 2: Parser — tree-sitter extraction + pre-compute all strategy variants
76 let parsed: Vec<ParsedFile> = ingestion
77 .files
78 .par_iter()
79 .filter_map(|f| f.language.as_ref().map(|lang| (f, lang)))
80 .map(|(f, lang)| parser::parse_file(f, lang).map_err(eyre::Report::from))
81 .collect::<eyre::Result<Vec<_>>>()?;
82
83 // Compute base overhead from ingestion non-file content
84 let base_overhead_tokens = compute_base_overhead(&ingestion);
85
86 // Phase 3: Scheduler — pure optimization using pre-computed token counts
87 let scheduled = scheduler::run_scheduler(&parsed, config, base_overhead_tokens)?;
88
89 // Phase 4: Renderer — assemble final markdown
90 let output = renderer::render_llms_txt(&scheduled, &ingestion, config)?;
91
92 Ok(PipelineResult { output, schedule: scheduled })
93}
94
95/// Compute token overhead from directory tree and git context.
96///
97/// If the git diff exceeds `MAX_DIFF_TOKENS`, it is truncated
98/// with a `"... (diff truncated)"` suffix.
99fn compute_base_overhead(ingestion: &IngestionResult) -> usize {
100 let mut overhead = count_tokens(&ingestion.directory_tree);
101
102 if let Some(ref git) = ingestion.git_context {
103 overhead += count_tokens(&git.branch);
104 overhead += count_tokens(&git.latest_commit);
105 if let Some(ref diff) = git.diff {
106 let diff_tokens = count_tokens(diff);
107 if diff_tokens > MAX_DIFF_TOKENS {
108 let suffix = "... (diff truncated)";
109 // Approximate: use MAX_DIFF_TOKENS + suffix token count
110 overhead += MAX_DIFF_TOKENS + count_tokens(suffix);
111 } else {
112 overhead += diff_tokens;
113 }
114 }
115 }
116
117 overhead
118}
119
120/// Count tokens in a string using `tiktoken-rs`.
121///
122/// Uses a cached BPE instance to avoid repeated initialization.
123fn count_tokens(text: &str) -> usize {
124 use std::sync::LazyLock;
125 static BPE: LazyLock<Option<tiktoken_rs::CoreBPE>> =
126 LazyLock::new(|| tiktoken_rs::cl100k_base().ok());
127
128 BPE.as_ref().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
129}
130
131/// Initialize hotpath guard for tests.
132///
133/// This is called automatically when running tests with the hotpath feature enabled.
134#[cfg(all(test, feature = "hotpath"))]
135#[ctor::ctor]
136fn init_hotpath_for_tests() {
137 let _guard = hotpath::HotpathGuardBuilder::new("test").build();
138 std::mem::forget(_guard);
139}