Skip to main content

ast_doc_core/
lib.rs

1//! Core library for [ast-doc](https://crates.io/crates/ast-doc): a four-stage pipeline
2//! for generating optimized `llms.txt` documentation from codebases.
3//!
4//! # Pipeline
5//!
6//! 1. **Ingestion** — File discovery, git metadata capture, directory tree generation.
7//! 2. **Parser** — tree-sitter AST extraction with pre-computed strategy variants.
8//! 3. **Scheduler** — Token budget optimization with intelligent degradation.
9//! 4. **Renderer** — Markdown assembly with anti-bloat rules.
10//!
11//! # Quick Start
12//!
13//! ```no_run
14//! use std::path::PathBuf;
15//!
16//! use ast_doc_core::{AstDocConfig, OutputStrategy};
17//!
18//! let config = AstDocConfig {
19//!     path: PathBuf::from("."),
20//!     output: None,
21//!     max_tokens: 128_000,
22//!     core_patterns: vec![],
23//!     default_strategy: OutputStrategy::Full,
24//!     include_patterns: vec![],
25//!     exclude_patterns: vec![],
26//!     no_git: false,
27//!     no_tree: false,
28//!     copy: false,
29//!     verbose: false,
30//! };
31//!
32//! let result = ast_doc_core::run_pipeline(&config).expect("pipeline failed");
33//! println!("{}", result.output);
34//! ```
35
36#![allow(clippy::print_stdout, clippy::print_stderr)]
37
38pub mod config;
39pub mod error;
40pub mod ingestion;
41pub mod parser;
42pub mod renderer;
43pub mod scheduler;
44
45pub use config::{AstDocConfig, OutputStrategy};
46pub use error::AstDocError;
47pub use ingestion::{DiscoveredFile, GitContext, IngestionResult};
48pub use parser::{Language, ParsedFile, StrategyData};
49use rayon::prelude::*;
50pub use scheduler::{ScheduleResult, ScheduledFile};
51
52/// Maximum tokens allowed for a git diff before truncation.
53const MAX_DIFF_TOKENS: usize = 1000;
54
55/// Result of running the full pipeline.
56#[derive(Debug)]
57pub struct PipelineResult {
58    /// The rendered `llms.txt` output.
59    pub output: String,
60    /// The scheduling result with token breakdowns.
61    pub schedule: ScheduleResult,
62}
63
64/// Run the full ast-doc pipeline and return the rendered output plus scheduling metadata.
65///
66/// # Errors
67///
68/// Returns an error if any pipeline stage fails.
69#[cfg_attr(feature = "hotpath", allow(missing_docs))]
70#[cfg_attr(feature = "hotpath", hotpath::measure)]
71pub fn run_pipeline(config: &AstDocConfig) -> eyre::Result<PipelineResult> {
72    // Phase 1: Ingestion — file discovery, git metadata, directory tree
73    let ingestion = ingestion::run_ingestion(config)?;
74
75    // Phase 2: Parser — tree-sitter extraction + pre-compute all strategy variants
76    let parsed: Vec<ParsedFile> = ingestion
77        .files
78        .par_iter()
79        .filter_map(|f| f.language.as_ref().map(|lang| (f, lang)))
80        .map(|(f, lang)| parser::parse_file(f, lang).map_err(eyre::Report::from))
81        .collect::<eyre::Result<Vec<_>>>()?;
82
83    // Compute base overhead from ingestion non-file content
84    let base_overhead_tokens = compute_base_overhead(&ingestion);
85
86    // Phase 3: Scheduler — pure optimization using pre-computed token counts
87    let scheduled = scheduler::run_scheduler(&parsed, config, base_overhead_tokens)?;
88
89    // Phase 4: Renderer — assemble final markdown
90    let output = renderer::render_llms_txt(&scheduled, &ingestion, config)?;
91
92    Ok(PipelineResult { output, schedule: scheduled })
93}
94
95/// Compute token overhead from directory tree and git context.
96///
97/// If the git diff exceeds `MAX_DIFF_TOKENS`, it is truncated
98/// with a `"... (diff truncated)"` suffix.
99fn compute_base_overhead(ingestion: &IngestionResult) -> usize {
100    let mut overhead = count_tokens(&ingestion.directory_tree);
101
102    if let Some(ref git) = ingestion.git_context {
103        overhead += count_tokens(&git.branch);
104        overhead += count_tokens(&git.latest_commit);
105        if let Some(ref diff) = git.diff {
106            let diff_tokens = count_tokens(diff);
107            if diff_tokens > MAX_DIFF_TOKENS {
108                let suffix = "... (diff truncated)";
109                // Approximate: use MAX_DIFF_TOKENS + suffix token count
110                overhead += MAX_DIFF_TOKENS + count_tokens(suffix);
111            } else {
112                overhead += diff_tokens;
113            }
114        }
115    }
116
117    overhead
118}
119
120/// Count tokens in a string using `tiktoken-rs`.
121///
122/// Uses a cached BPE instance to avoid repeated initialization.
123fn count_tokens(text: &str) -> usize {
124    use std::sync::LazyLock;
125    static BPE: LazyLock<Option<tiktoken_rs::CoreBPE>> =
126        LazyLock::new(|| tiktoken_rs::cl100k_base().ok());
127
128    BPE.as_ref().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
129}
130
131/// Initialize hotpath guard for tests.
132///
133/// This is called automatically when running tests with the hotpath feature enabled.
134#[cfg(all(test, feature = "hotpath"))]
135#[ctor::ctor]
136fn init_hotpath_for_tests() {
137    let _guard = hotpath::HotpathGuardBuilder::new("test").build();
138    std::mem::forget(_guard);
139}