Skip to main content

ast_doc_core/
lib.rs

1//! Core library for [ast-doc](https://crates.io/crates/ast-doc): a four-stage pipeline
2//! for generating optimized `llms.txt` documentation from codebases.
3//!
4//! # Pipeline
5//!
6//! 1. **Ingestion** — File discovery, git metadata capture, directory tree generation.
7//! 2. **Parser** — tree-sitter AST extraction with pre-computed strategy variants.
8//! 3. **Scheduler** — Token budget optimization with intelligent degradation.
9//! 4. **Renderer** — Markdown assembly with anti-bloat rules.
10//!
11//! # Quick Start
12//!
13//! ```no_run
14//! use std::path::PathBuf;
15//!
16//! use ast_doc_core::{AstDocConfig, OutputStrategy};
17//!
18//! let config = AstDocConfig {
19//!     path: PathBuf::from("."),
20//!     output: None,
21//!     max_tokens: 128_000,
22//!     core_patterns: vec![],
23//!     default_strategy: OutputStrategy::Full,
24//!     include_patterns: vec![],
25//!     exclude_patterns: vec![],
26//!     no_git: false,
27//!     no_tree: false,
28//!     copy: false,
29//!     verbose: false,
30//! };
31//!
32//! let result = ast_doc_core::run_pipeline(&config).expect("pipeline failed");
33//! println!("{}", result.output);
34//! ```
35
36#![allow(clippy::print_stdout, clippy::print_stderr)]
37
38pub mod config;
39pub mod error;
40pub mod ingestion;
41pub mod parser;
42pub mod renderer;
43pub mod scheduler;
44
45pub use config::{AstDocConfig, OutputStrategy};
46pub use error::AstDocError;
47pub use ingestion::{DiscoveredFile, GitContext, IngestionResult};
48pub use parser::{Language, ParsedFile, StrategyData};
49use rayon::prelude::*;
50pub use scheduler::{ScheduleResult, ScheduledFile};
51
52/// Maximum tokens allowed for a git diff before truncation.
53const MAX_DIFF_TOKENS: usize = 1000;
54
55/// Result of running the full pipeline.
56#[derive(Debug)]
57pub struct PipelineResult {
58    /// The rendered `llms.txt` output.
59    pub output: String,
60    /// The scheduling result with token breakdowns.
61    pub schedule: ScheduleResult,
62}
63
64/// Run the full ast-doc pipeline and return the rendered output plus scheduling metadata.
65///
66/// # Errors
67///
68/// Returns an error if any pipeline stage fails.
69pub fn run_pipeline(config: &AstDocConfig) -> eyre::Result<PipelineResult> {
70    // Phase 1: Ingestion — file discovery, git metadata, directory tree
71    let ingestion = ingestion::run_ingestion(config)?;
72
73    // Phase 2: Parser — tree-sitter extraction + pre-compute all strategy variants
74    let parsed: Vec<ParsedFile> = ingestion
75        .files
76        .par_iter()
77        .filter_map(|f| f.language.map(|lang| (f, lang)))
78        .map(|(f, lang)| parser::parse_file(f, lang).map_err(eyre::Report::from))
79        .collect::<eyre::Result<Vec<_>>>()?;
80
81    // Compute base overhead from ingestion non-file content
82    let base_overhead_tokens = compute_base_overhead(&ingestion);
83
84    // Phase 3: Scheduler — pure optimization using pre-computed token counts
85    let scheduled = scheduler::run_scheduler(&parsed, config, base_overhead_tokens)?;
86
87    // Phase 4: Renderer — assemble final markdown
88    let output = renderer::render_llms_txt(&scheduled, &ingestion, config)?;
89
90    Ok(PipelineResult { output, schedule: scheduled })
91}
92
93/// Compute token overhead from directory tree and git context.
94///
95/// If the git diff exceeds `MAX_DIFF_TOKENS`, it is truncated
96/// with a `"... (diff truncated)"` suffix.
97fn compute_base_overhead(ingestion: &IngestionResult) -> usize {
98    let mut overhead = count_tokens(&ingestion.directory_tree);
99
100    if let Some(ref git) = ingestion.git_context {
101        overhead += count_tokens(&git.branch);
102        overhead += count_tokens(&git.latest_commit);
103        if let Some(ref diff) = git.diff {
104            let diff_tokens = count_tokens(diff);
105            if diff_tokens > MAX_DIFF_TOKENS {
106                let suffix = "... (diff truncated)";
107                // Approximate: use MAX_DIFF_TOKENS + suffix token count
108                overhead += MAX_DIFF_TOKENS + count_tokens(suffix);
109            } else {
110                overhead += diff_tokens;
111            }
112        }
113    }
114
115    overhead
116}
117
118/// Count tokens in a string using `tiktoken-rs`.
119fn count_tokens(text: &str) -> usize {
120    tiktoken_rs::cl100k_base().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
121}