ast_doc_core/lib.rs
1//! Core library for [ast-doc](https://crates.io/crates/ast-doc): a four-stage pipeline
2//! for generating optimized `llms.txt` documentation from codebases.
3//!
4//! # Pipeline
5//!
6//! 1. **Ingestion** — File discovery, git metadata capture, directory tree generation.
7//! 2. **Parser** — tree-sitter AST extraction with pre-computed strategy variants.
8//! 3. **Scheduler** — Token budget optimization with intelligent degradation.
9//! 4. **Renderer** — Markdown assembly with anti-bloat rules.
10//!
11//! # Quick Start
12//!
13//! ```no_run
14//! use std::path::PathBuf;
15//!
16//! use ast_doc_core::{AstDocConfig, OutputStrategy};
17//!
18//! let config = AstDocConfig {
19//! path: PathBuf::from("."),
20//! output: None,
21//! max_tokens: 128_000,
22//! core_patterns: vec![],
23//! default_strategy: OutputStrategy::Full,
24//! include_patterns: vec![],
25//! exclude_patterns: vec![],
26//! no_git: false,
27//! no_tree: false,
28//! copy: false,
29//! verbose: false,
30//! };
31//!
32//! let result = ast_doc_core::run_pipeline(&config).expect("pipeline failed");
33//! println!("{}", result.output);
34//! ```
35
36#![allow(clippy::print_stdout, clippy::print_stderr)]
37
38pub mod config;
39pub mod error;
40pub mod ingestion;
41pub mod parser;
42pub mod renderer;
43pub mod scheduler;
44
45pub use config::{AstDocConfig, OutputStrategy};
46pub use error::AstDocError;
47pub use ingestion::{DiscoveredFile, GitContext, IngestionResult};
48pub use parser::{Language, ParsedFile, StrategyData};
49use rayon::prelude::*;
50pub use scheduler::{ScheduleResult, ScheduledFile};
51
52/// Maximum tokens allowed for a git diff before truncation.
53const MAX_DIFF_TOKENS: usize = 1000;
54
55/// Result of running the full pipeline.
56#[derive(Debug)]
57pub struct PipelineResult {
58 /// The rendered `llms.txt` output.
59 pub output: String,
60 /// The scheduling result with token breakdowns.
61 pub schedule: ScheduleResult,
62}
63
64/// Run the full ast-doc pipeline and return the rendered output plus scheduling metadata.
65///
66/// # Errors
67///
68/// Returns an error if any pipeline stage fails.
69pub fn run_pipeline(config: &AstDocConfig) -> eyre::Result<PipelineResult> {
70 // Phase 1: Ingestion — file discovery, git metadata, directory tree
71 let ingestion = ingestion::run_ingestion(config)?;
72
73 // Phase 2: Parser — tree-sitter extraction + pre-compute all strategy variants
74 let parsed: Vec<ParsedFile> = ingestion
75 .files
76 .par_iter()
77 .filter_map(|f| f.language.map(|lang| (f, lang)))
78 .map(|(f, lang)| parser::parse_file(f, lang).map_err(eyre::Report::from))
79 .collect::<eyre::Result<Vec<_>>>()?;
80
81 // Compute base overhead from ingestion non-file content
82 let base_overhead_tokens = compute_base_overhead(&ingestion);
83
84 // Phase 3: Scheduler — pure optimization using pre-computed token counts
85 let scheduled = scheduler::run_scheduler(&parsed, config, base_overhead_tokens)?;
86
87 // Phase 4: Renderer — assemble final markdown
88 let output = renderer::render_llms_txt(&scheduled, &ingestion, config)?;
89
90 Ok(PipelineResult { output, schedule: scheduled })
91}
92
93/// Compute token overhead from directory tree and git context.
94///
95/// If the git diff exceeds `MAX_DIFF_TOKENS`, it is truncated
96/// with a `"... (diff truncated)"` suffix.
97fn compute_base_overhead(ingestion: &IngestionResult) -> usize {
98 let mut overhead = count_tokens(&ingestion.directory_tree);
99
100 if let Some(ref git) = ingestion.git_context {
101 overhead += count_tokens(&git.branch);
102 overhead += count_tokens(&git.latest_commit);
103 if let Some(ref diff) = git.diff {
104 let diff_tokens = count_tokens(diff);
105 if diff_tokens > MAX_DIFF_TOKENS {
106 let suffix = "... (diff truncated)";
107 // Approximate: use MAX_DIFF_TOKENS + suffix token count
108 overhead += MAX_DIFF_TOKENS + count_tokens(suffix);
109 } else {
110 overhead += diff_tokens;
111 }
112 }
113 }
114
115 overhead
116}
117
118/// Count tokens in a string using `tiktoken-rs`.
119fn count_tokens(text: &str) -> usize {
120 tiktoken_rs::cl100k_base().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
121}