ast_doc_core/ingestion/
mod.rs1pub mod git;
7pub mod walker;
8
9use std::path::{Path, PathBuf};
10
11use git::extract_git_context;
12use tracing::{debug, info, warn};
13use walker::{build_globset, walk_directory};
14
15use crate::{config::AstDocConfig, error::AstDocError, parser::Language};
16
17#[derive(Debug, Clone)]
19pub struct DiscoveredFile {
20 pub path: PathBuf,
22 pub content: String,
24 pub language: Option<Language>,
26 pub raw_token_count: usize,
28}
29
30#[derive(Debug, Clone)]
32pub struct GitContext {
33 pub branch: String,
35 pub latest_commit: String,
37 pub diff: Option<String>,
39}
40
41#[derive(Debug)]
43pub struct IngestionResult {
44 pub files: Vec<DiscoveredFile>,
46 pub directory_tree: String,
48 pub git_context: Option<GitContext>,
50}
51
52#[cfg_attr(feature = "hotpath", allow(missing_docs))]
58#[cfg_attr(feature = "hotpath", hotpath::measure)]
59pub fn run_ingestion(config: &AstDocConfig) -> Result<IngestionResult, AstDocError> {
60 let root = config
61 .path
62 .canonicalize()
63 .map_err(|e| AstDocError::FileRead { path: config.path.clone(), source: e })?;
64 info!(path = %root.display(), "starting ingestion");
65
66 let include = build_globset(&config.include_patterns)?;
68 let exclude = build_globset(&config.exclude_patterns)?;
69
70 let file_paths = walk_directory(&root, &include, &exclude, config)?;
72
73 let mut files = Vec::with_capacity(file_paths.len());
75 for rel_path in &file_paths {
76 let abs_path = root.join(rel_path);
77 match std::fs::read_to_string(&abs_path) {
78 Ok(content) => {
79 let lang = crate::parser::detect_language(rel_path);
80 let token_count = count_tokens(&content);
81 debug!(
82 path = %rel_path.display(),
83 language = ?lang,
84 tokens = token_count,
85 "discovered file"
86 );
87 files.push(DiscoveredFile {
88 path: rel_path.clone(),
89 content,
90 language: lang,
91 raw_token_count: token_count,
92 });
93 }
94 Err(e) => {
95 warn!(
96 path = %rel_path.display(),
97 error = %e,
98 "failed to read file, skipping"
99 );
100 }
101 }
102 }
103
104 let directory_tree =
106 if config.no_tree { String::new() } else { build_directory_tree(&root, &file_paths) };
107
108 let git_context = if config.no_git {
110 None
111 } else {
112 match extract_git_context(&root) {
113 Ok(Some(ctx)) => Some(ctx),
114 Ok(None) => None,
115 Err(e) => {
116 warn!(error = %e, "failed to extract git context");
117 None
118 }
119 }
120 };
121
122 info!(files = files.len(), git = git_context.is_some(), "ingestion complete");
123
124 Ok(IngestionResult { files, directory_tree, git_context })
125}
126
127fn count_tokens(text: &str) -> usize {
131 use std::sync::LazyLock;
132 static BPE: LazyLock<Option<tiktoken_rs::CoreBPE>> =
133 LazyLock::new(|| tiktoken_rs::cl100k_base().ok());
134
135 BPE.as_ref().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
136}
137
138fn build_directory_tree(root: &Path, files: &[PathBuf]) -> String {
142 use termtree::Tree;
143
144 let parent_name = root.file_name().unwrap_or_default().to_string_lossy().to_string();
145
146 let mut tree = Tree::new(parent_name);
147
148 for file_path in files {
149 let mut current = &mut tree;
150 let components: Vec<_> =
151 file_path.components().map(|c| c.as_os_str().to_string_lossy().to_string()).collect();
152
153 for (i, component) in components.iter().enumerate() {
154 if i == components.len() - 1 {
155 let lang = crate::parser::detect_language(file_path)
157 .map(|l| format!(" [{l}]"))
158 .unwrap_or_default();
159 current.push(Tree::new(format!("{component}{lang}")));
160 } else {
161 let idx = current.leaves.iter().position(|child| child.root == component.as_str());
163 if let Some(pos) = idx {
164 current = &mut current.leaves[pos];
165 } else {
166 current.push(Tree::new(component.clone()));
167 let last = current.leaves.len() - 1;
168 current = &mut current.leaves[last];
169 }
170 }
171 }
172 }
173
174 tree.to_string()
175}
176
177#[must_use]
181pub fn detect_language(path: &Path) -> Option<Language> {
182 crate::parser::detect_language(path)
183}
184
185#[cfg(test)]
186#[expect(clippy::unwrap_used)]
187mod tests {
188 use std::fs;
189
190 use tempfile::TempDir;
191
192 use super::*;
193
194 fn make_config(root: &Path) -> AstDocConfig {
195 AstDocConfig {
196 path: root.to_path_buf(),
197 output: None,
198 max_tokens: 10_000,
199 core_patterns: vec![],
200 default_strategy: crate::config::OutputStrategy::Full,
201 include_patterns: vec![],
202 exclude_patterns: vec![],
203 no_git: true,
204 no_tree: false,
205 copy: false,
206 verbose: false,
207 }
208 }
209
210 fn setup_rust_project() -> TempDir {
211 let dir = TempDir::new().unwrap();
212 let base = dir.path();
213 fs::create_dir_all(base.join("src")).unwrap();
214 fs::write(base.join("src/main.rs"), "fn main() {\n println!(\"hello\");\n}\n").unwrap();
215 fs::write(base.join("src/lib.rs"), "/// Library docs\npub fn lib() -> i32 {\n 42\n}\n")
216 .unwrap();
217 fs::write(base.join("Cargo.toml"), "[package]\nname = \"test\"\n").unwrap();
218 dir
219 }
220
221 #[test]
222 fn test_run_ingestion_discovers_files() {
223 let dir = setup_rust_project();
224 let config = make_config(dir.path());
225 let result = run_ingestion(&config).unwrap();
226
227 assert!(!result.files.is_empty());
228 assert!(result.files.iter().any(|f| f.path.ends_with("src/main.rs")));
229 assert!(result.files.iter().any(|f| f.path.ends_with("src/lib.rs")));
230 }
231
232 #[test]
233 fn test_run_ingestion_detects_languages() {
234 let dir = setup_rust_project();
235 let config = make_config(dir.path());
236 let result = run_ingestion(&config).unwrap();
237
238 let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
239 assert_eq!(main_file.language, Some(Language::Rust));
240 }
241
242 #[test]
243 fn test_run_ingestion_counts_tokens() {
244 let dir = setup_rust_project();
245 let config = make_config(dir.path());
246 let result = run_ingestion(&config).unwrap();
247
248 for file in &result.files {
249 assert!(file.raw_token_count > 0, "token count should be > 0");
250 }
251 }
252
253 #[test]
254 fn test_run_ingestion_with_include_patterns() {
255 let dir = setup_rust_project();
256 let mut config = make_config(dir.path());
257 config.include_patterns = vec!["*.rs".to_string()];
258
259 let result = run_ingestion(&config).unwrap();
260 assert!(result.files.iter().all(|f| f.path.extension().is_some_and(|e| e == "rs")));
261 }
262
263 #[test]
264 fn test_run_ingestion_with_exclude_patterns() {
265 let dir = setup_rust_project();
266 let mut config = make_config(dir.path());
267 config.exclude_patterns = vec!["*.toml".to_string()];
268
269 let result = run_ingestion(&config).unwrap();
270 assert!(!result.files.iter().any(|f| f.path.ends_with("Cargo.toml")));
271 }
272
273 #[test]
274 fn test_run_ingestion_no_tree() {
275 let dir = setup_rust_project();
276 let mut config = make_config(dir.path());
277 config.no_tree = true;
278
279 let result = run_ingestion(&config).unwrap();
280 assert!(result.directory_tree.is_empty());
281 }
282
283 #[test]
284 fn test_run_ingestion_generates_tree() {
285 let dir = setup_rust_project();
286 let config = make_config(dir.path());
287
288 let result = run_ingestion(&config).unwrap();
289 assert!(!result.directory_tree.is_empty());
290 let tree = &result.directory_tree;
292 assert!(tree.contains("src"), "tree should contain 'src' directory");
293 assert!(tree.contains("main.rs"), "tree should contain 'main.rs'");
294 }
295
296 #[test]
297 fn test_run_ingestion_no_git_flag() {
298 let dir = setup_rust_project();
299 let mut config = make_config(dir.path());
300 config.no_git = true;
301
302 let result = run_ingestion(&config).unwrap();
303 assert!(result.git_context.is_none());
304 }
305
306 #[test]
307 fn test_run_ingestion_reads_file_contents() {
308 let dir = setup_rust_project();
309 let config = make_config(dir.path());
310 let result = run_ingestion(&config).unwrap();
311
312 let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
313 assert!(main_file.content.contains("main"));
314 }
315
316 #[test]
317 fn test_run_ingestion_with_python_files() {
318 let dir = TempDir::new().unwrap();
319 let base = dir.path();
320 fs::write(base.join("app.py"), "def main():\n pass\n").unwrap();
321 fs::write(base.join("main.rs"), "fn main() {}\n").unwrap();
322
323 let config = make_config(base);
324 let result = run_ingestion(&config).unwrap();
325
326 let py_file = result.files.iter().find(|f| f.path.ends_with("app.py")).unwrap();
327 assert_eq!(py_file.language, Some(Language::Python));
328
329 let rs_file = result.files.iter().find(|f| f.path.ends_with("main.rs")).unwrap();
330 assert_eq!(rs_file.language, Some(Language::Rust));
331 }
332
333 #[test]
334 fn test_run_ingestion_empty_directory() {
335 let dir = TempDir::new().unwrap();
336 let config = make_config(dir.path());
337 let result = run_ingestion(&config).unwrap();
338 assert!(result.files.is_empty());
339 assert!(result.git_context.is_none());
340 }
341
342 #[test]
343 fn test_build_directory_tree_basic() {
344 let dir = TempDir::new().unwrap();
345 let base = dir.path();
346 let files = vec![
347 PathBuf::from("src/main.rs"),
348 PathBuf::from("src/lib.rs"),
349 PathBuf::from("README.md"),
350 ];
351
352 let tree = build_directory_tree(base, &files);
353 assert!(tree.contains("src"));
354 assert!(tree.contains("main.rs"));
355 assert!(tree.contains("lib.rs"));
356 assert!(tree.contains("README.md"));
357 }
358
359 #[test]
360 fn test_run_ingestion_nested_directories() {
361 let dir = TempDir::new().unwrap();
362 let base = dir.path();
363 fs::create_dir_all(base.join("src/utils/helpers")).unwrap();
364 fs::write(base.join("src/utils/helpers/math.rs"), "pub fn add() {}").unwrap();
365 fs::write(base.join("src/main.rs"), "fn main() {}").unwrap();
366
367 let config = make_config(base);
368 let result = run_ingestion(&config).unwrap();
369
370 assert_eq!(result.files.len(), 2);
371 assert!(result.files.iter().any(|f| f.path.ends_with("src/utils/helpers/math.rs")));
372
373 let tree = &result.directory_tree;
374 assert!(tree.contains("utils"), "tree should contain 'utils'");
375 assert!(tree.contains("helpers"), "tree should contain 'helpers'");
376 assert!(tree.contains("math.rs"), "tree should contain 'math.rs'");
377 }
378}