ast_doc_core/ingestion/
mod.rs1pub mod git;
7pub mod walker;
8
9use std::path::{Path, PathBuf};
10
11use git::extract_git_context;
12use tracing::{debug, info, warn};
13use walker::{build_globset, walk_directory};
14
15use crate::{config::AstDocConfig, error::AstDocError, parser::Language};
16
17#[derive(Debug, Clone)]
19pub struct DiscoveredFile {
20 pub path: PathBuf,
22 pub content: String,
24 pub language: Option<Language>,
26 pub raw_token_count: usize,
28}
29
30#[derive(Debug, Clone)]
32pub struct GitContext {
33 pub branch: String,
35 pub latest_commit: String,
37 pub diff: Option<String>,
39}
40
41#[derive(Debug)]
43pub struct IngestionResult {
44 pub files: Vec<DiscoveredFile>,
46 pub directory_tree: String,
48 pub git_context: Option<GitContext>,
50}
51
52pub fn run_ingestion(config: &AstDocConfig) -> Result<IngestionResult, AstDocError> {
58 let root = config
59 .path
60 .canonicalize()
61 .map_err(|e| AstDocError::FileRead { path: config.path.clone(), source: e })?;
62 info!(path = %root.display(), "starting ingestion");
63
64 let include = build_globset(&config.include_patterns)?;
66 let exclude = build_globset(&config.exclude_patterns)?;
67
68 let file_paths = walk_directory(&root, &include, &exclude, config)?;
70
71 let mut files = Vec::with_capacity(file_paths.len());
73 for rel_path in &file_paths {
74 let abs_path = root.join(rel_path);
75 match std::fs::read_to_string(&abs_path) {
76 Ok(content) => {
77 let language = crate::parser::detect_language(rel_path);
78 let token_count = count_tokens(&content);
79 debug!(
80 path = %rel_path.display(),
81 language = ?language,
82 tokens = token_count,
83 "discovered file"
84 );
85 files.push(DiscoveredFile {
86 path: rel_path.clone(),
87 content,
88 language,
89 raw_token_count: token_count,
90 });
91 }
92 Err(e) => {
93 warn!(
94 path = %rel_path.display(),
95 error = %e,
96 "failed to read file, skipping"
97 );
98 }
99 }
100 }
101
102 let directory_tree =
104 if config.no_tree { String::new() } else { build_directory_tree(&root, &file_paths) };
105
106 let git_context = if config.no_git {
108 None
109 } else {
110 match extract_git_context(&root) {
111 Ok(Some(ctx)) => Some(ctx),
112 Ok(None) => None,
113 Err(e) => {
114 warn!(error = %e, "failed to extract git context");
115 None
116 }
117 }
118 };
119
120 info!(files = files.len(), git = git_context.is_some(), "ingestion complete");
121
122 Ok(IngestionResult { files, directory_tree, git_context })
123}
124
125fn count_tokens(text: &str) -> usize {
127 tiktoken_rs::cl100k_base().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
128}
129
130fn build_directory_tree(root: &Path, files: &[PathBuf]) -> String {
134 use termtree::Tree;
135
136 let parent_name = root.file_name().unwrap_or_default().to_string_lossy().to_string();
137
138 let mut tree = Tree::new(parent_name);
139
140 for file_path in files {
141 let mut current = &mut tree;
142 let components: Vec<_> =
143 file_path.components().map(|c| c.as_os_str().to_string_lossy().to_string()).collect();
144
145 for (i, component) in components.iter().enumerate() {
146 if i == components.len() - 1 {
147 let lang = crate::parser::detect_language(file_path)
149 .map(|l| format!(" [{l}]"))
150 .unwrap_or_default();
151 current.push(Tree::new(format!("{component}{lang}")));
152 } else {
153 let idx = current.leaves.iter().position(|child| child.root == component.as_str());
155 if let Some(pos) = idx {
156 current = &mut current.leaves[pos];
157 } else {
158 current.push(Tree::new(component.clone()));
159 let last = current.leaves.len() - 1;
160 current = &mut current.leaves[last];
161 }
162 }
163 }
164 }
165
166 tree.to_string()
167}
168
169#[must_use]
173pub fn detect_language(path: &Path) -> Option<Language> {
174 crate::parser::detect_language(path)
175}
176
177#[cfg(test)]
178#[expect(clippy::unwrap_used)]
179mod tests {
180 use std::fs;
181
182 use tempfile::TempDir;
183
184 use super::*;
185
186 fn make_config(root: &Path) -> AstDocConfig {
187 AstDocConfig {
188 path: root.to_path_buf(),
189 output: None,
190 max_tokens: 10_000,
191 core_patterns: vec![],
192 default_strategy: crate::config::OutputStrategy::Full,
193 include_patterns: vec![],
194 exclude_patterns: vec![],
195 no_git: true,
196 no_tree: false,
197 copy: false,
198 verbose: false,
199 }
200 }
201
202 fn setup_rust_project() -> TempDir {
203 let dir = TempDir::new().unwrap();
204 let base = dir.path();
205 fs::create_dir_all(base.join("src")).unwrap();
206 fs::write(base.join("src/main.rs"), "fn main() {\n println!(\"hello\");\n}\n").unwrap();
207 fs::write(base.join("src/lib.rs"), "/// Library docs\npub fn lib() -> i32 {\n 42\n}\n")
208 .unwrap();
209 fs::write(base.join("Cargo.toml"), "[package]\nname = \"test\"\n").unwrap();
210 dir
211 }
212
213 #[test]
214 fn test_run_ingestion_discovers_files() {
215 let dir = setup_rust_project();
216 let config = make_config(dir.path());
217 let result = run_ingestion(&config).unwrap();
218
219 assert!(!result.files.is_empty());
220 assert!(result.files.iter().any(|f| f.path.ends_with("src/main.rs")));
221 assert!(result.files.iter().any(|f| f.path.ends_with("src/lib.rs")));
222 }
223
224 #[test]
225 fn test_run_ingestion_detects_languages() {
226 let dir = setup_rust_project();
227 let config = make_config(dir.path());
228 let result = run_ingestion(&config).unwrap();
229
230 let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
231 assert_eq!(main_file.language, Some(Language::Rust));
232 }
233
234 #[test]
235 fn test_run_ingestion_counts_tokens() {
236 let dir = setup_rust_project();
237 let config = make_config(dir.path());
238 let result = run_ingestion(&config).unwrap();
239
240 for file in &result.files {
241 assert!(file.raw_token_count > 0, "token count should be > 0");
242 }
243 }
244
245 #[test]
246 fn test_run_ingestion_with_include_patterns() {
247 let dir = setup_rust_project();
248 let mut config = make_config(dir.path());
249 config.include_patterns = vec!["*.rs".to_string()];
250
251 let result = run_ingestion(&config).unwrap();
252 assert!(result.files.iter().all(|f| f.path.extension().is_some_and(|e| e == "rs")));
253 }
254
255 #[test]
256 fn test_run_ingestion_with_exclude_patterns() {
257 let dir = setup_rust_project();
258 let mut config = make_config(dir.path());
259 config.exclude_patterns = vec!["*.toml".to_string()];
260
261 let result = run_ingestion(&config).unwrap();
262 assert!(!result.files.iter().any(|f| f.path.ends_with("Cargo.toml")));
263 }
264
265 #[test]
266 fn test_run_ingestion_no_tree() {
267 let dir = setup_rust_project();
268 let mut config = make_config(dir.path());
269 config.no_tree = true;
270
271 let result = run_ingestion(&config).unwrap();
272 assert!(result.directory_tree.is_empty());
273 }
274
275 #[test]
276 fn test_run_ingestion_generates_tree() {
277 let dir = setup_rust_project();
278 let config = make_config(dir.path());
279
280 let result = run_ingestion(&config).unwrap();
281 assert!(!result.directory_tree.is_empty());
282 let tree = &result.directory_tree;
284 assert!(tree.contains("src"), "tree should contain 'src' directory");
285 assert!(tree.contains("main.rs"), "tree should contain 'main.rs'");
286 }
287
288 #[test]
289 fn test_run_ingestion_no_git_flag() {
290 let dir = setup_rust_project();
291 let mut config = make_config(dir.path());
292 config.no_git = true;
293
294 let result = run_ingestion(&config).unwrap();
295 assert!(result.git_context.is_none());
296 }
297
298 #[test]
299 fn test_run_ingestion_reads_file_contents() {
300 let dir = setup_rust_project();
301 let config = make_config(dir.path());
302 let result = run_ingestion(&config).unwrap();
303
304 let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
305 assert!(main_file.content.contains("main"));
306 }
307
308 #[test]
309 fn test_run_ingestion_with_python_files() {
310 let dir = TempDir::new().unwrap();
311 let base = dir.path();
312 fs::write(base.join("app.py"), "def main():\n pass\n").unwrap();
313 fs::write(base.join("main.rs"), "fn main() {}\n").unwrap();
314
315 let config = make_config(base);
316 let result = run_ingestion(&config).unwrap();
317
318 let py_file = result.files.iter().find(|f| f.path.ends_with("app.py")).unwrap();
319 assert_eq!(py_file.language, Some(Language::Python));
320
321 let rs_file = result.files.iter().find(|f| f.path.ends_with("main.rs")).unwrap();
322 assert_eq!(rs_file.language, Some(Language::Rust));
323 }
324
325 #[test]
326 fn test_run_ingestion_empty_directory() {
327 let dir = TempDir::new().unwrap();
328 let config = make_config(dir.path());
329 let result = run_ingestion(&config).unwrap();
330 assert!(result.files.is_empty());
331 assert!(result.git_context.is_none());
332 }
333
334 #[test]
335 fn test_build_directory_tree_basic() {
336 let dir = TempDir::new().unwrap();
337 let base = dir.path();
338 let files = vec![
339 PathBuf::from("src/main.rs"),
340 PathBuf::from("src/lib.rs"),
341 PathBuf::from("README.md"),
342 ];
343
344 let tree = build_directory_tree(base, &files);
345 assert!(tree.contains("src"));
346 assert!(tree.contains("main.rs"));
347 assert!(tree.contains("lib.rs"));
348 assert!(tree.contains("README.md"));
349 }
350
351 #[test]
352 fn test_run_ingestion_nested_directories() {
353 let dir = TempDir::new().unwrap();
354 let base = dir.path();
355 fs::create_dir_all(base.join("src/utils/helpers")).unwrap();
356 fs::write(base.join("src/utils/helpers/math.rs"), "pub fn add() {}").unwrap();
357 fs::write(base.join("src/main.rs"), "fn main() {}").unwrap();
358
359 let config = make_config(base);
360 let result = run_ingestion(&config).unwrap();
361
362 assert_eq!(result.files.len(), 2);
363 assert!(result.files.iter().any(|f| f.path.ends_with("src/utils/helpers/math.rs")));
364
365 let tree = &result.directory_tree;
366 assert!(tree.contains("utils"), "tree should contain 'utils'");
367 assert!(tree.contains("helpers"), "tree should contain 'helpers'");
368 assert!(tree.contains("math.rs"), "tree should contain 'math.rs'");
369 }
370}