Skip to main content

ast_doc_core/ingestion/
mod.rs

1//! Phase 1: File discovery and ingestion.
2//!
3//! Walks the project directory, applies .gitignore/.astdocignore rules,
4//! captures git metadata, and produces a directory tree.
5
6pub mod git;
7pub mod walker;
8
9use std::path::{Path, PathBuf};
10
11use git::extract_git_context;
12use tracing::{debug, info, warn};
13use walker::{build_globset, walk_directory};
14
15use crate::{config::AstDocConfig, error::AstDocError, parser::Language};
16
17/// A discovered source file with its content and metadata.
18#[derive(Debug, Clone)]
19pub struct DiscoveredFile {
20    /// Relative path from the project root.
21    pub path: PathBuf,
22    /// Full file content.
23    pub content: String,
24    /// Detected language (None if unsupported).
25    pub language: Option<Language>,
26    /// Raw token count of the original content.
27    pub raw_token_count: usize,
28}
29
30/// Git context captured from the repository.
31#[derive(Debug, Clone)]
32pub struct GitContext {
33    /// Current branch name.
34    pub branch: String,
35    /// Latest commit summary.
36    pub latest_commit: String,
37    /// Uncommitted changes diff (may be truncated).
38    pub diff: Option<String>,
39}
40
41/// Result of the ingestion phase.
42#[derive(Debug)]
43pub struct IngestionResult {
44    /// All discovered source files.
45    pub files: Vec<DiscoveredFile>,
46    /// Directory tree string (with annotations).
47    pub directory_tree: String,
48    /// Git context (None if `--no-git` or not a git repo).
49    pub git_context: Option<GitContext>,
50}
51
52/// Run the ingestion phase.
53///
54/// # Errors
55///
56/// Returns an error if directory walking or git operations fail.
57pub fn run_ingestion(config: &AstDocConfig) -> Result<IngestionResult, AstDocError> {
58    let root = config
59        .path
60        .canonicalize()
61        .map_err(|e| AstDocError::FileRead { path: config.path.clone(), source: e })?;
62    info!(path = %root.display(), "starting ingestion");
63
64    // Build glob sets for include/exclude filtering
65    let include = build_globset(&config.include_patterns)?;
66    let exclude = build_globset(&config.exclude_patterns)?;
67
68    // Walk the directory to discover files
69    let file_paths = walk_directory(&root, &include, &exclude, config)?;
70
71    // Read file contents and detect languages
72    let mut files = Vec::with_capacity(file_paths.len());
73    for rel_path in &file_paths {
74        let abs_path = root.join(rel_path);
75        match std::fs::read_to_string(&abs_path) {
76            Ok(content) => {
77                let language = crate::parser::detect_language(rel_path);
78                let token_count = count_tokens(&content);
79                debug!(
80                    path = %rel_path.display(),
81                    language = ?language,
82                    tokens = token_count,
83                    "discovered file"
84                );
85                files.push(DiscoveredFile {
86                    path: rel_path.clone(),
87                    content,
88                    language,
89                    raw_token_count: token_count,
90                });
91            }
92            Err(e) => {
93                warn!(
94                    path = %rel_path.display(),
95                    error = %e,
96                    "failed to read file, skipping"
97                );
98            }
99        }
100    }
101
102    // Build directory tree
103    let directory_tree =
104        if config.no_tree { String::new() } else { build_directory_tree(&root, &file_paths) };
105
106    // Capture git context
107    let git_context = if config.no_git {
108        None
109    } else {
110        match extract_git_context(&root) {
111            Ok(Some(ctx)) => Some(ctx),
112            Ok(None) => None,
113            Err(e) => {
114                warn!(error = %e, "failed to extract git context");
115                None
116            }
117        }
118    };
119
120    info!(files = files.len(), git = git_context.is_some(), "ingestion complete");
121
122    Ok(IngestionResult { files, directory_tree, git_context })
123}
124
125/// Count tokens in a string using `tiktoken-rs`.
126fn count_tokens(text: &str) -> usize {
127    tiktoken_rs::cl100k_base().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
128}
129
130/// Build a directory tree string from discovered file paths.
131///
132/// Uses `termtree` to render a tree with annotations for detected languages.
133fn build_directory_tree(root: &Path, files: &[PathBuf]) -> String {
134    use termtree::Tree;
135
136    let parent_name = root.file_name().unwrap_or_default().to_string_lossy().to_string();
137
138    let mut tree = Tree::new(parent_name);
139
140    for file_path in files {
141        let mut current = &mut tree;
142        let components: Vec<_> =
143            file_path.components().map(|c| c.as_os_str().to_string_lossy().to_string()).collect();
144
145        for (i, component) in components.iter().enumerate() {
146            if i == components.len() - 1 {
147                // Leaf node - file with language annotation
148                let lang = crate::parser::detect_language(file_path)
149                    .map(|l| format!(" [{l}]"))
150                    .unwrap_or_default();
151                current.push(Tree::new(format!("{component}{lang}")));
152            } else {
153                // Directory node - find or create
154                let idx = current.leaves.iter().position(|child| child.root == component.as_str());
155                if let Some(pos) = idx {
156                    current = &mut current.leaves[pos];
157                } else {
158                    current.push(Tree::new(component.clone()));
159                    let last = current.leaves.len() - 1;
160                    current = &mut current.leaves[last];
161                }
162            }
163        }
164    }
165
166    tree.to_string()
167}
168
169/// Detect the language of a file from its extension.
170///
171/// Re-exports `parser::detect_language` for convenience.
172#[must_use]
173pub fn detect_language(path: &Path) -> Option<Language> {
174    crate::parser::detect_language(path)
175}
176
177#[cfg(test)]
178#[expect(clippy::unwrap_used)]
179mod tests {
180    use std::fs;
181
182    use tempfile::TempDir;
183
184    use super::*;
185
186    fn make_config(root: &Path) -> AstDocConfig {
187        AstDocConfig {
188            path: root.to_path_buf(),
189            output: None,
190            max_tokens: 10_000,
191            core_patterns: vec![],
192            default_strategy: crate::config::OutputStrategy::Full,
193            include_patterns: vec![],
194            exclude_patterns: vec![],
195            no_git: true,
196            no_tree: false,
197            copy: false,
198            verbose: false,
199        }
200    }
201
202    fn setup_rust_project() -> TempDir {
203        let dir = TempDir::new().unwrap();
204        let base = dir.path();
205        fs::create_dir_all(base.join("src")).unwrap();
206        fs::write(base.join("src/main.rs"), "fn main() {\n    println!(\"hello\");\n}\n").unwrap();
207        fs::write(base.join("src/lib.rs"), "/// Library docs\npub fn lib() -> i32 {\n    42\n}\n")
208            .unwrap();
209        fs::write(base.join("Cargo.toml"), "[package]\nname = \"test\"\n").unwrap();
210        dir
211    }
212
213    #[test]
214    fn test_run_ingestion_discovers_files() {
215        let dir = setup_rust_project();
216        let config = make_config(dir.path());
217        let result = run_ingestion(&config).unwrap();
218
219        assert!(!result.files.is_empty());
220        assert!(result.files.iter().any(|f| f.path.ends_with("src/main.rs")));
221        assert!(result.files.iter().any(|f| f.path.ends_with("src/lib.rs")));
222    }
223
224    #[test]
225    fn test_run_ingestion_detects_languages() {
226        let dir = setup_rust_project();
227        let config = make_config(dir.path());
228        let result = run_ingestion(&config).unwrap();
229
230        let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
231        assert_eq!(main_file.language, Some(Language::Rust));
232    }
233
234    #[test]
235    fn test_run_ingestion_counts_tokens() {
236        let dir = setup_rust_project();
237        let config = make_config(dir.path());
238        let result = run_ingestion(&config).unwrap();
239
240        for file in &result.files {
241            assert!(file.raw_token_count > 0, "token count should be > 0");
242        }
243    }
244
245    #[test]
246    fn test_run_ingestion_with_include_patterns() {
247        let dir = setup_rust_project();
248        let mut config = make_config(dir.path());
249        config.include_patterns = vec!["*.rs".to_string()];
250
251        let result = run_ingestion(&config).unwrap();
252        assert!(result.files.iter().all(|f| f.path.extension().is_some_and(|e| e == "rs")));
253    }
254
255    #[test]
256    fn test_run_ingestion_with_exclude_patterns() {
257        let dir = setup_rust_project();
258        let mut config = make_config(dir.path());
259        config.exclude_patterns = vec!["*.toml".to_string()];
260
261        let result = run_ingestion(&config).unwrap();
262        assert!(!result.files.iter().any(|f| f.path.ends_with("Cargo.toml")));
263    }
264
265    #[test]
266    fn test_run_ingestion_no_tree() {
267        let dir = setup_rust_project();
268        let mut config = make_config(dir.path());
269        config.no_tree = true;
270
271        let result = run_ingestion(&config).unwrap();
272        assert!(result.directory_tree.is_empty());
273    }
274
275    #[test]
276    fn test_run_ingestion_generates_tree() {
277        let dir = setup_rust_project();
278        let config = make_config(dir.path());
279
280        let result = run_ingestion(&config).unwrap();
281        assert!(!result.directory_tree.is_empty());
282        // Tree should contain the directory name and file names
283        let tree = &result.directory_tree;
284        assert!(tree.contains("src"), "tree should contain 'src' directory");
285        assert!(tree.contains("main.rs"), "tree should contain 'main.rs'");
286    }
287
288    #[test]
289    fn test_run_ingestion_no_git_flag() {
290        let dir = setup_rust_project();
291        let mut config = make_config(dir.path());
292        config.no_git = true;
293
294        let result = run_ingestion(&config).unwrap();
295        assert!(result.git_context.is_none());
296    }
297
298    #[test]
299    fn test_run_ingestion_reads_file_contents() {
300        let dir = setup_rust_project();
301        let config = make_config(dir.path());
302        let result = run_ingestion(&config).unwrap();
303
304        let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
305        assert!(main_file.content.contains("main"));
306    }
307
308    #[test]
309    fn test_run_ingestion_with_python_files() {
310        let dir = TempDir::new().unwrap();
311        let base = dir.path();
312        fs::write(base.join("app.py"), "def main():\n    pass\n").unwrap();
313        fs::write(base.join("main.rs"), "fn main() {}\n").unwrap();
314
315        let config = make_config(base);
316        let result = run_ingestion(&config).unwrap();
317
318        let py_file = result.files.iter().find(|f| f.path.ends_with("app.py")).unwrap();
319        assert_eq!(py_file.language, Some(Language::Python));
320
321        let rs_file = result.files.iter().find(|f| f.path.ends_with("main.rs")).unwrap();
322        assert_eq!(rs_file.language, Some(Language::Rust));
323    }
324
325    #[test]
326    fn test_run_ingestion_empty_directory() {
327        let dir = TempDir::new().unwrap();
328        let config = make_config(dir.path());
329        let result = run_ingestion(&config).unwrap();
330        assert!(result.files.is_empty());
331        assert!(result.git_context.is_none());
332    }
333
334    #[test]
335    fn test_build_directory_tree_basic() {
336        let dir = TempDir::new().unwrap();
337        let base = dir.path();
338        let files = vec![
339            PathBuf::from("src/main.rs"),
340            PathBuf::from("src/lib.rs"),
341            PathBuf::from("README.md"),
342        ];
343
344        let tree = build_directory_tree(base, &files);
345        assert!(tree.contains("src"));
346        assert!(tree.contains("main.rs"));
347        assert!(tree.contains("lib.rs"));
348        assert!(tree.contains("README.md"));
349    }
350
351    #[test]
352    fn test_run_ingestion_nested_directories() {
353        let dir = TempDir::new().unwrap();
354        let base = dir.path();
355        fs::create_dir_all(base.join("src/utils/helpers")).unwrap();
356        fs::write(base.join("src/utils/helpers/math.rs"), "pub fn add() {}").unwrap();
357        fs::write(base.join("src/main.rs"), "fn main() {}").unwrap();
358
359        let config = make_config(base);
360        let result = run_ingestion(&config).unwrap();
361
362        assert_eq!(result.files.len(), 2);
363        assert!(result.files.iter().any(|f| f.path.ends_with("src/utils/helpers/math.rs")));
364
365        let tree = &result.directory_tree;
366        assert!(tree.contains("utils"), "tree should contain 'utils'");
367        assert!(tree.contains("helpers"), "tree should contain 'helpers'");
368        assert!(tree.contains("math.rs"), "tree should contain 'math.rs'");
369    }
370}