Skip to main content

ast_doc_core/ingestion/
mod.rs

1//! Phase 1: File discovery and ingestion.
2//!
3//! Walks the project directory, applies .gitignore/.astdocignore rules,
4//! captures git metadata, and produces a directory tree.
5
6pub mod git;
7pub mod walker;
8
9use std::path::{Path, PathBuf};
10
11use git::extract_git_context;
12use tracing::{debug, info, warn};
13use walker::{build_globset, walk_directory};
14
15use crate::{config::AstDocConfig, error::AstDocError, parser::Language};
16
17/// A discovered source file with its content and metadata.
18#[derive(Debug, Clone)]
19pub struct DiscoveredFile {
20    /// Relative path from the project root.
21    pub path: PathBuf,
22    /// Full file content.
23    pub content: String,
24    /// Detected language (None if unsupported).
25    pub language: Option<Language>,
26    /// Raw token count of the original content.
27    pub raw_token_count: usize,
28}
29
30/// Git context captured from the repository.
31#[derive(Debug, Clone)]
32pub struct GitContext {
33    /// Current branch name.
34    pub branch: String,
35    /// Latest commit summary.
36    pub latest_commit: String,
37    /// Uncommitted changes diff (may be truncated).
38    pub diff: Option<String>,
39}
40
41/// Result of the ingestion phase.
42#[derive(Debug)]
43pub struct IngestionResult {
44    /// All discovered source files.
45    pub files: Vec<DiscoveredFile>,
46    /// Directory tree string (with annotations).
47    pub directory_tree: String,
48    /// Git context (None if `--no-git` or not a git repo).
49    pub git_context: Option<GitContext>,
50}
51
52/// Run the ingestion phase.
53///
54/// # Errors
55///
56/// Returns an error if directory walking or git operations fail.
57#[cfg_attr(feature = "hotpath", allow(missing_docs))]
58#[cfg_attr(feature = "hotpath", hotpath::measure)]
59pub fn run_ingestion(config: &AstDocConfig) -> Result<IngestionResult, AstDocError> {
60    let root = config
61        .path
62        .canonicalize()
63        .map_err(|e| AstDocError::FileRead { path: config.path.clone(), source: e })?;
64    info!(path = %root.display(), "starting ingestion");
65
66    // Build glob sets for include/exclude filtering
67    let include = build_globset(&config.include_patterns)?;
68    let exclude = build_globset(&config.exclude_patterns)?;
69
70    // Walk the directory to discover files
71    let file_paths = walk_directory(&root, &include, &exclude, config)?;
72
73    // Read file contents and detect languages
74    let mut files = Vec::with_capacity(file_paths.len());
75    for rel_path in &file_paths {
76        let abs_path = root.join(rel_path);
77        match std::fs::read_to_string(&abs_path) {
78            Ok(content) => {
79                let lang = crate::parser::detect_language(rel_path);
80                let token_count = count_tokens(&content);
81                debug!(
82                    path = %rel_path.display(),
83                    language = ?lang,
84                    tokens = token_count,
85                    "discovered file"
86                );
87                files.push(DiscoveredFile {
88                    path: rel_path.clone(),
89                    content,
90                    language: lang,
91                    raw_token_count: token_count,
92                });
93            }
94            Err(e) => {
95                warn!(
96                    path = %rel_path.display(),
97                    error = %e,
98                    "failed to read file, skipping"
99                );
100            }
101        }
102    }
103
104    // Build directory tree
105    let directory_tree =
106        if config.no_tree { String::new() } else { build_directory_tree(&root, &file_paths) };
107
108    // Capture git context
109    let git_context = if config.no_git {
110        None
111    } else {
112        match extract_git_context(&root) {
113            Ok(Some(ctx)) => Some(ctx),
114            Ok(None) => None,
115            Err(e) => {
116                warn!(error = %e, "failed to extract git context");
117                None
118            }
119        }
120    };
121
122    info!(files = files.len(), git = git_context.is_some(), "ingestion complete");
123
124    Ok(IngestionResult { files, directory_tree, git_context })
125}
126
127/// Count tokens in a string using `tiktoken-rs`.
128///
129/// Uses a cached BPE instance to avoid repeated initialization.
130fn count_tokens(text: &str) -> usize {
131    use std::sync::LazyLock;
132    static BPE: LazyLock<Option<tiktoken_rs::CoreBPE>> =
133        LazyLock::new(|| tiktoken_rs::cl100k_base().ok());
134
135    BPE.as_ref().map_or(0, |bpe| bpe.encode_with_special_tokens(text).len())
136}
137
138/// Build a directory tree string from discovered file paths.
139///
140/// Uses `termtree` to render a tree with annotations for detected languages.
141fn build_directory_tree(root: &Path, files: &[PathBuf]) -> String {
142    use termtree::Tree;
143
144    let parent_name = root.file_name().unwrap_or_default().to_string_lossy().to_string();
145
146    let mut tree = Tree::new(parent_name);
147
148    for file_path in files {
149        let mut current = &mut tree;
150        let components: Vec<_> =
151            file_path.components().map(|c| c.as_os_str().to_string_lossy().to_string()).collect();
152
153        for (i, component) in components.iter().enumerate() {
154            if i == components.len() - 1 {
155                // Leaf node - file with language annotation
156                let lang = crate::parser::detect_language(file_path)
157                    .map(|l| format!(" [{l}]"))
158                    .unwrap_or_default();
159                current.push(Tree::new(format!("{component}{lang}")));
160            } else {
161                // Directory node - find or create
162                let idx = current.leaves.iter().position(|child| child.root == component.as_str());
163                if let Some(pos) = idx {
164                    current = &mut current.leaves[pos];
165                } else {
166                    current.push(Tree::new(component.clone()));
167                    let last = current.leaves.len() - 1;
168                    current = &mut current.leaves[last];
169                }
170            }
171        }
172    }
173
174    tree.to_string()
175}
176
177/// Detect the language of a file from its extension.
178///
179/// Re-exports `parser::detect_language` for convenience.
180#[must_use]
181pub fn detect_language(path: &Path) -> Option<Language> {
182    crate::parser::detect_language(path)
183}
184
185#[cfg(test)]
186#[expect(clippy::unwrap_used)]
187mod tests {
188    use std::fs;
189
190    use tempfile::TempDir;
191
192    use super::*;
193
194    fn make_config(root: &Path) -> AstDocConfig {
195        AstDocConfig {
196            path: root.to_path_buf(),
197            output: None,
198            max_tokens: 10_000,
199            core_patterns: vec![],
200            default_strategy: crate::config::OutputStrategy::Full,
201            include_patterns: vec![],
202            exclude_patterns: vec![],
203            no_git: true,
204            no_tree: false,
205            copy: false,
206            verbose: false,
207        }
208    }
209
210    fn setup_rust_project() -> TempDir {
211        let dir = TempDir::new().unwrap();
212        let base = dir.path();
213        fs::create_dir_all(base.join("src")).unwrap();
214        fs::write(base.join("src/main.rs"), "fn main() {\n    println!(\"hello\");\n}\n").unwrap();
215        fs::write(base.join("src/lib.rs"), "/// Library docs\npub fn lib() -> i32 {\n    42\n}\n")
216            .unwrap();
217        fs::write(base.join("Cargo.toml"), "[package]\nname = \"test\"\n").unwrap();
218        dir
219    }
220
221    #[test]
222    fn test_run_ingestion_discovers_files() {
223        let dir = setup_rust_project();
224        let config = make_config(dir.path());
225        let result = run_ingestion(&config).unwrap();
226
227        assert!(!result.files.is_empty());
228        assert!(result.files.iter().any(|f| f.path.ends_with("src/main.rs")));
229        assert!(result.files.iter().any(|f| f.path.ends_with("src/lib.rs")));
230    }
231
232    #[test]
233    fn test_run_ingestion_detects_languages() {
234        let dir = setup_rust_project();
235        let config = make_config(dir.path());
236        let result = run_ingestion(&config).unwrap();
237
238        let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
239        assert_eq!(main_file.language, Some(Language::Rust));
240    }
241
242    #[test]
243    fn test_run_ingestion_counts_tokens() {
244        let dir = setup_rust_project();
245        let config = make_config(dir.path());
246        let result = run_ingestion(&config).unwrap();
247
248        for file in &result.files {
249            assert!(file.raw_token_count > 0, "token count should be > 0");
250        }
251    }
252
253    #[test]
254    fn test_run_ingestion_with_include_patterns() {
255        let dir = setup_rust_project();
256        let mut config = make_config(dir.path());
257        config.include_patterns = vec!["*.rs".to_string()];
258
259        let result = run_ingestion(&config).unwrap();
260        assert!(result.files.iter().all(|f| f.path.extension().is_some_and(|e| e == "rs")));
261    }
262
263    #[test]
264    fn test_run_ingestion_with_exclude_patterns() {
265        let dir = setup_rust_project();
266        let mut config = make_config(dir.path());
267        config.exclude_patterns = vec!["*.toml".to_string()];
268
269        let result = run_ingestion(&config).unwrap();
270        assert!(!result.files.iter().any(|f| f.path.ends_with("Cargo.toml")));
271    }
272
273    #[test]
274    fn test_run_ingestion_no_tree() {
275        let dir = setup_rust_project();
276        let mut config = make_config(dir.path());
277        config.no_tree = true;
278
279        let result = run_ingestion(&config).unwrap();
280        assert!(result.directory_tree.is_empty());
281    }
282
283    #[test]
284    fn test_run_ingestion_generates_tree() {
285        let dir = setup_rust_project();
286        let config = make_config(dir.path());
287
288        let result = run_ingestion(&config).unwrap();
289        assert!(!result.directory_tree.is_empty());
290        // Tree should contain the directory name and file names
291        let tree = &result.directory_tree;
292        assert!(tree.contains("src"), "tree should contain 'src' directory");
293        assert!(tree.contains("main.rs"), "tree should contain 'main.rs'");
294    }
295
296    #[test]
297    fn test_run_ingestion_no_git_flag() {
298        let dir = setup_rust_project();
299        let mut config = make_config(dir.path());
300        config.no_git = true;
301
302        let result = run_ingestion(&config).unwrap();
303        assert!(result.git_context.is_none());
304    }
305
306    #[test]
307    fn test_run_ingestion_reads_file_contents() {
308        let dir = setup_rust_project();
309        let config = make_config(dir.path());
310        let result = run_ingestion(&config).unwrap();
311
312        let main_file = result.files.iter().find(|f| f.path.ends_with("src/main.rs")).unwrap();
313        assert!(main_file.content.contains("main"));
314    }
315
316    #[test]
317    fn test_run_ingestion_with_python_files() {
318        let dir = TempDir::new().unwrap();
319        let base = dir.path();
320        fs::write(base.join("app.py"), "def main():\n    pass\n").unwrap();
321        fs::write(base.join("main.rs"), "fn main() {}\n").unwrap();
322
323        let config = make_config(base);
324        let result = run_ingestion(&config).unwrap();
325
326        let py_file = result.files.iter().find(|f| f.path.ends_with("app.py")).unwrap();
327        assert_eq!(py_file.language, Some(Language::Python));
328
329        let rs_file = result.files.iter().find(|f| f.path.ends_with("main.rs")).unwrap();
330        assert_eq!(rs_file.language, Some(Language::Rust));
331    }
332
333    #[test]
334    fn test_run_ingestion_empty_directory() {
335        let dir = TempDir::new().unwrap();
336        let config = make_config(dir.path());
337        let result = run_ingestion(&config).unwrap();
338        assert!(result.files.is_empty());
339        assert!(result.git_context.is_none());
340    }
341
342    #[test]
343    fn test_build_directory_tree_basic() {
344        let dir = TempDir::new().unwrap();
345        let base = dir.path();
346        let files = vec![
347            PathBuf::from("src/main.rs"),
348            PathBuf::from("src/lib.rs"),
349            PathBuf::from("README.md"),
350        ];
351
352        let tree = build_directory_tree(base, &files);
353        assert!(tree.contains("src"));
354        assert!(tree.contains("main.rs"));
355        assert!(tree.contains("lib.rs"));
356        assert!(tree.contains("README.md"));
357    }
358
359    #[test]
360    fn test_run_ingestion_nested_directories() {
361        let dir = TempDir::new().unwrap();
362        let base = dir.path();
363        fs::create_dir_all(base.join("src/utils/helpers")).unwrap();
364        fs::write(base.join("src/utils/helpers/math.rs"), "pub fn add() {}").unwrap();
365        fs::write(base.join("src/main.rs"), "fn main() {}").unwrap();
366
367        let config = make_config(base);
368        let result = run_ingestion(&config).unwrap();
369
370        assert_eq!(result.files.len(), 2);
371        assert!(result.files.iter().any(|f| f.path.ends_with("src/utils/helpers/math.rs")));
372
373        let tree = &result.directory_tree;
374        assert!(tree.contains("utils"), "tree should contain 'utils'");
375        assert!(tree.contains("helpers"), "tree should contain 'helpers'");
376        assert!(tree.contains("math.rs"), "tree should contain 'math.rs'");
377    }
378}