project_rag/indexer/file_walker/
mod.rs

1//! File walking functionality for directory traversal
2
3use super::file_info::FileInfo;
4use super::language::detect_language;
5use super::pdf_extractor::extract_pdf_to_markdown;
6use anyhow::{Context, Result};
7use ignore::WalkBuilder;
8use sha2::{Digest, Sha256};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::sync::atomic::{AtomicBool, Ordering};
12use std::sync::Arc;
13
14pub struct FileWalker {
15    pub(crate) root: PathBuf,
16    pub(crate) project: Option<String>,
17    pub(crate) max_file_size: usize,
18    pub(crate) include_patterns: Vec<String>,
19    pub(crate) exclude_patterns: Vec<String>,
20    /// Optional cancellation flag - if set to true, walk() will exit early
21    cancelled: Option<Arc<AtomicBool>>,
22}
23
24impl FileWalker {
25    pub fn new(root: impl AsRef<Path>, max_file_size: usize) -> Self {
26        Self {
27            root: root.as_ref().to_path_buf(),
28            project: None,
29            max_file_size,
30            include_patterns: vec![],
31            exclude_patterns: vec![],
32            cancelled: None,
33        }
34    }
35
36    /// Set a cancellation flag that will be checked during the walk.
37    /// If the flag is set to true, the walk will exit early.
38    pub fn with_cancellation_flag(mut self, cancelled: Arc<AtomicBool>) -> Self {
39        self.cancelled = Some(cancelled);
40        self
41    }
42
43    /// Check if cancellation has been requested
44    fn is_cancelled(&self) -> bool {
45        self.cancelled
46            .as_ref()
47            .is_some_and(|flag| flag.load(Ordering::Relaxed))
48    }
49
50    pub fn with_project(mut self, project: Option<String>) -> Self {
51        self.project = project;
52        self
53    }
54
55    pub fn with_patterns(
56        mut self,
57        include_patterns: Vec<String>,
58        exclude_patterns: Vec<String>,
59    ) -> Self {
60        self.include_patterns = include_patterns;
61        self.exclude_patterns = exclude_patterns;
62        self
63    }
64
65    /// Walk the directory and collect all eligible files
66    pub fn walk(&self) -> Result<Vec<FileInfo>> {
67        // Verify root directory exists
68        if !self.root.exists() {
69            anyhow::bail!("Root directory does not exist: {:?}", self.root);
70        }
71        if !self.root.is_dir() {
72            anyhow::bail!("Root path is not a directory: {:?}", self.root);
73        }
74
75        let mut files = Vec::new();
76
77        let walker = WalkBuilder::new(&self.root)
78            .standard_filters(true) // Respect .gitignore, .ignore, etc.
79            .hidden(false) // Don't skip hidden files by default
80            .git_ignore(true) // Respect .gitignore files
81            .git_exclude(true) // Respect .git/info/exclude
82            .git_global(true) // Respect global gitignore
83            .require_git(false) // Don't require a .git directory
84            .build();
85
86        for entry in walker {
87            // Check for cancellation at the start of each iteration
88            if self.is_cancelled() {
89                tracing::info!("File walk cancelled after {} files", files.len());
90                anyhow::bail!("Indexing was cancelled");
91            }
92
93            let entry = entry.context("Failed to read directory entry")?;
94            let path = entry.path();
95
96            // Skip directories
97            if path.is_dir() {
98                continue;
99            }
100
101            // Explicitly skip .git directory contents
102            if path.components().any(|c| c.as_os_str() == ".git") {
103                tracing::debug!("Skipping .git directory file: {:?}", path);
104                continue;
105            }
106
107            // Check file size
108            if let Ok(metadata) = fs::metadata(path)
109                && metadata.len() > self.max_file_size as u64
110            {
111                tracing::debug!("Skipping large file: {:?}", path);
112                continue;
113            }
114
115            // Check if file is text (binary detection), but allow PDFs
116            let is_pdf = path
117                .extension()
118                .and_then(|e| e.to_str())
119                .map(|e| e.to_lowercase() == "pdf")
120                .unwrap_or(false);
121
122            if !is_pdf && !self.is_text_file(path)? {
123                tracing::debug!("Skipping binary file: {:?}", path);
124                continue;
125            }
126
127            // Apply include/exclude patterns
128            if !self.matches_patterns(path) {
129                continue;
130            }
131
132            // Read file content - extract text from PDFs or read as UTF-8
133            let content = if is_pdf {
134                match extract_pdf_to_markdown(path) {
135                    Ok(c) => c,
136                    Err(e) => {
137                        tracing::warn!("Failed to extract PDF {:?}: {}", path, e);
138                        continue;
139                    }
140                }
141            } else {
142                match fs::read_to_string(path) {
143                    Ok(c) => c,
144                    Err(e) => {
145                        tracing::debug!(
146                            "Skipping file that can't be read as UTF-8: {:?}: {}",
147                            path,
148                            e
149                        );
150                        continue;
151                    }
152                }
153            };
154
155            // Calculate hash
156            let hash = self.calculate_hash(&content);
157
158            // Get relative path
159            let relative_path = path
160                .strip_prefix(&self.root)
161                .unwrap_or(path)
162                .to_string_lossy()
163                .to_string();
164
165            // Detect language
166            let extension = path.extension().and_then(|e| e.to_str()).map(String::from);
167            let language = extension.as_ref().and_then(|ext| detect_language(ext));
168
169            files.push(FileInfo {
170                path: path.to_path_buf(),
171                relative_path,
172                root_path: self.root.to_string_lossy().to_string(),
173                project: self.project.clone(),
174                extension,
175                language,
176                content,
177                hash,
178            });
179        }
180
181        tracing::info!("Found {} files to index", files.len());
182        Ok(files)
183    }
184
185    /// Check if a file is likely text (not binary)
186    pub(crate) fn is_text_file(&self, path: &Path) -> Result<bool> {
187        let content = fs::read(path).context("Failed to read file")?;
188
189        // Simple heuristic: if more than 30% of bytes are non-printable, it's binary
190        let non_printable = content
191            .iter()
192            .filter(|&&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
193            .count();
194
195        Ok((non_printable as f64 / content.len() as f64) < 0.3)
196    }
197
198    /// Check if file matches include/exclude patterns
199    pub(crate) fn matches_patterns(&self, path: &Path) -> bool {
200        let path_str = path.to_string_lossy();
201
202        // If include patterns are specified, file must match at least one
203        if !self.include_patterns.is_empty() {
204            let matches_include = self
205                .include_patterns
206                .iter()
207                .any(|pattern| path_str.contains(pattern));
208            if !matches_include {
209                return false;
210            }
211        }
212
213        // File must not match any exclude pattern
214        if self
215            .exclude_patterns
216            .iter()
217            .any(|pattern| path_str.contains(pattern))
218        {
219            return false;
220        }
221
222        true
223    }
224
225    pub(crate) fn calculate_hash(&self, content: &str) -> String {
226        let mut hasher = Sha256::new();
227        hasher.update(content.as_bytes());
228        format!("{:x}", hasher.finalize())
229    }
230}
231
232#[cfg(test)]
233mod tests;