project_rag/indexer/file_walker/
mod.rs1use super::file_info::FileInfo;
4use super::language::detect_language;
5use super::pdf_extractor::extract_pdf_to_markdown;
6use anyhow::{Context, Result};
7use ignore::WalkBuilder;
8use sha2::{Digest, Sha256};
9use std::fs;
10use std::path::{Path, PathBuf};
11use std::sync::atomic::{AtomicBool, Ordering};
12use std::sync::Arc;
13
14pub struct FileWalker {
15 pub(crate) root: PathBuf,
16 pub(crate) project: Option<String>,
17 pub(crate) max_file_size: usize,
18 pub(crate) include_patterns: Vec<String>,
19 pub(crate) exclude_patterns: Vec<String>,
20 cancelled: Option<Arc<AtomicBool>>,
22}
23
24impl FileWalker {
25 pub fn new(root: impl AsRef<Path>, max_file_size: usize) -> Self {
26 Self {
27 root: root.as_ref().to_path_buf(),
28 project: None,
29 max_file_size,
30 include_patterns: vec![],
31 exclude_patterns: vec![],
32 cancelled: None,
33 }
34 }
35
36 pub fn with_cancellation_flag(mut self, cancelled: Arc<AtomicBool>) -> Self {
39 self.cancelled = Some(cancelled);
40 self
41 }
42
43 fn is_cancelled(&self) -> bool {
45 self.cancelled
46 .as_ref()
47 .is_some_and(|flag| flag.load(Ordering::Relaxed))
48 }
49
50 pub fn with_project(mut self, project: Option<String>) -> Self {
51 self.project = project;
52 self
53 }
54
55 pub fn with_patterns(
56 mut self,
57 include_patterns: Vec<String>,
58 exclude_patterns: Vec<String>,
59 ) -> Self {
60 self.include_patterns = include_patterns;
61 self.exclude_patterns = exclude_patterns;
62 self
63 }
64
65 pub fn walk(&self) -> Result<Vec<FileInfo>> {
67 if !self.root.exists() {
69 anyhow::bail!("Root directory does not exist: {:?}", self.root);
70 }
71 if !self.root.is_dir() {
72 anyhow::bail!("Root path is not a directory: {:?}", self.root);
73 }
74
75 let mut files = Vec::new();
76
77 let walker = WalkBuilder::new(&self.root)
78 .standard_filters(true) .hidden(false) .git_ignore(true) .git_exclude(true) .git_global(true) .require_git(false) .build();
85
86 for entry in walker {
87 if self.is_cancelled() {
89 tracing::info!("File walk cancelled after {} files", files.len());
90 anyhow::bail!("Indexing was cancelled");
91 }
92
93 let entry = entry.context("Failed to read directory entry")?;
94 let path = entry.path();
95
96 if path.is_dir() {
98 continue;
99 }
100
101 if path.components().any(|c| c.as_os_str() == ".git") {
103 tracing::debug!("Skipping .git directory file: {:?}", path);
104 continue;
105 }
106
107 if let Ok(metadata) = fs::metadata(path)
109 && metadata.len() > self.max_file_size as u64
110 {
111 tracing::debug!("Skipping large file: {:?}", path);
112 continue;
113 }
114
115 let is_pdf = path
117 .extension()
118 .and_then(|e| e.to_str())
119 .map(|e| e.to_lowercase() == "pdf")
120 .unwrap_or(false);
121
122 if !is_pdf && !self.is_text_file(path)? {
123 tracing::debug!("Skipping binary file: {:?}", path);
124 continue;
125 }
126
127 if !self.matches_patterns(path) {
129 continue;
130 }
131
132 let content = if is_pdf {
134 match extract_pdf_to_markdown(path) {
135 Ok(c) => c,
136 Err(e) => {
137 tracing::warn!("Failed to extract PDF {:?}: {}", path, e);
138 continue;
139 }
140 }
141 } else {
142 match fs::read_to_string(path) {
143 Ok(c) => c,
144 Err(e) => {
145 tracing::debug!(
146 "Skipping file that can't be read as UTF-8: {:?}: {}",
147 path,
148 e
149 );
150 continue;
151 }
152 }
153 };
154
155 let hash = self.calculate_hash(&content);
157
158 let relative_path = path
160 .strip_prefix(&self.root)
161 .unwrap_or(path)
162 .to_string_lossy()
163 .to_string();
164
165 let extension = path.extension().and_then(|e| e.to_str()).map(String::from);
167 let language = extension.as_ref().and_then(|ext| detect_language(ext));
168
169 files.push(FileInfo {
170 path: path.to_path_buf(),
171 relative_path,
172 root_path: self.root.to_string_lossy().to_string(),
173 project: self.project.clone(),
174 extension,
175 language,
176 content,
177 hash,
178 });
179 }
180
181 tracing::info!("Found {} files to index", files.len());
182 Ok(files)
183 }
184
185 pub(crate) fn is_text_file(&self, path: &Path) -> Result<bool> {
187 let content = fs::read(path).context("Failed to read file")?;
188
189 let non_printable = content
191 .iter()
192 .filter(|&&b| b < 0x20 && b != b'\n' && b != b'\r' && b != b'\t')
193 .count();
194
195 Ok((non_printable as f64 / content.len() as f64) < 0.3)
196 }
197
198 pub(crate) fn matches_patterns(&self, path: &Path) -> bool {
200 let path_str = path.to_string_lossy();
201
202 if !self.include_patterns.is_empty() {
204 let matches_include = self
205 .include_patterns
206 .iter()
207 .any(|pattern| path_str.contains(pattern));
208 if !matches_include {
209 return false;
210 }
211 }
212
213 if self
215 .exclude_patterns
216 .iter()
217 .any(|pattern| path_str.contains(pattern))
218 {
219 return false;
220 }
221
222 true
223 }
224
225 pub(crate) fn calculate_hash(&self, content: &str) -> String {
226 let mut hasher = Sha256::new();
227 hasher.update(content.as_bytes());
228 format!("{:x}", hasher.finalize())
229 }
230}
231
232#[cfg(test)]
233mod tests;