1use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use scribe_core::{Result, ScribeError};
8use std::collections::{HashMap, HashSet};
9use std::fs::File;
10use std::io::{BufReader, Read};
11use std::path::Path;
12
13const STOPLIST: &[&str] = &[
15 "README",
16 "readme",
17 "index",
18 "test",
19 "tests",
20 "spec",
21 "specs",
22 "main",
23 "app",
24 "lib",
25 "src",
26 "build",
27 "dist",
28 "target",
29 "node_modules",
30 "package",
31 "config",
32 "util",
33 "utils",
34 "helper",
35 "helpers",
36 "common",
37 "base",
38 "core",
39 "api",
40 "client",
41 "server",
42 "data",
43 "model",
44 "view",
45];
46
47#[derive(Debug, Clone)]
49pub struct IndexConfig {
50 pub min_token_length: usize,
52 pub max_token_length: usize,
54 pub max_patterns: usize,
56 pub chunk_size: usize,
58 pub include_stems: bool,
60 pub include_dirs: bool,
62}
63
64impl Default for IndexConfig {
65 fn default() -> Self {
66 Self {
67 min_token_length: 3,
68 max_token_length: 64,
69 max_patterns: 10_000,
70 chunk_size: 64 * 1024, include_stems: true,
72 include_dirs: false,
73 }
74 }
75}
76
77#[derive(Debug, Clone, PartialEq, Eq, Hash)]
79pub struct FileToken {
80 pub text: String,
81 pub token_type: TokenType,
82}
83
84#[derive(Debug, Clone, PartialEq, Eq, Hash)]
85pub enum TokenType {
86 Basename, Stem, Directory, }
90
91#[derive(Debug)]
93pub struct AhoCorasickReferenceIndex {
94 token_to_files: HashMap<String, Vec<usize>>,
96 file_tokens: Vec<Vec<FileToken>>,
98 automaton: Option<AhoCorasick>,
100 patterns: Vec<String>,
102 config: IndexConfig,
104 pub metrics: IndexMetrics,
106}
107
108#[derive(Debug, Default, Clone)]
110pub struct IndexMetrics {
111 pub total_files: usize,
112 pub total_tokens: usize,
113 pub unique_tokens: usize,
114 pub automaton_build_ms: u64,
115 pub bytes_scanned: u64,
116 pub matches_found: usize,
117 pub boundary_checks: usize,
118 pub valid_matches: usize,
119}
120
121impl AhoCorasickReferenceIndex {
122 pub fn new(file_paths: &[impl AsRef<Path>], config: IndexConfig) -> Result<Self> {
124 let start_time = std::time::Instant::now();
125
126 let mut index = Self {
127 token_to_files: HashMap::new(),
128 file_tokens: Vec::with_capacity(file_paths.len()),
129 automaton: None,
130 patterns: Vec::new(),
131 config,
132 metrics: IndexMetrics {
133 total_files: file_paths.len(),
134 ..Default::default()
135 },
136 };
137
138 for (file_id, path) in file_paths.iter().enumerate() {
140 let tokens = index.extract_tokens(path.as_ref());
141
142 for token in &tokens {
143 index
144 .token_to_files
145 .entry(token.text.clone())
146 .or_insert_with(Vec::new)
147 .push(file_id);
148 }
149
150 index.metrics.total_tokens += tokens.len();
151 index.file_tokens.push(tokens);
152 }
153
154 index.build_automaton()?;
156
157 index.metrics.automaton_build_ms = start_time.elapsed().as_millis() as u64;
158
159 Ok(index)
160 }
161
162 fn extract_tokens(&self, path: &Path) -> Vec<FileToken> {
164 let mut tokens = Vec::new();
165
166 if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
167 if self.is_valid_token(filename) {
169 tokens.push(FileToken {
170 text: filename.to_string(),
171 token_type: TokenType::Basename,
172 });
173 }
174
175 if self.config.include_stems {
177 if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
178 if self.is_valid_token(stem) && stem != filename {
179 tokens.push(FileToken {
180 text: stem.to_string(),
181 token_type: TokenType::Stem,
182 });
183 }
184 }
185 }
186 }
187
188 if self.config.include_dirs {
190 if let Some(parent) = path.parent() {
191 for component in parent.components() {
192 if let Some(dir_name) = component.as_os_str().to_str() {
193 if self.is_valid_token(dir_name) {
194 tokens.push(FileToken {
195 text: dir_name.to_string(),
196 token_type: TokenType::Directory,
197 });
198 }
199 }
200 }
201 }
202 }
203
204 tokens
205 }
206
207 fn is_valid_token(&self, token: &str) -> bool {
209 let len = token.len();
210
211 if len < self.config.min_token_length || len > self.config.max_token_length {
213 return false;
214 }
215
216 if STOPLIST.contains(&token) {
218 return false;
219 }
220
221 if token.chars().all(|c| c.is_ascii_digit()) {
223 return false;
224 }
225
226 if !token.chars().any(|c| c.is_alphanumeric()) {
228 return false;
229 }
230
231 true
232 }
233
234 fn build_automaton(&mut self) -> Result<()> {
236 let mut patterns: Vec<String> = self.token_to_files.keys().cloned().collect();
238 patterns.sort_by_key(|p| std::cmp::Reverse(self.token_to_files[p].len())); patterns.truncate(self.config.max_patterns);
240
241 self.metrics.unique_tokens = patterns.len();
242
243 if patterns.is_empty() {
244 return Ok(());
245 }
246
247 let automaton = AhoCorasickBuilder::new()
249 .match_kind(MatchKind::Standard) .build(&patterns)
251 .map_err(|e| {
252 ScribeError::io(
253 format!("Failed to build Aho-Corasick automaton: {}", e),
254 std::io::Error::new(std::io::ErrorKind::Other, e),
255 )
256 })?;
257
258 self.patterns = patterns;
259 self.automaton = Some(automaton);
260
261 Ok(())
262 }
263
264 pub fn scan_file_for_references<P: AsRef<Path>>(&mut self, file_path: P) -> Result<Vec<usize>> {
266 let Some(ref automaton) = self.automaton else {
267 return Ok(Vec::new());
268 };
269
270 let file = File::open(file_path.as_ref()).map_err(|e| {
271 ScribeError::io(
272 format!("Failed to open file: {}", file_path.as_ref().display()),
273 e,
274 )
275 })?;
276
277 let mut reader = BufReader::new(file);
278 let mut referenced_files = HashSet::new();
279
280 let max_pattern_len = self.patterns.iter().map(|p| p.len()).max().unwrap_or(0);
282 let overlap = max_pattern_len.saturating_sub(1);
283
284 let mut buffer = vec![0u8; self.config.chunk_size + overlap];
285 let mut carried_bytes = 0;
286
287 loop {
288 let bytes_read = reader
290 .read(&mut buffer[carried_bytes..])
291 .map_err(|e| ScribeError::io("Failed to read file chunk".to_string(), e))?;
292
293 if bytes_read == 0 {
294 break; }
296
297 let total_bytes = carried_bytes + bytes_read;
298 let chunk = &buffer[..total_bytes];
299 self.metrics.bytes_scanned += bytes_read as u64;
300
301 for mat in automaton.find_iter(chunk) {
303 self.metrics.matches_found += 1;
304
305 if self.is_boundary_valid(chunk, mat.start(), mat.end()) {
307 self.metrics.boundary_checks += 1;
308 self.metrics.valid_matches += 1;
309
310 let pattern = &self.patterns[mat.pattern()];
312 if let Some(file_ids) = self.token_to_files.get(pattern) {
313 referenced_files.extend(file_ids.iter().copied());
314 }
315 }
316 }
317
318 if bytes_read == self.config.chunk_size {
320 if overlap > 0 && total_bytes > overlap {
322 buffer.copy_within(total_bytes - overlap.., 0);
323 carried_bytes = overlap;
324 } else {
325 carried_bytes = 0;
326 }
327 } else {
328 break; }
330 }
331
332 Ok(referenced_files.into_iter().collect())
333 }
334
335 fn is_boundary_valid(&self, text: &[u8], start: usize, end: usize) -> bool {
337 let left_ok = start == 0 || !self.is_identifier_char(text[start - 1]);
339
340 let right_ok = end == text.len() || !self.is_identifier_char(text[end]);
342
343 left_ok && right_ok
344 }
345
346 fn is_identifier_char(&self, byte: u8) -> bool {
348 byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'.' || byte == b'/' || byte == b'-'
349 }
350
351 pub fn get_reference_count(&self, file_id: usize) -> usize {
353 0
356 }
357
358 pub fn metrics(&self) -> &IndexMetrics {
360 &self.metrics
361 }
362}
363
364#[cfg(test)]
365mod tests {
366 use super::*;
367 use std::fs;
368 use tempfile::TempDir;
369
370 #[test]
371 fn test_token_extraction() {
372 let config = IndexConfig::default();
373 let index = AhoCorasickReferenceIndex {
374 token_to_files: HashMap::new(),
375 file_tokens: Vec::new(),
376 automaton: None,
377 patterns: Vec::new(),
378 config,
379 metrics: IndexMetrics::default(),
380 };
381
382 let path = Path::new("src/auth/user.rs");
383 let tokens = index.extract_tokens(path);
384
385 assert!(tokens
386 .iter()
387 .any(|t| t.text == "user.rs" && t.token_type == TokenType::Basename));
388 assert!(tokens
389 .iter()
390 .any(|t| t.text == "user" && t.token_type == TokenType::Stem));
391 }
392
393 #[test]
394 fn test_stoplist_filtering() {
395 let config = IndexConfig::default();
396 let index = AhoCorasickReferenceIndex {
397 token_to_files: HashMap::new(),
398 file_tokens: Vec::new(),
399 automaton: None,
400 patterns: Vec::new(),
401 config,
402 metrics: IndexMetrics::default(),
403 };
404
405 assert!(!index.is_valid_token("README"));
406 assert!(!index.is_valid_token("test"));
407 assert!(!index.is_valid_token("123"));
408 assert!(index.is_valid_token("auth"));
409 assert!(index.is_valid_token("user"));
410 }
411
412 #[test]
413 fn test_boundary_detection() {
414 let mut index = AhoCorasickReferenceIndex {
415 token_to_files: HashMap::new(),
416 file_tokens: Vec::new(),
417 automaton: None,
418 patterns: Vec::new(),
419 config: IndexConfig::default(),
420 metrics: IndexMetrics::default(),
421 };
422
423 let text = b"import user from './user.js'";
424 assert!(index.is_boundary_valid(text, 7, 11)); assert!(!index.is_boundary_valid(text, 8, 11)); }
427
428 #[tokio::test]
429 async fn test_file_scanning() -> Result<()> {
430 let temp_dir = TempDir::new().unwrap();
431
432 let file1_path = temp_dir.path().join("auth.rs");
434 let file2_path = temp_dir.path().join("user.rs");
435
436 fs::write(&file1_path, "use user;\nmod user_service;")?;
437 fs::write(&file2_path, "struct User { name: String }")?;
438
439 let config = IndexConfig::default();
441 let paths = vec![file1_path.clone(), file2_path.clone()];
442 let mut index = AhoCorasickReferenceIndex::new(&paths, config)?;
443
444 let references = index.scan_file_for_references(&file1_path)?;
446
447 assert!(references.contains(&1));
449
450 Ok(())
451 }
452}