scribe_scanner/
aho_corasick_reference_index.rs

1//! High-performance file reference indexing using Aho-Corasick multi-pattern search.
2//!
3//! This module replaces the quadratic O(files × bytes) string matching with
4//! linear O(total_bytes + matches) scanning using a single compiled automaton.
5
6use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
7use scribe_core::{Result, ScribeError};
8use std::collections::{HashMap, HashSet};
9use std::fs::File;
10use std::io::{BufReader, Read};
11use std::path::Path;
12
13/// Global stoplist for tokens that are too common to be useful
14const STOPLIST: &[&str] = &[
15    "README",
16    "readme",
17    "index",
18    "test",
19    "tests",
20    "spec",
21    "specs",
22    "main",
23    "app",
24    "lib",
25    "src",
26    "build",
27    "dist",
28    "target",
29    "node_modules",
30    "package",
31    "config",
32    "util",
33    "utils",
34    "helper",
35    "helpers",
36    "common",
37    "base",
38    "core",
39    "api",
40    "client",
41    "server",
42    "data",
43    "model",
44    "view",
45];
46
47/// Configuration for the reference indexing
48#[derive(Debug, Clone)]
49pub struct IndexConfig {
50    /// Minimum token length to consider
51    pub min_token_length: usize,
52    /// Maximum token length to consider  
53    pub max_token_length: usize,
54    /// Maximum number of patterns to compile into automaton
55    pub max_patterns: usize,
56    /// Chunk size for streaming reads
57    pub chunk_size: usize,
58    /// Whether to include file stems (filename without extension)
59    pub include_stems: bool,
60    /// Whether to include directory components
61    pub include_dirs: bool,
62}
63
64impl Default for IndexConfig {
65    fn default() -> Self {
66        Self {
67            min_token_length: 3,
68            max_token_length: 64,
69            max_patterns: 10_000,
70            chunk_size: 64 * 1024, // 64 KiB
71            include_stems: true,
72            include_dirs: false,
73        }
74    }
75}
76
77/// A token extracted from a file path for reference matching
78#[derive(Debug, Clone, PartialEq, Eq, Hash)]
79pub struct FileToken {
80    pub text: String,
81    pub token_type: TokenType,
82}
83
84#[derive(Debug, Clone, PartialEq, Eq, Hash)]
85pub enum TokenType {
86    Basename,  // file.ext
87    Stem,      // file (without extension)
88    Directory, // parent directory name
89}
90
91/// High-performance file reference index using Aho-Corasick
92#[derive(Debug)]
93pub struct AhoCorasickReferenceIndex {
94    /// Maps tokens to the files that contain them (as IDs)
95    token_to_files: HashMap<String, Vec<usize>>,
96    /// Maps file IDs to their generated tokens
97    file_tokens: Vec<Vec<FileToken>>,
98    /// Compiled Aho-Corasick automaton for pattern matching
99    automaton: Option<AhoCorasick>,
100    /// Patterns used in the automaton (for result mapping)
101    patterns: Vec<String>,
102    /// Configuration
103    config: IndexConfig,
104    /// Performance metrics
105    pub metrics: IndexMetrics,
106}
107
108/// Performance metrics for the indexing process
109#[derive(Debug, Default, Clone)]
110pub struct IndexMetrics {
111    pub total_files: usize,
112    pub total_tokens: usize,
113    pub unique_tokens: usize,
114    pub automaton_build_ms: u64,
115    pub bytes_scanned: u64,
116    pub matches_found: usize,
117    pub boundary_checks: usize,
118    pub valid_matches: usize,
119}
120
121impl AhoCorasickReferenceIndex {
122    /// Create a new index from file paths
123    pub fn new(file_paths: &[impl AsRef<Path>], config: IndexConfig) -> Result<Self> {
124        let start_time = std::time::Instant::now();
125
126        let mut index = Self {
127            token_to_files: HashMap::new(),
128            file_tokens: Vec::with_capacity(file_paths.len()),
129            automaton: None,
130            patterns: Vec::new(),
131            config,
132            metrics: IndexMetrics {
133                total_files: file_paths.len(),
134                ..Default::default()
135            },
136        };
137
138        // Step 1: Extract tokens from all file paths
139        for (file_id, path) in file_paths.iter().enumerate() {
140            let tokens = index.extract_tokens(path.as_ref());
141
142            for token in &tokens {
143                index
144                    .token_to_files
145                    .entry(token.text.clone())
146                    .or_insert_with(Vec::new)
147                    .push(file_id);
148            }
149
150            index.metrics.total_tokens += tokens.len();
151            index.file_tokens.push(tokens);
152        }
153
154        // Step 2: Build Aho-Corasick automaton
155        index.build_automaton()?;
156
157        index.metrics.automaton_build_ms = start_time.elapsed().as_millis() as u64;
158
159        Ok(index)
160    }
161
162    /// Extract relevant tokens from a file path
163    fn extract_tokens(&self, path: &Path) -> Vec<FileToken> {
164        let mut tokens = Vec::new();
165
166        if let Some(filename) = path.file_name().and_then(|n| n.to_str()) {
167            // Add basename (full filename)
168            if self.is_valid_token(filename) {
169                tokens.push(FileToken {
170                    text: filename.to_string(),
171                    token_type: TokenType::Basename,
172                });
173            }
174
175            // Add stem (filename without extension) if enabled
176            if self.config.include_stems {
177                if let Some(stem) = path.file_stem().and_then(|s| s.to_str()) {
178                    if self.is_valid_token(stem) && stem != filename {
179                        tokens.push(FileToken {
180                            text: stem.to_string(),
181                            token_type: TokenType::Stem,
182                        });
183                    }
184                }
185            }
186        }
187
188        // Add directory components if enabled
189        if self.config.include_dirs {
190            if let Some(parent) = path.parent() {
191                for component in parent.components() {
192                    if let Some(dir_name) = component.as_os_str().to_str() {
193                        if self.is_valid_token(dir_name) {
194                            tokens.push(FileToken {
195                                text: dir_name.to_string(),
196                                token_type: TokenType::Directory,
197                            });
198                        }
199                    }
200                }
201            }
202        }
203
204        tokens
205    }
206
207    /// Check if a token meets the criteria for inclusion
208    fn is_valid_token(&self, token: &str) -> bool {
209        let len = token.len();
210
211        // Length check
212        if len < self.config.min_token_length || len > self.config.max_token_length {
213            return false;
214        }
215
216        // Stoplist check
217        if STOPLIST.contains(&token) {
218            return false;
219        }
220
221        // All-numeric check
222        if token.chars().all(|c| c.is_ascii_digit()) {
223            return false;
224        }
225
226        // Must contain at least one alphanumeric character
227        if !token.chars().any(|c| c.is_alphanumeric()) {
228            return false;
229        }
230
231        true
232    }
233
234    /// Build the Aho-Corasick automaton from collected tokens
235    fn build_automaton(&mut self) -> Result<()> {
236        // Collect unique patterns, limited by max_patterns
237        let mut patterns: Vec<String> = self.token_to_files.keys().cloned().collect();
238        patterns.sort_by_key(|p| std::cmp::Reverse(self.token_to_files[p].len())); // Sort by frequency
239        patterns.truncate(self.config.max_patterns);
240
241        self.metrics.unique_tokens = patterns.len();
242
243        if patterns.is_empty() {
244            return Ok(());
245        }
246
247        // Build automaton with optimal settings for our use case
248        let automaton = AhoCorasickBuilder::new()
249            .match_kind(MatchKind::Standard) // Find all matches
250            .build(&patterns)
251            .map_err(|e| {
252                ScribeError::io(
253                    format!("Failed to build Aho-Corasick automaton: {}", e),
254                    std::io::Error::new(std::io::ErrorKind::Other, e),
255                )
256            })?;
257
258        self.patterns = patterns;
259        self.automaton = Some(automaton);
260
261        Ok(())
262    }
263
264    /// Scan a file for references to other files in the index
265    pub fn scan_file_for_references<P: AsRef<Path>>(&mut self, file_path: P) -> Result<Vec<usize>> {
266        let Some(ref automaton) = self.automaton else {
267            return Ok(Vec::new());
268        };
269
270        let file = File::open(file_path.as_ref()).map_err(|e| {
271            ScribeError::io(
272                format!("Failed to open file: {}", file_path.as_ref().display()),
273                e,
274            )
275        })?;
276
277        let mut reader = BufReader::new(file);
278        let mut referenced_files = HashSet::new();
279
280        // Stream the file in chunks with overlap for boundary handling
281        let max_pattern_len = self.patterns.iter().map(|p| p.len()).max().unwrap_or(0);
282        let overlap = max_pattern_len.saturating_sub(1);
283
284        let mut buffer = vec![0u8; self.config.chunk_size + overlap];
285        let mut carried_bytes = 0;
286
287        loop {
288            // Read chunk, preserving overlap from previous chunk
289            let bytes_read = reader
290                .read(&mut buffer[carried_bytes..])
291                .map_err(|e| ScribeError::io("Failed to read file chunk".to_string(), e))?;
292
293            if bytes_read == 0 {
294                break; // EOF
295            }
296
297            let total_bytes = carried_bytes + bytes_read;
298            let chunk = &buffer[..total_bytes];
299            self.metrics.bytes_scanned += bytes_read as u64;
300
301            // Find all matches in this chunk
302            for mat in automaton.find_iter(chunk) {
303                self.metrics.matches_found += 1;
304
305                // Perform boundary check
306                if self.is_boundary_valid(chunk, mat.start(), mat.end()) {
307                    self.metrics.boundary_checks += 1;
308                    self.metrics.valid_matches += 1;
309
310                    // Map pattern back to file IDs
311                    let pattern = &self.patterns[mat.pattern()];
312                    if let Some(file_ids) = self.token_to_files.get(pattern) {
313                        referenced_files.extend(file_ids.iter().copied());
314                    }
315                }
316            }
317
318            // Prepare overlap for next iteration
319            if bytes_read == self.config.chunk_size {
320                // Copy overlap bytes to start of buffer
321                if overlap > 0 && total_bytes > overlap {
322                    buffer.copy_within(total_bytes - overlap.., 0);
323                    carried_bytes = overlap;
324                } else {
325                    carried_bytes = 0;
326                }
327            } else {
328                break; // Last chunk, no more data
329            }
330        }
331
332        Ok(referenced_files.into_iter().collect())
333    }
334
335    /// Check if a match occurs at a valid token boundary
336    fn is_boundary_valid(&self, text: &[u8], start: usize, end: usize) -> bool {
337        // Check left boundary
338        let left_ok = start == 0 || !self.is_identifier_char(text[start - 1]);
339
340        // Check right boundary
341        let right_ok = end == text.len() || !self.is_identifier_char(text[end]);
342
343        left_ok && right_ok
344    }
345
346    /// Check if a byte is part of an identifier (alphanumeric, underscore, dot, slash, dash)
347    fn is_identifier_char(&self, byte: u8) -> bool {
348        byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'.' || byte == b'/' || byte == b'-'
349    }
350
351    /// Get the number of files that reference the given file ID
352    pub fn get_reference_count(&self, file_id: usize) -> usize {
353        // This would need to be computed by scanning all files
354        // For now, return 0 as this is a placeholder for compatibility
355        0
356    }
357
358    /// Get performance metrics
359    pub fn metrics(&self) -> &IndexMetrics {
360        &self.metrics
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use super::*;
367    use std::fs;
368    use tempfile::TempDir;
369
370    #[test]
371    fn test_token_extraction() {
372        let config = IndexConfig::default();
373        let index = AhoCorasickReferenceIndex {
374            token_to_files: HashMap::new(),
375            file_tokens: Vec::new(),
376            automaton: None,
377            patterns: Vec::new(),
378            config,
379            metrics: IndexMetrics::default(),
380        };
381
382        let path = Path::new("src/auth/user.rs");
383        let tokens = index.extract_tokens(path);
384
385        assert!(tokens
386            .iter()
387            .any(|t| t.text == "user.rs" && t.token_type == TokenType::Basename));
388        assert!(tokens
389            .iter()
390            .any(|t| t.text == "user" && t.token_type == TokenType::Stem));
391    }
392
393    #[test]
394    fn test_stoplist_filtering() {
395        let config = IndexConfig::default();
396        let index = AhoCorasickReferenceIndex {
397            token_to_files: HashMap::new(),
398            file_tokens: Vec::new(),
399            automaton: None,
400            patterns: Vec::new(),
401            config,
402            metrics: IndexMetrics::default(),
403        };
404
405        assert!(!index.is_valid_token("README"));
406        assert!(!index.is_valid_token("test"));
407        assert!(!index.is_valid_token("123"));
408        assert!(index.is_valid_token("auth"));
409        assert!(index.is_valid_token("user"));
410    }
411
412    #[test]
413    fn test_boundary_detection() {
414        let mut index = AhoCorasickReferenceIndex {
415            token_to_files: HashMap::new(),
416            file_tokens: Vec::new(),
417            automaton: None,
418            patterns: Vec::new(),
419            config: IndexConfig::default(),
420            metrics: IndexMetrics::default(),
421        };
422
423        let text = b"import user from './user.js'";
424        assert!(index.is_boundary_valid(text, 7, 11)); // "user" with spaces around
425        assert!(!index.is_boundary_valid(text, 8, 11)); // "ser" inside "user"
426    }
427
428    #[tokio::test]
429    async fn test_file_scanning() -> Result<()> {
430        let temp_dir = TempDir::new().unwrap();
431
432        // Create test files
433        let file1_path = temp_dir.path().join("auth.rs");
434        let file2_path = temp_dir.path().join("user.rs");
435
436        fs::write(&file1_path, "use user;\nmod user_service;")?;
437        fs::write(&file2_path, "struct User { name: String }")?;
438
439        // Create index
440        let config = IndexConfig::default();
441        let paths = vec![file1_path.clone(), file2_path.clone()];
442        let mut index = AhoCorasickReferenceIndex::new(&paths, config)?;
443
444        // Scan auth.rs for references
445        let references = index.scan_file_for_references(&file1_path)?;
446
447        // Should find reference to user.rs (file ID 1)
448        assert!(references.contains(&1));
449
450        Ok(())
451    }
452}