cs/parse/
key_extractor.rs

1// src/parse/key_extractor.rs
2
3use crate::cache::SearchResultCache;
4use crate::error::Result;
5use std::path::Path;
6use walkdir::WalkDir;
7
8use super::json_parser::JsonParser;
9use super::translation::TranslationEntry;
10use super::yaml_parser::YamlParser;
11
12/// `KeyExtractor` provides functionality to search translation entries across
13/// multiple YAML translation files, returning the full dot‑notation key path,
14/// associated file path and line number for each match.
15pub struct KeyExtractor {
16    exclusions: Vec<String>,
17    verbose: bool,
18    quiet: bool,          // Suppress progress indicators (for --simple mode)
19    case_sensitive: bool, // Case-sensitive matching
20    cache: Option<SearchResultCache>,
21    progress_count: std::cell::Cell<usize>, // Track progress for better indicator
22}
23
24impl Default for KeyExtractor {
25    fn default() -> Self {
26        Self::new()
27    }
28}
29
30impl KeyExtractor {
31    /// Create a new `KeyExtractor`.
32    pub fn new() -> Self {
33        let cache = SearchResultCache::new().ok(); // Silently disable cache on error
34        Self {
35            exclusions: Vec::new(),
36            verbose: false,
37            quiet: false,
38            case_sensitive: false,
39            cache,
40            progress_count: std::cell::Cell::new(0),
41        }
42    }
43
44    /// Set exclusion patterns (e.g., directories or files to ignore)
45    pub fn set_exclusions(&mut self, exclusions: Vec<String>) {
46        self.exclusions = exclusions;
47    }
48
49    /// Set verbose mode for detailed error messages
50    pub fn set_verbose(&mut self, verbose: bool) {
51        self.verbose = verbose;
52    }
53
54    /// Set quiet mode to suppress progress indicators
55    pub fn set_quiet(&mut self, quiet: bool) {
56        self.quiet = quiet;
57    }
58
59    /// Set case-sensitive matching
60    pub fn set_case_sensitive(&mut self, case_sensitive: bool) {
61        self.case_sensitive = case_sensitive;
62    }
63
64    /// Print progress indicator with proper formatting
65    /// Only shows meaningful progress - no useless dashes
66    fn print_progress(&self, indicator_type: char) {
67        if self.quiet {
68            return;
69        }
70
71        let count = self.progress_count.get();
72
73        // Only show meaningful progress indicators
74        match indicator_type {
75            '-' => {
76                // Don't show skipped files at all - they're just noise
77                return;
78            }
79            'C' => {
80                // Show cache hits - indicates good performance
81            }
82            '.' => {
83                // Show successful parses - indicates progress
84            }
85            'S' => {
86                // Show parse errors - important for debugging
87            }
88            _ => return,
89        }
90
91        // Print the colored indicator
92        use colored::Colorize;
93        let indicator = match indicator_type {
94            'C' => "C".cyan(),
95            '.' => ".".green(),
96            'S' => "S".yellow(),
97            _ => return,
98        };
99        eprint!("{}", indicator);
100
101        // Update count and add newline + reset every 30 characters
102        let new_count = count + 1;
103        if new_count >= 30 {
104            eprintln!(); // Newline after 30 characters
105            self.progress_count.set(0);
106        } else {
107            self.progress_count.set(new_count);
108        }
109    }
110
111    /// Recursively walk `base_dir` for `*.yml` (or `*.yaml`) files, parse each,
112    /// and return entries whose **value** contains `query`.
113    ///
114    /// Matching respects case sensitivity setting.
115    pub fn extract(&self, base_dir: &Path, query: &str) -> Result<Vec<TranslationEntry>> {
116        let mut matches = Vec::new();
117        let search_query = if self.case_sensitive {
118            query.to_string()
119        } else {
120            query.to_lowercase()
121        };
122        let mut skipped_files = 0;
123
124        let walker = WalkDir::new(base_dir).into_iter();
125        for entry in walker
126            .filter_entry(|e| {
127                if is_ignored(e) {
128                    return false;
129                }
130                let name = e.file_name().to_string_lossy();
131                for excl in &self.exclusions {
132                    if name == excl.as_str() {
133                        return false;
134                    }
135                }
136                true
137            })
138            .filter_map(|e| e.ok())
139            .filter(|e| e.file_type().is_file())
140        {
141            let path = entry.path();
142            if let Some(ext) = path.extension() {
143                let ext_str = ext.to_string_lossy();
144                if ext_str == "yml" || ext_str == "yaml" {
145                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
146                    // This avoids expensive YAML parsing for files without matches
147                    match YamlParser::contains_query(path, query) {
148                        Ok(false) => {
149                            // No match in file, skip it entirely
150                            self.print_progress('-');
151                            continue;
152                        }
153                        Err(_e) => {
154                            // ripgrep failed, fall back to full parsing
155                            // (don't skip the file, just proceed with parsing)
156                        }
157                        Ok(true) => {
158                            // Match found, proceed with parsing below
159                        }
160                    }
161
162                    // Try cache first
163                    let metadata = std::fs::metadata(path).ok();
164                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
165                    {
166                        let mtime = meta.modified().ok();
167                        let size = meta.len();
168                        if let Some(mt) = mtime {
169                            cache.get(path, query, false, mt, size)
170                        } else {
171                            None
172                        }
173                    } else {
174                        None
175                    };
176
177                    let all_entries = if let Some(cached) = cached_results {
178                        if !self.quiet {
179                            self.print_progress('C');
180                        } else {
181                            eprintln!("[cache] hit {} (yaml)", path.display());
182                        }
183                        cached
184                    } else {
185                        // Cache miss - parse file with query for optimization
186                        match YamlParser::parse_file_with_query(path, Some(query)) {
187                            Ok(entries) => {
188                                self.print_progress('.');
189
190                                // Store in cache
191                                if let (Some(cache), Ok(meta)) =
192                                    (&self.cache, std::fs::metadata(path))
193                                {
194                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
195                                        let _ =
196                                            cache.set(path, query, false, mtime, size, &entries);
197                                    }
198                                }
199
200                                entries
201                            }
202                            Err(e) => {
203                                skipped_files += 1;
204                                self.print_progress('S');
205                                if self.verbose {
206                                    eprintln!(
207                                        "\nWarning: Failed to parse YAML file {}: {}",
208                                        path.display(),
209                                        e
210                                    );
211                                }
212                                continue;
213                            }
214                        }
215                    };
216
217                    // Filter for matching entries
218                    for e in all_entries {
219                        let value_to_check = if self.case_sensitive {
220                            e.value.clone()
221                        } else {
222                            e.value.to_lowercase()
223                        };
224
225                        if value_to_check.contains(&search_query) {
226                            matches.push(e);
227                        }
228                    }
229                } else if ext_str == "json" {
230                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
231                    // Note: We don't have a contains_query for JSON yet, so we use YAML's
232                    match YamlParser::contains_query(path, query) {
233                        Ok(false) => {
234                            // No match in file, skip it entirely
235                            self.print_progress('-');
236                            continue;
237                        }
238                        Err(_e) => {
239                            // ripgrep failed, fall back to full parsing
240                        }
241                        Ok(true) => {
242                            // Match found, proceed with parsing below
243                        }
244                    }
245
246                    // Try cache first
247                    let metadata = std::fs::metadata(path).ok();
248                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
249                    {
250                        let mtime = meta.modified().ok();
251                        let size = meta.len();
252                        if let Some(mt) = mtime {
253                            cache.get(path, query, false, mt, size)
254                        } else {
255                            None
256                        }
257                    } else {
258                        None
259                    };
260
261                    let all_entries = if let Some(cached) = cached_results {
262                        if !self.quiet {
263                            self.print_progress('C');
264                        } else {
265                            eprintln!("[cache] hit {} (json)", path.display());
266                        }
267                        cached
268                    } else {
269                        // Cache miss - parse file with query for optimization
270                        match JsonParser::parse_file_with_query(path, Some(query)) {
271                            Ok(entries) => {
272                                self.print_progress('.');
273
274                                // Store in cache
275                                if let (Some(cache), Ok(meta)) =
276                                    (&self.cache, std::fs::metadata(path))
277                                {
278                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
279                                        let _ =
280                                            cache.set(path, query, false, mtime, size, &entries);
281                                    }
282                                }
283
284                                entries
285                            }
286                            Err(e) => {
287                                skipped_files += 1;
288                                self.print_progress('S');
289                                if self.verbose {
290                                    eprintln!(
291                                        "\nWarning: Failed to parse JSON file {}: {}",
292                                        path.display(),
293                                        e
294                                    );
295                                }
296                                continue;
297                            }
298                        }
299                    };
300
301                    // Filter for matching entries
302                    for e in all_entries {
303                        let value_to_check = if self.case_sensitive {
304                            e.value.clone()
305                        } else {
306                            e.value.to_lowercase()
307                        };
308
309                        if value_to_check.contains(&search_query) {
310                            matches.push(e);
311                        }
312                    }
313                }
314            }
315        }
316
317        // Print final newline and summary if files were skipped (only in verbose mode)
318        // Note: Skipped files are typically config files (package.json, tsconfig.json, etc.)
319        // that aren't translation files, which is expected behavior.
320        if !self.quiet {
321            // Always print final newline if we showed any progress
322            if self.progress_count.get() > 0 {
323                eprintln!();
324            }
325
326            if skipped_files > 0 && self.verbose {
327                eprintln!(
328                    "(Skipped {} non-translation file{})",
329                    skipped_files,
330                    if skipped_files == 1 { "" } else { "s" }
331                );
332            }
333        }
334
335        Ok(matches)
336    }
337}
338
339fn is_ignored(entry: &walkdir::DirEntry) -> bool {
340    // Always allow the root directory of the search
341    if entry.depth() == 0 {
342        return false;
343    }
344
345    entry
346        .file_name()
347        .to_str()
348        .map(|s| {
349            s.starts_with('.') // Hidden files/dirs
350                || s == "node_modules"
351                || s == "target"
352                || s == "dist"
353                || s == "build"
354                || s == "vendor"
355        })
356        .unwrap_or(false)
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362    use std::fs;
363
364    use tempfile::tempdir;
365
366    #[test]
367    fn test_key_extractor_simple() -> Result<()> {
368        let dir = tempdir()?;
369        let en_path = dir.path().join("en.yml");
370        let fr_path = dir.path().join("fr.yml");
371
372        // Write simple yaml files with proper format
373        fs::write(
374            &en_path,
375            "greeting:\n  hello: \"Hello World\"\n  goodbye: \"Goodbye\"",
376        )?;
377        fs::write(
378            &fr_path,
379            "greeting:\n  hello: \"Bonjour World\"\n  goodbye: \"Au revoir\"",
380        )?;
381
382        let extractor = KeyExtractor::new();
383        let results = extractor.extract(dir.path(), "world")?;
384
385        // Should find two entries (en and fr)
386        assert_eq!(results.len(), 2);
387        let keys: Vec<_> = results.iter().map(|e| e.key.clone()).collect();
388        assert!(keys.contains(&"greeting.hello".to_string()));
389        Ok(())
390    }
391
392    #[test]
393    fn test_key_extractor_case_insensitive() -> Result<()> {
394        let dir = tempdir()?;
395        let yaml_path = dir.path().join("test.yml");
396
397        fs::write(
398            &yaml_path,
399            "app:\n  title: \"My Application\"\n  description: \"A great APP for everyone\"",
400        )?;
401
402        let extractor = KeyExtractor::new();
403
404        // Test case insensitive search
405        let results = extractor.extract(dir.path(), "APP")?;
406        assert_eq!(results.len(), 2); // Should match both "Application" and "APP"
407
408        let values: Vec<_> = results.iter().map(|e| e.value.clone()).collect();
409        assert!(values.contains(&"My Application".to_string()));
410        assert!(values.contains(&"A great APP for everyone".to_string()));
411
412        Ok(())
413    }
414
415    #[test]
416    fn test_key_extractor_multiple_files() -> Result<()> {
417        let dir = tempdir()?;
418
419        // Create multiple language files
420        let en_path = dir.path().join("en.yml");
421        let fr_path = dir.path().join("fr.yml");
422        let de_path = dir.path().join("de.yml");
423
424        fs::write(&en_path, "common:\n  action: \"Save Data\"")?;
425        fs::write(&fr_path, "common:\n  action: \"Sauvegarder Data\"")?;
426        fs::write(&de_path, "common:\n  action: \"Speichern Data\"")?;
427
428        let extractor = KeyExtractor::new();
429        let results = extractor.extract(dir.path(), "data")?;
430
431        // Should find all three files (case-insensitive)
432        assert_eq!(results.len(), 3);
433
434        let files: Vec<_> = results
435            .iter()
436            .map(|e| e.file.file_name().unwrap().to_string_lossy().to_string())
437            .collect();
438        assert!(files.contains(&"en.yml".to_string()));
439        assert!(files.contains(&"fr.yml".to_string()));
440        assert!(files.contains(&"de.yml".to_string()));
441
442        Ok(())
443    }
444
445    #[test]
446    fn test_key_extractor_deep_nested() -> Result<()> {
447        let dir = tempdir()?;
448        let yaml_path = dir.path().join("nested.yml");
449
450        fs::write(
451            &yaml_path,
452            "level1:\n  level2:\n    level3:\n      deep_key: \"Deep nested value\"\n      another: \"test value\"",
453        )?;
454
455        let extractor = KeyExtractor::new();
456        let results = extractor.extract(dir.path(), "deep")?;
457
458        assert_eq!(results.len(), 1);
459        assert_eq!(results[0].key, "level1.level2.level3.deep_key");
460        assert_eq!(results[0].value, "Deep nested value");
461
462        Ok(())
463    }
464
465    #[test]
466    fn test_key_extractor_no_matches() -> Result<()> {
467        let dir = tempdir()?;
468        let yaml_path = dir.path().join("test.yml");
469
470        fs::write(
471            &yaml_path,
472            "greeting:\n  hello: \"Hello\"\n  goodbye: \"Goodbye\"",
473        )?;
474
475        let extractor = KeyExtractor::new();
476        let results = extractor.extract(dir.path(), "nonexistent")?;
477
478        assert_eq!(results.len(), 0);
479
480        Ok(())
481    }
482
483    #[test]
484    fn test_key_extractor_supports_json_and_yaml() -> Result<()> {
485        let dir = tempdir()?;
486        let yaml_path = dir.path().join("test.yml");
487        let txt_path = dir.path().join("test.txt");
488        let json_path = dir.path().join("test.json");
489
490        fs::write(&yaml_path, "key: \"test value\"")?;
491        fs::write(&txt_path, "key: test value")?; // This should be ignored
492        fs::write(&json_path, "{\"key\": \"test value\"}")?; // This should be ignored
493
494        let extractor = KeyExtractor::new();
495        let results = extractor.extract(dir.path(), "test")?;
496
497        // Should find both YAML and JSON files
498        assert_eq!(results.len(), 2);
499        let extensions: Vec<_> = results
500            .iter()
501            .map(|e| e.file.extension().unwrap().to_string_lossy().to_string())
502            .collect();
503        assert!(extensions.contains(&"yml".to_string()));
504        assert!(extensions.contains(&"json".to_string()));
505
506        Ok(())
507    }
508
509    #[test]
510    fn test_key_extractor_malformed_file() -> Result<()> {
511        let dir = tempdir()?;
512        let good_path = dir.path().join("good.yml");
513        let bad_path = dir.path().join("bad.yml");
514
515        fs::write(&good_path, "key: \"value\"")?;
516        fs::write(&bad_path, "key: value: invalid: yaml")?; // Malformed YAML
517
518        let extractor = KeyExtractor::new();
519        // This should NOT return an error, but just skip the bad file
520        let results = extractor.extract(dir.path(), "value")?;
521
522        // Should find the good file
523        assert_eq!(results.len(), 1);
524        assert_eq!(results[0].value, "value");
525
526        Ok(())
527    }
528}