cs/parse/
key_extractor.rs

1// src/parse/key_extractor.rs
2
3use crate::cache::SearchResultCache;
4use crate::error::Result;
5use std::path::Path;
6use walkdir::WalkDir;
7
8use super::js_parser::JsParser;
9use super::json_parser::JsonParser;
10use super::translation::TranslationEntry;
11use super::yaml_parser::YamlParser;
12
13/// `KeyExtractor` provides functionality to search translation entries across
14/// multiple YAML translation files, returning the full dot‑notation key path,
15/// associated file path and line number for each match.
16pub struct KeyExtractor {
17    exclusions: Vec<String>,
18    verbose: bool,
19    quiet: bool,          // Suppress progress indicators (for --simple mode)
20    case_sensitive: bool, // Case-sensitive matching
21    cache: Option<SearchResultCache>,
22    progress_count: std::cell::Cell<usize>, // Track progress for better indicator
23}
24
25impl Default for KeyExtractor {
26    fn default() -> Self {
27        Self::new()
28    }
29}
30
31impl KeyExtractor {
32    /// Create a new `KeyExtractor`.
33    pub fn new() -> Self {
34        let cache = SearchResultCache::new().ok(); // Silently disable cache on error
35        Self {
36            exclusions: Vec::new(),
37            verbose: false,
38            quiet: false,
39            case_sensitive: false,
40            cache,
41            progress_count: std::cell::Cell::new(0),
42        }
43    }
44
45    /// Set exclusion patterns (e.g., directories or files to ignore)
46    pub fn set_exclusions(&mut self, exclusions: Vec<String>) {
47        self.exclusions = exclusions;
48    }
49
50    /// Set verbose mode for detailed error messages
51    pub fn set_verbose(&mut self, verbose: bool) {
52        self.verbose = verbose;
53    }
54
55    /// Set quiet mode to suppress progress indicators
56    pub fn set_quiet(&mut self, quiet: bool) {
57        self.quiet = quiet;
58    }
59
60    /// Set case-sensitive matching
61    pub fn set_case_sensitive(&mut self, case_sensitive: bool) {
62        self.case_sensitive = case_sensitive;
63    }
64
65    /// Print progress indicator with proper formatting
66    /// Only shows meaningful progress - no useless dashes
67    fn print_progress(&self, indicator_type: char) {
68        if self.quiet {
69            return;
70        }
71
72        let count = self.progress_count.get();
73
74        // Only show meaningful progress indicators
75        match indicator_type {
76            '-' => {
77                // Don't show skipped files at all - they're just noise
78                return;
79            }
80            'C' => {
81                // Show cache hits - indicates good performance
82            }
83            '.' => {
84                // Show successful parses - indicates progress
85            }
86            'S' => {
87                // Show parse errors - important for debugging
88            }
89            _ => return,
90        }
91
92        // Print the colored indicator
93        use colored::Colorize;
94        let indicator = match indicator_type {
95            'C' => "C".cyan(),
96            '.' => ".".green(),
97            'S' => "S".yellow(),
98            _ => return,
99        };
100        eprint!("{}", indicator);
101
102        // Update count and add newline + reset every 30 characters
103        let new_count = count + 1;
104        if new_count >= 30 {
105            eprintln!(); // Newline after 30 characters
106            self.progress_count.set(0);
107        } else {
108            self.progress_count.set(new_count);
109        }
110    }
111
112    /// Recursively walk `base_dir` for `*.yml` (or `*.yaml`) files, parse each,
113    /// and return entries whose **value** contains `query`.
114    ///
115    /// Matching respects case sensitivity setting.
116    pub fn extract(&self, base_dir: &Path, query: &str) -> Result<Vec<TranslationEntry>> {
117        let mut matches = Vec::new();
118        let search_query = if self.case_sensitive {
119            query.to_string()
120        } else {
121            query.to_lowercase()
122        };
123        let mut skipped_files = 0;
124
125        let walker = WalkDir::new(base_dir).into_iter();
126        for entry in walker
127            .filter_entry(|e| {
128                if is_ignored(e) {
129                    return false;
130                }
131                let name = e.file_name().to_string_lossy();
132                for excl in &self.exclusions {
133                    if name == excl.as_str() {
134                        return false;
135                    }
136                }
137                true
138            })
139            .filter_map(|e| e.ok())
140            .filter(|e| e.file_type().is_file())
141        {
142            let path = entry.path();
143            if let Some(ext) = path.extension() {
144                let ext_str = ext.to_string_lossy();
145
146                if ext_str == "yml" || ext_str == "yaml" {
147                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
148                    // This avoids expensive YAML parsing for files without matches
149                    match YamlParser::contains_query(path, query) {
150                        Ok(false) => {
151                            // No match in file, skip it entirely
152                            self.print_progress('-');
153                            continue;
154                        }
155                        Err(_e) => {
156                            // ripgrep failed, fall back to full parsing
157                            // (don't skip the file, just proceed with parsing)
158                        }
159                        Ok(true) => {
160                            // Match found, proceed with parsing below
161                        }
162                    }
163
164                    // Try cache first
165                    let metadata = std::fs::metadata(path).ok();
166                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
167                    {
168                        let mtime = meta.modified().ok();
169                        let size = meta.len();
170                        if let Some(mt) = mtime {
171                            cache.get(path, query, false, mt, size)
172                        } else {
173                            None
174                        }
175                    } else {
176                        None
177                    };
178
179                    let all_entries = if let Some(cached) = cached_results {
180                        if !self.quiet {
181                            self.print_progress('C');
182                        }
183                        cached
184                    } else {
185                        // Cache miss - parse file with query for optimization
186                        match YamlParser::parse_file_with_query(path, Some(query)) {
187                            Ok(entries) => {
188                                self.print_progress('.');
189
190                                // Store in cache
191                                if let (Some(cache), Ok(meta)) =
192                                    (&self.cache, std::fs::metadata(path))
193                                {
194                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
195                                        let _ =
196                                            cache.set(path, query, false, mtime, size, &entries);
197                                    }
198                                }
199
200                                entries
201                            }
202                            Err(e) => {
203                                skipped_files += 1;
204                                self.print_progress('S');
205                                if self.verbose {
206                                    eprintln!(
207                                        "\nWarning: Failed to parse YAML file {}: {}",
208                                        path.display(),
209                                        e
210                                    );
211                                }
212                                continue;
213                            }
214                        }
215                    };
216
217                    // Filter for matching entries
218                    for e in all_entries {
219                        let value_to_check = if self.case_sensitive {
220                            e.value.clone()
221                        } else {
222                            e.value.to_lowercase()
223                        };
224
225                        if value_to_check.contains(&search_query) {
226                            matches.push(e);
227                        }
228                    }
229                } else if ext_str == "json" {
230                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
231                    // Note: We don't have a contains_query for JSON yet, so we use YAML's
232                    match YamlParser::contains_query(path, query) {
233                        Ok(false) => {
234                            // No match in file, skip it entirely
235                            self.print_progress('-');
236                            continue;
237                        }
238                        Err(_e) => {
239                            // ripgrep failed, fall back to full parsing
240                        }
241                        Ok(true) => {
242                            // Match found, proceed with parsing below
243                        }
244                    }
245
246                    // Try cache first
247                    let metadata = std::fs::metadata(path).ok();
248                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
249                    {
250                        let mtime = meta.modified().ok();
251                        let size = meta.len();
252                        if let Some(mt) = mtime {
253                            cache.get(path, query, false, mt, size)
254                        } else {
255                            None
256                        }
257                    } else {
258                        None
259                    };
260
261                    let all_entries = if let Some(cached) = cached_results {
262                        if !self.quiet {
263                            self.print_progress('C');
264                        }
265                        cached
266                    } else {
267                        // Cache miss - parse file with query for optimization
268                        match JsonParser::parse_file_with_query(path, Some(query)) {
269                            Ok(entries) => {
270                                self.print_progress('.');
271
272                                // Store in cache
273                                if let (Some(cache), Ok(meta)) =
274                                    (&self.cache, std::fs::metadata(path))
275                                {
276                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
277                                        let _ =
278                                            cache.set(path, query, false, mtime, size, &entries);
279                                    }
280                                }
281
282                                entries
283                            }
284                            Err(e) => {
285                                skipped_files += 1;
286                                self.print_progress('S');
287                                if self.verbose {
288                                    eprintln!(
289                                        "\nWarning: Failed to parse JSON file {}: {}",
290                                        path.display(),
291                                        e
292                                    );
293                                }
294                                continue;
295                            }
296                        }
297                    };
298
299                    // Filter for matching entries
300                    for e in all_entries {
301                        let value_to_check = if self.case_sensitive {
302                            e.value.clone()
303                        } else {
304                            e.value.to_lowercase()
305                        };
306
307                        if value_to_check.contains(&search_query) {
308                            matches.push(e);
309                        }
310                    }
311                } else if ext_str == "js" {
312                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
313                    match JsParser::contains_query(path, query) {
314                        Ok(false) => {
315                            // No match in file, skip it entirely
316                            self.print_progress('-');
317                            continue;
318                        }
319                        Err(_e) => {
320                            // ripgrep failed, fall back to full parsing
321                        }
322                        Ok(true) => {
323                            // Match found, proceed with parsing below
324                        }
325                    }
326
327                    // Try cache first
328                    let metadata = std::fs::metadata(path).ok();
329                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
330                    {
331                        let mtime = meta.modified().ok();
332                        let size = meta.len();
333                        if let Some(mt) = mtime {
334                            cache.get(path, query, false, mt, size)
335                        } else {
336                            None
337                        }
338                    } else {
339                        None
340                    };
341
342                    let all_entries = if let Some(cached) = cached_results {
343                        if !self.quiet {
344                            self.print_progress('C');
345                        }
346                        cached
347                    } else {
348                        // Cache miss - parse file with query for optimization
349                        match JsParser::parse_file_with_query(path, Some(query)) {
350                            Ok(entries) => {
351                                self.print_progress('.');
352
353                                // Store in cache
354                                if let (Some(cache), Ok(meta)) =
355                                    (&self.cache, std::fs::metadata(path))
356                                {
357                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
358                                        let _ =
359                                            cache.set(path, query, false, mtime, size, &entries);
360                                    }
361                                }
362
363                                entries
364                            }
365                            Err(e) => {
366                                skipped_files += 1;
367                                self.print_progress('S');
368                                if self.verbose {
369                                    eprintln!(
370                                        "\nWarning: Failed to parse JavaScript file {}: {}",
371                                        path.display(),
372                                        e
373                                    );
374                                }
375                                continue;
376                            }
377                        }
378                    };
379
380                    // Filter for matching entries
381                    for e in all_entries {
382                        let value_to_check = if self.case_sensitive {
383                            e.value.clone()
384                        } else {
385                            e.value.to_lowercase()
386                        };
387
388                        if value_to_check.contains(&search_query) {
389                            matches.push(e);
390                        }
391                    }
392                }
393            }
394        }
395
396        // Print final newline and summary if files were skipped (only in verbose mode)
397        // Note: Skipped files are typically config files (package.json, tsconfig.json, etc.)
398        // that aren't translation files, which is expected behavior.
399        if !self.quiet {
400            // Always print final newline if we showed any progress
401            if self.progress_count.get() > 0 {
402                eprintln!();
403            }
404
405            if skipped_files > 0 && self.verbose {
406                eprintln!(
407                    "(Skipped {} non-translation file{})",
408                    skipped_files,
409                    if skipped_files == 1 { "" } else { "s" }
410                );
411            }
412        }
413
414        Ok(matches)
415    }
416}
417
418fn is_ignored(entry: &walkdir::DirEntry) -> bool {
419    // Always allow the root directory of the search
420    if entry.depth() == 0 {
421        return false;
422    }
423
424    entry
425        .file_name()
426        .to_str()
427        .map(|s| {
428            s.starts_with('.') // Hidden files/dirs
429                || s == "node_modules"
430                || s == "target"
431                || s == "dist"
432                || s == "build"
433                || s == "vendor"
434        })
435        .unwrap_or(false)
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441    use std::fs;
442
443    use tempfile::tempdir;
444
445    #[test]
446    fn test_key_extractor_simple() -> Result<()> {
447        let dir = tempdir()?;
448        let en_path = dir.path().join("en.yml");
449        let fr_path = dir.path().join("fr.yml");
450
451        // Write simple yaml files with proper format
452        fs::write(
453            &en_path,
454            "greeting:\n  hello: \"Hello World\"\n  goodbye: \"Goodbye\"",
455        )?;
456        fs::write(
457            &fr_path,
458            "greeting:\n  hello: \"Bonjour World\"\n  goodbye: \"Au revoir\"",
459        )?;
460
461        let extractor = KeyExtractor::new();
462        let results = extractor.extract(dir.path(), "world")?;
463
464        // Should find two entries (en and fr)
465        assert_eq!(results.len(), 2);
466        let keys: Vec<_> = results.iter().map(|e| e.key.clone()).collect();
467        assert!(keys.contains(&"greeting.hello".to_string()));
468        Ok(())
469    }
470
471    #[test]
472    fn test_key_extractor_case_insensitive() -> Result<()> {
473        let dir = tempdir()?;
474        let yaml_path = dir.path().join("test.yml");
475
476        fs::write(
477            &yaml_path,
478            "app:\n  title: \"My Application\"\n  description: \"A great APP for everyone\"",
479        )?;
480
481        let extractor = KeyExtractor::new();
482
483        // Test case insensitive search
484        let results = extractor.extract(dir.path(), "APP")?;
485        assert_eq!(results.len(), 2); // Should match both "Application" and "APP"
486
487        let values: Vec<_> = results.iter().map(|e| e.value.clone()).collect();
488        assert!(values.contains(&"My Application".to_string()));
489        assert!(values.contains(&"A great APP for everyone".to_string()));
490
491        Ok(())
492    }
493
494    #[test]
495    fn test_key_extractor_multiple_files() -> Result<()> {
496        let dir = tempdir()?;
497
498        // Create multiple language files
499        let en_path = dir.path().join("en.yml");
500        let fr_path = dir.path().join("fr.yml");
501        let de_path = dir.path().join("de.yml");
502
503        fs::write(&en_path, "common:\n  action: \"Save Data\"")?;
504        fs::write(&fr_path, "common:\n  action: \"Sauvegarder Data\"")?;
505        fs::write(&de_path, "common:\n  action: \"Speichern Data\"")?;
506
507        let extractor = KeyExtractor::new();
508        let results = extractor.extract(dir.path(), "data")?;
509
510        // Should find all three files (case-insensitive)
511        assert_eq!(results.len(), 3);
512
513        let files: Vec<_> = results
514            .iter()
515            .map(|e| e.file.file_name().unwrap().to_string_lossy().to_string())
516            .collect();
517        assert!(files.contains(&"en.yml".to_string()));
518        assert!(files.contains(&"fr.yml".to_string()));
519        assert!(files.contains(&"de.yml".to_string()));
520
521        Ok(())
522    }
523
524    #[test]
525    fn test_key_extractor_deep_nested() -> Result<()> {
526        let dir = tempdir()?;
527        let yaml_path = dir.path().join("nested.yml");
528
529        fs::write(
530            &yaml_path,
531            "level1:\n  level2:\n    level3:\n      deep_key: \"Deep nested value\"\n      another: \"test value\"",
532        )?;
533
534        let extractor = KeyExtractor::new();
535        let results = extractor.extract(dir.path(), "deep")?;
536
537        assert_eq!(results.len(), 1);
538        assert_eq!(results[0].key, "level1.level2.level3.deep_key");
539        assert_eq!(results[0].value, "Deep nested value");
540
541        Ok(())
542    }
543
544    #[test]
545    fn test_key_extractor_no_matches() -> Result<()> {
546        let dir = tempdir()?;
547        let yaml_path = dir.path().join("test.yml");
548
549        fs::write(
550            &yaml_path,
551            "greeting:\n  hello: \"Hello\"\n  goodbye: \"Goodbye\"",
552        )?;
553
554        let extractor = KeyExtractor::new();
555        let results = extractor.extract(dir.path(), "nonexistent")?;
556
557        assert_eq!(results.len(), 0);
558
559        Ok(())
560    }
561
562    #[test]
563    fn test_key_extractor_supports_yaml_json_and_js() -> Result<()> {
564        let dir = tempdir()?;
565        let yaml_path = dir.path().join("test.yml");
566        let txt_path = dir.path().join("test.txt");
567        let json_path = dir.path().join("test.json");
568        let js_path = dir.path().join("test.js");
569
570        fs::write(&yaml_path, "key: \"test value\"")?;
571        fs::write(&txt_path, "key: test value")?; // This should be ignored
572        fs::write(&json_path, "{\"key\": \"test value\"}")?;
573        fs::write(&js_path, "export default { key: 'test value' };")?;
574
575        let extractor = KeyExtractor::new();
576        let results = extractor.extract(dir.path(), "test")?;
577
578        // Should find YAML, JSON, and JS files
579        assert_eq!(results.len(), 3);
580        let extensions: Vec<_> = results
581            .iter()
582            .map(|e| e.file.extension().unwrap().to_string_lossy().to_string())
583            .collect();
584        assert!(extensions.contains(&"yml".to_string()));
585        assert!(extensions.contains(&"json".to_string()));
586        assert!(extensions.contains(&"js".to_string()));
587
588        Ok(())
589    }
590
591    #[test]
592    fn test_key_extractor_malformed_file() -> Result<()> {
593        let dir = tempdir()?;
594        let good_path = dir.path().join("good.yml");
595        let bad_path = dir.path().join("bad.yml");
596
597        fs::write(&good_path, "key: \"value\"")?;
598        fs::write(&bad_path, "key: value: invalid: yaml")?; // Malformed YAML
599
600        let extractor = KeyExtractor::new();
601        // This should NOT return an error, but just skip the bad file
602        let results = extractor.extract(dir.path(), "value")?;
603
604        // Should find the good file
605        assert_eq!(results.len(), 1);
606        assert_eq!(results[0].value, "value");
607
608        Ok(())
609    }
610
611    #[test]
612    fn test_key_extractor_with_js_file() -> Result<()> {
613        let dir = tempdir()?;
614        let js_path = dir.path().join("en.js");
615
616        fs::write(
617            &js_path,
618            r#"
619export default {
620  table: {
621    emptyText: 'No Data',
622    confirmFilter: 'Confirm'
623  }
624};
625"#,
626        )?;
627
628        let extractor = KeyExtractor::new();
629        let results = extractor.extract(dir.path(), "No Data")?;
630
631        println!("Found {} translation entries:", results.len());
632        for entry in &results {
633            println!(
634                "  {} = {} ({}:{})",
635                entry.key,
636                entry.value,
637                entry.file.display(),
638                entry.line
639            );
640        }
641
642        assert!(
643            !results.is_empty(),
644            "Should find translation entries in JS file"
645        );
646
647        let no_data_entry = results.iter().find(|e| e.value == "No Data");
648        assert!(no_data_entry.is_some(), "Should find 'No Data' entry");
649
650        let entry = no_data_entry.unwrap();
651        assert_eq!(entry.key, "table.emptyText");
652        assert_eq!(entry.value, "No Data");
653
654        Ok(())
655    }
656}