cs/parse/
key_extractor.rs

1// src/parse/key_extractor.rs
2
3use crate::cache::SearchResultCache;
4use crate::error::Result;
5use std::path::Path;
6use walkdir::WalkDir;
7
8use super::json_parser::JsonParser;
9use super::translation::TranslationEntry;
10use super::yaml_parser::YamlParser;
11
12/// `KeyExtractor` provides functionality to search translation entries across
13/// multiple YAML translation files, returning the full dot‑notation key path,
14/// associated file path and line number for each match.
15pub struct KeyExtractor {
16    exclusions: Vec<String>,
17    verbose: bool,
18    quiet: bool, // Suppress progress indicators (for --simple mode)
19    cache: Option<SearchResultCache>,
20}
21
22impl Default for KeyExtractor {
23    fn default() -> Self {
24        Self::new()
25    }
26}
27
28impl KeyExtractor {
29    /// Create a new `KeyExtractor`.
30    pub fn new() -> Self {
31        let cache = SearchResultCache::new().ok(); // Silently disable cache on error
32        Self {
33            exclusions: Vec::new(),
34            verbose: false,
35            quiet: false,
36            cache,
37        }
38    }
39
40    /// Set exclusion patterns (e.g., directories or files to ignore)
41    pub fn set_exclusions(&mut self, exclusions: Vec<String>) {
42        self.exclusions = exclusions;
43    }
44
45    /// Set verbose mode for detailed error messages
46    pub fn set_verbose(&mut self, verbose: bool) {
47        self.verbose = verbose;
48    }
49
50    /// Set quiet mode to suppress progress indicators
51    pub fn set_quiet(&mut self, quiet: bool) {
52        self.quiet = quiet;
53    }
54
55    /// Recursively walk `base_dir` for `*.yml` (or `*.yaml`) files, parse each,
56    /// and return entries whose **value** contains `query`.
57    ///
58    /// Matching is case‑insensitive by default.
59    pub fn extract(&self, base_dir: &Path, query: &str) -> Result<Vec<TranslationEntry>> {
60        let mut matches = Vec::new();
61        let lowered = query.to_lowercase();
62        let mut skipped_files = 0;
63
64        let walker = WalkDir::new(base_dir).into_iter();
65        for entry in walker
66            .filter_entry(|e| {
67                if is_ignored(e) {
68                    return false;
69                }
70                let name = e.file_name().to_string_lossy();
71                for excl in &self.exclusions {
72                    if name == excl.as_str() {
73                        return false;
74                    }
75                }
76                true
77            })
78            .filter_map(|e| e.ok())
79            .filter(|e| e.file_type().is_file())
80        {
81            let path = entry.path();
82            if let Some(ext) = path.extension() {
83                let ext_str = ext.to_string_lossy();
84                if ext_str == "yml" || ext_str == "yaml" {
85                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
86                    // This avoids expensive YAML parsing for files without matches
87                    match YamlParser::contains_query(path, query) {
88                        Ok(false) => {
89                            // No match in file, skip it entirely
90                            if !self.quiet {
91                                use colored::Colorize;
92                                eprint!("{}", "-".dimmed()); // Skipped (no match)
93                            }
94                            continue;
95                        }
96                        Err(_e) => {
97                            // ripgrep failed, fall back to full parsing
98                            // (don't skip the file, just proceed with parsing)
99                        }
100                        Ok(true) => {
101                            // Match found, proceed with parsing below
102                        }
103                    }
104
105                    // Try cache first
106                    let metadata = std::fs::metadata(path).ok();
107                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
108                    {
109                        let mtime = meta.modified().ok();
110                        let size = meta.len();
111                        if let Some(mt) = mtime {
112                            cache.get(path, query, false, mt, size)
113                        } else {
114                            None
115                        }
116                    } else {
117                        None
118                    };
119
120                    let all_entries = if let Some(cached) = cached_results {
121                        if !self.quiet {
122                            use colored::Colorize;
123                            eprint!("{}", "C".cyan()); // Cache hit!
124                        } else {
125                            eprintln!("[cache] hit {} (yaml)", path.display());
126                        }
127                        cached
128                    } else {
129                        // Cache miss - parse file with query for optimization
130                        match YamlParser::parse_file_with_query(path, Some(query)) {
131                            Ok(entries) => {
132                                if !self.quiet {
133                                    use colored::Colorize;
134                                    eprint!("{}", ".".green()); // Successfully parsed
135                                }
136
137                                // Store in cache
138                                if let (Some(cache), Ok(meta)) =
139                                    (&self.cache, std::fs::metadata(path))
140                                {
141                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
142                                        let _ =
143                                            cache.set(path, query, false, mtime, size, &entries);
144                                    }
145                                }
146
147                                entries
148                            }
149                            Err(e) => {
150                                skipped_files += 1;
151                                if !self.quiet {
152                                    use colored::Colorize;
153                                    eprint!("{}", "S".yellow()); // Skipped due to parse error
154                                }
155                                if self.verbose {
156                                    eprintln!(
157                                        "\nWarning: Failed to parse YAML file {}: {}",
158                                        path.display(),
159                                        e
160                                    );
161                                }
162                                continue;
163                            }
164                        }
165                    };
166
167                    // Filter for matching entries
168                    for e in all_entries {
169                        if e.value.to_lowercase().contains(&lowered) {
170                            matches.push(e);
171                        }
172                    }
173                } else if ext_str == "json" {
174                    // OPTIMIZATION: Use ripgrep to pre-filter files before parsing
175                    // Note: We don't have a contains_query for JSON yet, so we use YAML's
176                    match YamlParser::contains_query(path, query) {
177                        Ok(false) => {
178                            // No match in file, skip it entirely
179                            if !self.quiet {
180                                use colored::Colorize;
181                                eprint!("{}", "-".dimmed()); // Skipped (no match)
182                            }
183                            continue;
184                        }
185                        Err(_e) => {
186                            // ripgrep failed, fall back to full parsing
187                        }
188                        Ok(true) => {
189                            // Match found, proceed with parsing below
190                        }
191                    }
192
193                    // Try cache first
194                    let metadata = std::fs::metadata(path).ok();
195                    let cached_results = if let (Some(cache), Some(meta)) = (&self.cache, metadata)
196                    {
197                        let mtime = meta.modified().ok();
198                        let size = meta.len();
199                        if let Some(mt) = mtime {
200                            cache.get(path, query, false, mt, size)
201                        } else {
202                            None
203                        }
204                    } else {
205                        None
206                    };
207
208                    let all_entries = if let Some(cached) = cached_results {
209                        if !self.quiet {
210                            use colored::Colorize;
211                            eprint!("{}", "C".cyan()); // Cache hit!
212                        } else {
213                            eprintln!("[cache] hit {} (json)", path.display());
214                        }
215                        cached
216                    } else {
217                        // Cache miss - parse file with query for optimization
218                        match JsonParser::parse_file_with_query(path, Some(query)) {
219                            Ok(entries) => {
220                                if !self.quiet {
221                                    use colored::Colorize;
222                                    eprint!("{}", ".".green()); // Successfully parsed
223                                }
224
225                                // Store in cache
226                                if let (Some(cache), Ok(meta)) =
227                                    (&self.cache, std::fs::metadata(path))
228                                {
229                                    if let (Ok(mtime), size) = (meta.modified(), meta.len()) {
230                                        let _ =
231                                            cache.set(path, query, false, mtime, size, &entries);
232                                    }
233                                }
234
235                                entries
236                            }
237                            Err(e) => {
238                                skipped_files += 1;
239                                if !self.quiet {
240                                    use colored::Colorize;
241                                    eprint!("{}", "S".yellow()); // Skipped due to parse error
242                                }
243                                if self.verbose {
244                                    eprintln!(
245                                        "\nWarning: Failed to parse JSON file {}: {}",
246                                        path.display(),
247                                        e
248                                    );
249                                }
250                                continue;
251                            }
252                        }
253                    };
254
255                    // Filter for matching entries
256                    for e in all_entries {
257                        if e.value.to_lowercase().contains(&lowered) {
258                            matches.push(e);
259                        }
260                    }
261                }
262            }
263        }
264
265        // Print newline and summary if files were skipped (only in verbose mode)
266        // Note: Skipped files are typically config files (package.json, tsconfig.json, etc.)
267        // that aren't translation files, which is expected behavior.
268        if !self.quiet {
269            if skipped_files > 0 && self.verbose {
270                eprintln!(); // Newline after the S indicators
271                eprintln!(
272                    "(Skipped {} non-translation file{})",
273                    skipped_files,
274                    if skipped_files == 1 { "" } else { "s" }
275                );
276            } else if skipped_files > 0 {
277                eprintln!(); // Just newline, no message in non-verbose mode
278            }
279        }
280
281        Ok(matches)
282    }
283}
284
285fn is_ignored(entry: &walkdir::DirEntry) -> bool {
286    // Always allow the root directory of the search
287    if entry.depth() == 0 {
288        return false;
289    }
290
291    entry
292        .file_name()
293        .to_str()
294        .map(|s| {
295            s.starts_with('.') // Hidden files/dirs
296                || s == "node_modules"
297                || s == "target"
298                || s == "dist"
299                || s == "build"
300                || s == "vendor"
301        })
302        .unwrap_or(false)
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308    use std::fs;
309
310    use tempfile::tempdir;
311
312    #[test]
313    fn test_key_extractor_simple() -> Result<()> {
314        let dir = tempdir()?;
315        let en_path = dir.path().join("en.yml");
316        let fr_path = dir.path().join("fr.yml");
317
318        // Write simple yaml files with proper format
319        fs::write(
320            &en_path,
321            "greeting:\n  hello: \"Hello World\"\n  goodbye: \"Goodbye\"",
322        )?;
323        fs::write(
324            &fr_path,
325            "greeting:\n  hello: \"Bonjour World\"\n  goodbye: \"Au revoir\"",
326        )?;
327
328        let extractor = KeyExtractor::new();
329        let results = extractor.extract(dir.path(), "world")?;
330
331        // Should find two entries (en and fr)
332        assert_eq!(results.len(), 2);
333        let keys: Vec<_> = results.iter().map(|e| e.key.clone()).collect();
334        assert!(keys.contains(&"greeting.hello".to_string()));
335        Ok(())
336    }
337
338    #[test]
339    fn test_key_extractor_case_insensitive() -> Result<()> {
340        let dir = tempdir()?;
341        let yaml_path = dir.path().join("test.yml");
342
343        fs::write(
344            &yaml_path,
345            "app:\n  title: \"My Application\"\n  description: \"A great APP for everyone\"",
346        )?;
347
348        let extractor = KeyExtractor::new();
349
350        // Test case insensitive search
351        let results = extractor.extract(dir.path(), "APP")?;
352        assert_eq!(results.len(), 2); // Should match both "Application" and "APP"
353
354        let values: Vec<_> = results.iter().map(|e| e.value.clone()).collect();
355        assert!(values.contains(&"My Application".to_string()));
356        assert!(values.contains(&"A great APP for everyone".to_string()));
357
358        Ok(())
359    }
360
361    #[test]
362    fn test_key_extractor_multiple_files() -> Result<()> {
363        let dir = tempdir()?;
364
365        // Create multiple language files
366        let en_path = dir.path().join("en.yml");
367        let fr_path = dir.path().join("fr.yml");
368        let de_path = dir.path().join("de.yml");
369
370        fs::write(&en_path, "common:\n  action: \"Save Data\"")?;
371        fs::write(&fr_path, "common:\n  action: \"Sauvegarder Data\"")?;
372        fs::write(&de_path, "common:\n  action: \"Speichern Data\"")?;
373
374        let extractor = KeyExtractor::new();
375        let results = extractor.extract(dir.path(), "data")?;
376
377        // Should find all three files (case-insensitive)
378        assert_eq!(results.len(), 3);
379
380        let files: Vec<_> = results
381            .iter()
382            .map(|e| e.file.file_name().unwrap().to_string_lossy().to_string())
383            .collect();
384        assert!(files.contains(&"en.yml".to_string()));
385        assert!(files.contains(&"fr.yml".to_string()));
386        assert!(files.contains(&"de.yml".to_string()));
387
388        Ok(())
389    }
390
391    #[test]
392    fn test_key_extractor_deep_nested() -> Result<()> {
393        let dir = tempdir()?;
394        let yaml_path = dir.path().join("nested.yml");
395
396        fs::write(
397            &yaml_path,
398            "level1:\n  level2:\n    level3:\n      deep_key: \"Deep nested value\"\n      another: \"test value\"",
399        )?;
400
401        let extractor = KeyExtractor::new();
402        let results = extractor.extract(dir.path(), "deep")?;
403
404        assert_eq!(results.len(), 1);
405        assert_eq!(results[0].key, "level1.level2.level3.deep_key");
406        assert_eq!(results[0].value, "Deep nested value");
407
408        Ok(())
409    }
410
411    #[test]
412    fn test_key_extractor_no_matches() -> Result<()> {
413        let dir = tempdir()?;
414        let yaml_path = dir.path().join("test.yml");
415
416        fs::write(
417            &yaml_path,
418            "greeting:\n  hello: \"Hello\"\n  goodbye: \"Goodbye\"",
419        )?;
420
421        let extractor = KeyExtractor::new();
422        let results = extractor.extract(dir.path(), "nonexistent")?;
423
424        assert_eq!(results.len(), 0);
425
426        Ok(())
427    }
428
429    #[test]
430    fn test_key_extractor_supports_json_and_yaml() -> Result<()> {
431        let dir = tempdir()?;
432        let yaml_path = dir.path().join("test.yml");
433        let txt_path = dir.path().join("test.txt");
434        let json_path = dir.path().join("test.json");
435
436        fs::write(&yaml_path, "key: \"test value\"")?;
437        fs::write(&txt_path, "key: test value")?; // This should be ignored
438        fs::write(&json_path, "{\"key\": \"test value\"}")?; // This should be ignored
439
440        let extractor = KeyExtractor::new();
441        let results = extractor.extract(dir.path(), "test")?;
442
443        // Should find both YAML and JSON files
444        assert_eq!(results.len(), 2);
445        let extensions: Vec<_> = results
446            .iter()
447            .map(|e| e.file.extension().unwrap().to_string_lossy().to_string())
448            .collect();
449        assert!(extensions.contains(&"yml".to_string()));
450        assert!(extensions.contains(&"json".to_string()));
451
452        Ok(())
453    }
454
455    #[test]
456    fn test_key_extractor_malformed_file() -> Result<()> {
457        let dir = tempdir()?;
458        let good_path = dir.path().join("good.yml");
459        let bad_path = dir.path().join("bad.yml");
460
461        fs::write(&good_path, "key: \"value\"")?;
462        fs::write(&bad_path, "key: value: invalid: yaml")?; // Malformed YAML
463
464        let extractor = KeyExtractor::new();
465        // This should NOT return an error, but just skip the bad file
466        let results = extractor.extract(dir.path(), "value")?;
467
468        // Should find the good file
469        assert_eq!(results.len(), 1);
470        assert_eq!(results[0].value, "value");
471
472        Ok(())
473    }
474}