cs/parse/
yaml_parser.rs

1use crate::error::{Result, SearchError};
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5use yaml_rust::{Yaml, YamlLoader};
6
7use super::translation::TranslationEntry;
8
9/// Parser for YAML translation files
10pub struct YamlParser;
11
12impl YamlParser {
13    /// Fast pre-check: does this file contain the search query?
14    /// Uses grep library for exact match before expensive YAML parsing.
15    /// Returns true if the file contains the query (case-insensitive).
16    pub fn contains_query(path: &Path, query: &str) -> Result<bool> {
17        use grep_regex::RegexMatcherBuilder;
18        use grep_searcher::sinks::UTF8;
19        use grep_searcher::SearcherBuilder;
20
21        // Build matcher for case-insensitive fixed-string search
22        let matcher = RegexMatcherBuilder::new()
23            .case_insensitive(true)
24            .fixed_strings(true) // Treat as literal string, not regex
25            .build(query)
26            .map_err(|e| {
27                SearchError::yaml_parse_error(path, format!("Failed to build matcher: {}", e))
28            })?;
29
30        // Use searcher to check if file contains the query
31        let mut searcher = SearcherBuilder::new().build();
32        let mut found = false;
33
34        searcher
35            .search_path(
36                &matcher,
37                path,
38                UTF8(|_line_num, _line_content| {
39                    found = true;
40                    Ok(false) // Stop searching after first match
41                }),
42            )
43            .map_err(|e| SearchError::yaml_parse_error(path, format!("Search failed: {}", e)))?;
44
45        Ok(found)
46    }
47
48    pub fn parse_file(path: &Path) -> Result<Vec<TranslationEntry>> {
49        Self::parse_file_with_query(path, None)
50    }
51
52    /// Parse YAML file, optionally filtering by query for better performance.
53    /// If query is provided, uses bottom-up approach: finds exact matches with grep,
54    /// then traces keys upward WITHOUT parsing the entire YAML structure.
55    pub fn parse_file_with_query(
56        path: &Path,
57        query: Option<&str>,
58    ) -> Result<Vec<TranslationEntry>> {
59        let content = fs::read_to_string(path).map_err(|e| {
60            SearchError::yaml_parse_error(path, format!("Failed to read file: {}", e))
61        })?;
62
63        // Strip ERB templates to support Rails-style YAML fixtures
64        let cleaned_content = Self::strip_erb_templates(&content);
65
66        // If query is provided, use bottom-up approach
67        // FIXME: Bottom-up trace is buggy (returns leaf keys), disabled for now.
68        // if let Some(q) = query {
69        //     return Self::parse_with_bottom_up_trace(path, &cleaned_content, q);
70        // }
71
72        // No query - parse entire file (fallback to old method)
73        let mut value_to_line: HashMap<String, usize> = HashMap::new();
74        for (line_num, line) in cleaned_content.lines().enumerate() {
75            if let Some(colon_pos) = line.find(':') {
76                let value = line[colon_pos + 1..].trim();
77                if !value.is_empty() && !value.starts_with('#') {
78                    let clean_value = value.trim_matches('"').trim_matches('\'');
79                    if !clean_value.is_empty() {
80                        value_to_line
81                            .entry(clean_value.to_string())
82                            .or_insert(line_num + 1);
83                    }
84                }
85            }
86        }
87
88        let docs = YamlLoader::load_from_str(&cleaned_content).map_err(|e| {
89            SearchError::yaml_parse_error(path, format!("Invalid YAML syntax: {}", e))
90        })?;
91
92        let mut entries = Vec::new();
93        for doc in docs {
94            Self::flatten_yaml(doc, String::new(), path, &value_to_line, &mut entries, true);
95        }
96
97        // Filter by query if provided (since bottom-up trace is disabled)
98        if let Some(q) = query {
99            let q_lower = q.to_lowercase();
100            entries.retain(|e| e.value.to_lowercase().contains(&q_lower));
101        }
102
103        Ok(entries)
104    }
105
106    /*
107    /// Bottom-up approach: Find matching lines with grep, then trace keys upward.
108    /// This avoids parsing the entire YAML structure.
109    fn parse_with_bottom_up_trace(
110        path: &Path,
111        content: &str,
112        query: &str,
113    ) -> Result<Vec<TranslationEntry>> {
114        use grep_regex::RegexMatcherBuilder;
115        use grep_searcher::sinks::UTF8;
116        use grep_searcher::SearcherBuilder;
117        use std::collections::HashMap;
118
119        // Use grep to find exact line numbers with matches
120        let matcher = RegexMatcherBuilder::new()
121            .case_insensitive(true)
122            .fixed_strings(true)
123            .build(query)
124            .map_err(|e| SearchError::yaml_parse_error(path, format!("Matcher error: {}", e)))?;
125
126        let mut searcher = SearcherBuilder::new().line_number(true).build();
127        let mut matched_lines: Vec<(usize, String)> = Vec::new();
128
129        searcher
130            .search_path(
131                &matcher,
132                path,
133                UTF8(|line_num, line_content| {
134                    matched_lines.push((line_num as usize, line_content.to_string()));
135                    Ok(true) // Continue searching
136                }),
137            )
138            .map_err(|e| SearchError::yaml_parse_error(path, format!("Search error: {}", e)))?;
139
140        if matched_lines.is_empty() {
141            return Ok(Vec::new());
142        }
143
144        // For each matched line, trace the key path bottom-up
145        let lines: Vec<&str> = content.lines().collect();
146        let mut entries = Vec::new();
147
148        // Optimization: tree is non-tangled, later matches appear after earlier ones.
149        // Maintain a cutoff and ancestor cache to stop climbing once we cross earlier paths.
150        let mut cutoff_line: usize = 0;
151        let mut ancestor_cache: HashMap<usize, Vec<String>> = HashMap::new();
152
153        for (line_num, _line_content) in matched_lines {
154            if let Some(trace) =
155                Self::trace_key_from_line(&lines, line_num, path, cutoff_line, &ancestor_cache)
156            {
157                // Register ancestors for future lookups (so later matches can stop early)
158                for (line_idx, prefix) in trace.parent_prefixes {
159                    ancestor_cache.entry(line_idx).or_insert(prefix);
160                }
161
162                entries.push(trace.entry);
163            }
164
165            // Monotonic guarantee: subsequent matches start after the previous leaf
166            cutoff_line = line_num;
167        }
168
169        Ok(entries)
170    }
171
172    /// Binary search for parent key with indent less than target_indent.
173    /// Returns (line_index, key, indent) if found.
174    /// Handles empty lines and comments by moving up one line.
175    fn binary_search_parent(
176        lines: &[&str],
177        end_line: usize,
178        target_indent: usize,
179        cutoff_line: usize,
180        _ancestor_cache: &HashMap<usize, Vec<String>>,
181    ) -> Option<(usize, String, usize)> {
182        let mut left = 0;
183        let mut right = end_line;
184        let mut best_match: Option<(usize, String, usize)> = None;
185
186        while left <= right {
187            let mid = (left + right) / 2;
188            let mut check_line = mid;
189
190            // Skip empty lines and comments by moving up
191            while check_line > 0 {
192                let line = lines[check_line];
193                if !line.trim().is_empty() && !line.trim().starts_with('#') {
194                    break;
195                }
196                check_line -= 1;
197            }
198
199            if check_line == 0 && (lines[0].trim().is_empty() || lines[0].trim().starts_with('#')) {
200                // Couldn't find valid line, search left half
201                if mid == 0 {
202                    break;
203                }
204                right = mid - 1;
205                continue;
206            }
207
208            let line = lines[check_line];
209            let line_indent = line.len() - line.trim_start().len();
210            let line_idx = check_line + 1; // Convert to 1-based
211
212            // Check if we hit cutoff line (ancestor cache boundary)
213            if line_idx <= cutoff_line {
214                // Stop searching in this region
215                if mid == 0 {
216                    break;
217                }
218                right = mid - 1;
219                continue;
220            }
221
222            // Check if this line has a key (contains ':')
223            if let Some(colon_pos) = line.find(':') {
224                let key = line[..colon_pos].trim().to_string();
225
226                if line_indent < target_indent {
227                    // Found a parent! But keep searching for the closest one
228                    best_match = Some((check_line, key, line_indent));
229                    // Search right half for closer parent
230                    left = mid + 1;
231                } else if line_indent >= target_indent {
232                    // Too indented or same level, search left half
233                    if mid == 0 {
234                        break;
235                    }
236                    right = mid - 1;
237                } else {
238                    // Exact match shouldn't happen, search left
239                    if mid == 0 {
240                        break;
241                    }
242                    right = mid - 1;
243                }
244            } else {
245                // No colon, not a key line, search left
246                if mid == 0 {
247                    break;
248                }
249                right = mid - 1;
250            }
251
252            if left > right {
253                break;
254            }
255        }
256
257        best_match
258    }
259
260    /// Trace the YAML key path from a specific line number bottom-up.
261    /// Uses binary search to find parents efficiently (O(log n) instead of O(n)).
262    fn trace_key_from_line(
263        lines: &[&str],
264        line_num: usize,
265        path: &Path,
266        cutoff_line: usize,
267        ancestor_cache: &HashMap<usize, Vec<String>>,
268    ) -> Option<TraceResult> {
269        if line_num == 0 || line_num > lines.len() {
270            return None;
271        }
272
273        let target_line = lines[line_num - 1]; // Convert to 0-indexed
274
275        // Extract the key and value from the target line
276        let colon_pos = target_line.find(':')?;
277        let key_part = target_line[..colon_pos].trim();
278        let value_part = target_line[colon_pos + 1..].trim();
279
280        // Check for malformed YAML: multiple colons without quotes
281        // e.g., "key: value: invalid: yaml" should be rejected
282        if value_part.contains(':') && !value_part.starts_with('"') && !value_part.starts_with('\'')
283        {
284            return None; // Skip malformed lines
285        }
286
287        let value = value_part.trim_matches('"').trim_matches('\'').to_string();
288
289        // Skip empty values
290        if value.is_empty() {
291            return None;
292        }
293
294        // Get the indentation level of the target line
295        let target_indent = target_line.len() - target_line.trim_start().len();
296
297        // Build the key path by walking up the tree using binary search
298        let mut key_parts = vec![key_part.to_string()];
299        let mut current_indent = target_indent;
300        let mut parent_lines: Vec<usize> = Vec::new();
301        let mut search_end = line_num - 1; // Start searching from line before target
302
303        // Find parents by binary searching for each indent level
304        while current_indent > 0 && search_end > 0 {
305            // Binary search for parent with indent < current_indent
306            if let Some((parent_idx, parent_key, parent_indent)) = Self::binary_search_parent(
307                lines,
308                search_end,
309                current_indent,
310                cutoff_line,
311                ancestor_cache,
312            ) {
313                let line_idx = parent_idx + 1; // Convert to 1-based
314
315                // Check if we hit cached ancestor
316                if let Some(prefix) = ancestor_cache.get(&line_idx) {
317                    let mut combined = prefix.clone();
318                    combined.extend(key_parts);
319                    return Some(TraceResult::new(
320                        combined,
321                        value,
322                        line_num,
323                        path,
324                        parent_lines,
325                    ));
326                }
327
328                // Skip locale root keys (en, fr, de, etc.)
329                if parent_indent == 0
330                    && (parent_key == "en"
331                        || parent_key == "fr"
332                        || parent_key == "de"
333                        || parent_key == "es"
334                        || parent_key == "ja"
335                        || parent_key == "zh")
336                {
337                    break;
338                }
339
340                key_parts.insert(0, parent_key);
341                parent_lines.push(line_idx);
342                current_indent = parent_indent;
343                search_end = parent_idx; // Next search ends at this parent
344
345                if parent_indent == 0 {
346                    break; // Reached root
347                }
348            } else {
349                break; // No more parents found
350            }
351        }
352
353        Some(TraceResult::new(
354            key_parts,
355            value,
356            line_num,
357            path,
358            parent_lines,
359        ))
360    }
361    */
362    /// Strip ERB templates (<%= ... %> and <% ... %>) from YAML
363    /// This enables parsing of Rails fixture files
364    fn strip_erb_templates(content: &str) -> String {
365        let mut result = String::with_capacity(content.len());
366        let mut chars = content.chars().peekable();
367
368        while let Some(ch) = chars.next() {
369            if ch == '<' {
370                if let Some(&'%') = chars.peek() {
371                    chars.next(); // consume '%'
372
373                    // Check for <%= or <%
374                    let _has_equals = if let Some(&'=') = chars.peek() {
375                        chars.next(); // consume '='
376                        true
377                    } else {
378                        false
379                    };
380
381                    // Skip until we find %>
382                    let mut prev = ' ';
383                    for c in chars.by_ref() {
384                        if prev == '%' && c == '>' {
385                            break;
386                        }
387                        if c == '\n' {
388                            result.push('\n'); // preserve newlines
389                        }
390                        prev = c;
391                    }
392
393                    // Replace ERB tag with empty string (already skipped)
394                    continue;
395                }
396            }
397
398            result.push(ch);
399        }
400
401        result
402    }
403
404    fn flatten_yaml(
405        yaml: Yaml,
406        prefix: String,
407        file_path: &Path,
408        value_to_line: &HashMap<String, usize>,
409        entries: &mut Vec<TranslationEntry>,
410        is_root: bool,
411    ) {
412        match yaml {
413            Yaml::Hash(hash) => {
414                for (key, value) in hash {
415                    if let Some(key_str) = key.as_str() {
416                        // Check if this is a locale root BEFORE building prefix
417                        let is_locale_root = is_root
418                            && prefix.is_empty()
419                            && (key_str == "en"
420                                || key_str == "fr"
421                                || key_str == "de"
422                                || key_str == "es"
423                                || key_str == "ja"
424                                || key_str == "zh");
425
426                        // For locale roots, skip the locale prefix entirely
427                        let new_prefix = if is_locale_root {
428                            String::new()
429                        } else if prefix.is_empty() {
430                            key_str.to_string()
431                        } else {
432                            format!("{}.{}", prefix, key_str)
433                        };
434
435                        // Only flatten once, not twice!
436                        Self::flatten_yaml(
437                            value,
438                            new_prefix,
439                            file_path,
440                            value_to_line,
441                            entries,
442                            false,
443                        );
444                    }
445                }
446            }
447            Yaml::String(value) => {
448                let line = value_to_line.get(&value).copied().unwrap_or(0);
449
450                entries.push(TranslationEntry {
451                    key: prefix,
452                    value,
453                    line,
454                    file: PathBuf::from(file_path),
455                });
456            }
457            Yaml::Integer(value) => {
458                let value_str = value.to_string();
459                let line = value_to_line.get(&value_str).copied().unwrap_or(0);
460
461                entries.push(TranslationEntry {
462                    key: prefix,
463                    value: value_str,
464                    line,
465                    file: PathBuf::from(file_path),
466                });
467            }
468            Yaml::Boolean(value) => {
469                let value_str = value.to_string();
470                let line = value_to_line.get(&value_str).copied().unwrap_or(0);
471
472                entries.push(TranslationEntry {
473                    key: prefix,
474                    value: value_str,
475                    line,
476                    file: PathBuf::from(file_path),
477                });
478            }
479            Yaml::Array(arr) => {
480                for (index, val) in arr.into_iter().enumerate() {
481                    let new_prefix = if prefix.is_empty() {
482                        index.to_string()
483                    } else {
484                        format!("{}.{}", prefix, index)
485                    };
486                    Self::flatten_yaml(val, new_prefix, file_path, value_to_line, entries, false);
487                }
488            }
489            _ => {
490                // Ignore other types for now
491            }
492        }
493    }
494}
495
496/*
497/// Result of a trace with ancestor bookkeeping so future traces can short-circuit.
498struct TraceResult {
499    entry: TranslationEntry,
500    parent_prefixes: Vec<(usize, Vec<String>)>,
501}
502
503impl TraceResult {
504    fn new(
505        key_parts: Vec<String>,
506        value: String,
507        line_num: usize,
508        path: &Path,
509        parent_lines: Vec<usize>,
510    ) -> Self {
511        let entry = TranslationEntry {
512            key: key_parts.join("."),
513            value,
514            line: line_num,
515            file: PathBuf::from(path),
516        };
517
518        // Build prefix cache for each ancestor line (root first) so later traces can stop early.
519        let mut parent_prefixes = Vec::new();
520        for (idx, line_idx) in parent_lines.iter().rev().enumerate() {
521            // idx corresponds to prefix length in key_parts
522            let prefix_len = idx + 1;
523            if prefix_len <= key_parts.len() {
524                parent_prefixes.push((*line_idx, key_parts[..prefix_len].to_vec()));
525            }
526        }
527
528        Self {
529            entry,
530            parent_prefixes,
531        }
532    }
533}
534*/
535
536#[cfg(test)]
537mod tests {
538    use super::*;
539    use std::io::Write;
540    use tempfile::NamedTempFile;
541
542    #[test]
543    fn test_parse_simple_yaml() {
544        let mut file = NamedTempFile::new().unwrap();
545        write!(file, "key: value").unwrap();
546
547        let entries = YamlParser::parse_file(file.path()).unwrap();
548        assert_eq!(entries.len(), 1);
549        assert_eq!(entries[0].key, "key");
550        assert_eq!(entries[0].value, "value");
551        assert_eq!(entries[0].line, 1);
552    }
553
554    #[test]
555    fn test_parse_nested_yaml() {
556        let mut file = NamedTempFile::new().unwrap();
557        write!(file, "parent:\n  child: value").unwrap();
558
559        let entries = YamlParser::parse_file(file.path()).unwrap();
560        assert_eq!(entries.len(), 1);
561        assert_eq!(entries[0].key, "parent.child");
562        assert_eq!(entries[0].value, "value");
563        assert_eq!(entries[0].line, 2);
564    }
565
566    #[test]
567    fn test_parse_multiple_keys() {
568        let mut file = NamedTempFile::new().unwrap();
569        write!(
570            file,
571            "
572key1: value1
573key2: value2
574nested:
575  key3: value3
576"
577        )
578        .unwrap();
579
580        let entries = YamlParser::parse_file(file.path()).unwrap();
581        assert_eq!(entries.len(), 3);
582
583        // Find entries by key
584        let entry1 = entries.iter().find(|e| e.key == "key1").unwrap();
585        assert_eq!(entry1.value, "value1");
586        assert_eq!(entry1.line, 2);
587
588        let entry2 = entries.iter().find(|e| e.key == "key2").unwrap();
589        assert_eq!(entry2.value, "value2");
590        assert_eq!(entry2.line, 3);
591
592        let entry3 = entries.iter().find(|e| e.key == "nested.key3").unwrap();
593        assert_eq!(entry3.value, "value3");
594        assert_eq!(entry3.line, 5);
595    }
596
597    #[test]
598    fn test_parse_yaml_array() {
599        let mut file = NamedTempFile::new().unwrap();
600        write!(file, "list:\n  - item1\n  - item2").unwrap();
601
602        let entries = YamlParser::parse_file(file.path()).unwrap();
603        assert_eq!(entries.len(), 2);
604
605        let item1 = entries.iter().find(|e| e.value == "item1").unwrap();
606        assert_eq!(item1.key, "list.0");
607
608        let item2 = entries.iter().find(|e| e.value == "item2").unwrap();
609        assert_eq!(item2.key, "list.1");
610    }
611
612    #[test]
613    fn test_bottom_up_trace() {
614        let mut file = NamedTempFile::new().unwrap();
615        write!(
616            file,
617            "en:
618  js:
619    user:
620      log_in: \"Log In\"
621      sign_up: \"Sign Up\"
622"
623        )
624        .unwrap();
625
626        let entries = YamlParser::parse_file_with_query(file.path(), Some("Log In")).unwrap();
627        assert_eq!(entries.len(), 1);
628        assert_eq!(entries[0].key, "js.user.log_in");
629        assert_eq!(entries[0].value, "Log In");
630        assert_eq!(entries[0].line, 4);
631    }
632}