cs/parse/yaml_parser.rs
1use crate::error::{Result, SearchError};
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5use yaml_rust::{Yaml, YamlLoader};
6
7use super::translation::TranslationEntry;
8
9/// Parser for YAML translation files
10pub struct YamlParser;
11
12impl YamlParser {
13 /// Fast pre-check: does this file contain the search query?
14 /// Uses grep library for exact match before expensive YAML parsing.
15 /// Returns true if the file contains the query (case-insensitive).
16 pub fn contains_query(path: &Path, query: &str) -> Result<bool> {
17 use grep_regex::RegexMatcherBuilder;
18 use grep_searcher::sinks::UTF8;
19 use grep_searcher::SearcherBuilder;
20
21 // Build matcher for case-insensitive fixed-string search
22 let matcher = RegexMatcherBuilder::new()
23 .case_insensitive(true)
24 .fixed_strings(true) // Treat as literal string, not regex
25 .build(query)
26 .map_err(|e| {
27 SearchError::yaml_parse_error(path, format!("Failed to build matcher: {}", e))
28 })?;
29
30 // Use searcher to check if file contains the query
31 let mut searcher = SearcherBuilder::new().build();
32 let mut found = false;
33
34 searcher
35 .search_path(
36 &matcher,
37 path,
38 UTF8(|_line_num, _line_content| {
39 found = true;
40 Ok(false) // Stop searching after first match
41 }),
42 )
43 .map_err(|e| SearchError::yaml_parse_error(path, format!("Search failed: {}", e)))?;
44
45 Ok(found)
46 }
47
48 pub fn parse_file(path: &Path) -> Result<Vec<TranslationEntry>> {
49 Self::parse_file_with_query(path, None)
50 }
51
52 /// Parse YAML file, optionally filtering by query for better performance.
53 /// If query is provided, uses bottom-up approach: finds exact matches with grep,
54 /// then traces keys upward WITHOUT parsing the entire YAML structure.
55 pub fn parse_file_with_query(
56 path: &Path,
57 query: Option<&str>,
58 ) -> Result<Vec<TranslationEntry>> {
59 let content = fs::read_to_string(path).map_err(|e| {
60 SearchError::yaml_parse_error(path, format!("Failed to read file: {}", e))
61 })?;
62
63 // Strip ERB templates to support Rails-style YAML fixtures
64 let cleaned_content = Self::strip_erb_templates(&content);
65
66 // If query is provided, use bottom-up approach
67 // FIXME: Bottom-up trace is buggy (returns leaf keys), disabled for now.
68 // if let Some(q) = query {
69 // return Self::parse_with_bottom_up_trace(path, &cleaned_content, q);
70 // }
71
72 // No query - parse entire file (fallback to old method)
73 let mut value_to_line: HashMap<String, usize> = HashMap::new();
74 for (line_num, line) in cleaned_content.lines().enumerate() {
75 if let Some(colon_pos) = line.find(':') {
76 let value = line[colon_pos + 1..].trim();
77 if !value.is_empty() && !value.starts_with('#') {
78 let clean_value = value.trim_matches('"').trim_matches('\'');
79 if !clean_value.is_empty() {
80 value_to_line
81 .entry(clean_value.to_string())
82 .or_insert(line_num + 1);
83 }
84 }
85 }
86 }
87
88 let docs = YamlLoader::load_from_str(&cleaned_content).map_err(|e| {
89 SearchError::yaml_parse_error(path, format!("Invalid YAML syntax: {}", e))
90 })?;
91
92 let mut entries = Vec::new();
93 for doc in docs {
94 Self::flatten_yaml(doc, String::new(), path, &value_to_line, &mut entries, true);
95 }
96
97 // Filter by query if provided (since bottom-up trace is disabled)
98 if let Some(q) = query {
99 let q_lower = q.to_lowercase();
100 entries.retain(|e| e.value.to_lowercase().contains(&q_lower));
101 }
102
103 Ok(entries)
104 }
105
106 /*
107 /// Bottom-up approach: Find matching lines with grep, then trace keys upward.
108 /// This avoids parsing the entire YAML structure.
109 fn parse_with_bottom_up_trace(
110 path: &Path,
111 content: &str,
112 query: &str,
113 ) -> Result<Vec<TranslationEntry>> {
114 use grep_regex::RegexMatcherBuilder;
115 use grep_searcher::sinks::UTF8;
116 use grep_searcher::SearcherBuilder;
117 use std::collections::HashMap;
118
119 // Use grep to find exact line numbers with matches
120 let matcher = RegexMatcherBuilder::new()
121 .case_insensitive(true)
122 .fixed_strings(true)
123 .build(query)
124 .map_err(|e| SearchError::yaml_parse_error(path, format!("Matcher error: {}", e)))?;
125
126 let mut searcher = SearcherBuilder::new().line_number(true).build();
127 let mut matched_lines: Vec<(usize, String)> = Vec::new();
128
129 searcher
130 .search_path(
131 &matcher,
132 path,
133 UTF8(|line_num, line_content| {
134 matched_lines.push((line_num as usize, line_content.to_string()));
135 Ok(true) // Continue searching
136 }),
137 )
138 .map_err(|e| SearchError::yaml_parse_error(path, format!("Search error: {}", e)))?;
139
140 if matched_lines.is_empty() {
141 return Ok(Vec::new());
142 }
143
144 // For each matched line, trace the key path bottom-up
145 let lines: Vec<&str> = content.lines().collect();
146 let mut entries = Vec::new();
147
148 // Optimization: tree is non-tangled, later matches appear after earlier ones.
149 // Maintain a cutoff and ancestor cache to stop climbing once we cross earlier paths.
150 let mut cutoff_line: usize = 0;
151 let mut ancestor_cache: HashMap<usize, Vec<String>> = HashMap::new();
152
153 for (line_num, _line_content) in matched_lines {
154 if let Some(trace) =
155 Self::trace_key_from_line(&lines, line_num, path, cutoff_line, &ancestor_cache)
156 {
157 // Register ancestors for future lookups (so later matches can stop early)
158 for (line_idx, prefix) in trace.parent_prefixes {
159 ancestor_cache.entry(line_idx).or_insert(prefix);
160 }
161
162 entries.push(trace.entry);
163 }
164
165 // Monotonic guarantee: subsequent matches start after the previous leaf
166 cutoff_line = line_num;
167 }
168
169 Ok(entries)
170 }
171
172 /// Binary search for parent key with indent less than target_indent.
173 /// Returns (line_index, key, indent) if found.
174 /// Handles empty lines and comments by moving up one line.
175 fn binary_search_parent(
176 lines: &[&str],
177 end_line: usize,
178 target_indent: usize,
179 cutoff_line: usize,
180 _ancestor_cache: &HashMap<usize, Vec<String>>,
181 ) -> Option<(usize, String, usize)> {
182 let mut left = 0;
183 let mut right = end_line;
184 let mut best_match: Option<(usize, String, usize)> = None;
185
186 while left <= right {
187 let mid = (left + right) / 2;
188 let mut check_line = mid;
189
190 // Skip empty lines and comments by moving up
191 while check_line > 0 {
192 let line = lines[check_line];
193 if !line.trim().is_empty() && !line.trim().starts_with('#') {
194 break;
195 }
196 check_line -= 1;
197 }
198
199 if check_line == 0 && (lines[0].trim().is_empty() || lines[0].trim().starts_with('#')) {
200 // Couldn't find valid line, search left half
201 if mid == 0 {
202 break;
203 }
204 right = mid - 1;
205 continue;
206 }
207
208 let line = lines[check_line];
209 let line_indent = line.len() - line.trim_start().len();
210 let line_idx = check_line + 1; // Convert to 1-based
211
212 // Check if we hit cutoff line (ancestor cache boundary)
213 if line_idx <= cutoff_line {
214 // Stop searching in this region
215 if mid == 0 {
216 break;
217 }
218 right = mid - 1;
219 continue;
220 }
221
222 // Check if this line has a key (contains ':')
223 if let Some(colon_pos) = line.find(':') {
224 let key = line[..colon_pos].trim().to_string();
225
226 if line_indent < target_indent {
227 // Found a parent! But keep searching for the closest one
228 best_match = Some((check_line, key, line_indent));
229 // Search right half for closer parent
230 left = mid + 1;
231 } else if line_indent >= target_indent {
232 // Too indented or same level, search left half
233 if mid == 0 {
234 break;
235 }
236 right = mid - 1;
237 } else {
238 // Exact match shouldn't happen, search left
239 if mid == 0 {
240 break;
241 }
242 right = mid - 1;
243 }
244 } else {
245 // No colon, not a key line, search left
246 if mid == 0 {
247 break;
248 }
249 right = mid - 1;
250 }
251
252 if left > right {
253 break;
254 }
255 }
256
257 best_match
258 }
259
260 /// Trace the YAML key path from a specific line number bottom-up.
261 /// Uses binary search to find parents efficiently (O(log n) instead of O(n)).
262 fn trace_key_from_line(
263 lines: &[&str],
264 line_num: usize,
265 path: &Path,
266 cutoff_line: usize,
267 ancestor_cache: &HashMap<usize, Vec<String>>,
268 ) -> Option<TraceResult> {
269 if line_num == 0 || line_num > lines.len() {
270 return None;
271 }
272
273 let target_line = lines[line_num - 1]; // Convert to 0-indexed
274
275 // Extract the key and value from the target line
276 let colon_pos = target_line.find(':')?;
277 let key_part = target_line[..colon_pos].trim();
278 let value_part = target_line[colon_pos + 1..].trim();
279
280 // Check for malformed YAML: multiple colons without quotes
281 // e.g., "key: value: invalid: yaml" should be rejected
282 if value_part.contains(':') && !value_part.starts_with('"') && !value_part.starts_with('\'')
283 {
284 return None; // Skip malformed lines
285 }
286
287 let value = value_part.trim_matches('"').trim_matches('\'').to_string();
288
289 // Skip empty values
290 if value.is_empty() {
291 return None;
292 }
293
294 // Get the indentation level of the target line
295 let target_indent = target_line.len() - target_line.trim_start().len();
296
297 // Build the key path by walking up the tree using binary search
298 let mut key_parts = vec![key_part.to_string()];
299 let mut current_indent = target_indent;
300 let mut parent_lines: Vec<usize> = Vec::new();
301 let mut search_end = line_num - 1; // Start searching from line before target
302
303 // Find parents by binary searching for each indent level
304 while current_indent > 0 && search_end > 0 {
305 // Binary search for parent with indent < current_indent
306 if let Some((parent_idx, parent_key, parent_indent)) = Self::binary_search_parent(
307 lines,
308 search_end,
309 current_indent,
310 cutoff_line,
311 ancestor_cache,
312 ) {
313 let line_idx = parent_idx + 1; // Convert to 1-based
314
315 // Check if we hit cached ancestor
316 if let Some(prefix) = ancestor_cache.get(&line_idx) {
317 let mut combined = prefix.clone();
318 combined.extend(key_parts);
319 return Some(TraceResult::new(
320 combined,
321 value,
322 line_num,
323 path,
324 parent_lines,
325 ));
326 }
327
328 // Skip locale root keys (en, fr, de, etc.)
329 if parent_indent == 0
330 && (parent_key == "en"
331 || parent_key == "fr"
332 || parent_key == "de"
333 || parent_key == "es"
334 || parent_key == "ja"
335 || parent_key == "zh")
336 {
337 break;
338 }
339
340 key_parts.insert(0, parent_key);
341 parent_lines.push(line_idx);
342 current_indent = parent_indent;
343 search_end = parent_idx; // Next search ends at this parent
344
345 if parent_indent == 0 {
346 break; // Reached root
347 }
348 } else {
349 break; // No more parents found
350 }
351 }
352
353 Some(TraceResult::new(
354 key_parts,
355 value,
356 line_num,
357 path,
358 parent_lines,
359 ))
360 }
361 */
362 /// Strip ERB templates (<%= ... %> and <% ... %>) from YAML
363 /// This enables parsing of Rails fixture files
364 fn strip_erb_templates(content: &str) -> String {
365 let mut result = String::with_capacity(content.len());
366 let mut chars = content.chars().peekable();
367
368 while let Some(ch) = chars.next() {
369 if ch == '<' {
370 if let Some(&'%') = chars.peek() {
371 chars.next(); // consume '%'
372
373 // Check for <%= or <%
374 let _has_equals = if let Some(&'=') = chars.peek() {
375 chars.next(); // consume '='
376 true
377 } else {
378 false
379 };
380
381 // Skip until we find %>
382 let mut prev = ' ';
383 for c in chars.by_ref() {
384 if prev == '%' && c == '>' {
385 break;
386 }
387 if c == '\n' {
388 result.push('\n'); // preserve newlines
389 }
390 prev = c;
391 }
392
393 // Replace ERB tag with empty string (already skipped)
394 continue;
395 }
396 }
397
398 result.push(ch);
399 }
400
401 result
402 }
403
404 fn flatten_yaml(
405 yaml: Yaml,
406 prefix: String,
407 file_path: &Path,
408 value_to_line: &HashMap<String, usize>,
409 entries: &mut Vec<TranslationEntry>,
410 is_root: bool,
411 ) {
412 match yaml {
413 Yaml::Hash(hash) => {
414 for (key, value) in hash {
415 if let Some(key_str) = key.as_str() {
416 // Check if this is a locale root BEFORE building prefix
417 let is_locale_root = is_root
418 && prefix.is_empty()
419 && (key_str == "en"
420 || key_str == "fr"
421 || key_str == "de"
422 || key_str == "es"
423 || key_str == "ja"
424 || key_str == "zh");
425
426 // For locale roots, skip the locale prefix entirely
427 let new_prefix = if is_locale_root {
428 String::new()
429 } else if prefix.is_empty() {
430 key_str.to_string()
431 } else {
432 format!("{}.{}", prefix, key_str)
433 };
434
435 // Only flatten once, not twice!
436 Self::flatten_yaml(
437 value,
438 new_prefix,
439 file_path,
440 value_to_line,
441 entries,
442 false,
443 );
444 }
445 }
446 }
447 Yaml::String(value) => {
448 let line = value_to_line.get(&value).copied().unwrap_or(0);
449
450 entries.push(TranslationEntry {
451 key: prefix,
452 value,
453 line,
454 file: PathBuf::from(file_path),
455 });
456 }
457 Yaml::Integer(value) => {
458 let value_str = value.to_string();
459 let line = value_to_line.get(&value_str).copied().unwrap_or(0);
460
461 entries.push(TranslationEntry {
462 key: prefix,
463 value: value_str,
464 line,
465 file: PathBuf::from(file_path),
466 });
467 }
468 Yaml::Boolean(value) => {
469 let value_str = value.to_string();
470 let line = value_to_line.get(&value_str).copied().unwrap_or(0);
471
472 entries.push(TranslationEntry {
473 key: prefix,
474 value: value_str,
475 line,
476 file: PathBuf::from(file_path),
477 });
478 }
479 Yaml::Array(arr) => {
480 for (index, val) in arr.into_iter().enumerate() {
481 let new_prefix = if prefix.is_empty() {
482 index.to_string()
483 } else {
484 format!("{}.{}", prefix, index)
485 };
486 Self::flatten_yaml(val, new_prefix, file_path, value_to_line, entries, false);
487 }
488 }
489 _ => {
490 // Ignore other types for now
491 }
492 }
493 }
494}
495
496/*
497/// Result of a trace with ancestor bookkeeping so future traces can short-circuit.
498struct TraceResult {
499 entry: TranslationEntry,
500 parent_prefixes: Vec<(usize, Vec<String>)>,
501}
502
503impl TraceResult {
504 fn new(
505 key_parts: Vec<String>,
506 value: String,
507 line_num: usize,
508 path: &Path,
509 parent_lines: Vec<usize>,
510 ) -> Self {
511 let entry = TranslationEntry {
512 key: key_parts.join("."),
513 value,
514 line: line_num,
515 file: PathBuf::from(path),
516 };
517
518 // Build prefix cache for each ancestor line (root first) so later traces can stop early.
519 let mut parent_prefixes = Vec::new();
520 for (idx, line_idx) in parent_lines.iter().rev().enumerate() {
521 // idx corresponds to prefix length in key_parts
522 let prefix_len = idx + 1;
523 if prefix_len <= key_parts.len() {
524 parent_prefixes.push((*line_idx, key_parts[..prefix_len].to_vec()));
525 }
526 }
527
528 Self {
529 entry,
530 parent_prefixes,
531 }
532 }
533}
534*/
535
536#[cfg(test)]
537mod tests {
538 use super::*;
539 use std::io::Write;
540 use tempfile::NamedTempFile;
541
542 #[test]
543 fn test_parse_simple_yaml() {
544 let mut file = NamedTempFile::new().unwrap();
545 write!(file, "key: value").unwrap();
546
547 let entries = YamlParser::parse_file(file.path()).unwrap();
548 assert_eq!(entries.len(), 1);
549 assert_eq!(entries[0].key, "key");
550 assert_eq!(entries[0].value, "value");
551 assert_eq!(entries[0].line, 1);
552 }
553
554 #[test]
555 fn test_parse_nested_yaml() {
556 let mut file = NamedTempFile::new().unwrap();
557 write!(file, "parent:\n child: value").unwrap();
558
559 let entries = YamlParser::parse_file(file.path()).unwrap();
560 assert_eq!(entries.len(), 1);
561 assert_eq!(entries[0].key, "parent.child");
562 assert_eq!(entries[0].value, "value");
563 assert_eq!(entries[0].line, 2);
564 }
565
566 #[test]
567 fn test_parse_multiple_keys() {
568 let mut file = NamedTempFile::new().unwrap();
569 write!(
570 file,
571 "
572key1: value1
573key2: value2
574nested:
575 key3: value3
576"
577 )
578 .unwrap();
579
580 let entries = YamlParser::parse_file(file.path()).unwrap();
581 assert_eq!(entries.len(), 3);
582
583 // Find entries by key
584 let entry1 = entries.iter().find(|e| e.key == "key1").unwrap();
585 assert_eq!(entry1.value, "value1");
586 assert_eq!(entry1.line, 2);
587
588 let entry2 = entries.iter().find(|e| e.key == "key2").unwrap();
589 assert_eq!(entry2.value, "value2");
590 assert_eq!(entry2.line, 3);
591
592 let entry3 = entries.iter().find(|e| e.key == "nested.key3").unwrap();
593 assert_eq!(entry3.value, "value3");
594 assert_eq!(entry3.line, 5);
595 }
596
597 #[test]
598 fn test_parse_yaml_array() {
599 let mut file = NamedTempFile::new().unwrap();
600 write!(file, "list:\n - item1\n - item2").unwrap();
601
602 let entries = YamlParser::parse_file(file.path()).unwrap();
603 assert_eq!(entries.len(), 2);
604
605 let item1 = entries.iter().find(|e| e.value == "item1").unwrap();
606 assert_eq!(item1.key, "list.0");
607
608 let item2 = entries.iter().find(|e| e.value == "item2").unwrap();
609 assert_eq!(item2.key, "list.1");
610 }
611
612 #[test]
613 fn test_bottom_up_trace() {
614 let mut file = NamedTempFile::new().unwrap();
615 write!(
616 file,
617 "en:
618 js:
619 user:
620 log_in: \"Log In\"
621 sign_up: \"Sign Up\"
622"
623 )
624 .unwrap();
625
626 let entries = YamlParser::parse_file_with_query(file.path(), Some("Log In")).unwrap();
627 assert_eq!(entries.len(), 1);
628 assert_eq!(entries[0].key, "js.user.log_in");
629 assert_eq!(entries[0].value, "Log In");
630 assert_eq!(entries[0].line, 4);
631 }
632}