Skip to main content

xcstrings_mcp/service/
strings_parser.rs

1use std::path::{Path, PathBuf};
2
3use crate::error::XcStringsError;
4
5#[derive(Debug)]
6pub struct StringsEntry {
7    pub key: String,
8    pub value: String,
9    pub comment: Option<String>,
10}
11
12pub struct DiscoveredStringsFile {
13    pub path: PathBuf,
14    pub locale: String,
15    pub table_name: String,
16    pub file_type: StringsFileType,
17}
18
19pub enum StringsFileType {
20    Strings,
21    Stringsdict,
22}
23
24fn parse_err(line: usize, message: impl Into<String>) -> XcStringsError {
25    XcStringsError::StringsParse {
26        line,
27        message: message.into(),
28    }
29}
30
31/// Decode raw bytes detecting BOM: UTF-16LE/BE, UTF-8, with UTF-16LE fallback.
32pub fn decode_strings_content(raw: &[u8]) -> Result<String, XcStringsError> {
33    if raw.len() >= 2 && raw[0] == 0xFF && raw[1] == 0xFE {
34        return decode_utf16(raw, 2, u16::from_le_bytes);
35    }
36    if raw.len() >= 2 && raw[0] == 0xFE && raw[1] == 0xFF {
37        return decode_utf16(raw, 2, u16::from_be_bytes);
38    }
39    if raw.len() >= 3 && raw[0] == 0xEF && raw[1] == 0xBB && raw[2] == 0xBF {
40        return String::from_utf8(raw[3..].to_vec())
41            .map_err(|e| parse_err(0, format!("invalid UTF-8 after BOM: {e}")));
42    }
43    // Heuristic: UTF-16LE without BOM has null bytes at odd positions for ASCII content.
44    // String::from_utf8 accepts null bytes, so check for the pattern first.
45    if raw.len() >= 2 && raw.len().is_multiple_of(2) && looks_like_utf16le(raw) {
46        return decode_utf16(raw, 0, u16::from_le_bytes);
47    }
48    String::from_utf8(raw.to_vec()).map_err(|e| parse_err(0, format!("invalid encoding: {e}")))
49}
50
51/// Check if raw bytes look like UTF-16LE: ASCII chars at even positions, null at odd positions.
52fn looks_like_utf16le(raw: &[u8]) -> bool {
53    // Sample the first few byte pairs
54    let sample = raw.len().min(20);
55    if sample < 2 {
56        return false;
57    }
58    let mut null_at_odd = 0;
59    let mut pairs = 0;
60    for chunk in raw[..sample].chunks_exact(2) {
61        pairs += 1;
62        if chunk[1] == 0 && chunk[0] != 0 {
63            null_at_odd += 1;
64        }
65    }
66    // If most odd bytes are null, it's likely UTF-16LE
67    pairs > 0 && null_at_odd * 2 >= pairs
68}
69
70fn decode_utf16(
71    raw: &[u8],
72    skip: usize,
73    conv: fn([u8; 2]) -> u16,
74) -> Result<String, XcStringsError> {
75    let data = &raw[skip..];
76    if !data.len().is_multiple_of(2) {
77        return Err(parse_err(0, "odd byte count for UTF-16 data"));
78    }
79    let units: Vec<u16> = data.chunks_exact(2).map(|c| conv([c[0], c[1]])).collect();
80    String::from_utf16(&units).map_err(|e| parse_err(0, format!("invalid UTF-16: {e}")))
81}
82
83#[derive(Clone, Copy)]
84enum State {
85    Idle,
86    InBlockComment,
87    InLineComment,
88    InQuotedKey,
89    InUnquotedKey,
90    ExpectingEquals,
91    InQuotedValue,
92}
93
94/// Parse `.strings` file content into entries.
95pub fn parse_strings(content: &str) -> Result<Vec<StringsEntry>, XcStringsError> {
96    let mut entries = Vec::new();
97    let mut state = State::Idle;
98    let mut line: usize = 1;
99    let (mut key, mut value, mut comment_buf) = (String::new(), String::new(), String::new());
100    let mut pending_comment: Option<String> = None;
101    let mut escape = false;
102    let chars: Vec<char> = content.chars().collect();
103    let len = chars.len();
104    let mut i = 0;
105
106    while i < len {
107        let ch = chars[i];
108        if ch == '\n' {
109            line += 1;
110        }
111        match state {
112            State::Idle => {
113                if ch == '/' && i + 1 < len && chars[i + 1] == '*' {
114                    state = State::InBlockComment;
115                    comment_buf.clear();
116                    i += 2;
117                    continue;
118                }
119                if ch == '/' && i + 1 < len && chars[i + 1] == '/' {
120                    state = State::InLineComment;
121                    comment_buf.clear();
122                    i += 2;
123                    continue;
124                }
125                if ch == '"' {
126                    state = State::InQuotedKey;
127                    key.clear();
128                    escape = false;
129                    i += 1;
130                    continue;
131                }
132                if ch.is_alphanumeric() || ch == '_' {
133                    state = State::InUnquotedKey;
134                    key.clear();
135                    key.push(ch);
136                    i += 1;
137                    continue;
138                }
139                i += 1;
140            }
141            State::InBlockComment => {
142                if ch == '*' && i + 1 < len && chars[i + 1] == '/' {
143                    let trimmed = comment_buf.trim();
144                    pending_comment = if trimmed.is_empty() {
145                        None
146                    } else {
147                        Some(trimmed.to_owned())
148                    };
149                    state = State::Idle;
150                    i += 2;
151                    continue;
152                }
153                comment_buf.push(ch);
154                i += 1;
155            }
156            State::InLineComment => {
157                if ch == '\n' {
158                    let t = comment_buf.trim();
159                    pending_comment = if t.starts_with("MARK:") {
160                        None
161                    } else {
162                        Some(t.to_owned())
163                    };
164                    state = State::Idle;
165                    i += 1;
166                    continue;
167                }
168                comment_buf.push(ch);
169                i += 1;
170            }
171            State::InQuotedKey => {
172                if escape {
173                    push_esc(ch, &mut key, &mut i, &chars, line)?;
174                    escape = false;
175                    continue;
176                }
177                if ch == '\\' {
178                    escape = true;
179                    i += 1;
180                    continue;
181                }
182                if ch == '"' {
183                    state = State::ExpectingEquals;
184                    i += 1;
185                    continue;
186                }
187                key.push(ch);
188                i += 1;
189            }
190            State::InUnquotedKey => {
191                if ch.is_alphanumeric() || ch == '_' || ch == '.' || ch == '-' {
192                    key.push(ch);
193                    i += 1;
194                    continue;
195                }
196                state = State::ExpectingEquals;
197            }
198            State::ExpectingEquals => {
199                if ch.is_whitespace() {
200                    i += 1;
201                    continue;
202                }
203                if ch == '=' {
204                    i += 1;
205                    while i < len && chars[i].is_whitespace() {
206                        if chars[i] == '\n' {
207                            line += 1;
208                        }
209                        i += 1;
210                    }
211                    if i >= len || chars[i] != '"' {
212                        return Err(parse_err(line, "expected '\"' after '='"));
213                    }
214                    state = State::InQuotedValue;
215                    value.clear();
216                    escape = false;
217                    i += 1;
218                    continue;
219                }
220                return Err(parse_err(
221                    line,
222                    format!("expected '=' after key, found '{ch}'"),
223                ));
224            }
225            State::InQuotedValue => {
226                if escape {
227                    push_esc(ch, &mut value, &mut i, &chars, line)?;
228                    escape = false;
229                    continue;
230                }
231                if ch == '\\' {
232                    escape = true;
233                    i += 1;
234                    continue;
235                }
236                if ch == '"' {
237                    i += 1;
238                    while i < len && chars[i].is_whitespace() {
239                        if chars[i] == '\n' {
240                            line += 1;
241                        }
242                        i += 1;
243                    }
244                    if i >= len || chars[i] != ';' {
245                        return Err(parse_err(line, "missing ';' after value"));
246                    }
247                    entries.push(StringsEntry {
248                        key: key.clone(),
249                        value: value.clone(),
250                        comment: pending_comment.take(),
251                    });
252                    state = State::Idle;
253                    i += 1;
254                    continue;
255                }
256                value.push(ch);
257                i += 1;
258            }
259        }
260    }
261    if matches!(state, State::InLineComment) { /* trailing comment — ok */
262    } else if !matches!(state, State::Idle) {
263        return Err(parse_err(line, "unexpected end of input"));
264    }
265    Ok(entries)
266}
267
268fn push_esc(
269    ch: char,
270    buf: &mut String,
271    i: &mut usize,
272    chars: &[char],
273    line: usize,
274) -> Result<(), XcStringsError> {
275    match ch {
276        '"' => buf.push('"'),
277        '\\' => buf.push('\\'),
278        'n' => buf.push('\n'),
279        't' => buf.push('\t'),
280        'r' => buf.push('\r'),
281        'U' => {
282            *i += 1;
283            let code = hex4(chars, *i, line)?;
284            *i += 4;
285            if (0xD800..=0xDBFF).contains(&code) {
286                if *i + 1 < chars.len()
287                    && chars[*i] == '\\'
288                    && *i + 2 < chars.len()
289                    && chars[*i + 1] == 'U'
290                {
291                    let low = hex4(chars, *i + 2, line)?;
292                    if (0xDC00..=0xDFFF).contains(&low) {
293                        let cp = 0x10000 + ((code as u32 - 0xD800) << 10) + (low as u32 - 0xDC00);
294                        buf.push(char::from_u32(cp).ok_or_else(|| {
295                            parse_err(
296                                line,
297                                format!("invalid surrogate pair: U+{code:04X} U+{low:04X}"),
298                            )
299                        })?);
300                        *i += 6;
301                        return Ok(());
302                    }
303                }
304                return Err(parse_err(
305                    line,
306                    format!("high surrogate U+{code:04X} without low surrogate"),
307                ));
308            }
309            buf.push(
310                char::from_u32(code as u32)
311                    .ok_or_else(|| parse_err(line, format!("invalid unicode: U+{code:04X}")))?,
312            );
313            return Ok(());
314        }
315        _ => {
316            buf.push('\\');
317            buf.push(ch);
318        }
319    }
320    *i += 1;
321    Ok(())
322}
323
324fn hex4(chars: &[char], start: usize, line: usize) -> Result<u16, XcStringsError> {
325    if start + 4 > chars.len() {
326        return Err(parse_err(line, "incomplete \\U escape: need 4 hex digits"));
327    }
328    let h: String = chars[start..start + 4].iter().collect();
329    u16::from_str_radix(&h, 16)
330        .map_err(|_| parse_err(line, format!("invalid hex in \\U escape: {h}")))
331}
332
333/// Extract locale from `.lproj` parent directory in path.
334pub fn extract_locale_from_path(path: &Path) -> Result<String, XcStringsError> {
335    for comp in path.components().rev() {
336        if let std::path::Component::Normal(name) = comp
337            && let Some(locale) = name.to_string_lossy().strip_suffix(".lproj")
338        {
339            return Ok(locale.to_owned());
340        }
341    }
342    Err(parse_err(
343        0,
344        format!("no .lproj directory found in path: {}", path.display()),
345    ))
346}
347
348/// Recursively discover `.strings` and `.stringsdict` files under `.lproj` directories.
349pub fn discover_strings_files(root: &Path) -> Result<Vec<DiscoveredStringsFile>, XcStringsError> {
350    let mut results = Vec::new();
351    walk_lproj(root, &mut results, 0)?;
352    results.sort_by(|a, b| {
353        a.table_name
354            .cmp(&b.table_name)
355            .then(a.locale.cmp(&b.locale))
356    });
357    Ok(results)
358}
359
360fn walk_lproj(
361    dir: &Path,
362    out: &mut Vec<DiscoveredStringsFile>,
363    depth: usize,
364) -> Result<(), XcStringsError> {
365    const MAX_DEPTH: usize = 20;
366    if depth > MAX_DEPTH {
367        return Ok(()); // silently stop — deep nesting is not expected in iOS projects
368    }
369    for entry in std::fs::read_dir(dir)? {
370        let path = entry?.path();
371        if path.is_dir() {
372            let name = path
373                .file_name()
374                .map(|n| n.to_string_lossy().to_string())
375                .unwrap_or_default();
376            if name.ends_with(".lproj") && name != "Base.lproj" {
377                let locale = name.strip_suffix(".lproj").unwrap_or(&name).to_owned();
378                for f in std::fs::read_dir(&path)? {
379                    let fp = f?.path();
380                    if !fp.is_file() {
381                        continue;
382                    }
383                    let ft = match fp.extension().and_then(|e| e.to_str()) {
384                        Some("strings") => StringsFileType::Strings,
385                        Some("stringsdict") => StringsFileType::Stringsdict,
386                        _ => continue,
387                    };
388                    let tbl = fp
389                        .file_stem()
390                        .and_then(|s| s.to_str())
391                        .unwrap_or("Unknown")
392                        .to_owned();
393                    out.push(DiscoveredStringsFile {
394                        path: fp,
395                        locale: locale.clone(),
396                        table_name: tbl,
397                        file_type: ft,
398                    });
399                }
400            } else if !name.ends_with(".lproj") {
401                walk_lproj(&path, out, depth + 1)?;
402            }
403        }
404    }
405    Ok(())
406}
407
408#[cfg(test)]
409mod tests {
410    use super::*;
411    use std::fs;
412
413    #[test]
414    fn decode_utf8_no_bom() {
415        assert_eq!(
416            decode_strings_content(b"\"k\" = \"v\";").unwrap(),
417            "\"k\" = \"v\";"
418        );
419    }
420    #[test]
421    fn decode_utf8_with_bom() {
422        let mut b = vec![0xEF, 0xBB, 0xBF];
423        b.extend_from_slice(b"\"k\"=\"v\";");
424        assert_eq!(decode_strings_content(&b).unwrap(), "\"k\"=\"v\";");
425    }
426    #[test]
427    fn decode_utf16le_with_bom() {
428        let t = "\"k\" = \"v\";";
429        let mut b = vec![0xFF, 0xFE];
430        for u in t.encode_utf16() {
431            b.extend_from_slice(&u.to_le_bytes());
432        }
433        assert_eq!(decode_strings_content(&b).unwrap(), t);
434    }
435    #[test]
436    fn decode_utf16be_with_bom() {
437        let t = "\"k\" = \"v\";";
438        let mut b = vec![0xFE, 0xFF];
439        for u in t.encode_utf16() {
440            b.extend_from_slice(&u.to_be_bytes());
441        }
442        assert_eq!(decode_strings_content(&b).unwrap(), t);
443    }
444    #[test]
445    fn decode_utf16le_no_bom_fallback() {
446        let t = "\"k\" = \"v\";";
447        let mut b = Vec::new();
448        for u in t.encode_utf16() {
449            b.extend_from_slice(&u.to_le_bytes());
450        }
451        assert_eq!(decode_strings_content(&b).unwrap(), t);
452    }
453    #[test]
454    fn decode_invalid_encoding() {
455        assert!(decode_strings_content(&[0xFF, 0xFF, 0xFF]).is_err());
456    }
457
458    #[test]
459    fn parse_basic_key_value() {
460        let e = parse_strings("\"hello\" = \"world\";").unwrap();
461        assert_eq!(e.len(), 1);
462        assert_eq!(e[0].key, "hello");
463        assert_eq!(e[0].value, "world");
464        assert!(e[0].comment.is_none());
465    }
466    #[test]
467    fn parse_block_comment_attached() {
468        let e = parse_strings("/* A greeting */\n\"hello\" = \"world\";").unwrap();
469        assert_eq!(e[0].comment.as_deref(), Some("A greeting"));
470    }
471    #[test]
472    fn parse_line_comment_attached() {
473        let e = parse_strings("// A greeting\n\"hello\" = \"world\";").unwrap();
474        assert_eq!(e[0].comment.as_deref(), Some("A greeting"));
475    }
476    #[test]
477    fn parse_mark_comment_not_attached() {
478        let e = parse_strings("// MARK: Section\n\"hello\" = \"world\";").unwrap();
479        assert!(e[0].comment.is_none());
480    }
481    #[test]
482    fn parse_escape_sequences() {
483        let e = parse_strings(r#""key" = "a\"b\\c\nd\te\rf";"#).unwrap();
484        assert_eq!(e[0].value, "a\"b\\c\nd\te\rf");
485    }
486    #[test]
487    fn parse_unicode_escape() {
488        let e = parse_strings(r#""key" = "\U00E9";"#).unwrap();
489        assert_eq!(e[0].value, "é");
490    }
491    #[test]
492    fn parse_unicode_surrogate_pair() {
493        let e = parse_strings(r#""key" = "\UD83D\UDE00";"#).unwrap();
494        assert_eq!(e[0].value, "\u{1F600}");
495    }
496    #[test]
497    fn parse_empty_value() {
498        assert_eq!(parse_strings("\"key\" = \"\";").unwrap()[0].value, "");
499    }
500    #[test]
501    fn parse_multiple_entries_mixed_comments() {
502        let e =
503            parse_strings("/* First */\n\"a\" = \"1\";\n// Second\n\"b\" = \"2\";\n\"c\" = \"3\";")
504                .unwrap();
505        assert_eq!(e.len(), 3);
506        assert_eq!(e[0].comment.as_deref(), Some("First"));
507        assert_eq!(e[1].comment.as_deref(), Some("Second"));
508        assert!(e[2].comment.is_none());
509    }
510    #[test]
511    fn parse_duplicate_keys() {
512        let e = parse_strings("\"key\" = \"first\";\n\"key\" = \"second\";").unwrap();
513        assert_eq!(e.len(), 2);
514        assert_eq!(e[0].value, "first");
515        assert_eq!(e[1].value, "second");
516    }
517    #[test]
518    fn parse_missing_semicolon() {
519        assert!(parse_strings("\"key\" = \"value\"").is_err());
520    }
521    #[test]
522    fn parse_empty_input() {
523        assert!(parse_strings("").unwrap().is_empty());
524    }
525    #[test]
526    fn parse_unquoted_key() {
527        let e = parse_strings("myKey = \"value\";").unwrap();
528        assert_eq!(e[0].key, "myKey");
529        assert_eq!(e[0].value, "value");
530    }
531    #[test]
532    fn parse_unquoted_key_with_dots() {
533        assert_eq!(
534            parse_strings("my.key.name = \"value\";").unwrap()[0].key,
535            "my.key.name"
536        );
537    }
538
539    #[test]
540    fn extract_locale_valid() {
541        assert_eq!(
542            extract_locale_from_path(Path::new("/p/en.lproj/L.strings")).unwrap(),
543            "en"
544        );
545    }
546    #[test]
547    fn extract_locale_invalid() {
548        assert!(extract_locale_from_path(Path::new("/p/Resources/L.strings")).is_err());
549    }
550
551    #[test]
552    fn discover_with_lproj_dirs() {
553        let tmp = tempfile::tempdir().unwrap();
554        fs::create_dir(tmp.path().join("en.lproj")).unwrap();
555        fs::create_dir(tmp.path().join("es.lproj")).unwrap();
556        fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
557        fs::write(tmp.path().join("es.lproj/Localizable.strings"), "").unwrap();
558        let f = discover_strings_files(tmp.path()).unwrap();
559        assert_eq!(f.len(), 2);
560        assert_eq!(f[0].table_name, "Localizable");
561    }
562    #[test]
563    fn discover_both_file_types() {
564        let tmp = tempfile::tempdir().unwrap();
565        fs::create_dir(tmp.path().join("en.lproj")).unwrap();
566        fs::write(tmp.path().join("en.lproj/L.strings"), "").unwrap();
567        fs::write(tmp.path().join("en.lproj/L.stringsdict"), "").unwrap();
568        assert_eq!(discover_strings_files(tmp.path()).unwrap().len(), 2);
569    }
570    #[test]
571    fn discover_multiple_tables() {
572        let tmp = tempfile::tempdir().unwrap();
573        fs::create_dir(tmp.path().join("en.lproj")).unwrap();
574        fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
575        fs::write(tmp.path().join("en.lproj/InfoPlist.strings"), "").unwrap();
576        let f = discover_strings_files(tmp.path()).unwrap();
577        assert_eq!(f[0].table_name, "InfoPlist");
578        assert_eq!(f[1].table_name, "Localizable");
579    }
580    #[test]
581    fn discover_no_lproj() {
582        let tmp = tempfile::tempdir().unwrap();
583        assert!(discover_strings_files(tmp.path()).unwrap().is_empty());
584    }
585    #[test]
586    fn discover_nested_directories() {
587        let tmp = tempfile::tempdir().unwrap();
588        fs::create_dir_all(tmp.path().join("Resources/en.lproj")).unwrap();
589        fs::write(tmp.path().join("Resources/en.lproj/L.strings"), "").unwrap();
590        assert_eq!(discover_strings_files(tmp.path()).unwrap().len(), 1);
591    }
592
593    #[test]
594    fn empty_block_comment_produces_none() {
595        let e = parse_strings("/**/\n\"hello\" = \"world\";").unwrap();
596        assert_eq!(e.len(), 1);
597        assert!(
598            e[0].comment.is_none(),
599            "empty block comment should not attach as comment"
600        );
601    }
602
603    #[test]
604    fn whitespace_only_block_comment_produces_none() {
605        let e = parse_strings("/*   */\n\"hello\" = \"world\";").unwrap();
606        assert_eq!(e.len(), 1);
607        assert!(
608            e[0].comment.is_none(),
609            "whitespace-only block comment should not attach as comment"
610        );
611    }
612
613    #[test]
614    fn test_unknown_escape_passthrough() {
615        let e = parse_strings(r#""key" = "hello\pworld";"#).unwrap();
616        assert_eq!(e[0].value, "hello\\pworld");
617    }
618
619    #[test]
620    fn escape_error_reports_correct_line() {
621        // Incomplete \U escape on line 3
622        let input = "\"a\" = \"ok\";\n\"b\" = \"ok\";\n\"c\" = \"\\U00G\";";
623        let err = match parse_strings(input) {
624            Err(e) => e,
625            Ok(_) => panic!("expected error on invalid \\U escape"),
626        };
627        let msg = err.to_string();
628        assert!(
629            msg.contains("line 3"),
630            "expected error on line 3, got: {msg}"
631        );
632    }
633
634    #[test]
635    fn discover_skips_base_lproj() {
636        let tmp = tempfile::tempdir().unwrap();
637        fs::create_dir(tmp.path().join("Base.lproj")).unwrap();
638        fs::create_dir(tmp.path().join("en.lproj")).unwrap();
639        fs::write(tmp.path().join("Base.lproj/Main.strings"), "").unwrap();
640        fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
641        let f = discover_strings_files(tmp.path()).unwrap();
642        assert_eq!(f.len(), 1, "Base.lproj should be skipped");
643        assert_eq!(f[0].locale, "en");
644    }
645
646    #[test]
647    fn discover_respects_max_depth() {
648        let tmp = tempfile::tempdir().unwrap();
649        // Create a path 22 levels deep with an lproj at the bottom
650        let mut deep = tmp.path().to_path_buf();
651        for i in 0..22 {
652            deep = deep.join(format!("d{i}"));
653        }
654        let lproj = deep.join("en.lproj");
655        fs::create_dir_all(&lproj).unwrap();
656        fs::write(lproj.join("L.strings"), "").unwrap();
657        let f = discover_strings_files(tmp.path()).unwrap();
658        assert!(
659            f.is_empty(),
660            "files beyond MAX_DEPTH should not be discovered"
661        );
662    }
663}