1use std::path::{Path, PathBuf};
2
3use crate::error::XcStringsError;
4
5#[derive(Debug)]
6pub struct StringsEntry {
7 pub key: String,
8 pub value: String,
9 pub comment: Option<String>,
10}
11
12pub struct DiscoveredStringsFile {
13 pub path: PathBuf,
14 pub locale: String,
15 pub table_name: String,
16 pub file_type: StringsFileType,
17}
18
19pub enum StringsFileType {
20 Strings,
21 Stringsdict,
22}
23
24fn parse_err(line: usize, message: impl Into<String>) -> XcStringsError {
25 XcStringsError::StringsParse {
26 line,
27 message: message.into(),
28 }
29}
30
31pub fn decode_strings_content(raw: &[u8]) -> Result<String, XcStringsError> {
33 if raw.len() >= 2 && raw[0] == 0xFF && raw[1] == 0xFE {
34 return decode_utf16(raw, 2, u16::from_le_bytes);
35 }
36 if raw.len() >= 2 && raw[0] == 0xFE && raw[1] == 0xFF {
37 return decode_utf16(raw, 2, u16::from_be_bytes);
38 }
39 if raw.len() >= 3 && raw[0] == 0xEF && raw[1] == 0xBB && raw[2] == 0xBF {
40 return String::from_utf8(raw[3..].to_vec())
41 .map_err(|e| parse_err(0, format!("invalid UTF-8 after BOM: {e}")));
42 }
43 if raw.len() >= 2 && raw.len().is_multiple_of(2) && looks_like_utf16le(raw) {
46 return decode_utf16(raw, 0, u16::from_le_bytes);
47 }
48 String::from_utf8(raw.to_vec()).map_err(|e| parse_err(0, format!("invalid encoding: {e}")))
49}
50
51fn looks_like_utf16le(raw: &[u8]) -> bool {
53 let sample = raw.len().min(20);
55 if sample < 2 {
56 return false;
57 }
58 let mut null_at_odd = 0;
59 let mut pairs = 0;
60 for chunk in raw[..sample].chunks_exact(2) {
61 pairs += 1;
62 if chunk[1] == 0 && chunk[0] != 0 {
63 null_at_odd += 1;
64 }
65 }
66 pairs > 0 && null_at_odd * 2 >= pairs
68}
69
70fn decode_utf16(
71 raw: &[u8],
72 skip: usize,
73 conv: fn([u8; 2]) -> u16,
74) -> Result<String, XcStringsError> {
75 let data = &raw[skip..];
76 if !data.len().is_multiple_of(2) {
77 return Err(parse_err(0, "odd byte count for UTF-16 data"));
78 }
79 let units: Vec<u16> = data.chunks_exact(2).map(|c| conv([c[0], c[1]])).collect();
80 String::from_utf16(&units).map_err(|e| parse_err(0, format!("invalid UTF-16: {e}")))
81}
82
83#[derive(Clone, Copy)]
84enum State {
85 Idle,
86 InBlockComment,
87 InLineComment,
88 InQuotedKey,
89 InUnquotedKey,
90 ExpectingEquals,
91 InQuotedValue,
92}
93
94pub fn parse_strings(content: &str) -> Result<Vec<StringsEntry>, XcStringsError> {
96 let mut entries = Vec::new();
97 let mut state = State::Idle;
98 let mut line: usize = 1;
99 let (mut key, mut value, mut comment_buf) = (String::new(), String::new(), String::new());
100 let mut pending_comment: Option<String> = None;
101 let mut escape = false;
102 let chars: Vec<char> = content.chars().collect();
103 let len = chars.len();
104 let mut i = 0;
105
106 while i < len {
107 let ch = chars[i];
108 if ch == '\n' {
109 line += 1;
110 }
111 match state {
112 State::Idle => {
113 if ch == '/' && i + 1 < len && chars[i + 1] == '*' {
114 state = State::InBlockComment;
115 comment_buf.clear();
116 i += 2;
117 continue;
118 }
119 if ch == '/' && i + 1 < len && chars[i + 1] == '/' {
120 state = State::InLineComment;
121 comment_buf.clear();
122 i += 2;
123 continue;
124 }
125 if ch == '"' {
126 state = State::InQuotedKey;
127 key.clear();
128 escape = false;
129 i += 1;
130 continue;
131 }
132 if ch.is_alphanumeric() || ch == '_' {
133 state = State::InUnquotedKey;
134 key.clear();
135 key.push(ch);
136 i += 1;
137 continue;
138 }
139 i += 1;
140 }
141 State::InBlockComment => {
142 if ch == '*' && i + 1 < len && chars[i + 1] == '/' {
143 let trimmed = comment_buf.trim();
144 pending_comment = if trimmed.is_empty() {
145 None
146 } else {
147 Some(trimmed.to_owned())
148 };
149 state = State::Idle;
150 i += 2;
151 continue;
152 }
153 comment_buf.push(ch);
154 i += 1;
155 }
156 State::InLineComment => {
157 if ch == '\n' {
158 let t = comment_buf.trim();
159 pending_comment = if t.starts_with("MARK:") {
160 None
161 } else {
162 Some(t.to_owned())
163 };
164 state = State::Idle;
165 i += 1;
166 continue;
167 }
168 comment_buf.push(ch);
169 i += 1;
170 }
171 State::InQuotedKey => {
172 if escape {
173 push_esc(ch, &mut key, &mut i, &chars, line)?;
174 escape = false;
175 continue;
176 }
177 if ch == '\\' {
178 escape = true;
179 i += 1;
180 continue;
181 }
182 if ch == '"' {
183 state = State::ExpectingEquals;
184 i += 1;
185 continue;
186 }
187 key.push(ch);
188 i += 1;
189 }
190 State::InUnquotedKey => {
191 if ch.is_alphanumeric() || ch == '_' || ch == '.' || ch == '-' {
192 key.push(ch);
193 i += 1;
194 continue;
195 }
196 state = State::ExpectingEquals;
197 }
198 State::ExpectingEquals => {
199 if ch.is_whitespace() {
200 i += 1;
201 continue;
202 }
203 if ch == '=' {
204 i += 1;
205 while i < len && chars[i].is_whitespace() {
206 if chars[i] == '\n' {
207 line += 1;
208 }
209 i += 1;
210 }
211 if i >= len || chars[i] != '"' {
212 return Err(parse_err(line, "expected '\"' after '='"));
213 }
214 state = State::InQuotedValue;
215 value.clear();
216 escape = false;
217 i += 1;
218 continue;
219 }
220 return Err(parse_err(
221 line,
222 format!("expected '=' after key, found '{ch}'"),
223 ));
224 }
225 State::InQuotedValue => {
226 if escape {
227 push_esc(ch, &mut value, &mut i, &chars, line)?;
228 escape = false;
229 continue;
230 }
231 if ch == '\\' {
232 escape = true;
233 i += 1;
234 continue;
235 }
236 if ch == '"' {
237 i += 1;
238 while i < len && chars[i].is_whitespace() {
239 if chars[i] == '\n' {
240 line += 1;
241 }
242 i += 1;
243 }
244 if i >= len || chars[i] != ';' {
245 return Err(parse_err(line, "missing ';' after value"));
246 }
247 entries.push(StringsEntry {
248 key: key.clone(),
249 value: value.clone(),
250 comment: pending_comment.take(),
251 });
252 state = State::Idle;
253 i += 1;
254 continue;
255 }
256 value.push(ch);
257 i += 1;
258 }
259 }
260 }
261 if matches!(state, State::InLineComment) { } else if !matches!(state, State::Idle) {
263 return Err(parse_err(line, "unexpected end of input"));
264 }
265 Ok(entries)
266}
267
268fn push_esc(
269 ch: char,
270 buf: &mut String,
271 i: &mut usize,
272 chars: &[char],
273 line: usize,
274) -> Result<(), XcStringsError> {
275 match ch {
276 '"' => buf.push('"'),
277 '\\' => buf.push('\\'),
278 'n' => buf.push('\n'),
279 't' => buf.push('\t'),
280 'r' => buf.push('\r'),
281 'U' => {
282 *i += 1;
283 let code = hex4(chars, *i, line)?;
284 *i += 4;
285 if (0xD800..=0xDBFF).contains(&code) {
286 if *i + 1 < chars.len()
287 && chars[*i] == '\\'
288 && *i + 2 < chars.len()
289 && chars[*i + 1] == 'U'
290 {
291 let low = hex4(chars, *i + 2, line)?;
292 if (0xDC00..=0xDFFF).contains(&low) {
293 let cp = 0x10000 + ((code as u32 - 0xD800) << 10) + (low as u32 - 0xDC00);
294 buf.push(char::from_u32(cp).ok_or_else(|| {
295 parse_err(
296 line,
297 format!("invalid surrogate pair: U+{code:04X} U+{low:04X}"),
298 )
299 })?);
300 *i += 6;
301 return Ok(());
302 }
303 }
304 return Err(parse_err(
305 line,
306 format!("high surrogate U+{code:04X} without low surrogate"),
307 ));
308 }
309 buf.push(
310 char::from_u32(code as u32)
311 .ok_or_else(|| parse_err(line, format!("invalid unicode: U+{code:04X}")))?,
312 );
313 return Ok(());
314 }
315 _ => {
316 buf.push('\\');
317 buf.push(ch);
318 }
319 }
320 *i += 1;
321 Ok(())
322}
323
324fn hex4(chars: &[char], start: usize, line: usize) -> Result<u16, XcStringsError> {
325 if start + 4 > chars.len() {
326 return Err(parse_err(line, "incomplete \\U escape: need 4 hex digits"));
327 }
328 let h: String = chars[start..start + 4].iter().collect();
329 u16::from_str_radix(&h, 16)
330 .map_err(|_| parse_err(line, format!("invalid hex in \\U escape: {h}")))
331}
332
333pub fn extract_locale_from_path(path: &Path) -> Result<String, XcStringsError> {
335 for comp in path.components().rev() {
336 if let std::path::Component::Normal(name) = comp
337 && let Some(locale) = name.to_string_lossy().strip_suffix(".lproj")
338 {
339 return Ok(locale.to_owned());
340 }
341 }
342 Err(parse_err(
343 0,
344 format!("no .lproj directory found in path: {}", path.display()),
345 ))
346}
347
348pub fn discover_strings_files(root: &Path) -> Result<Vec<DiscoveredStringsFile>, XcStringsError> {
350 let mut results = Vec::new();
351 walk_lproj(root, &mut results, 0)?;
352 results.sort_by(|a, b| {
353 a.table_name
354 .cmp(&b.table_name)
355 .then(a.locale.cmp(&b.locale))
356 });
357 Ok(results)
358}
359
360fn walk_lproj(
361 dir: &Path,
362 out: &mut Vec<DiscoveredStringsFile>,
363 depth: usize,
364) -> Result<(), XcStringsError> {
365 const MAX_DEPTH: usize = 20;
366 if depth > MAX_DEPTH {
367 return Ok(()); }
369 for entry in std::fs::read_dir(dir)? {
370 let path = entry?.path();
371 if path.is_dir() {
372 let name = path
373 .file_name()
374 .map(|n| n.to_string_lossy().to_string())
375 .unwrap_or_default();
376 if name.ends_with(".lproj") && name != "Base.lproj" {
377 let locale = name.strip_suffix(".lproj").unwrap_or(&name).to_owned();
378 for f in std::fs::read_dir(&path)? {
379 let fp = f?.path();
380 if !fp.is_file() {
381 continue;
382 }
383 let ft = match fp.extension().and_then(|e| e.to_str()) {
384 Some("strings") => StringsFileType::Strings,
385 Some("stringsdict") => StringsFileType::Stringsdict,
386 _ => continue,
387 };
388 let tbl = fp
389 .file_stem()
390 .and_then(|s| s.to_str())
391 .unwrap_or("Unknown")
392 .to_owned();
393 out.push(DiscoveredStringsFile {
394 path: fp,
395 locale: locale.clone(),
396 table_name: tbl,
397 file_type: ft,
398 });
399 }
400 } else if !name.ends_with(".lproj") {
401 walk_lproj(&path, out, depth + 1)?;
402 }
403 }
404 }
405 Ok(())
406}
407
408#[cfg(test)]
409mod tests {
410 use super::*;
411 use std::fs;
412
413 #[test]
414 fn decode_utf8_no_bom() {
415 assert_eq!(
416 decode_strings_content(b"\"k\" = \"v\";").unwrap(),
417 "\"k\" = \"v\";"
418 );
419 }
420 #[test]
421 fn decode_utf8_with_bom() {
422 let mut b = vec![0xEF, 0xBB, 0xBF];
423 b.extend_from_slice(b"\"k\"=\"v\";");
424 assert_eq!(decode_strings_content(&b).unwrap(), "\"k\"=\"v\";");
425 }
426 #[test]
427 fn decode_utf16le_with_bom() {
428 let t = "\"k\" = \"v\";";
429 let mut b = vec![0xFF, 0xFE];
430 for u in t.encode_utf16() {
431 b.extend_from_slice(&u.to_le_bytes());
432 }
433 assert_eq!(decode_strings_content(&b).unwrap(), t);
434 }
435 #[test]
436 fn decode_utf16be_with_bom() {
437 let t = "\"k\" = \"v\";";
438 let mut b = vec![0xFE, 0xFF];
439 for u in t.encode_utf16() {
440 b.extend_from_slice(&u.to_be_bytes());
441 }
442 assert_eq!(decode_strings_content(&b).unwrap(), t);
443 }
444 #[test]
445 fn decode_utf16le_no_bom_fallback() {
446 let t = "\"k\" = \"v\";";
447 let mut b = Vec::new();
448 for u in t.encode_utf16() {
449 b.extend_from_slice(&u.to_le_bytes());
450 }
451 assert_eq!(decode_strings_content(&b).unwrap(), t);
452 }
453 #[test]
454 fn decode_invalid_encoding() {
455 assert!(decode_strings_content(&[0xFF, 0xFF, 0xFF]).is_err());
456 }
457
458 #[test]
459 fn parse_basic_key_value() {
460 let e = parse_strings("\"hello\" = \"world\";").unwrap();
461 assert_eq!(e.len(), 1);
462 assert_eq!(e[0].key, "hello");
463 assert_eq!(e[0].value, "world");
464 assert!(e[0].comment.is_none());
465 }
466 #[test]
467 fn parse_block_comment_attached() {
468 let e = parse_strings("/* A greeting */\n\"hello\" = \"world\";").unwrap();
469 assert_eq!(e[0].comment.as_deref(), Some("A greeting"));
470 }
471 #[test]
472 fn parse_line_comment_attached() {
473 let e = parse_strings("// A greeting\n\"hello\" = \"world\";").unwrap();
474 assert_eq!(e[0].comment.as_deref(), Some("A greeting"));
475 }
476 #[test]
477 fn parse_mark_comment_not_attached() {
478 let e = parse_strings("// MARK: Section\n\"hello\" = \"world\";").unwrap();
479 assert!(e[0].comment.is_none());
480 }
481 #[test]
482 fn parse_escape_sequences() {
483 let e = parse_strings(r#""key" = "a\"b\\c\nd\te\rf";"#).unwrap();
484 assert_eq!(e[0].value, "a\"b\\c\nd\te\rf");
485 }
486 #[test]
487 fn parse_unicode_escape() {
488 let e = parse_strings(r#""key" = "\U00E9";"#).unwrap();
489 assert_eq!(e[0].value, "é");
490 }
491 #[test]
492 fn parse_unicode_surrogate_pair() {
493 let e = parse_strings(r#""key" = "\UD83D\UDE00";"#).unwrap();
494 assert_eq!(e[0].value, "\u{1F600}");
495 }
496 #[test]
497 fn parse_empty_value() {
498 assert_eq!(parse_strings("\"key\" = \"\";").unwrap()[0].value, "");
499 }
500 #[test]
501 fn parse_multiple_entries_mixed_comments() {
502 let e =
503 parse_strings("/* First */\n\"a\" = \"1\";\n// Second\n\"b\" = \"2\";\n\"c\" = \"3\";")
504 .unwrap();
505 assert_eq!(e.len(), 3);
506 assert_eq!(e[0].comment.as_deref(), Some("First"));
507 assert_eq!(e[1].comment.as_deref(), Some("Second"));
508 assert!(e[2].comment.is_none());
509 }
510 #[test]
511 fn parse_duplicate_keys() {
512 let e = parse_strings("\"key\" = \"first\";\n\"key\" = \"second\";").unwrap();
513 assert_eq!(e.len(), 2);
514 assert_eq!(e[0].value, "first");
515 assert_eq!(e[1].value, "second");
516 }
517 #[test]
518 fn parse_missing_semicolon() {
519 assert!(parse_strings("\"key\" = \"value\"").is_err());
520 }
521 #[test]
522 fn parse_empty_input() {
523 assert!(parse_strings("").unwrap().is_empty());
524 }
525 #[test]
526 fn parse_unquoted_key() {
527 let e = parse_strings("myKey = \"value\";").unwrap();
528 assert_eq!(e[0].key, "myKey");
529 assert_eq!(e[0].value, "value");
530 }
531 #[test]
532 fn parse_unquoted_key_with_dots() {
533 assert_eq!(
534 parse_strings("my.key.name = \"value\";").unwrap()[0].key,
535 "my.key.name"
536 );
537 }
538
539 #[test]
540 fn extract_locale_valid() {
541 assert_eq!(
542 extract_locale_from_path(Path::new("/p/en.lproj/L.strings")).unwrap(),
543 "en"
544 );
545 }
546 #[test]
547 fn extract_locale_invalid() {
548 assert!(extract_locale_from_path(Path::new("/p/Resources/L.strings")).is_err());
549 }
550
551 #[test]
552 fn discover_with_lproj_dirs() {
553 let tmp = tempfile::tempdir().unwrap();
554 fs::create_dir(tmp.path().join("en.lproj")).unwrap();
555 fs::create_dir(tmp.path().join("es.lproj")).unwrap();
556 fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
557 fs::write(tmp.path().join("es.lproj/Localizable.strings"), "").unwrap();
558 let f = discover_strings_files(tmp.path()).unwrap();
559 assert_eq!(f.len(), 2);
560 assert_eq!(f[0].table_name, "Localizable");
561 }
562 #[test]
563 fn discover_both_file_types() {
564 let tmp = tempfile::tempdir().unwrap();
565 fs::create_dir(tmp.path().join("en.lproj")).unwrap();
566 fs::write(tmp.path().join("en.lproj/L.strings"), "").unwrap();
567 fs::write(tmp.path().join("en.lproj/L.stringsdict"), "").unwrap();
568 assert_eq!(discover_strings_files(tmp.path()).unwrap().len(), 2);
569 }
570 #[test]
571 fn discover_multiple_tables() {
572 let tmp = tempfile::tempdir().unwrap();
573 fs::create_dir(tmp.path().join("en.lproj")).unwrap();
574 fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
575 fs::write(tmp.path().join("en.lproj/InfoPlist.strings"), "").unwrap();
576 let f = discover_strings_files(tmp.path()).unwrap();
577 assert_eq!(f[0].table_name, "InfoPlist");
578 assert_eq!(f[1].table_name, "Localizable");
579 }
580 #[test]
581 fn discover_no_lproj() {
582 let tmp = tempfile::tempdir().unwrap();
583 assert!(discover_strings_files(tmp.path()).unwrap().is_empty());
584 }
585 #[test]
586 fn discover_nested_directories() {
587 let tmp = tempfile::tempdir().unwrap();
588 fs::create_dir_all(tmp.path().join("Resources/en.lproj")).unwrap();
589 fs::write(tmp.path().join("Resources/en.lproj/L.strings"), "").unwrap();
590 assert_eq!(discover_strings_files(tmp.path()).unwrap().len(), 1);
591 }
592
593 #[test]
594 fn empty_block_comment_produces_none() {
595 let e = parse_strings("/**/\n\"hello\" = \"world\";").unwrap();
596 assert_eq!(e.len(), 1);
597 assert!(
598 e[0].comment.is_none(),
599 "empty block comment should not attach as comment"
600 );
601 }
602
603 #[test]
604 fn whitespace_only_block_comment_produces_none() {
605 let e = parse_strings("/* */\n\"hello\" = \"world\";").unwrap();
606 assert_eq!(e.len(), 1);
607 assert!(
608 e[0].comment.is_none(),
609 "whitespace-only block comment should not attach as comment"
610 );
611 }
612
613 #[test]
614 fn test_unknown_escape_passthrough() {
615 let e = parse_strings(r#""key" = "hello\pworld";"#).unwrap();
616 assert_eq!(e[0].value, "hello\\pworld");
617 }
618
619 #[test]
620 fn escape_error_reports_correct_line() {
621 let input = "\"a\" = \"ok\";\n\"b\" = \"ok\";\n\"c\" = \"\\U00G\";";
623 let err = match parse_strings(input) {
624 Err(e) => e,
625 Ok(_) => panic!("expected error on invalid \\U escape"),
626 };
627 let msg = err.to_string();
628 assert!(
629 msg.contains("line 3"),
630 "expected error on line 3, got: {msg}"
631 );
632 }
633
634 #[test]
635 fn discover_skips_base_lproj() {
636 let tmp = tempfile::tempdir().unwrap();
637 fs::create_dir(tmp.path().join("Base.lproj")).unwrap();
638 fs::create_dir(tmp.path().join("en.lproj")).unwrap();
639 fs::write(tmp.path().join("Base.lproj/Main.strings"), "").unwrap();
640 fs::write(tmp.path().join("en.lproj/Localizable.strings"), "").unwrap();
641 let f = discover_strings_files(tmp.path()).unwrap();
642 assert_eq!(f.len(), 1, "Base.lproj should be skipped");
643 assert_eq!(f[0].locale, "en");
644 }
645
646 #[test]
647 fn discover_respects_max_depth() {
648 let tmp = tempfile::tempdir().unwrap();
649 let mut deep = tmp.path().to_path_buf();
651 for i in 0..22 {
652 deep = deep.join(format!("d{i}"));
653 }
654 let lproj = deep.join("en.lproj");
655 fs::create_dir_all(&lproj).unwrap();
656 fs::write(lproj.join("L.strings"), "").unwrap();
657 let f = discover_strings_files(tmp.path()).unwrap();
658 assert!(
659 f.is_empty(),
660 "files beyond MAX_DEPTH should not be discovered"
661 );
662 }
663}