1use std::borrow::Cow;
2
3use memchr::memchr_iter;
4
5use crate::scan::{
6 CommentKind, Keyword, LineKind, LineScanner, classify_line, parse_plural_index,
7 split_once_byte, trim_ascii,
8};
9use crate::text::{extract_quoted_bytes_cow, split_reference_comment};
10use crate::utf8::input_slice_as_str;
11use crate::{Header, MsgStr, ParseError, PoFile, PoItem};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14enum Context {
15 Id,
16 IdPlural,
17 Str,
18 Ctxt,
19}
20
21#[derive(Debug)]
22struct ParserState {
23 item: PoItem,
24 msgstr: MsgStr,
25 context: Option<Context>,
26 plural_index: usize,
27 obsolete_line_count: usize,
28 content_line_count: usize,
29 has_keyword: bool,
30}
31
32impl ParserState {
33 fn new(nplurals: usize) -> Self {
34 Self {
35 item: PoItem::new(nplurals),
36 msgstr: MsgStr::None,
37 context: None,
38 plural_index: 0,
39 obsolete_line_count: 0,
40 content_line_count: 0,
41 has_keyword: false,
42 }
43 }
44
45 fn reset(&mut self, nplurals: usize) {
46 self.item.clear_for_reuse(nplurals);
47 self.reset_after_take(nplurals);
48 }
49
50 fn reset_after_take(&mut self, nplurals: usize) {
51 self.item.nplurals = nplurals;
52 self.msgstr = MsgStr::None;
53 self.context = None;
54 self.plural_index = 0;
55 self.obsolete_line_count = 0;
56 self.content_line_count = 0;
57 self.has_keyword = false;
58 }
59
60 fn set_msgstr(&mut self, plural_index: usize, value: String) {
61 match (&mut self.msgstr, plural_index) {
62 (MsgStr::None, 0) => self.msgstr = MsgStr::Singular(value),
63 (MsgStr::Singular(existing), 0) => *existing = value,
64 (MsgStr::Plural(values), 0) => {
65 if values.is_empty() {
66 values.push(String::new());
67 }
68 values[0] = value;
69 }
70 _ => {
71 let msgstr = self.promote_plural_msgstr(plural_index);
72 msgstr[plural_index] = value;
73 }
74 }
75 }
76
77 fn append_msgstr(&mut self, plural_index: usize, value: &str) {
78 match (&mut self.msgstr, plural_index) {
79 (MsgStr::None, 0) => self.msgstr = MsgStr::Singular(value.to_owned()),
80 (MsgStr::Singular(existing), 0) => existing.push_str(value),
81 (MsgStr::Plural(values), 0) => {
82 if values.is_empty() {
83 values.push(String::new());
84 }
85 values[0].push_str(value);
86 }
87 _ => {
88 let msgstr = self.promote_plural_msgstr(plural_index);
89 msgstr[plural_index].push_str(value);
90 }
91 }
92 }
93
94 fn header_msgstr(&self) -> &str {
95 self.msgstr.first_str().unwrap_or_default()
96 }
97
98 fn materialize_msgstr(&mut self) {
99 debug_assert!(self.item.msgstr.is_empty());
100 self.item.msgstr = core::mem::take(&mut self.msgstr);
101 }
102
103 fn promote_plural_msgstr(&mut self, plural_index: usize) -> &mut Vec<String> {
104 if !matches!(self.msgstr, MsgStr::Plural(_)) {
105 self.msgstr = match core::mem::take(&mut self.msgstr) {
106 MsgStr::None => MsgStr::Plural(Vec::with_capacity(2)),
107 MsgStr::Singular(value) => {
108 let mut values = Vec::with_capacity(2);
109 values.push(value);
110 MsgStr::Plural(values)
111 }
112 MsgStr::Plural(values) => MsgStr::Plural(values),
113 };
114 }
115 let MsgStr::Plural(msgstr) = &mut self.msgstr else {
116 unreachable!("plural msgstr promotion must yield plural storage");
117 };
118 if msgstr.len() <= plural_index {
119 msgstr.resize(plural_index + 1, String::new());
120 }
121 msgstr
122 }
123}
124
125#[derive(Debug, Clone, Copy)]
126struct BorrowedLine<'a> {
127 trimmed: &'a [u8],
128 obsolete: bool,
129}
130
131pub fn parse_po(input: &str) -> Result<PoFile, ParseError> {
140 let input = strip_utf8_bom(input);
141 let normalized;
142 let input = if input.as_bytes().contains(&b'\r') {
143 normalized = input.replace("\r\n", "\n").replace('\r', "\n");
144 normalized.as_str()
145 } else {
146 input
147 };
148
149 let mut file = PoFile::default();
150 file.items.reserve((input.len() / 96).max(1));
151 let mut current_nplurals = 2;
152 let mut state = ParserState::new(current_nplurals);
153
154 for line in LineScanner::new(input.as_bytes()) {
155 parse_line(
156 BorrowedLine {
157 trimmed: line.trimmed,
158 obsolete: line.obsolete,
159 },
160 &mut state,
161 &mut file,
162 &mut current_nplurals,
163 )?;
164 }
165
166 finish_item(&mut state, &mut file, &mut current_nplurals);
167
168 Ok(file)
169}
170
171#[inline]
172fn strip_utf8_bom(input: &str) -> &str {
173 input.strip_prefix('\u{feff}').unwrap_or(input)
174}
175
176fn parse_line(
177 line: BorrowedLine<'_>,
178 state: &mut ParserState,
179 file: &mut PoFile,
180 current_nplurals: &mut usize,
181) -> Result<(), ParseError> {
182 match classify_line(line.trimmed) {
183 LineKind::Continuation => {
184 append_continuation(line.trimmed, line.obsolete, state)?;
185 Ok(())
186 }
187 LineKind::Comment(kind) => {
188 parse_comment_line(line.trimmed, kind, state, file, current_nplurals);
189 Ok(())
190 }
191 LineKind::Keyword(keyword) => parse_keyword_line(
192 line.trimmed,
193 line.obsolete,
194 keyword,
195 state,
196 file,
197 current_nplurals,
198 ),
199 LineKind::Other => Ok(()),
200 }
201}
202
203fn parse_comment_line(
204 line_bytes: &[u8],
205 kind: CommentKind,
206 state: &mut ParserState,
207 file: &mut PoFile,
208 current_nplurals: &mut usize,
209) {
210 finish_item(state, file, current_nplurals);
211
212 match kind {
213 CommentKind::Reference => {
214 let reference_line = trimmed_str(&line_bytes[2..]);
215 state.item.references.extend(
216 split_reference_comment(reference_line)
217 .into_iter()
218 .map(Cow::into_owned),
219 );
220 }
221 CommentKind::Flags => {
222 for flag in trimmed_str(&line_bytes[2..]).split(',') {
223 state.item.flags.push(flag.trim().to_owned());
224 }
225 }
226 CommentKind::Extracted => state
227 .item
228 .extracted_comments
229 .push(trimmed_string(&line_bytes[2..])),
230 CommentKind::Metadata => {
231 let trimmed = trim_ascii(&line_bytes[2..]);
232 if let Some((key_bytes, value_bytes)) = split_once_byte(trimmed, b':') {
233 let key = trimmed_str(key_bytes);
234 if !key.is_empty() {
235 let value = trimmed_str(value_bytes);
236 state.item.metadata.push((key.to_owned(), value.to_owned()));
237 }
238 }
239 }
240 CommentKind::Translator => state.item.comments.push(trimmed_string(&line_bytes[1..])),
241 CommentKind::Other => {}
242 }
243}
244
245fn parse_keyword_line(
246 line_bytes: &[u8],
247 obsolete: bool,
248 keyword: Keyword,
249 state: &mut ParserState,
250 file: &mut PoFile,
251 current_nplurals: &mut usize,
252) -> Result<(), ParseError> {
253 match keyword {
254 Keyword::IdPlural => {
255 state.obsolete_line_count += usize::from(obsolete);
256 state.item.msgid_plural = Some(extract_quoted_bytes_cow(line_bytes)?.into_owned());
257 state.context = Some(Context::IdPlural);
258 state.content_line_count += 1;
259 state.has_keyword = true;
260 }
261 Keyword::Id => {
262 finish_item(state, file, current_nplurals);
263 state.obsolete_line_count += usize::from(obsolete);
264 state.item.msgid = extract_quoted_bytes_cow(line_bytes)?.into_owned();
265 state.context = Some(Context::Id);
266 state.content_line_count += 1;
267 state.has_keyword = true;
268 }
269 Keyword::Str => {
270 let plural_index = parse_plural_index(line_bytes).unwrap_or(0);
271 state.plural_index = plural_index;
272 state.obsolete_line_count += usize::from(obsolete);
273 state.set_msgstr(
274 plural_index,
275 extract_quoted_bytes_cow(line_bytes)?.into_owned(),
276 );
277 state.context = Some(Context::Str);
278 state.content_line_count += 1;
279 state.has_keyword = true;
280 }
281 Keyword::Ctxt => {
282 finish_item(state, file, current_nplurals);
283 state.obsolete_line_count += usize::from(obsolete);
284 state.item.msgctxt = Some(extract_quoted_bytes_cow(line_bytes)?.into_owned());
285 state.context = Some(Context::Ctxt);
286 state.content_line_count += 1;
287 state.has_keyword = true;
288 }
289 }
290
291 Ok(())
292}
293
294fn append_continuation(
295 line_bytes: &[u8],
296 obsolete: bool,
297 state: &mut ParserState,
298) -> Result<(), ParseError> {
299 state.obsolete_line_count += usize::from(obsolete);
300 state.content_line_count += 1;
301 let value = extract_quoted_bytes_cow(line_bytes)?;
302
303 match state.context {
304 Some(Context::Str) => {
305 state.append_msgstr(state.plural_index, value.as_ref());
306 }
307 Some(Context::Id) => state.item.msgid.push_str(value.as_ref()),
308 Some(Context::IdPlural) => {
309 let target = state.item.msgid_plural.get_or_insert_with(String::new);
310 target.push_str(value.as_ref());
311 }
312 Some(Context::Ctxt) => {
313 let target = state.item.msgctxt.get_or_insert_with(String::new);
314 target.push_str(value.as_ref());
315 }
316 None => {}
317 }
318
319 Ok(())
320}
321
322fn finish_item(state: &mut ParserState, file: &mut PoFile, current_nplurals: &mut usize) {
323 if !state.has_keyword {
324 return;
325 }
326
327 if state.item.msgid.is_empty() && !is_header_state(state) {
328 return;
329 }
330
331 if state.obsolete_line_count >= state.content_line_count && state.content_line_count > 0 {
332 state.item.obsolete = true;
333 }
334
335 if is_header_state(state) && file.headers.is_empty() && file.items.is_empty() {
336 file.comments = core::mem::take(&mut state.item.comments);
337 file.extracted_comments = core::mem::take(&mut state.item.extracted_comments);
338 parse_headers(state.header_msgstr(), &mut file.headers);
339 *current_nplurals = parse_nplurals(&file.headers).unwrap_or(2);
340 state.reset(*current_nplurals);
341 return;
342 }
343
344 state.materialize_msgstr();
345
346 if state.item.msgstr.is_empty() {
347 state.item.msgstr = MsgStr::Singular(String::new());
348 }
349 if state.item.msgid_plural.is_some() && state.item.msgstr.len() == 1 {
350 let mut values = state.item.msgstr.clone().into_vec();
351 values.resize(state.item.nplurals.max(1), String::new());
352 state.item.msgstr = MsgStr::Plural(values);
353 }
354
355 state.item.nplurals = *current_nplurals;
356 file.items.push(core::mem::take(&mut state.item));
357 state.reset_after_take(*current_nplurals);
358}
359
360fn is_header_state(state: &ParserState) -> bool {
361 state.item.msgid.is_empty()
362 && state.item.msgctxt.is_none()
363 && state.item.msgid_plural.is_none()
364 && !state.msgstr.is_empty()
365}
366
367fn parse_headers(raw: &str, out: &mut Vec<Header>) {
368 let bytes = raw.as_bytes();
369 out.reserve(memchr_iter(b'\n', bytes).count() + 1);
370
371 for line in LineScanner::new(bytes) {
372 if let Some((key_bytes, value_bytes)) = split_once_byte(line.trimmed, b':') {
373 out.push(Header {
374 key: trimmed_string(key_bytes),
375 value: trimmed_string(value_bytes),
376 });
377 }
378 }
379}
380
381fn parse_nplurals(headers: &[Header]) -> Option<usize> {
382 let plural_forms = headers
383 .iter()
384 .find(|header| header.key == "Plural-Forms")?
385 .value
386 .as_bytes();
387 let mut rest = plural_forms;
388
389 while !rest.is_empty() {
390 let (part, next) = match split_once_byte(rest, b';') {
391 Some((part, tail)) => (part, tail),
392 None => (rest, &b""[..]),
393 };
394 let trimmed = trim_ascii(part);
395 if let Some((key, value)) = split_once_byte(trimmed, b'=')
396 && trim_ascii(key) == b"nplurals"
397 && let value = bytes_to_str(trim_ascii(value))
398 && let Ok(parsed) = value.parse::<usize>()
399 {
400 return Some(parsed);
401 }
402 rest = next;
403 }
404
405 None
406}
407
408fn bytes_to_str(bytes: &[u8]) -> &str {
409 input_slice_as_str(bytes)
410}
411
412fn trimmed_str(bytes: &[u8]) -> &str {
413 bytes_to_str(trim_ascii(bytes))
414}
415
416fn trimmed_string(bytes: &[u8]) -> String {
417 trimmed_str(bytes).to_owned()
418}
419
420#[cfg(test)]
421mod tests {
422 use super::parse_po;
423
424 const MULTI_LINE: &str = r#"# French translation of Link (6.x-2.9)
425# Copyright (c) 2011 by the French translation team
426#
427## Plural-Forms by polish translation team to demonstrate multi-line ##
428#
429msgid ""
430msgstr ""
431"Project-Id-Version: Link (6.x-2.9)\n"
432"POT-Creation-Date: 2011-12-31 23:39+0000\n"
433"PO-Revision-Date: 2013-12-17 14:21+0100\n"
434"Language-Team: French\n"
435"MIME-Version: 1.0\n"
436"Content-Type: text/plain; charset=UTF-8\n"
437"Content-Transfer-Encoding: 8bit\n"
438"Plural-Forms: nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 "
439"|| n%100>=20) ? 1 : 2;\n"
440"Last-Translator: Ruben Vermeersch <ruben@rocketeer.be>\n"
441"Language: fr\n"
442"X-Generator: Poedit 1.6.2\n"
443
444msgid ""
445"The following placeholder tokens can be used in both paths and titles. When "
446"used in a path or title, they will be replaced with the appropriate values."
447msgstr ""
448"Les ébauches de jetons suivantes peuvent être utilisées à la fois dans les "
449"chemins et in the titles. Lorsqu'elles sont utilisées dans un chemin ou un "
450"titre, elles seront remplacées par les valeurs appropriées."
451"#;
452
453 const COMMENTED: &str = r#"msgid ""
454msgstr ""
455"Project-Id-Version: Test\n"
456"Plural-Forms: nplurals=2; plural=(n != 1);\n"
457
458#: .tmp/ui/settings/views/console-modal.html
459msgid "{{dataLoader.data.length}} results"
460msgstr "{{dataLoader.data.length}} resultaten"
461
462#~ msgid "Add order"
463#~ msgstr "Order toevoegen"
464
465#~ # commented obsolete item
466#~ #, fuzzy
467#~ msgid "Commented item"
468#~ msgstr "not sure"
469
470# commented obsolete item
471#, fuzzy
472#~ msgid "Second commented item"
473#~ msgstr "also not sure"
474"#;
475
476 const C_STRINGS: &str = r#"msgid ""
477msgstr ""
478"Plural-Forms: nplurals=2; plural=(n > 1);\n"
479
480msgid "The name field must not contain characters like \" or \\"
481msgstr ""
482
483msgid ""
484"%1$s\n"
485"%2$s %3$s\n"
486"%4$s\n"
487"%5$s"
488msgstr ""
489
490msgid ""
491"define('some/test/module', function () {\n"
492"\t'use strict';\n"
493"\treturn {};\n"
494"});\n"
495""
496msgstr ""
497"#;
498
499 #[test]
500 fn parses_multiline_headers_and_items() {
501 let po = match parse_po(MULTI_LINE) {
502 Ok(value) => value,
503 Err(error) => panic!("parse failed: {error}"),
504 };
505
506 assert_eq!(po.headers[6].key, "Content-Transfer-Encoding");
507 assert_eq!(
508 po.headers
509 .iter()
510 .find(|header| header.key == "Plural-Forms")
511 .map(|header| header.value.as_str()),
512 Some(
513 "nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;"
514 )
515 );
516 assert_eq!(po.items.len(), 1);
517 assert_eq!(
518 po.items[0].msgid,
519 "The following placeholder tokens can be used in both paths and titles. When used in a path or title, they will be replaced with the appropriate values."
520 );
521 }
522
523 #[test]
524 fn parses_c_string_escapes_and_multiline_values() {
525 let po = match parse_po(C_STRINGS) {
526 Ok(value) => value,
527 Err(error) => panic!("parse failed: {error}"),
528 };
529
530 assert_eq!(
531 po.items[0].msgid,
532 "The name field must not contain characters like \" or \\"
533 );
534 assert_eq!(po.items[1].msgid, "%1$s\n%2$s %3$s\n%4$s\n%5$s");
535 assert_eq!(
536 po.items[2].msgid,
537 "define('some/test/module', function () {\n\t'use strict';\n\treturn {};\n});\n"
538 );
539 }
540
541 #[test]
542 fn parses_obsolete_items() {
543 let po = match parse_po(COMMENTED) {
544 Ok(value) => value,
545 Err(error) => panic!("parse failed: {error}"),
546 };
547
548 assert_eq!(po.items.len(), 4);
549 assert!(!po.items[0].obsolete);
550 assert!(po.items[1].obsolete);
551 assert!(po.items[2].obsolete);
552 assert!(po.items[3].obsolete);
553 assert_eq!(
554 po.items[3].comments,
555 vec!["commented obsolete item".to_owned()]
556 );
557 assert_eq!(po.items[3].flags, vec!["fuzzy".to_owned()]);
558 }
559
560 #[test]
561 fn parses_context_without_creating_phantom_items() {
562 let input = r#"msgid ""
563msgstr ""
564"Language: de\n"
565
566msgctxt "menu"
567msgid "File"
568msgstr "Datei"
569"#;
570
571 let po = match parse_po(input) {
572 Ok(value) => value,
573 Err(error) => panic!("parse failed: {error}"),
574 };
575
576 assert_eq!(po.items.len(), 1);
577 assert_eq!(po.items[0].msgctxt.as_deref(), Some("menu"));
578 assert_eq!(po.items[0].msgid, "File");
579 }
580
581 #[test]
582 fn strips_utf8_bom_prefix() {
583 let input = "\u{feff}msgid \"foo\"\nmsgstr \"bar\"\n";
584 let po = parse_po(input).expect("parse");
585
586 assert_eq!(po.items.len(), 1);
587 assert_eq!(po.items[0].msgid, "foo");
588 assert_eq!(po.items[0].msgstr[0], "bar");
589 }
590
591 #[test]
592 fn rejects_unescaped_quote_sequences() {
593 let input = "msgid \"Some msgid with \\\"double\\\" quotes\"\nmsgstr \"\"\n\"Some msgstr with \"double\\\" quotes\"\n";
594 let error = parse_po(input).expect_err("invalid quote pattern should fail");
595
596 assert!(error.to_string().contains("unescaped"));
597 }
598}