Skip to main content

ferrocat_po/
parse.rs

1use std::borrow::Cow;
2
3use memchr::memchr_iter;
4
5use crate::scan::{
6    CommentKind, Keyword, LineKind, LineScanner, classify_line, parse_plural_index,
7    split_once_byte, trim_ascii,
8};
9use crate::text::{extract_quoted_bytes_cow, split_reference_comment};
10use crate::utf8::input_slice_as_str;
11use crate::{Header, MsgStr, ParseError, PoFile, PoItem};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14enum Context {
15    Id,
16    IdPlural,
17    Str,
18    Ctxt,
19}
20
21#[derive(Debug)]
22struct ParserState {
23    item: PoItem,
24    msgstr: MsgStr,
25    context: Option<Context>,
26    plural_index: usize,
27    obsolete_line_count: usize,
28    content_line_count: usize,
29    has_keyword: bool,
30}
31
32impl ParserState {
33    fn new(nplurals: usize) -> Self {
34        Self {
35            item: PoItem::new(nplurals),
36            msgstr: MsgStr::None,
37            context: None,
38            plural_index: 0,
39            obsolete_line_count: 0,
40            content_line_count: 0,
41            has_keyword: false,
42        }
43    }
44
45    fn reset(&mut self, nplurals: usize) {
46        self.item.clear_for_reuse(nplurals);
47        self.reset_after_take(nplurals);
48    }
49
50    fn reset_after_take(&mut self, nplurals: usize) {
51        self.item.nplurals = nplurals;
52        self.msgstr = MsgStr::None;
53        self.context = None;
54        self.plural_index = 0;
55        self.obsolete_line_count = 0;
56        self.content_line_count = 0;
57        self.has_keyword = false;
58    }
59
60    fn set_msgstr(&mut self, plural_index: usize, value: String) {
61        match (&mut self.msgstr, plural_index) {
62            (MsgStr::None, 0) => self.msgstr = MsgStr::Singular(value),
63            (MsgStr::Singular(existing), 0) => *existing = value,
64            (MsgStr::Plural(values), 0) => {
65                if values.is_empty() {
66                    values.push(String::new());
67                }
68                values[0] = value;
69            }
70            _ => {
71                let msgstr = self.promote_plural_msgstr(plural_index);
72                msgstr[plural_index] = value;
73            }
74        }
75    }
76
77    fn append_msgstr(&mut self, plural_index: usize, value: &str) {
78        match (&mut self.msgstr, plural_index) {
79            (MsgStr::None, 0) => self.msgstr = MsgStr::Singular(value.to_owned()),
80            (MsgStr::Singular(existing), 0) => existing.push_str(value),
81            (MsgStr::Plural(values), 0) => {
82                if values.is_empty() {
83                    values.push(String::new());
84                }
85                values[0].push_str(value);
86            }
87            _ => {
88                let msgstr = self.promote_plural_msgstr(plural_index);
89                msgstr[plural_index].push_str(value);
90            }
91        }
92    }
93
94    fn header_msgstr(&self) -> &str {
95        self.msgstr.first_str().unwrap_or_default()
96    }
97
98    fn materialize_msgstr(&mut self) {
99        debug_assert!(self.item.msgstr.is_empty());
100        self.item.msgstr = core::mem::take(&mut self.msgstr);
101    }
102
103    fn promote_plural_msgstr(&mut self, plural_index: usize) -> &mut Vec<String> {
104        if !matches!(self.msgstr, MsgStr::Plural(_)) {
105            self.msgstr = match core::mem::take(&mut self.msgstr) {
106                MsgStr::None => MsgStr::Plural(Vec::with_capacity(2)),
107                MsgStr::Singular(value) => {
108                    let mut values = Vec::with_capacity(2);
109                    values.push(value);
110                    MsgStr::Plural(values)
111                }
112                MsgStr::Plural(values) => MsgStr::Plural(values),
113            };
114        }
115        let MsgStr::Plural(msgstr) = &mut self.msgstr else {
116            unreachable!("plural msgstr promotion must yield plural storage");
117        };
118        if msgstr.len() <= plural_index {
119            msgstr.resize(plural_index + 1, String::new());
120        }
121        msgstr
122    }
123}
124
125#[derive(Debug, Clone, Copy)]
126struct BorrowedLine<'a> {
127    trimmed: &'a [u8],
128    obsolete: bool,
129}
130
131/// Parses PO content into the owned [`PoFile`] representation.
132///
133/// Line endings are normalized before parsing, and the UTF-8 BOM is ignored
134/// when present.
135///
136/// # Errors
137///
138/// Returns [`ParseError`] when the input is not valid PO syntax.
139pub fn parse_po(input: &str) -> Result<PoFile, ParseError> {
140    let input = strip_utf8_bom(input);
141    let normalized;
142    let input = if input.as_bytes().contains(&b'\r') {
143        normalized = input.replace("\r\n", "\n").replace('\r', "\n");
144        normalized.as_str()
145    } else {
146        input
147    };
148
149    let mut file = PoFile::default();
150    file.items.reserve((input.len() / 96).max(1));
151    let mut current_nplurals = 2;
152    let mut state = ParserState::new(current_nplurals);
153
154    for line in LineScanner::new(input.as_bytes()) {
155        parse_line(
156            BorrowedLine {
157                trimmed: line.trimmed,
158                obsolete: line.obsolete,
159            },
160            &mut state,
161            &mut file,
162            &mut current_nplurals,
163        )?;
164    }
165
166    finish_item(&mut state, &mut file, &mut current_nplurals);
167
168    Ok(file)
169}
170
171#[inline]
172fn strip_utf8_bom(input: &str) -> &str {
173    input.strip_prefix('\u{feff}').unwrap_or(input)
174}
175
176fn parse_line(
177    line: BorrowedLine<'_>,
178    state: &mut ParserState,
179    file: &mut PoFile,
180    current_nplurals: &mut usize,
181) -> Result<(), ParseError> {
182    match classify_line(line.trimmed) {
183        LineKind::Continuation => {
184            append_continuation(line.trimmed, line.obsolete, state)?;
185            Ok(())
186        }
187        LineKind::Comment(kind) => {
188            parse_comment_line(line.trimmed, kind, state, file, current_nplurals);
189            Ok(())
190        }
191        LineKind::Keyword(keyword) => parse_keyword_line(
192            line.trimmed,
193            line.obsolete,
194            keyword,
195            state,
196            file,
197            current_nplurals,
198        ),
199        LineKind::Other => Ok(()),
200    }
201}
202
203fn parse_comment_line(
204    line_bytes: &[u8],
205    kind: CommentKind,
206    state: &mut ParserState,
207    file: &mut PoFile,
208    current_nplurals: &mut usize,
209) {
210    finish_item(state, file, current_nplurals);
211
212    match kind {
213        CommentKind::Reference => {
214            let reference_line = trimmed_str(&line_bytes[2..]);
215            state.item.references.extend(
216                split_reference_comment(reference_line)
217                    .into_iter()
218                    .map(Cow::into_owned),
219            );
220        }
221        CommentKind::Flags => {
222            for flag in trimmed_str(&line_bytes[2..]).split(',') {
223                state.item.flags.push(flag.trim().to_owned());
224            }
225        }
226        CommentKind::Extracted => state
227            .item
228            .extracted_comments
229            .push(trimmed_string(&line_bytes[2..])),
230        CommentKind::Metadata => {
231            let trimmed = trim_ascii(&line_bytes[2..]);
232            if let Some((key_bytes, value_bytes)) = split_once_byte(trimmed, b':') {
233                let key = trimmed_str(key_bytes);
234                if !key.is_empty() {
235                    let value = trimmed_str(value_bytes);
236                    state.item.metadata.push((key.to_owned(), value.to_owned()));
237                }
238            }
239        }
240        CommentKind::Translator => state.item.comments.push(trimmed_string(&line_bytes[1..])),
241        CommentKind::Other => {}
242    }
243}
244
245fn parse_keyword_line(
246    line_bytes: &[u8],
247    obsolete: bool,
248    keyword: Keyword,
249    state: &mut ParserState,
250    file: &mut PoFile,
251    current_nplurals: &mut usize,
252) -> Result<(), ParseError> {
253    match keyword {
254        Keyword::IdPlural => {
255            state.obsolete_line_count += usize::from(obsolete);
256            state.item.msgid_plural = Some(extract_quoted_bytes_cow(line_bytes)?.into_owned());
257            state.context = Some(Context::IdPlural);
258            state.content_line_count += 1;
259            state.has_keyword = true;
260        }
261        Keyword::Id => {
262            finish_item(state, file, current_nplurals);
263            state.obsolete_line_count += usize::from(obsolete);
264            state.item.msgid = extract_quoted_bytes_cow(line_bytes)?.into_owned();
265            state.context = Some(Context::Id);
266            state.content_line_count += 1;
267            state.has_keyword = true;
268        }
269        Keyword::Str => {
270            let plural_index = parse_plural_index(line_bytes).unwrap_or(0);
271            state.plural_index = plural_index;
272            state.obsolete_line_count += usize::from(obsolete);
273            state.set_msgstr(
274                plural_index,
275                extract_quoted_bytes_cow(line_bytes)?.into_owned(),
276            );
277            state.context = Some(Context::Str);
278            state.content_line_count += 1;
279            state.has_keyword = true;
280        }
281        Keyword::Ctxt => {
282            finish_item(state, file, current_nplurals);
283            state.obsolete_line_count += usize::from(obsolete);
284            state.item.msgctxt = Some(extract_quoted_bytes_cow(line_bytes)?.into_owned());
285            state.context = Some(Context::Ctxt);
286            state.content_line_count += 1;
287            state.has_keyword = true;
288        }
289    }
290
291    Ok(())
292}
293
294fn append_continuation(
295    line_bytes: &[u8],
296    obsolete: bool,
297    state: &mut ParserState,
298) -> Result<(), ParseError> {
299    state.obsolete_line_count += usize::from(obsolete);
300    state.content_line_count += 1;
301    let value = extract_quoted_bytes_cow(line_bytes)?;
302
303    match state.context {
304        Some(Context::Str) => {
305            state.append_msgstr(state.plural_index, value.as_ref());
306        }
307        Some(Context::Id) => state.item.msgid.push_str(value.as_ref()),
308        Some(Context::IdPlural) => {
309            let target = state.item.msgid_plural.get_or_insert_with(String::new);
310            target.push_str(value.as_ref());
311        }
312        Some(Context::Ctxt) => {
313            let target = state.item.msgctxt.get_or_insert_with(String::new);
314            target.push_str(value.as_ref());
315        }
316        None => {}
317    }
318
319    Ok(())
320}
321
322fn finish_item(state: &mut ParserState, file: &mut PoFile, current_nplurals: &mut usize) {
323    if !state.has_keyword {
324        return;
325    }
326
327    if state.item.msgid.is_empty() && !is_header_state(state) {
328        return;
329    }
330
331    if state.obsolete_line_count >= state.content_line_count && state.content_line_count > 0 {
332        state.item.obsolete = true;
333    }
334
335    if is_header_state(state) && file.headers.is_empty() && file.items.is_empty() {
336        file.comments = core::mem::take(&mut state.item.comments);
337        file.extracted_comments = core::mem::take(&mut state.item.extracted_comments);
338        parse_headers(state.header_msgstr(), &mut file.headers);
339        *current_nplurals = parse_nplurals(&file.headers).unwrap_or(2);
340        state.reset(*current_nplurals);
341        return;
342    }
343
344    state.materialize_msgstr();
345
346    if state.item.msgstr.is_empty() {
347        state.item.msgstr = MsgStr::Singular(String::new());
348    }
349    if state.item.msgid_plural.is_some() && state.item.msgstr.len() == 1 {
350        let mut values = state.item.msgstr.clone().into_vec();
351        values.resize(state.item.nplurals.max(1), String::new());
352        state.item.msgstr = MsgStr::Plural(values);
353    }
354
355    state.item.nplurals = *current_nplurals;
356    file.items.push(core::mem::take(&mut state.item));
357    state.reset_after_take(*current_nplurals);
358}
359
360fn is_header_state(state: &ParserState) -> bool {
361    state.item.msgid.is_empty()
362        && state.item.msgctxt.is_none()
363        && state.item.msgid_plural.is_none()
364        && !state.msgstr.is_empty()
365}
366
367fn parse_headers(raw: &str, out: &mut Vec<Header>) {
368    let bytes = raw.as_bytes();
369    out.reserve(memchr_iter(b'\n', bytes).count() + 1);
370
371    for line in LineScanner::new(bytes) {
372        if let Some((key_bytes, value_bytes)) = split_once_byte(line.trimmed, b':') {
373            out.push(Header {
374                key: trimmed_string(key_bytes),
375                value: trimmed_string(value_bytes),
376            });
377        }
378    }
379}
380
381fn parse_nplurals(headers: &[Header]) -> Option<usize> {
382    let plural_forms = headers
383        .iter()
384        .find(|header| header.key == "Plural-Forms")?
385        .value
386        .as_bytes();
387    let mut rest = plural_forms;
388
389    while !rest.is_empty() {
390        let (part, next) = match split_once_byte(rest, b';') {
391            Some((part, tail)) => (part, tail),
392            None => (rest, &b""[..]),
393        };
394        let trimmed = trim_ascii(part);
395        if let Some((key, value)) = split_once_byte(trimmed, b'=')
396            && trim_ascii(key) == b"nplurals"
397            && let value = bytes_to_str(trim_ascii(value))
398            && let Ok(parsed) = value.parse::<usize>()
399        {
400            return Some(parsed);
401        }
402        rest = next;
403    }
404
405    None
406}
407
408fn bytes_to_str(bytes: &[u8]) -> &str {
409    input_slice_as_str(bytes)
410}
411
412fn trimmed_str(bytes: &[u8]) -> &str {
413    bytes_to_str(trim_ascii(bytes))
414}
415
416fn trimmed_string(bytes: &[u8]) -> String {
417    trimmed_str(bytes).to_owned()
418}
419
420#[cfg(test)]
421mod tests {
422    use super::parse_po;
423
424    const MULTI_LINE: &str = r#"# French translation of Link (6.x-2.9)
425# Copyright (c) 2011 by the French translation team
426#
427## Plural-Forms by polish translation team to demonstrate multi-line ##
428#
429msgid ""
430msgstr ""
431"Project-Id-Version: Link (6.x-2.9)\n"
432"POT-Creation-Date: 2011-12-31 23:39+0000\n"
433"PO-Revision-Date: 2013-12-17 14:21+0100\n"
434"Language-Team: French\n"
435"MIME-Version: 1.0\n"
436"Content-Type: text/plain; charset=UTF-8\n"
437"Content-Transfer-Encoding: 8bit\n"
438"Plural-Forms: nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 "
439"|| n%100>=20) ? 1 : 2;\n"
440"Last-Translator: Ruben Vermeersch <ruben@rocketeer.be>\n"
441"Language: fr\n"
442"X-Generator: Poedit 1.6.2\n"
443
444msgid ""
445"The following placeholder tokens can be used in both paths and titles. When "
446"used in a path or title, they will be replaced with the appropriate values."
447msgstr ""
448"Les ébauches de jetons suivantes peuvent être utilisées à la fois dans les "
449"chemins et in the titles. Lorsqu'elles sont utilisées dans un chemin ou un "
450"titre, elles seront remplacées par les valeurs appropriées."
451"#;
452
453    const COMMENTED: &str = r#"msgid ""
454msgstr ""
455"Project-Id-Version: Test\n"
456"Plural-Forms: nplurals=2; plural=(n != 1);\n"
457
458#: .tmp/ui/settings/views/console-modal.html
459msgid "{{dataLoader.data.length}} results"
460msgstr "{{dataLoader.data.length}} resultaten"
461
462#~ msgid "Add order"
463#~ msgstr "Order toevoegen"
464
465#~ # commented obsolete item
466#~ #, fuzzy
467#~ msgid "Commented item"
468#~ msgstr "not sure"
469
470# commented obsolete item
471#, fuzzy
472#~ msgid "Second commented item"
473#~ msgstr "also not sure"
474"#;
475
476    const C_STRINGS: &str = r#"msgid ""
477msgstr ""
478"Plural-Forms: nplurals=2; plural=(n > 1);\n"
479
480msgid "The name field must not contain characters like \" or \\"
481msgstr ""
482
483msgid ""
484"%1$s\n"
485"%2$s %3$s\n"
486"%4$s\n"
487"%5$s"
488msgstr ""
489
490msgid ""
491"define('some/test/module', function () {\n"
492"\t'use strict';\n"
493"\treturn {};\n"
494"});\n"
495""
496msgstr ""
497"#;
498
499    #[test]
500    fn parses_multiline_headers_and_items() {
501        let po = match parse_po(MULTI_LINE) {
502            Ok(value) => value,
503            Err(error) => panic!("parse failed: {error}"),
504        };
505
506        assert_eq!(po.headers[6].key, "Content-Transfer-Encoding");
507        assert_eq!(
508            po.headers
509                .iter()
510                .find(|header| header.key == "Plural-Forms")
511                .map(|header| header.value.as_str()),
512            Some(
513                "nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;"
514            )
515        );
516        assert_eq!(po.items.len(), 1);
517        assert_eq!(
518            po.items[0].msgid,
519            "The following placeholder tokens can be used in both paths and titles. When used in a path or title, they will be replaced with the appropriate values."
520        );
521    }
522
523    #[test]
524    fn parses_c_string_escapes_and_multiline_values() {
525        let po = match parse_po(C_STRINGS) {
526            Ok(value) => value,
527            Err(error) => panic!("parse failed: {error}"),
528        };
529
530        assert_eq!(
531            po.items[0].msgid,
532            "The name field must not contain characters like \" or \\"
533        );
534        assert_eq!(po.items[1].msgid, "%1$s\n%2$s %3$s\n%4$s\n%5$s");
535        assert_eq!(
536            po.items[2].msgid,
537            "define('some/test/module', function () {\n\t'use strict';\n\treturn {};\n});\n"
538        );
539    }
540
541    #[test]
542    fn parses_obsolete_items() {
543        let po = match parse_po(COMMENTED) {
544            Ok(value) => value,
545            Err(error) => panic!("parse failed: {error}"),
546        };
547
548        assert_eq!(po.items.len(), 4);
549        assert!(!po.items[0].obsolete);
550        assert!(po.items[1].obsolete);
551        assert!(po.items[2].obsolete);
552        assert!(po.items[3].obsolete);
553        assert_eq!(
554            po.items[3].comments,
555            vec!["commented obsolete item".to_owned()]
556        );
557        assert_eq!(po.items[3].flags, vec!["fuzzy".to_owned()]);
558    }
559
560    #[test]
561    fn parses_context_without_creating_phantom_items() {
562        let input = r#"msgid ""
563msgstr ""
564"Language: de\n"
565
566msgctxt "menu"
567msgid "File"
568msgstr "Datei"
569"#;
570
571        let po = match parse_po(input) {
572            Ok(value) => value,
573            Err(error) => panic!("parse failed: {error}"),
574        };
575
576        assert_eq!(po.items.len(), 1);
577        assert_eq!(po.items[0].msgctxt.as_deref(), Some("menu"));
578        assert_eq!(po.items[0].msgid, "File");
579    }
580
581    #[test]
582    fn strips_utf8_bom_prefix() {
583        let input = "\u{feff}msgid \"foo\"\nmsgstr \"bar\"\n";
584        let po = parse_po(input).expect("parse");
585
586        assert_eq!(po.items.len(), 1);
587        assert_eq!(po.items[0].msgid, "foo");
588        assert_eq!(po.items[0].msgstr[0], "bar");
589    }
590
591    #[test]
592    fn rejects_unescaped_quote_sequences() {
593        let input = "msgid \"Some msgid with \\\"double\\\" quotes\"\nmsgstr \"\"\n\"Some msgstr with \"double\\\" quotes\"\n";
594        let error = parse_po(input).expect_err("invalid quote pattern should fail");
595
596        assert!(error.to_string().contains("unescaped"));
597    }
598}