Skip to main content

ferrocat_icu/
parser.rs

1use crate::ast::{IcuMessage, IcuNode, IcuOption, IcuPluralKind};
2use crate::error::IcuParseError;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub struct IcuParserOptions {
6    pub ignore_tag: bool,
7    pub requires_other_clause: bool,
8}
9
10impl Default for IcuParserOptions {
11    fn default() -> Self {
12        Self {
13            ignore_tag: false,
14            requires_other_clause: true,
15        }
16    }
17}
18
19pub fn parse_icu(input: &str) -> Result<IcuMessage, IcuParseError> {
20    parse_icu_with_options(input, &IcuParserOptions::default())
21}
22
23pub fn parse_icu_with_options(
24    input: &str,
25    options: &IcuParserOptions,
26) -> Result<IcuMessage, IcuParseError> {
27    let mut parser = Parser::new(input, options);
28    let nodes = parser.parse_nodes(None, 0)?;
29    if !parser.is_eof() {
30        return Err(parser.error("Unexpected trailing input"));
31    }
32    Ok(IcuMessage { nodes })
33}
34
35struct Parser<'a> {
36    input: &'a str,
37    input_bytes: &'a [u8],
38    pos: usize,
39    options: &'a IcuParserOptions,
40}
41
42impl<'a> Parser<'a> {
43    const OFFSET_PREFIX: &'static [u8] = b"offset:";
44    const CLOSE_TAG_PREFIX: &'static [u8] = b"</";
45
46    fn new(input: &'a str, options: &'a IcuParserOptions) -> Self {
47        Self {
48            input,
49            input_bytes: input.as_bytes(),
50            pos: 0,
51            options,
52        }
53    }
54
55    fn parse_nodes(
56        &mut self,
57        until_tag: Option<&str>,
58        plural_depth: usize,
59    ) -> Result<Vec<IcuNode>, IcuParseError> {
60        let mut nodes = Vec::with_capacity(4);
61        let mut literal = String::with_capacity(16);
62
63        while let Some(byte) = self.byte_at() {
64            if byte == b'}' {
65                break;
66            }
67
68            if let Some(tag_name) = until_tag {
69                if self.starts_with_close_tag(tag_name) {
70                    break;
71                }
72                if !self.options.ignore_tag && self.peek_close_tag() {
73                    return Err(self.error("Mismatched closing tag"));
74                }
75            } else if !self.options.ignore_tag && self.peek_close_tag() {
76                return Err(self.error("Unexpected closing tag"));
77            }
78
79            match byte {
80                b'{' => {
81                    self.flush_literal(&mut literal, &mut nodes);
82                    nodes.push(self.parse_argument(plural_depth)?);
83                }
84                b'<' if !self.options.ignore_tag && self.peek_open_tag() => {
85                    self.flush_literal(&mut literal, &mut nodes);
86                    nodes.push(self.parse_tag(plural_depth)?);
87                }
88                b'#' if plural_depth > 0 => {
89                    self.flush_literal(&mut literal, &mut nodes);
90                    self.pos += 1;
91                    nodes.push(IcuNode::Pound);
92                }
93                b'\'' => literal.push_str(&self.parse_apostrophe_literal()?),
94                _ => literal.push(self.advance_char().expect("byte implies char")),
95            }
96        }
97
98        self.flush_literal(&mut literal, &mut nodes);
99        Ok(nodes)
100    }
101
102    fn parse_argument(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
103        self.expect_char('{')?;
104        self.skip_whitespace();
105        let name = self.parse_identifier()?;
106        self.skip_whitespace();
107
108        if self.consume_char('}') {
109            return Ok(IcuNode::Argument { name });
110        }
111
112        self.expect_char(',')?;
113        self.skip_whitespace();
114        let kind = self.parse_identifier()?;
115        self.skip_whitespace();
116
117        match kind.as_str() {
118            "number" => self.parse_simple_formatter(name, FormatterKind::Number),
119            "date" => self.parse_simple_formatter(name, FormatterKind::Date),
120            "time" => self.parse_simple_formatter(name, FormatterKind::Time),
121            "list" => self.parse_simple_formatter(name, FormatterKind::List),
122            "duration" => self.parse_simple_formatter(name, FormatterKind::Duration),
123            "ago" => self.parse_simple_formatter(name, FormatterKind::Ago),
124            "name" => self.parse_simple_formatter(name, FormatterKind::Name),
125            "select" => self.parse_select(name, plural_depth),
126            "plural" => self.parse_plural(name, plural_depth, IcuPluralKind::Cardinal),
127            "selectordinal" => self.parse_plural(name, plural_depth, IcuPluralKind::Ordinal),
128            _ => Err(self.error("Unsupported ICU argument type")),
129        }
130    }
131
132    fn parse_simple_formatter(
133        &mut self,
134        name: String,
135        kind: FormatterKind,
136    ) -> Result<IcuNode, IcuParseError> {
137        let style = if self.consume_char(',') {
138            let style = self.read_until_closing_brace()?.trim().to_owned();
139            Some(style).filter(|style| !style.is_empty())
140        } else {
141            None
142        };
143        self.expect_char('}')?;
144
145        Ok(match kind {
146            FormatterKind::Number => IcuNode::Number { name, style },
147            FormatterKind::Date => IcuNode::Date { name, style },
148            FormatterKind::Time => IcuNode::Time { name, style },
149            FormatterKind::List => IcuNode::List { name, style },
150            FormatterKind::Duration => IcuNode::Duration { name, style },
151            FormatterKind::Ago => IcuNode::Ago { name, style },
152            FormatterKind::Name => IcuNode::Name { name, style },
153        })
154    }
155
156    fn parse_select(
157        &mut self,
158        name: String,
159        plural_depth: usize,
160    ) -> Result<IcuNode, IcuParseError> {
161        if self.consume_char(',') {
162            self.skip_whitespace();
163        }
164        let options = self.parse_options(plural_depth)?;
165        if self.options.requires_other_clause && !has_other_clause(&options) {
166            return Err(self.error("Select argument requires an \"other\" clause"));
167        }
168        self.expect_char('}')?;
169        Ok(IcuNode::Select { name, options })
170    }
171
172    fn parse_plural(
173        &mut self,
174        name: String,
175        plural_depth: usize,
176        kind: IcuPluralKind,
177    ) -> Result<IcuNode, IcuParseError> {
178        let mut offset = 0u32;
179
180        if self.consume_char(',') {
181            self.skip_whitespace();
182        }
183
184        loop {
185            self.skip_whitespace();
186            if self.starts_with_bytes(Self::OFFSET_PREFIX) {
187                self.pos += Self::OFFSET_PREFIX.len();
188                self.skip_whitespace();
189                offset = self.parse_unsigned_int()? as u32;
190            } else {
191                break;
192            }
193        }
194
195        let options = self.parse_options(plural_depth + 1)?;
196        if self.options.requires_other_clause && !has_other_clause(&options) {
197            return Err(self.error("Plural argument requires an \"other\" clause"));
198        }
199        self.expect_char('}')?;
200
201        Ok(IcuNode::Plural {
202            name,
203            kind,
204            offset,
205            options,
206        })
207    }
208
209    fn parse_options(&mut self, plural_depth: usize) -> Result<Vec<IcuOption>, IcuParseError> {
210        let mut options = Vec::with_capacity(4);
211
212        loop {
213            self.skip_whitespace();
214            if self.byte_at() == Some(b'}') {
215                break;
216            }
217            let selector = self.parse_selector()?;
218            self.skip_whitespace();
219            self.expect_char('{')?;
220            let value = self.parse_nodes(None, plural_depth)?;
221            self.expect_char('}')?;
222            options.push(IcuOption { selector, value });
223        }
224
225        if options.is_empty() {
226            return Err(self.error("Expected at least one ICU option"));
227        }
228
229        Ok(options)
230    }
231
232    fn parse_tag(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
233        self.expect_char('<')?;
234        let name = self.parse_tag_name()?;
235        self.expect_char('>')?;
236        let children = self.parse_nodes(Some(&name), plural_depth)?;
237        self.expect_bytes(Self::CLOSE_TAG_PREFIX)?;
238        let close_name = self.parse_tag_name()?;
239        if close_name != name {
240            return Err(self.error("Mismatched closing tag"));
241        }
242        self.expect_char('>')?;
243        Ok(IcuNode::Tag { name, children })
244    }
245
246    fn parse_apostrophe_literal(&mut self) -> Result<String, IcuParseError> {
247        let start = self.pos;
248        self.expect_char('\'')?;
249
250        if self.consume_char('\'') {
251            return Ok("'".to_owned());
252        }
253
254        let mut out = String::with_capacity(8);
255        while let Some(byte) = self.byte_at() {
256            if byte == b'\'' {
257                self.pos += 1;
258                if self.consume_char('\'') {
259                    out.push('\'');
260                } else {
261                    return Ok(out);
262                }
263            } else {
264                out.push(self.advance_char().expect("byte implies char"));
265            }
266        }
267
268        Err(IcuParseError::syntax(
269            "Unterminated apostrophe escape",
270            self.input,
271            start,
272        ))
273    }
274
275    fn read_until_closing_brace(&mut self) -> Result<String, IcuParseError> {
276        let mut out = String::with_capacity(8);
277        while let Some(byte) = self.byte_at() {
278            if byte == b'}' {
279                return Ok(out);
280            }
281            if byte == b'\'' {
282                out.push_str(&self.parse_apostrophe_literal()?);
283            } else {
284                out.push(self.advance_char().expect("byte implies char"));
285            }
286        }
287        Err(self.error("Unterminated ICU argument"))
288    }
289
290    fn parse_selector(&mut self) -> Result<String, IcuParseError> {
291        let start = self.pos;
292        if self.consume_char('=') {
293            let number = self.parse_unsigned_int()?;
294            return Ok(format!("={number}"));
295        }
296
297        while let Some(byte) = self.byte_at() {
298            if byte.is_ascii_whitespace() || byte == b'{' {
299                break;
300            }
301            if byte.is_ascii() {
302                self.pos += 1;
303            } else {
304                self.advance_char();
305            }
306        }
307
308        if self.pos == start {
309            return Err(self.error("Expected ICU selector"));
310        }
311
312        Ok(self.input[start..self.pos].to_owned())
313    }
314
315    fn parse_identifier(&mut self) -> Result<String, IcuParseError> {
316        let start = self.pos;
317        while let Some(byte) = self.byte_at() {
318            if byte.is_ascii_whitespace() || matches!(byte, b'{' | b'}' | b',' | b'<' | b'>') {
319                break;
320            }
321            if byte.is_ascii() {
322                self.pos += 1;
323            } else {
324                self.advance_char();
325            }
326        }
327
328        if self.pos == start {
329            return Err(self.error("Expected ICU identifier"));
330        }
331
332        Ok(self.input[start..self.pos].to_owned())
333    }
334
335    fn parse_tag_name(&mut self) -> Result<String, IcuParseError> {
336        let start = self.pos;
337        while let Some(byte) = self.byte_at() {
338            if byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.') {
339                self.pos += 1;
340            } else {
341                break;
342            }
343        }
344
345        if self.pos == start {
346            return Err(self.error("Expected tag name"));
347        }
348
349        Ok(self.input[start..self.pos].to_owned())
350    }
351
352    fn parse_unsigned_int(&mut self) -> Result<usize, IcuParseError> {
353        let start = self.pos;
354        while let Some(byte) = self.byte_at() {
355            if byte.is_ascii_digit() {
356                self.pos += 1;
357            } else {
358                break;
359            }
360        }
361
362        if self.pos == start {
363            return Err(self.error("Expected integer"));
364        }
365
366        self.input[start..self.pos]
367            .parse::<usize>()
368            .map_err(|_| self.error("Invalid integer"))
369    }
370
371    fn skip_whitespace(&mut self) {
372        while let Some(byte) = self.byte_at() {
373            if byte.is_ascii_whitespace() {
374                self.pos += 1;
375            } else {
376                break;
377            }
378        }
379    }
380
381    fn flush_literal(&self, literal: &mut String, nodes: &mut Vec<IcuNode>) {
382        if !literal.is_empty() {
383            nodes.push(IcuNode::Literal(core::mem::take(literal)));
384        }
385    }
386
387    fn expect_char(&mut self, ch: char) -> Result<(), IcuParseError> {
388        if ch.is_ascii() {
389            if self.byte_at() == Some(ch as u8) {
390                self.pos += 1;
391                return Ok(());
392            }
393            return Err(self.error(format!("Expected '{ch}'")));
394        }
395
396        match self.peek_char() {
397            Some(current) if current == ch => {
398                self.pos += ch.len_utf8();
399                Ok(())
400            }
401            _ => Err(self.error(format!("Expected '{ch}'"))),
402        }
403    }
404
405    fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), IcuParseError> {
406        if self.starts_with_bytes(expected) {
407            self.pos += expected.len();
408            Ok(())
409        } else {
410            let expected = core::str::from_utf8(expected).unwrap_or("<bytes>");
411            Err(self.error(format!("Expected \"{expected}\"")))
412        }
413    }
414
415    fn consume_char(&mut self, ch: char) -> bool {
416        if ch.is_ascii() {
417            if self.byte_at() == Some(ch as u8) {
418                self.pos += 1;
419                return true;
420            }
421            return false;
422        }
423
424        if self.peek_char() == Some(ch) {
425            self.pos += ch.len_utf8();
426            true
427        } else {
428            false
429        }
430    }
431
432    fn peek_char(&self) -> Option<char> {
433        self.input[self.pos..].chars().next()
434    }
435
436    fn byte_at(&self) -> Option<u8> {
437        self.input_bytes.get(self.pos).copied()
438    }
439
440    fn advance_char(&mut self) -> Option<char> {
441        let ch = self.peek_char()?;
442        self.pos += ch.len_utf8();
443        Some(ch)
444    }
445
446    fn peek_open_tag(&self) -> bool {
447        let Some(rest) = self.input_bytes.get(self.pos..) else {
448            return false;
449        };
450        if !rest.starts_with(b"<") || rest.starts_with(b"</") {
451            return false;
452        }
453        rest.get(1).is_some_and(u8::is_ascii_alphanumeric)
454    }
455
456    fn peek_close_tag(&self) -> bool {
457        self.input_bytes[self.pos..].starts_with(b"</")
458    }
459
460    fn starts_with_close_tag(&self, name: &str) -> bool {
461        let Some(rest) = self.input_bytes.get(self.pos..) else {
462            return false;
463        };
464        rest.starts_with(Self::CLOSE_TAG_PREFIX)
465            && rest[2..].starts_with(name.as_bytes())
466            && rest.get(2 + name.len()) == Some(&b'>')
467    }
468
469    fn starts_with_bytes(&self, expected: &[u8]) -> bool {
470        self.input_bytes[self.pos..].starts_with(expected)
471    }
472
473    fn is_eof(&self) -> bool {
474        self.pos >= self.input.len()
475    }
476
477    fn error(&self, message: impl Into<String>) -> IcuParseError {
478        IcuParseError::syntax(message, self.input, self.pos)
479    }
480}
481
482#[derive(Clone, Copy)]
483enum FormatterKind {
484    Number,
485    Date,
486    Time,
487    List,
488    Duration,
489    Ago,
490    Name,
491}
492
493fn has_other_clause(options: &[IcuOption]) -> bool {
494    options.iter().any(|option| option.selector == "other")
495}
496
497#[cfg(test)]
498mod tests {
499    use crate::{
500        IcuNode, IcuParseError, IcuParserOptions, IcuPluralKind, parse_icu, parse_icu_with_options,
501        validate_icu,
502    };
503
504    #[test]
505    fn parses_simple_argument_message() {
506        let message = parse_icu("Hello {name}!").expect("parse");
507        assert_eq!(
508            message.nodes,
509            vec![
510                IcuNode::Literal("Hello ".to_owned()),
511                IcuNode::Argument {
512                    name: "name".to_owned()
513                },
514                IcuNode::Literal("!".to_owned())
515            ]
516        );
517    }
518
519    #[test]
520    fn parses_formatter_styles_as_opaque_strings() {
521        let message = parse_icu(
522            "{n, number, currency} {d, date, short} {t, time, ::HHmm} {items, list, disjunction}",
523        )
524        .expect("parse");
525        assert!(matches!(
526            &message.nodes[0],
527            IcuNode::Number {
528                style: Some(style),
529                ..
530            } if style == "currency"
531        ));
532        assert!(matches!(
533            &message.nodes[2],
534            IcuNode::Date {
535                style: Some(style),
536                ..
537            } if style == "short"
538        ));
539        assert!(matches!(
540            &message.nodes[4],
541            IcuNode::Time {
542                style: Some(style),
543                ..
544            } if style == "::HHmm"
545        ));
546        assert!(matches!(
547            &message.nodes[6],
548            IcuNode::List {
549                style: Some(style),
550                ..
551            } if style == "disjunction"
552        ));
553    }
554
555    #[test]
556    fn parses_plural_select_and_selectordinal() {
557        let message = parse_icu(
558            "{count, plural, offset:1 =0 {none} one {# item} other {{gender, select, male {his} other {their}} items}} {rank, selectordinal, one {#st} other {#th}}",
559        )
560        .expect("parse");
561
562        assert!(matches!(
563            &message.nodes[0],
564            IcuNode::Plural {
565                kind: IcuPluralKind::Cardinal,
566                offset: 1,
567                options,
568                ..
569            } if options.len() == 3
570        ));
571        assert!(matches!(
572            &message.nodes[2],
573            IcuNode::Plural {
574                kind: IcuPluralKind::Ordinal,
575                options,
576                ..
577            } if options.len() == 2
578        ));
579    }
580
581    #[test]
582    fn parses_tags_and_nested_content() {
583        let message =
584            parse_icu("<0>{count, plural, one {<b>#</b>} other {items}}</0>").expect("parse");
585        assert!(matches!(
586            &message.nodes[0],
587            IcuNode::Tag { name, children } if name == "0" && !children.is_empty()
588        ));
589    }
590
591    #[test]
592    fn ignore_tag_treats_tags_as_literal_text() {
593        let message = parse_icu_with_options(
594            "<b>Hello</b>",
595            &IcuParserOptions {
596                ignore_tag: true,
597                ..IcuParserOptions::default()
598            },
599        )
600        .expect("parse");
601        assert_eq!(
602            message.nodes,
603            vec![IcuNode::Literal("<b>Hello</b>".to_owned())]
604        );
605    }
606
607    #[test]
608    fn apostrophe_escaping_works() {
609        let message = parse_icu("'{'{name}'}' ''").expect("parse");
610        assert_eq!(
611            message.nodes,
612            vec![
613                IcuNode::Literal("{".to_owned()),
614                IcuNode::Argument {
615                    name: "name".to_owned()
616                },
617                IcuNode::Literal("} '".to_owned()),
618            ]
619        );
620    }
621
622    #[test]
623    fn missing_other_clause_fails_by_default() {
624        let error = parse_icu("{count, plural, one {item}}").expect_err("missing other");
625        assert!(error.message.contains("other"));
626    }
627
628    #[test]
629    fn missing_other_clause_can_be_disabled() {
630        parse_icu_with_options(
631            "{count, plural, one {item}}",
632            &IcuParserOptions {
633                requires_other_clause: false,
634                ..IcuParserOptions::default()
635            },
636        )
637        .expect("parse");
638    }
639
640    #[test]
641    fn mismatched_closing_tag_fails() {
642        let error = parse_icu("<a>hello</b>").expect_err("mismatch");
643        assert!(error.message.contains("Mismatched"));
644    }
645
646    #[test]
647    fn invalid_offset_fails() {
648        let error = parse_icu("{count, plural, offset:x other {#}}").expect_err("invalid offset");
649        assert!(error.message.contains("integer"));
650    }
651
652    #[test]
653    fn validate_icu_uses_same_error_surface() {
654        let parse_error = parse_icu("{unclosed").expect_err("parse");
655        let validate_error = validate_icu("{unclosed").expect_err("validate");
656        assert_eq!(parse_error, validate_error);
657    }
658
659    #[test]
660    fn error_positions_are_reported() {
661        let error = parse_icu("Hello\n{unclosed").expect_err("parse");
662        assert_eq!(error.position.line, 2);
663        assert!(error.position.column >= 2);
664    }
665
666    #[test]
667    fn pound_outside_plural_is_literal() {
668        let message = parse_icu("Total # items").expect("parse");
669        assert_eq!(
670            message.nodes,
671            vec![IcuNode::Literal("Total # items".to_owned())]
672        );
673    }
674
675    #[test]
676    fn parse_error_type_is_result_based() {
677        let result: Result<_, IcuParseError> = parse_icu("{broken");
678        assert!(result.is_err());
679    }
680}