Skip to main content

ferrocat_icu/
parser.rs

1use crate::ast::{IcuMessage, IcuNode, IcuOption, IcuPluralKind};
2use crate::error::IcuParseError;
3
4/// Options controlling ICU parsing behavior.
5#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct IcuParserOptions {
7    /// When `true`, rich-text style tags are treated as plain text.
8    pub ignore_tag: bool,
9    /// When `true`, select and plural arguments must include an `other` clause.
10    pub requires_other_clause: bool,
11}
12
13impl Default for IcuParserOptions {
14    fn default() -> Self {
15        Self {
16            ignore_tag: false,
17            requires_other_clause: true,
18        }
19    }
20}
21
22/// Parses ICU `MessageFormat` input with the default parser options.
23///
24/// # Errors
25///
26/// Returns [`IcuParseError`] when the input is malformed.
27pub fn parse_icu(input: &str) -> Result<IcuMessage, IcuParseError> {
28    parse_icu_with_options(input, &IcuParserOptions::default())
29}
30
31/// Parses ICU `MessageFormat` input with explicit parser options.
32///
33/// # Errors
34///
35/// Returns [`IcuParseError`] when the input is malformed.
36pub fn parse_icu_with_options(
37    input: &str,
38    options: &IcuParserOptions,
39) -> Result<IcuMessage, IcuParseError> {
40    let mut parser = Parser::new(input, options);
41    let nodes = parser.parse_nodes(None, 0)?;
42    if !parser.is_eof() {
43        return Err(parser.error("Unexpected trailing input"));
44    }
45    Ok(IcuMessage { nodes })
46}
47
48struct Parser<'a> {
49    input: &'a str,
50    input_bytes: &'a [u8],
51    pos: usize,
52    options: &'a IcuParserOptions,
53}
54
55impl<'a> Parser<'a> {
56    const OFFSET_PREFIX: &'static [u8] = b"offset:";
57    const CLOSE_TAG_PREFIX: &'static [u8] = b"</";
58
59    const fn new(input: &'a str, options: &'a IcuParserOptions) -> Self {
60        Self {
61            input,
62            input_bytes: input.as_bytes(),
63            pos: 0,
64            options,
65        }
66    }
67
68    fn parse_nodes(
69        &mut self,
70        until_tag: Option<&str>,
71        plural_depth: usize,
72    ) -> Result<Vec<IcuNode>, IcuParseError> {
73        let mut nodes = Vec::with_capacity(4);
74        let mut literal = String::with_capacity(16);
75
76        while let Some(byte) = self.byte_at() {
77            if byte == b'}' {
78                break;
79            }
80
81            if let Some(tag_name) = until_tag {
82                if self.starts_with_close_tag(tag_name) {
83                    break;
84                }
85                if !self.options.ignore_tag && self.peek_close_tag() {
86                    return Err(self.error("Mismatched closing tag"));
87                }
88            } else if !self.options.ignore_tag && self.peek_close_tag() {
89                return Err(self.error("Unexpected closing tag"));
90            }
91
92            match byte {
93                b'{' => {
94                    Self::flush_literal(&mut literal, &mut nodes);
95                    nodes.push(self.parse_argument(plural_depth)?);
96                }
97                b'<' if !self.options.ignore_tag && self.peek_open_tag() => {
98                    Self::flush_literal(&mut literal, &mut nodes);
99                    nodes.push(self.parse_tag(plural_depth)?);
100                }
101                b'#' if plural_depth > 0 => {
102                    Self::flush_literal(&mut literal, &mut nodes);
103                    self.pos += 1;
104                    nodes.push(IcuNode::Pound);
105                }
106                b'\'' => literal.push_str(&self.parse_apostrophe_literal()?),
107                _ => literal.push(self.advance_char().expect("byte implies char")),
108            }
109        }
110
111        Self::flush_literal(&mut literal, &mut nodes);
112        Ok(nodes)
113    }
114
115    fn parse_argument(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
116        self.expect_char('{')?;
117        self.skip_whitespace();
118        let name = self.parse_identifier()?;
119        self.skip_whitespace();
120
121        if self.consume_char('}') {
122            return Ok(IcuNode::Argument { name });
123        }
124
125        self.expect_char(',')?;
126        self.skip_whitespace();
127        let kind = self.parse_identifier()?;
128        self.skip_whitespace();
129
130        match kind.as_str() {
131            "number" => self.parse_simple_formatter(name, FormatterKind::Number),
132            "date" => self.parse_simple_formatter(name, FormatterKind::Date),
133            "time" => self.parse_simple_formatter(name, FormatterKind::Time),
134            "list" => self.parse_simple_formatter(name, FormatterKind::List),
135            "duration" => self.parse_simple_formatter(name, FormatterKind::Duration),
136            "ago" => self.parse_simple_formatter(name, FormatterKind::Ago),
137            "name" => self.parse_simple_formatter(name, FormatterKind::Name),
138            "select" => self.parse_select(name, plural_depth),
139            "plural" => self.parse_plural(name, plural_depth, IcuPluralKind::Cardinal),
140            "selectordinal" => self.parse_plural(name, plural_depth, IcuPluralKind::Ordinal),
141            _ => Err(self.error("Unsupported ICU argument type")),
142        }
143    }
144
145    fn parse_simple_formatter(
146        &mut self,
147        name: String,
148        kind: FormatterKind,
149    ) -> Result<IcuNode, IcuParseError> {
150        let style = if self.consume_char(',') {
151            let style = self.read_until_closing_brace()?.trim().to_owned();
152            Some(style).filter(|style| !style.is_empty())
153        } else {
154            None
155        };
156        self.expect_char('}')?;
157
158        Ok(match kind {
159            FormatterKind::Number => IcuNode::Number { name, style },
160            FormatterKind::Date => IcuNode::Date { name, style },
161            FormatterKind::Time => IcuNode::Time { name, style },
162            FormatterKind::List => IcuNode::List { name, style },
163            FormatterKind::Duration => IcuNode::Duration { name, style },
164            FormatterKind::Ago => IcuNode::Ago { name, style },
165            FormatterKind::Name => IcuNode::Name { name, style },
166        })
167    }
168
169    fn parse_select(
170        &mut self,
171        name: String,
172        plural_depth: usize,
173    ) -> Result<IcuNode, IcuParseError> {
174        if self.consume_char(',') {
175            self.skip_whitespace();
176        }
177        let options = self.parse_options(plural_depth)?;
178        if self.options.requires_other_clause && !has_other_clause(&options) {
179            return Err(self.error("Select argument requires an \"other\" clause"));
180        }
181        self.expect_char('}')?;
182        Ok(IcuNode::Select { name, options })
183    }
184
185    fn parse_plural(
186        &mut self,
187        name: String,
188        plural_depth: usize,
189        kind: IcuPluralKind,
190    ) -> Result<IcuNode, IcuParseError> {
191        let mut offset = 0u32;
192
193        if self.consume_char(',') {
194            self.skip_whitespace();
195        }
196
197        loop {
198            self.skip_whitespace();
199            if self.starts_with_bytes(Self::OFFSET_PREFIX) {
200                self.pos += Self::OFFSET_PREFIX.len();
201                self.skip_whitespace();
202                offset = self.parse_unsigned_int()?;
203            } else {
204                break;
205            }
206        }
207
208        let options = self.parse_options(plural_depth + 1)?;
209        if self.options.requires_other_clause && !has_other_clause(&options) {
210            return Err(self.error("Plural argument requires an \"other\" clause"));
211        }
212        self.expect_char('}')?;
213
214        Ok(IcuNode::Plural {
215            name,
216            kind,
217            offset,
218            options,
219        })
220    }
221
222    fn parse_options(&mut self, plural_depth: usize) -> Result<Vec<IcuOption>, IcuParseError> {
223        let mut options = Vec::with_capacity(4);
224
225        loop {
226            self.skip_whitespace();
227            if self.byte_at() == Some(b'}') {
228                break;
229            }
230            let selector = self.parse_selector()?;
231            self.skip_whitespace();
232            self.expect_char('{')?;
233            let value = self.parse_nodes(None, plural_depth)?;
234            self.expect_char('}')?;
235            options.push(IcuOption { selector, value });
236        }
237
238        if options.is_empty() {
239            return Err(self.error("Expected at least one ICU option"));
240        }
241
242        Ok(options)
243    }
244
245    fn parse_tag(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
246        self.expect_char('<')?;
247        let name = self.parse_tag_name()?;
248        self.expect_char('>')?;
249        let children = self.parse_nodes(Some(&name), plural_depth)?;
250        self.expect_bytes(Self::CLOSE_TAG_PREFIX)?;
251        let close_name = self.parse_tag_name()?;
252        if close_name != name {
253            return Err(self.error("Mismatched closing tag"));
254        }
255        self.expect_char('>')?;
256        Ok(IcuNode::Tag { name, children })
257    }
258
259    fn parse_apostrophe_literal(&mut self) -> Result<String, IcuParseError> {
260        let start = self.pos;
261        self.expect_char('\'')?;
262
263        if self.consume_char('\'') {
264            return Ok("'".to_owned());
265        }
266
267        let mut out = String::with_capacity(8);
268        while let Some(byte) = self.byte_at() {
269            if byte == b'\'' {
270                self.pos += 1;
271                if self.consume_char('\'') {
272                    out.push('\'');
273                } else {
274                    return Ok(out);
275                }
276            } else {
277                out.push(self.advance_char().expect("byte implies char"));
278            }
279        }
280
281        Err(IcuParseError::syntax(
282            "Unterminated apostrophe escape",
283            self.input,
284            start,
285        ))
286    }
287
288    fn read_until_closing_brace(&mut self) -> Result<String, IcuParseError> {
289        let mut out = String::with_capacity(8);
290        while let Some(byte) = self.byte_at() {
291            if byte == b'}' {
292                return Ok(out);
293            }
294            if byte == b'\'' {
295                out.push_str(&self.parse_apostrophe_literal()?);
296            } else {
297                out.push(self.advance_char().expect("byte implies char"));
298            }
299        }
300        Err(self.error("Unterminated ICU argument"))
301    }
302
303    fn parse_selector(&mut self) -> Result<String, IcuParseError> {
304        let start = self.pos;
305        if self.consume_char('=') {
306            let number = self.parse_unsigned_int()?;
307            return Ok(format!("={number}"));
308        }
309
310        while let Some(byte) = self.byte_at() {
311            if byte.is_ascii_whitespace() || byte == b'{' {
312                break;
313            }
314            if byte.is_ascii() {
315                self.pos += 1;
316            } else {
317                self.advance_char();
318            }
319        }
320
321        if self.pos == start {
322            return Err(self.error("Expected ICU selector"));
323        }
324
325        Ok(self.input[start..self.pos].to_owned())
326    }
327
328    fn parse_identifier(&mut self) -> Result<String, IcuParseError> {
329        let start = self.pos;
330        while let Some(byte) = self.byte_at() {
331            if byte.is_ascii_whitespace() || matches!(byte, b'{' | b'}' | b',' | b'<' | b'>') {
332                break;
333            }
334            if byte.is_ascii() {
335                self.pos += 1;
336            } else {
337                self.advance_char();
338            }
339        }
340
341        if self.pos == start {
342            return Err(self.error("Expected ICU identifier"));
343        }
344
345        Ok(self.input[start..self.pos].to_owned())
346    }
347
348    fn parse_tag_name(&mut self) -> Result<String, IcuParseError> {
349        let start = self.pos;
350        while let Some(byte) = self.byte_at() {
351            if byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.') {
352                self.pos += 1;
353            } else {
354                break;
355            }
356        }
357
358        if self.pos == start {
359            return Err(self.error("Expected tag name"));
360        }
361
362        Ok(self.input[start..self.pos].to_owned())
363    }
364
365    fn parse_unsigned_int(&mut self) -> Result<u32, IcuParseError> {
366        let start = self.pos;
367        while let Some(byte) = self.byte_at() {
368            if byte.is_ascii_digit() {
369                self.pos += 1;
370            } else {
371                break;
372            }
373        }
374
375        if self.pos == start {
376            return Err(self.error("Expected integer"));
377        }
378
379        self.input[start..self.pos]
380            .parse::<u32>()
381            .map_err(|_| self.error("Invalid integer"))
382    }
383
384    fn skip_whitespace(&mut self) {
385        while let Some(byte) = self.byte_at() {
386            if byte.is_ascii_whitespace() {
387                self.pos += 1;
388            } else {
389                break;
390            }
391        }
392    }
393
394    fn flush_literal(literal: &mut String, nodes: &mut Vec<IcuNode>) {
395        if !literal.is_empty() {
396            nodes.push(IcuNode::Literal(core::mem::take(literal)));
397        }
398    }
399
400    fn expect_char(&mut self, ch: char) -> Result<(), IcuParseError> {
401        if ch.is_ascii() {
402            if self.byte_at() == Some(ch as u8) {
403                self.pos += 1;
404                return Ok(());
405            }
406            return Err(self.error(format!("Expected '{ch}'")));
407        }
408
409        match self.peek_char() {
410            Some(current) if current == ch => {
411                self.pos += ch.len_utf8();
412                Ok(())
413            }
414            _ => Err(self.error(format!("Expected '{ch}'"))),
415        }
416    }
417
418    fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), IcuParseError> {
419        if self.starts_with_bytes(expected) {
420            self.pos += expected.len();
421            Ok(())
422        } else {
423            let expected = core::str::from_utf8(expected).unwrap_or("<bytes>");
424            Err(self.error(format!("Expected \"{expected}\"")))
425        }
426    }
427
428    fn consume_char(&mut self, ch: char) -> bool {
429        if ch.is_ascii() {
430            if self.byte_at() == Some(ch as u8) {
431                self.pos += 1;
432                return true;
433            }
434            return false;
435        }
436
437        if self.peek_char() == Some(ch) {
438            self.pos += ch.len_utf8();
439            true
440        } else {
441            false
442        }
443    }
444
445    fn peek_char(&self) -> Option<char> {
446        self.input[self.pos..].chars().next()
447    }
448
449    fn byte_at(&self) -> Option<u8> {
450        self.input_bytes.get(self.pos).copied()
451    }
452
453    fn advance_char(&mut self) -> Option<char> {
454        let ch = self.peek_char()?;
455        self.pos += ch.len_utf8();
456        Some(ch)
457    }
458
459    fn peek_open_tag(&self) -> bool {
460        let Some(rest) = self.input_bytes.get(self.pos..) else {
461            return false;
462        };
463        if !rest.starts_with(b"<") || rest.starts_with(b"</") {
464            return false;
465        }
466        rest.get(1).is_some_and(u8::is_ascii_alphanumeric)
467    }
468
469    fn peek_close_tag(&self) -> bool {
470        self.input_bytes[self.pos..].starts_with(b"</")
471    }
472
473    fn starts_with_close_tag(&self, name: &str) -> bool {
474        let Some(rest) = self.input_bytes.get(self.pos..) else {
475            return false;
476        };
477        rest.starts_with(Self::CLOSE_TAG_PREFIX)
478            && rest[2..].starts_with(name.as_bytes())
479            && rest.get(2 + name.len()) == Some(&b'>')
480    }
481
482    fn starts_with_bytes(&self, expected: &[u8]) -> bool {
483        self.input_bytes[self.pos..].starts_with(expected)
484    }
485
486    const fn is_eof(&self) -> bool {
487        self.pos >= self.input.len()
488    }
489
490    fn error(&self, message: impl Into<String>) -> IcuParseError {
491        IcuParseError::syntax(message, self.input, self.pos)
492    }
493}
494
495#[derive(Clone, Copy)]
496enum FormatterKind {
497    Number,
498    Date,
499    Time,
500    List,
501    Duration,
502    Ago,
503    Name,
504}
505
506fn has_other_clause(options: &[IcuOption]) -> bool {
507    options.iter().any(|option| option.selector == "other")
508}
509
510#[cfg(test)]
511mod tests {
512    use crate::{
513        IcuNode, IcuParseError, IcuParserOptions, IcuPluralKind, parse_icu, parse_icu_with_options,
514        validate_icu,
515    };
516
517    #[test]
518    fn parses_simple_argument_message() {
519        let message = parse_icu("Hello {name}!").expect("parse");
520        assert_eq!(
521            message.nodes,
522            vec![
523                IcuNode::Literal("Hello ".to_owned()),
524                IcuNode::Argument {
525                    name: "name".to_owned()
526                },
527                IcuNode::Literal("!".to_owned())
528            ]
529        );
530    }
531
532    #[test]
533    fn parses_formatter_styles_as_opaque_strings() {
534        let message = parse_icu(
535            "{n, number, currency} {d, date, short} {t, time, ::HHmm} {items, list, disjunction}",
536        )
537        .expect("parse");
538        assert!(matches!(
539            &message.nodes[0],
540            IcuNode::Number {
541                style: Some(style),
542                ..
543            } if style == "currency"
544        ));
545        assert!(matches!(
546            &message.nodes[2],
547            IcuNode::Date {
548                style: Some(style),
549                ..
550            } if style == "short"
551        ));
552        assert!(matches!(
553            &message.nodes[4],
554            IcuNode::Time {
555                style: Some(style),
556                ..
557            } if style == "::HHmm"
558        ));
559        assert!(matches!(
560            &message.nodes[6],
561            IcuNode::List {
562                style: Some(style),
563                ..
564            } if style == "disjunction"
565        ));
566    }
567
568    #[test]
569    fn parses_plural_select_and_selectordinal() {
570        let message = parse_icu(
571            "{count, plural, offset:1 =0 {none} one {# item} other {{gender, select, male {his} other {their}} items}} {rank, selectordinal, one {#st} other {#th}}",
572        )
573        .expect("parse");
574
575        assert!(matches!(
576            &message.nodes[0],
577            IcuNode::Plural {
578                kind: IcuPluralKind::Cardinal,
579                offset: 1,
580                options,
581                ..
582            } if options.len() == 3
583        ));
584        assert!(matches!(
585            &message.nodes[2],
586            IcuNode::Plural {
587                kind: IcuPluralKind::Ordinal,
588                options,
589                ..
590            } if options.len() == 2
591        ));
592    }
593
594    #[test]
595    fn parses_tags_and_nested_content() {
596        let message =
597            parse_icu("<0>{count, plural, one {<b>#</b>} other {items}}</0>").expect("parse");
598        assert!(matches!(
599            &message.nodes[0],
600            IcuNode::Tag { name, children } if name == "0" && !children.is_empty()
601        ));
602    }
603
604    #[test]
605    fn ignore_tag_treats_tags_as_literal_text() {
606        let message = parse_icu_with_options(
607            "<b>Hello</b>",
608            &IcuParserOptions {
609                ignore_tag: true,
610                ..IcuParserOptions::default()
611            },
612        )
613        .expect("parse");
614        assert_eq!(
615            message.nodes,
616            vec![IcuNode::Literal("<b>Hello</b>".to_owned())]
617        );
618    }
619
620    #[test]
621    fn apostrophe_escaping_works() {
622        let message = parse_icu("'{'{name}'}' ''").expect("parse");
623        assert_eq!(
624            message.nodes,
625            vec![
626                IcuNode::Literal("{".to_owned()),
627                IcuNode::Argument {
628                    name: "name".to_owned()
629                },
630                IcuNode::Literal("} '".to_owned()),
631            ]
632        );
633    }
634
635    #[test]
636    fn missing_other_clause_fails_by_default() {
637        let error = parse_icu("{count, plural, one {item}}").expect_err("missing other");
638        assert!(error.message.contains("other"));
639    }
640
641    #[test]
642    fn missing_other_clause_can_be_disabled() {
643        parse_icu_with_options(
644            "{count, plural, one {item}}",
645            &IcuParserOptions {
646                requires_other_clause: false,
647                ..IcuParserOptions::default()
648            },
649        )
650        .expect("parse");
651    }
652
653    #[test]
654    fn mismatched_closing_tag_fails() {
655        let error = parse_icu("<a>hello</b>").expect_err("mismatch");
656        assert!(error.message.contains("Mismatched"));
657    }
658
659    #[test]
660    fn invalid_offset_fails() {
661        let error = parse_icu("{count, plural, offset:x other {#}}").expect_err("invalid offset");
662        assert!(error.message.contains("integer"));
663    }
664
665    #[test]
666    fn validate_icu_uses_same_error_surface() {
667        let parse_error = parse_icu("{unclosed").expect_err("parse");
668        let validate_error = validate_icu("{unclosed").expect_err("validate");
669        assert_eq!(parse_error, validate_error);
670    }
671
672    #[test]
673    fn error_positions_are_reported() {
674        let error = parse_icu("Hello\n{unclosed").expect_err("parse");
675        assert_eq!(error.position.line, 2);
676        assert!(error.position.column >= 2);
677    }
678
679    #[test]
680    fn pound_outside_plural_is_literal() {
681        let message = parse_icu("Total # items").expect("parse");
682        assert_eq!(
683            message.nodes,
684            vec![IcuNode::Literal("Total # items".to_owned())]
685        );
686    }
687
688    #[test]
689    fn parse_error_type_is_result_based() {
690        let result: Result<_, IcuParseError> = parse_icu("{broken");
691        assert!(result.is_err());
692    }
693
694    #[test]
695    fn rejects_unsupported_types_and_unexpected_trailing_input() {
696        let unsupported = parse_icu("{name, foo}").expect_err("unsupported type");
697        assert!(
698            unsupported
699                .message
700                .contains("Unsupported ICU argument type")
701        );
702
703        let trailing = parse_icu("hello}").expect_err("trailing input");
704        assert!(trailing.message.contains("Unexpected trailing input"));
705    }
706
707    #[test]
708    fn rejects_unterminated_apostrophe_and_unexpected_closing_tag() {
709        let apostrophe = parse_icu("'unterminated").expect_err("unterminated apostrophe");
710        assert!(
711            apostrophe
712                .message
713                .contains("Unterminated apostrophe escape")
714        );
715
716        let closing = parse_icu("</b>").expect_err("unexpected closing tag");
717        assert!(closing.message.contains("Unexpected closing tag"));
718    }
719
720    #[test]
721    fn parses_formatters_without_style_and_invalid_tag_names_fail() {
722        let message = parse_icu("{value, number}").expect("parse formatter without style");
723        assert!(matches!(
724            &message.nodes[0],
725            IcuNode::Number { style: None, .. }
726        ));
727
728        let error = parse_icu("<a>broken</>").expect_err("invalid closing tag");
729        assert!(error.message.contains("Mismatched closing tag"));
730    }
731}