1use crate::ast::{IcuMessage, IcuNode, IcuOption, IcuPluralKind};
2use crate::error::IcuParseError;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
6pub struct IcuParserOptions {
7 pub ignore_tag: bool,
9 pub requires_other_clause: bool,
11}
12
13impl Default for IcuParserOptions {
14 fn default() -> Self {
15 Self {
16 ignore_tag: false,
17 requires_other_clause: true,
18 }
19 }
20}
21
22pub fn parse_icu(input: &str) -> Result<IcuMessage, IcuParseError> {
28 parse_icu_with_options(input, &IcuParserOptions::default())
29}
30
31pub fn parse_icu_with_options(
37 input: &str,
38 options: &IcuParserOptions,
39) -> Result<IcuMessage, IcuParseError> {
40 let mut parser = Parser::new(input, options);
41 let nodes = parser.parse_nodes(None, 0)?;
42 if !parser.is_eof() {
43 return Err(parser.error("Unexpected trailing input"));
44 }
45 Ok(IcuMessage { nodes })
46}
47
48struct Parser<'a> {
49 input: &'a str,
50 input_bytes: &'a [u8],
51 pos: usize,
52 options: &'a IcuParserOptions,
53}
54
55impl<'a> Parser<'a> {
56 const OFFSET_PREFIX: &'static [u8] = b"offset:";
57 const CLOSE_TAG_PREFIX: &'static [u8] = b"</";
58
59 const fn new(input: &'a str, options: &'a IcuParserOptions) -> Self {
60 Self {
61 input,
62 input_bytes: input.as_bytes(),
63 pos: 0,
64 options,
65 }
66 }
67
68 fn parse_nodes(
69 &mut self,
70 until_tag: Option<&str>,
71 plural_depth: usize,
72 ) -> Result<Vec<IcuNode>, IcuParseError> {
73 let mut nodes = Vec::with_capacity(4);
74 let mut literal = String::with_capacity(16);
75
76 while let Some(byte) = self.byte_at() {
77 if byte == b'}' {
78 break;
79 }
80
81 if let Some(tag_name) = until_tag {
82 if self.starts_with_close_tag(tag_name) {
83 break;
84 }
85 if !self.options.ignore_tag && self.peek_close_tag() {
86 return Err(self.error("Mismatched closing tag"));
87 }
88 } else if !self.options.ignore_tag && self.peek_close_tag() {
89 return Err(self.error("Unexpected closing tag"));
90 }
91
92 match byte {
93 b'{' => {
94 Self::flush_literal(&mut literal, &mut nodes);
95 nodes.push(self.parse_argument(plural_depth)?);
96 }
97 b'<' if !self.options.ignore_tag && self.peek_open_tag() => {
98 Self::flush_literal(&mut literal, &mut nodes);
99 nodes.push(self.parse_tag(plural_depth)?);
100 }
101 b'#' if plural_depth > 0 => {
102 Self::flush_literal(&mut literal, &mut nodes);
103 self.pos += 1;
104 nodes.push(IcuNode::Pound);
105 }
106 b'\'' => literal.push_str(&self.parse_apostrophe_literal()?),
107 _ => literal.push(self.advance_char().expect("byte implies char")),
108 }
109 }
110
111 Self::flush_literal(&mut literal, &mut nodes);
112 Ok(nodes)
113 }
114
115 fn parse_argument(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
116 self.expect_char('{')?;
117 self.skip_whitespace();
118 let name = self.parse_identifier()?;
119 self.skip_whitespace();
120
121 if self.consume_char('}') {
122 return Ok(IcuNode::Argument { name });
123 }
124
125 self.expect_char(',')?;
126 self.skip_whitespace();
127 let kind = self.parse_identifier()?;
128 self.skip_whitespace();
129
130 match kind.as_str() {
131 "number" => self.parse_simple_formatter(name, FormatterKind::Number),
132 "date" => self.parse_simple_formatter(name, FormatterKind::Date),
133 "time" => self.parse_simple_formatter(name, FormatterKind::Time),
134 "list" => self.parse_simple_formatter(name, FormatterKind::List),
135 "duration" => self.parse_simple_formatter(name, FormatterKind::Duration),
136 "ago" => self.parse_simple_formatter(name, FormatterKind::Ago),
137 "name" => self.parse_simple_formatter(name, FormatterKind::Name),
138 "select" => self.parse_select(name, plural_depth),
139 "plural" => self.parse_plural(name, plural_depth, IcuPluralKind::Cardinal),
140 "selectordinal" => self.parse_plural(name, plural_depth, IcuPluralKind::Ordinal),
141 _ => Err(self.error("Unsupported ICU argument type")),
142 }
143 }
144
145 fn parse_simple_formatter(
146 &mut self,
147 name: String,
148 kind: FormatterKind,
149 ) -> Result<IcuNode, IcuParseError> {
150 let style = if self.consume_char(',') {
151 let style = self.read_until_closing_brace()?.trim().to_owned();
152 Some(style).filter(|style| !style.is_empty())
153 } else {
154 None
155 };
156 self.expect_char('}')?;
157
158 Ok(match kind {
159 FormatterKind::Number => IcuNode::Number { name, style },
160 FormatterKind::Date => IcuNode::Date { name, style },
161 FormatterKind::Time => IcuNode::Time { name, style },
162 FormatterKind::List => IcuNode::List { name, style },
163 FormatterKind::Duration => IcuNode::Duration { name, style },
164 FormatterKind::Ago => IcuNode::Ago { name, style },
165 FormatterKind::Name => IcuNode::Name { name, style },
166 })
167 }
168
169 fn parse_select(
170 &mut self,
171 name: String,
172 plural_depth: usize,
173 ) -> Result<IcuNode, IcuParseError> {
174 if self.consume_char(',') {
175 self.skip_whitespace();
176 }
177 let options = self.parse_options(plural_depth)?;
178 if self.options.requires_other_clause && !has_other_clause(&options) {
179 return Err(self.error("Select argument requires an \"other\" clause"));
180 }
181 self.expect_char('}')?;
182 Ok(IcuNode::Select { name, options })
183 }
184
185 fn parse_plural(
186 &mut self,
187 name: String,
188 plural_depth: usize,
189 kind: IcuPluralKind,
190 ) -> Result<IcuNode, IcuParseError> {
191 let mut offset = 0u32;
192
193 if self.consume_char(',') {
194 self.skip_whitespace();
195 }
196
197 loop {
198 self.skip_whitespace();
199 if self.starts_with_bytes(Self::OFFSET_PREFIX) {
200 self.pos += Self::OFFSET_PREFIX.len();
201 self.skip_whitespace();
202 offset = self.parse_unsigned_int()?;
203 } else {
204 break;
205 }
206 }
207
208 let options = self.parse_options(plural_depth + 1)?;
209 if self.options.requires_other_clause && !has_other_clause(&options) {
210 return Err(self.error("Plural argument requires an \"other\" clause"));
211 }
212 self.expect_char('}')?;
213
214 Ok(IcuNode::Plural {
215 name,
216 kind,
217 offset,
218 options,
219 })
220 }
221
222 fn parse_options(&mut self, plural_depth: usize) -> Result<Vec<IcuOption>, IcuParseError> {
223 let mut options = Vec::with_capacity(4);
224
225 loop {
226 self.skip_whitespace();
227 if self.byte_at() == Some(b'}') {
228 break;
229 }
230 let selector = self.parse_selector()?;
231 self.skip_whitespace();
232 self.expect_char('{')?;
233 let value = self.parse_nodes(None, plural_depth)?;
234 self.expect_char('}')?;
235 options.push(IcuOption { selector, value });
236 }
237
238 if options.is_empty() {
239 return Err(self.error("Expected at least one ICU option"));
240 }
241
242 Ok(options)
243 }
244
245 fn parse_tag(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
246 self.expect_char('<')?;
247 let name = self.parse_tag_name()?;
248 self.expect_char('>')?;
249 let children = self.parse_nodes(Some(&name), plural_depth)?;
250 self.expect_bytes(Self::CLOSE_TAG_PREFIX)?;
251 let close_name = self.parse_tag_name()?;
252 if close_name != name {
253 return Err(self.error("Mismatched closing tag"));
254 }
255 self.expect_char('>')?;
256 Ok(IcuNode::Tag { name, children })
257 }
258
259 fn parse_apostrophe_literal(&mut self) -> Result<String, IcuParseError> {
260 let start = self.pos;
261 self.expect_char('\'')?;
262
263 if self.consume_char('\'') {
264 return Ok("'".to_owned());
265 }
266
267 let mut out = String::with_capacity(8);
268 while let Some(byte) = self.byte_at() {
269 if byte == b'\'' {
270 self.pos += 1;
271 if self.consume_char('\'') {
272 out.push('\'');
273 } else {
274 return Ok(out);
275 }
276 } else {
277 out.push(self.advance_char().expect("byte implies char"));
278 }
279 }
280
281 Err(IcuParseError::syntax(
282 "Unterminated apostrophe escape",
283 self.input,
284 start,
285 ))
286 }
287
288 fn read_until_closing_brace(&mut self) -> Result<String, IcuParseError> {
289 let mut out = String::with_capacity(8);
290 while let Some(byte) = self.byte_at() {
291 if byte == b'}' {
292 return Ok(out);
293 }
294 if byte == b'\'' {
295 out.push_str(&self.parse_apostrophe_literal()?);
296 } else {
297 out.push(self.advance_char().expect("byte implies char"));
298 }
299 }
300 Err(self.error("Unterminated ICU argument"))
301 }
302
303 fn parse_selector(&mut self) -> Result<String, IcuParseError> {
304 let start = self.pos;
305 if self.consume_char('=') {
306 let number = self.parse_unsigned_int()?;
307 return Ok(format!("={number}"));
308 }
309
310 while let Some(byte) = self.byte_at() {
311 if byte.is_ascii_whitespace() || byte == b'{' {
312 break;
313 }
314 if byte.is_ascii() {
315 self.pos += 1;
316 } else {
317 self.advance_char();
318 }
319 }
320
321 if self.pos == start {
322 return Err(self.error("Expected ICU selector"));
323 }
324
325 Ok(self.input[start..self.pos].to_owned())
326 }
327
328 fn parse_identifier(&mut self) -> Result<String, IcuParseError> {
329 let start = self.pos;
330 while let Some(byte) = self.byte_at() {
331 if byte.is_ascii_whitespace() || matches!(byte, b'{' | b'}' | b',' | b'<' | b'>') {
332 break;
333 }
334 if byte.is_ascii() {
335 self.pos += 1;
336 } else {
337 self.advance_char();
338 }
339 }
340
341 if self.pos == start {
342 return Err(self.error("Expected ICU identifier"));
343 }
344
345 Ok(self.input[start..self.pos].to_owned())
346 }
347
348 fn parse_tag_name(&mut self) -> Result<String, IcuParseError> {
349 let start = self.pos;
350 while let Some(byte) = self.byte_at() {
351 if byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.') {
352 self.pos += 1;
353 } else {
354 break;
355 }
356 }
357
358 if self.pos == start {
359 return Err(self.error("Expected tag name"));
360 }
361
362 Ok(self.input[start..self.pos].to_owned())
363 }
364
365 fn parse_unsigned_int(&mut self) -> Result<u32, IcuParseError> {
366 let start = self.pos;
367 while let Some(byte) = self.byte_at() {
368 if byte.is_ascii_digit() {
369 self.pos += 1;
370 } else {
371 break;
372 }
373 }
374
375 if self.pos == start {
376 return Err(self.error("Expected integer"));
377 }
378
379 self.input[start..self.pos]
380 .parse::<u32>()
381 .map_err(|_| self.error("Invalid integer"))
382 }
383
384 fn skip_whitespace(&mut self) {
385 while let Some(byte) = self.byte_at() {
386 if byte.is_ascii_whitespace() {
387 self.pos += 1;
388 } else {
389 break;
390 }
391 }
392 }
393
394 fn flush_literal(literal: &mut String, nodes: &mut Vec<IcuNode>) {
395 if !literal.is_empty() {
396 nodes.push(IcuNode::Literal(core::mem::take(literal)));
397 }
398 }
399
400 fn expect_char(&mut self, ch: char) -> Result<(), IcuParseError> {
401 if ch.is_ascii() {
402 if self.byte_at() == Some(ch as u8) {
403 self.pos += 1;
404 return Ok(());
405 }
406 return Err(self.error(format!("Expected '{ch}'")));
407 }
408
409 match self.peek_char() {
410 Some(current) if current == ch => {
411 self.pos += ch.len_utf8();
412 Ok(())
413 }
414 _ => Err(self.error(format!("Expected '{ch}'"))),
415 }
416 }
417
418 fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), IcuParseError> {
419 if self.starts_with_bytes(expected) {
420 self.pos += expected.len();
421 Ok(())
422 } else {
423 let expected = core::str::from_utf8(expected).unwrap_or("<bytes>");
424 Err(self.error(format!("Expected \"{expected}\"")))
425 }
426 }
427
428 fn consume_char(&mut self, ch: char) -> bool {
429 if ch.is_ascii() {
430 if self.byte_at() == Some(ch as u8) {
431 self.pos += 1;
432 return true;
433 }
434 return false;
435 }
436
437 if self.peek_char() == Some(ch) {
438 self.pos += ch.len_utf8();
439 true
440 } else {
441 false
442 }
443 }
444
445 fn peek_char(&self) -> Option<char> {
446 self.input[self.pos..].chars().next()
447 }
448
449 fn byte_at(&self) -> Option<u8> {
450 self.input_bytes.get(self.pos).copied()
451 }
452
453 fn advance_char(&mut self) -> Option<char> {
454 let ch = self.peek_char()?;
455 self.pos += ch.len_utf8();
456 Some(ch)
457 }
458
459 fn peek_open_tag(&self) -> bool {
460 let Some(rest) = self.input_bytes.get(self.pos..) else {
461 return false;
462 };
463 if !rest.starts_with(b"<") || rest.starts_with(b"</") {
464 return false;
465 }
466 rest.get(1).is_some_and(u8::is_ascii_alphanumeric)
467 }
468
469 fn peek_close_tag(&self) -> bool {
470 self.input_bytes[self.pos..].starts_with(b"</")
471 }
472
473 fn starts_with_close_tag(&self, name: &str) -> bool {
474 let Some(rest) = self.input_bytes.get(self.pos..) else {
475 return false;
476 };
477 rest.starts_with(Self::CLOSE_TAG_PREFIX)
478 && rest[2..].starts_with(name.as_bytes())
479 && rest.get(2 + name.len()) == Some(&b'>')
480 }
481
482 fn starts_with_bytes(&self, expected: &[u8]) -> bool {
483 self.input_bytes[self.pos..].starts_with(expected)
484 }
485
486 const fn is_eof(&self) -> bool {
487 self.pos >= self.input.len()
488 }
489
490 fn error(&self, message: impl Into<String>) -> IcuParseError {
491 IcuParseError::syntax(message, self.input, self.pos)
492 }
493}
494
495#[derive(Clone, Copy)]
496enum FormatterKind {
497 Number,
498 Date,
499 Time,
500 List,
501 Duration,
502 Ago,
503 Name,
504}
505
506fn has_other_clause(options: &[IcuOption]) -> bool {
507 options.iter().any(|option| option.selector == "other")
508}
509
510#[cfg(test)]
511mod tests {
512 use crate::{
513 IcuNode, IcuParseError, IcuParserOptions, IcuPluralKind, parse_icu, parse_icu_with_options,
514 validate_icu,
515 };
516
517 #[test]
518 fn parses_simple_argument_message() {
519 let message = parse_icu("Hello {name}!").expect("parse");
520 assert_eq!(
521 message.nodes,
522 vec![
523 IcuNode::Literal("Hello ".to_owned()),
524 IcuNode::Argument {
525 name: "name".to_owned()
526 },
527 IcuNode::Literal("!".to_owned())
528 ]
529 );
530 }
531
532 #[test]
533 fn parses_formatter_styles_as_opaque_strings() {
534 let message = parse_icu(
535 "{n, number, currency} {d, date, short} {t, time, ::HHmm} {items, list, disjunction}",
536 )
537 .expect("parse");
538 assert!(matches!(
539 &message.nodes[0],
540 IcuNode::Number {
541 style: Some(style),
542 ..
543 } if style == "currency"
544 ));
545 assert!(matches!(
546 &message.nodes[2],
547 IcuNode::Date {
548 style: Some(style),
549 ..
550 } if style == "short"
551 ));
552 assert!(matches!(
553 &message.nodes[4],
554 IcuNode::Time {
555 style: Some(style),
556 ..
557 } if style == "::HHmm"
558 ));
559 assert!(matches!(
560 &message.nodes[6],
561 IcuNode::List {
562 style: Some(style),
563 ..
564 } if style == "disjunction"
565 ));
566 }
567
568 #[test]
569 fn parses_plural_select_and_selectordinal() {
570 let message = parse_icu(
571 "{count, plural, offset:1 =0 {none} one {# item} other {{gender, select, male {his} other {their}} items}} {rank, selectordinal, one {#st} other {#th}}",
572 )
573 .expect("parse");
574
575 assert!(matches!(
576 &message.nodes[0],
577 IcuNode::Plural {
578 kind: IcuPluralKind::Cardinal,
579 offset: 1,
580 options,
581 ..
582 } if options.len() == 3
583 ));
584 assert!(matches!(
585 &message.nodes[2],
586 IcuNode::Plural {
587 kind: IcuPluralKind::Ordinal,
588 options,
589 ..
590 } if options.len() == 2
591 ));
592 }
593
594 #[test]
595 fn parses_tags_and_nested_content() {
596 let message =
597 parse_icu("<0>{count, plural, one {<b>#</b>} other {items}}</0>").expect("parse");
598 assert!(matches!(
599 &message.nodes[0],
600 IcuNode::Tag { name, children } if name == "0" && !children.is_empty()
601 ));
602 }
603
604 #[test]
605 fn ignore_tag_treats_tags_as_literal_text() {
606 let message = parse_icu_with_options(
607 "<b>Hello</b>",
608 &IcuParserOptions {
609 ignore_tag: true,
610 ..IcuParserOptions::default()
611 },
612 )
613 .expect("parse");
614 assert_eq!(
615 message.nodes,
616 vec![IcuNode::Literal("<b>Hello</b>".to_owned())]
617 );
618 }
619
620 #[test]
621 fn apostrophe_escaping_works() {
622 let message = parse_icu("'{'{name}'}' ''").expect("parse");
623 assert_eq!(
624 message.nodes,
625 vec![
626 IcuNode::Literal("{".to_owned()),
627 IcuNode::Argument {
628 name: "name".to_owned()
629 },
630 IcuNode::Literal("} '".to_owned()),
631 ]
632 );
633 }
634
635 #[test]
636 fn missing_other_clause_fails_by_default() {
637 let error = parse_icu("{count, plural, one {item}}").expect_err("missing other");
638 assert!(error.message.contains("other"));
639 }
640
641 #[test]
642 fn missing_other_clause_can_be_disabled() {
643 parse_icu_with_options(
644 "{count, plural, one {item}}",
645 &IcuParserOptions {
646 requires_other_clause: false,
647 ..IcuParserOptions::default()
648 },
649 )
650 .expect("parse");
651 }
652
653 #[test]
654 fn mismatched_closing_tag_fails() {
655 let error = parse_icu("<a>hello</b>").expect_err("mismatch");
656 assert!(error.message.contains("Mismatched"));
657 }
658
659 #[test]
660 fn invalid_offset_fails() {
661 let error = parse_icu("{count, plural, offset:x other {#}}").expect_err("invalid offset");
662 assert!(error.message.contains("integer"));
663 }
664
665 #[test]
666 fn validate_icu_uses_same_error_surface() {
667 let parse_error = parse_icu("{unclosed").expect_err("parse");
668 let validate_error = validate_icu("{unclosed").expect_err("validate");
669 assert_eq!(parse_error, validate_error);
670 }
671
672 #[test]
673 fn error_positions_are_reported() {
674 let error = parse_icu("Hello\n{unclosed").expect_err("parse");
675 assert_eq!(error.position.line, 2);
676 assert!(error.position.column >= 2);
677 }
678
679 #[test]
680 fn pound_outside_plural_is_literal() {
681 let message = parse_icu("Total # items").expect("parse");
682 assert_eq!(
683 message.nodes,
684 vec![IcuNode::Literal("Total # items".to_owned())]
685 );
686 }
687
688 #[test]
689 fn parse_error_type_is_result_based() {
690 let result: Result<_, IcuParseError> = parse_icu("{broken");
691 assert!(result.is_err());
692 }
693
694 #[test]
695 fn rejects_unsupported_types_and_unexpected_trailing_input() {
696 let unsupported = parse_icu("{name, foo}").expect_err("unsupported type");
697 assert!(
698 unsupported
699 .message
700 .contains("Unsupported ICU argument type")
701 );
702
703 let trailing = parse_icu("hello}").expect_err("trailing input");
704 assert!(trailing.message.contains("Unexpected trailing input"));
705 }
706
707 #[test]
708 fn rejects_unterminated_apostrophe_and_unexpected_closing_tag() {
709 let apostrophe = parse_icu("'unterminated").expect_err("unterminated apostrophe");
710 assert!(
711 apostrophe
712 .message
713 .contains("Unterminated apostrophe escape")
714 );
715
716 let closing = parse_icu("</b>").expect_err("unexpected closing tag");
717 assert!(closing.message.contains("Unexpected closing tag"));
718 }
719
720 #[test]
721 fn parses_formatters_without_style_and_invalid_tag_names_fail() {
722 let message = parse_icu("{value, number}").expect("parse formatter without style");
723 assert!(matches!(
724 &message.nodes[0],
725 IcuNode::Number { style: None, .. }
726 ));
727
728 let error = parse_icu("<a>broken</>").expect_err("invalid closing tag");
729 assert!(error.message.contains("Mismatched closing tag"));
730 }
731}