1use crate::ast::{IcuMessage, IcuNode, IcuOption, IcuPluralKind};
2use crate::error::IcuParseError;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub struct IcuParserOptions {
6 pub ignore_tag: bool,
7 pub requires_other_clause: bool,
8}
9
10impl Default for IcuParserOptions {
11 fn default() -> Self {
12 Self {
13 ignore_tag: false,
14 requires_other_clause: true,
15 }
16 }
17}
18
19pub fn parse_icu(input: &str) -> Result<IcuMessage, IcuParseError> {
20 parse_icu_with_options(input, &IcuParserOptions::default())
21}
22
23pub fn parse_icu_with_options(
24 input: &str,
25 options: &IcuParserOptions,
26) -> Result<IcuMessage, IcuParseError> {
27 let mut parser = Parser::new(input, options);
28 let nodes = parser.parse_nodes(None, 0)?;
29 if !parser.is_eof() {
30 return Err(parser.error("Unexpected trailing input"));
31 }
32 Ok(IcuMessage { nodes })
33}
34
35struct Parser<'a> {
36 input: &'a str,
37 input_bytes: &'a [u8],
38 pos: usize,
39 options: &'a IcuParserOptions,
40}
41
42impl<'a> Parser<'a> {
43 const OFFSET_PREFIX: &'static [u8] = b"offset:";
44 const CLOSE_TAG_PREFIX: &'static [u8] = b"</";
45
46 fn new(input: &'a str, options: &'a IcuParserOptions) -> Self {
47 Self {
48 input,
49 input_bytes: input.as_bytes(),
50 pos: 0,
51 options,
52 }
53 }
54
55 fn parse_nodes(
56 &mut self,
57 until_tag: Option<&str>,
58 plural_depth: usize,
59 ) -> Result<Vec<IcuNode>, IcuParseError> {
60 let mut nodes = Vec::with_capacity(4);
61 let mut literal = String::with_capacity(16);
62
63 while let Some(byte) = self.byte_at() {
64 if byte == b'}' {
65 break;
66 }
67
68 if let Some(tag_name) = until_tag {
69 if self.starts_with_close_tag(tag_name) {
70 break;
71 }
72 if !self.options.ignore_tag && self.peek_close_tag() {
73 return Err(self.error("Mismatched closing tag"));
74 }
75 } else if !self.options.ignore_tag && self.peek_close_tag() {
76 return Err(self.error("Unexpected closing tag"));
77 }
78
79 match byte {
80 b'{' => {
81 self.flush_literal(&mut literal, &mut nodes);
82 nodes.push(self.parse_argument(plural_depth)?);
83 }
84 b'<' if !self.options.ignore_tag && self.peek_open_tag() => {
85 self.flush_literal(&mut literal, &mut nodes);
86 nodes.push(self.parse_tag(plural_depth)?);
87 }
88 b'#' if plural_depth > 0 => {
89 self.flush_literal(&mut literal, &mut nodes);
90 self.pos += 1;
91 nodes.push(IcuNode::Pound);
92 }
93 b'\'' => literal.push_str(&self.parse_apostrophe_literal()?),
94 _ => literal.push(self.advance_char().expect("byte implies char")),
95 }
96 }
97
98 self.flush_literal(&mut literal, &mut nodes);
99 Ok(nodes)
100 }
101
102 fn parse_argument(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
103 self.expect_char('{')?;
104 self.skip_whitespace();
105 let name = self.parse_identifier()?;
106 self.skip_whitespace();
107
108 if self.consume_char('}') {
109 return Ok(IcuNode::Argument { name });
110 }
111
112 self.expect_char(',')?;
113 self.skip_whitespace();
114 let kind = self.parse_identifier()?;
115 self.skip_whitespace();
116
117 match kind.as_str() {
118 "number" => self.parse_simple_formatter(name, FormatterKind::Number),
119 "date" => self.parse_simple_formatter(name, FormatterKind::Date),
120 "time" => self.parse_simple_formatter(name, FormatterKind::Time),
121 "list" => self.parse_simple_formatter(name, FormatterKind::List),
122 "duration" => self.parse_simple_formatter(name, FormatterKind::Duration),
123 "ago" => self.parse_simple_formatter(name, FormatterKind::Ago),
124 "name" => self.parse_simple_formatter(name, FormatterKind::Name),
125 "select" => self.parse_select(name, plural_depth),
126 "plural" => self.parse_plural(name, plural_depth, IcuPluralKind::Cardinal),
127 "selectordinal" => self.parse_plural(name, plural_depth, IcuPluralKind::Ordinal),
128 _ => Err(self.error("Unsupported ICU argument type")),
129 }
130 }
131
132 fn parse_simple_formatter(
133 &mut self,
134 name: String,
135 kind: FormatterKind,
136 ) -> Result<IcuNode, IcuParseError> {
137 let style = if self.consume_char(',') {
138 let style = self.read_until_closing_brace()?.trim().to_owned();
139 Some(style).filter(|style| !style.is_empty())
140 } else {
141 None
142 };
143 self.expect_char('}')?;
144
145 Ok(match kind {
146 FormatterKind::Number => IcuNode::Number { name, style },
147 FormatterKind::Date => IcuNode::Date { name, style },
148 FormatterKind::Time => IcuNode::Time { name, style },
149 FormatterKind::List => IcuNode::List { name, style },
150 FormatterKind::Duration => IcuNode::Duration { name, style },
151 FormatterKind::Ago => IcuNode::Ago { name, style },
152 FormatterKind::Name => IcuNode::Name { name, style },
153 })
154 }
155
156 fn parse_select(
157 &mut self,
158 name: String,
159 plural_depth: usize,
160 ) -> Result<IcuNode, IcuParseError> {
161 if self.consume_char(',') {
162 self.skip_whitespace();
163 }
164 let options = self.parse_options(plural_depth)?;
165 if self.options.requires_other_clause && !has_other_clause(&options) {
166 return Err(self.error("Select argument requires an \"other\" clause"));
167 }
168 self.expect_char('}')?;
169 Ok(IcuNode::Select { name, options })
170 }
171
172 fn parse_plural(
173 &mut self,
174 name: String,
175 plural_depth: usize,
176 kind: IcuPluralKind,
177 ) -> Result<IcuNode, IcuParseError> {
178 let mut offset = 0u32;
179
180 if self.consume_char(',') {
181 self.skip_whitespace();
182 }
183
184 loop {
185 self.skip_whitespace();
186 if self.starts_with_bytes(Self::OFFSET_PREFIX) {
187 self.pos += Self::OFFSET_PREFIX.len();
188 self.skip_whitespace();
189 offset = self.parse_unsigned_int()? as u32;
190 } else {
191 break;
192 }
193 }
194
195 let options = self.parse_options(plural_depth + 1)?;
196 if self.options.requires_other_clause && !has_other_clause(&options) {
197 return Err(self.error("Plural argument requires an \"other\" clause"));
198 }
199 self.expect_char('}')?;
200
201 Ok(IcuNode::Plural {
202 name,
203 kind,
204 offset,
205 options,
206 })
207 }
208
209 fn parse_options(&mut self, plural_depth: usize) -> Result<Vec<IcuOption>, IcuParseError> {
210 let mut options = Vec::with_capacity(4);
211
212 loop {
213 self.skip_whitespace();
214 if self.byte_at() == Some(b'}') {
215 break;
216 }
217 let selector = self.parse_selector()?;
218 self.skip_whitespace();
219 self.expect_char('{')?;
220 let value = self.parse_nodes(None, plural_depth)?;
221 self.expect_char('}')?;
222 options.push(IcuOption { selector, value });
223 }
224
225 if options.is_empty() {
226 return Err(self.error("Expected at least one ICU option"));
227 }
228
229 Ok(options)
230 }
231
232 fn parse_tag(&mut self, plural_depth: usize) -> Result<IcuNode, IcuParseError> {
233 self.expect_char('<')?;
234 let name = self.parse_tag_name()?;
235 self.expect_char('>')?;
236 let children = self.parse_nodes(Some(&name), plural_depth)?;
237 self.expect_bytes(Self::CLOSE_TAG_PREFIX)?;
238 let close_name = self.parse_tag_name()?;
239 if close_name != name {
240 return Err(self.error("Mismatched closing tag"));
241 }
242 self.expect_char('>')?;
243 Ok(IcuNode::Tag { name, children })
244 }
245
246 fn parse_apostrophe_literal(&mut self) -> Result<String, IcuParseError> {
247 let start = self.pos;
248 self.expect_char('\'')?;
249
250 if self.consume_char('\'') {
251 return Ok("'".to_owned());
252 }
253
254 let mut out = String::with_capacity(8);
255 while let Some(byte) = self.byte_at() {
256 if byte == b'\'' {
257 self.pos += 1;
258 if self.consume_char('\'') {
259 out.push('\'');
260 } else {
261 return Ok(out);
262 }
263 } else {
264 out.push(self.advance_char().expect("byte implies char"));
265 }
266 }
267
268 Err(IcuParseError::syntax(
269 "Unterminated apostrophe escape",
270 self.input,
271 start,
272 ))
273 }
274
275 fn read_until_closing_brace(&mut self) -> Result<String, IcuParseError> {
276 let mut out = String::with_capacity(8);
277 while let Some(byte) = self.byte_at() {
278 if byte == b'}' {
279 return Ok(out);
280 }
281 if byte == b'\'' {
282 out.push_str(&self.parse_apostrophe_literal()?);
283 } else {
284 out.push(self.advance_char().expect("byte implies char"));
285 }
286 }
287 Err(self.error("Unterminated ICU argument"))
288 }
289
290 fn parse_selector(&mut self) -> Result<String, IcuParseError> {
291 let start = self.pos;
292 if self.consume_char('=') {
293 let number = self.parse_unsigned_int()?;
294 return Ok(format!("={number}"));
295 }
296
297 while let Some(byte) = self.byte_at() {
298 if byte.is_ascii_whitespace() || byte == b'{' {
299 break;
300 }
301 if byte.is_ascii() {
302 self.pos += 1;
303 } else {
304 self.advance_char();
305 }
306 }
307
308 if self.pos == start {
309 return Err(self.error("Expected ICU selector"));
310 }
311
312 Ok(self.input[start..self.pos].to_owned())
313 }
314
315 fn parse_identifier(&mut self) -> Result<String, IcuParseError> {
316 let start = self.pos;
317 while let Some(byte) = self.byte_at() {
318 if byte.is_ascii_whitespace() || matches!(byte, b'{' | b'}' | b',' | b'<' | b'>') {
319 break;
320 }
321 if byte.is_ascii() {
322 self.pos += 1;
323 } else {
324 self.advance_char();
325 }
326 }
327
328 if self.pos == start {
329 return Err(self.error("Expected ICU identifier"));
330 }
331
332 Ok(self.input[start..self.pos].to_owned())
333 }
334
335 fn parse_tag_name(&mut self) -> Result<String, IcuParseError> {
336 let start = self.pos;
337 while let Some(byte) = self.byte_at() {
338 if byte.is_ascii_alphanumeric() || matches!(byte, b'_' | b'-' | b'.') {
339 self.pos += 1;
340 } else {
341 break;
342 }
343 }
344
345 if self.pos == start {
346 return Err(self.error("Expected tag name"));
347 }
348
349 Ok(self.input[start..self.pos].to_owned())
350 }
351
352 fn parse_unsigned_int(&mut self) -> Result<usize, IcuParseError> {
353 let start = self.pos;
354 while let Some(byte) = self.byte_at() {
355 if byte.is_ascii_digit() {
356 self.pos += 1;
357 } else {
358 break;
359 }
360 }
361
362 if self.pos == start {
363 return Err(self.error("Expected integer"));
364 }
365
366 self.input[start..self.pos]
367 .parse::<usize>()
368 .map_err(|_| self.error("Invalid integer"))
369 }
370
371 fn skip_whitespace(&mut self) {
372 while let Some(byte) = self.byte_at() {
373 if byte.is_ascii_whitespace() {
374 self.pos += 1;
375 } else {
376 break;
377 }
378 }
379 }
380
381 fn flush_literal(&self, literal: &mut String, nodes: &mut Vec<IcuNode>) {
382 if !literal.is_empty() {
383 nodes.push(IcuNode::Literal(core::mem::take(literal)));
384 }
385 }
386
387 fn expect_char(&mut self, ch: char) -> Result<(), IcuParseError> {
388 if ch.is_ascii() {
389 if self.byte_at() == Some(ch as u8) {
390 self.pos += 1;
391 return Ok(());
392 }
393 return Err(self.error(format!("Expected '{ch}'")));
394 }
395
396 match self.peek_char() {
397 Some(current) if current == ch => {
398 self.pos += ch.len_utf8();
399 Ok(())
400 }
401 _ => Err(self.error(format!("Expected '{ch}'"))),
402 }
403 }
404
405 fn expect_bytes(&mut self, expected: &[u8]) -> Result<(), IcuParseError> {
406 if self.starts_with_bytes(expected) {
407 self.pos += expected.len();
408 Ok(())
409 } else {
410 let expected = core::str::from_utf8(expected).unwrap_or("<bytes>");
411 Err(self.error(format!("Expected \"{expected}\"")))
412 }
413 }
414
415 fn consume_char(&mut self, ch: char) -> bool {
416 if ch.is_ascii() {
417 if self.byte_at() == Some(ch as u8) {
418 self.pos += 1;
419 return true;
420 }
421 return false;
422 }
423
424 if self.peek_char() == Some(ch) {
425 self.pos += ch.len_utf8();
426 true
427 } else {
428 false
429 }
430 }
431
432 fn peek_char(&self) -> Option<char> {
433 self.input[self.pos..].chars().next()
434 }
435
436 fn byte_at(&self) -> Option<u8> {
437 self.input_bytes.get(self.pos).copied()
438 }
439
440 fn advance_char(&mut self) -> Option<char> {
441 let ch = self.peek_char()?;
442 self.pos += ch.len_utf8();
443 Some(ch)
444 }
445
446 fn peek_open_tag(&self) -> bool {
447 let Some(rest) = self.input_bytes.get(self.pos..) else {
448 return false;
449 };
450 if !rest.starts_with(b"<") || rest.starts_with(b"</") {
451 return false;
452 }
453 rest.get(1).is_some_and(u8::is_ascii_alphanumeric)
454 }
455
456 fn peek_close_tag(&self) -> bool {
457 self.input_bytes[self.pos..].starts_with(b"</")
458 }
459
460 fn starts_with_close_tag(&self, name: &str) -> bool {
461 let Some(rest) = self.input_bytes.get(self.pos..) else {
462 return false;
463 };
464 rest.starts_with(Self::CLOSE_TAG_PREFIX)
465 && rest[2..].starts_with(name.as_bytes())
466 && rest.get(2 + name.len()) == Some(&b'>')
467 }
468
469 fn starts_with_bytes(&self, expected: &[u8]) -> bool {
470 self.input_bytes[self.pos..].starts_with(expected)
471 }
472
473 fn is_eof(&self) -> bool {
474 self.pos >= self.input.len()
475 }
476
477 fn error(&self, message: impl Into<String>) -> IcuParseError {
478 IcuParseError::syntax(message, self.input, self.pos)
479 }
480}
481
482#[derive(Clone, Copy)]
483enum FormatterKind {
484 Number,
485 Date,
486 Time,
487 List,
488 Duration,
489 Ago,
490 Name,
491}
492
493fn has_other_clause(options: &[IcuOption]) -> bool {
494 options.iter().any(|option| option.selector == "other")
495}
496
497#[cfg(test)]
498mod tests {
499 use crate::{
500 IcuNode, IcuParseError, IcuParserOptions, IcuPluralKind, parse_icu, parse_icu_with_options,
501 validate_icu,
502 };
503
504 #[test]
505 fn parses_simple_argument_message() {
506 let message = parse_icu("Hello {name}!").expect("parse");
507 assert_eq!(
508 message.nodes,
509 vec![
510 IcuNode::Literal("Hello ".to_owned()),
511 IcuNode::Argument {
512 name: "name".to_owned()
513 },
514 IcuNode::Literal("!".to_owned())
515 ]
516 );
517 }
518
519 #[test]
520 fn parses_formatter_styles_as_opaque_strings() {
521 let message = parse_icu(
522 "{n, number, currency} {d, date, short} {t, time, ::HHmm} {items, list, disjunction}",
523 )
524 .expect("parse");
525 assert!(matches!(
526 &message.nodes[0],
527 IcuNode::Number {
528 style: Some(style),
529 ..
530 } if style == "currency"
531 ));
532 assert!(matches!(
533 &message.nodes[2],
534 IcuNode::Date {
535 style: Some(style),
536 ..
537 } if style == "short"
538 ));
539 assert!(matches!(
540 &message.nodes[4],
541 IcuNode::Time {
542 style: Some(style),
543 ..
544 } if style == "::HHmm"
545 ));
546 assert!(matches!(
547 &message.nodes[6],
548 IcuNode::List {
549 style: Some(style),
550 ..
551 } if style == "disjunction"
552 ));
553 }
554
555 #[test]
556 fn parses_plural_select_and_selectordinal() {
557 let message = parse_icu(
558 "{count, plural, offset:1 =0 {none} one {# item} other {{gender, select, male {his} other {their}} items}} {rank, selectordinal, one {#st} other {#th}}",
559 )
560 .expect("parse");
561
562 assert!(matches!(
563 &message.nodes[0],
564 IcuNode::Plural {
565 kind: IcuPluralKind::Cardinal,
566 offset: 1,
567 options,
568 ..
569 } if options.len() == 3
570 ));
571 assert!(matches!(
572 &message.nodes[2],
573 IcuNode::Plural {
574 kind: IcuPluralKind::Ordinal,
575 options,
576 ..
577 } if options.len() == 2
578 ));
579 }
580
581 #[test]
582 fn parses_tags_and_nested_content() {
583 let message =
584 parse_icu("<0>{count, plural, one {<b>#</b>} other {items}}</0>").expect("parse");
585 assert!(matches!(
586 &message.nodes[0],
587 IcuNode::Tag { name, children } if name == "0" && !children.is_empty()
588 ));
589 }
590
591 #[test]
592 fn ignore_tag_treats_tags_as_literal_text() {
593 let message = parse_icu_with_options(
594 "<b>Hello</b>",
595 &IcuParserOptions {
596 ignore_tag: true,
597 ..IcuParserOptions::default()
598 },
599 )
600 .expect("parse");
601 assert_eq!(
602 message.nodes,
603 vec![IcuNode::Literal("<b>Hello</b>".to_owned())]
604 );
605 }
606
607 #[test]
608 fn apostrophe_escaping_works() {
609 let message = parse_icu("'{'{name}'}' ''").expect("parse");
610 assert_eq!(
611 message.nodes,
612 vec![
613 IcuNode::Literal("{".to_owned()),
614 IcuNode::Argument {
615 name: "name".to_owned()
616 },
617 IcuNode::Literal("} '".to_owned()),
618 ]
619 );
620 }
621
622 #[test]
623 fn missing_other_clause_fails_by_default() {
624 let error = parse_icu("{count, plural, one {item}}").expect_err("missing other");
625 assert!(error.message.contains("other"));
626 }
627
628 #[test]
629 fn missing_other_clause_can_be_disabled() {
630 parse_icu_with_options(
631 "{count, plural, one {item}}",
632 &IcuParserOptions {
633 requires_other_clause: false,
634 ..IcuParserOptions::default()
635 },
636 )
637 .expect("parse");
638 }
639
640 #[test]
641 fn mismatched_closing_tag_fails() {
642 let error = parse_icu("<a>hello</b>").expect_err("mismatch");
643 assert!(error.message.contains("Mismatched"));
644 }
645
646 #[test]
647 fn invalid_offset_fails() {
648 let error = parse_icu("{count, plural, offset:x other {#}}").expect_err("invalid offset");
649 assert!(error.message.contains("integer"));
650 }
651
652 #[test]
653 fn validate_icu_uses_same_error_surface() {
654 let parse_error = parse_icu("{unclosed").expect_err("parse");
655 let validate_error = validate_icu("{unclosed").expect_err("validate");
656 assert_eq!(parse_error, validate_error);
657 }
658
659 #[test]
660 fn error_positions_are_reported() {
661 let error = parse_icu("Hello\n{unclosed").expect_err("parse");
662 assert_eq!(error.position.line, 2);
663 assert!(error.position.column >= 2);
664 }
665
666 #[test]
667 fn pound_outside_plural_is_literal() {
668 let message = parse_icu("Total # items").expect("parse");
669 assert_eq!(
670 message.nodes,
671 vec![IcuNode::Literal("Total # items".to_owned())]
672 );
673 }
674
675 #[test]
676 fn parse_error_type_is_result_based() {
677 let result: Result<_, IcuParseError> = parse_icu("{broken");
678 assert!(result.is_err());
679 }
680}