icu_pattern/parser/mod.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug, Default)]
14enum ParserState {
15 #[default]
16 Default,
17 Placeholder,
18 QuotedLiteral,
19 Apostrophe {
20 quoted: bool,
21 },
22}
23
24macro_rules! handle_literal {
25 ($self:ident, $quoted:expr, $next_state:expr) => {{
26 let range = $self.advance_state($self.idx, $next_state);
27 if !range.is_empty() {
28 return Ok(Some(ParsedPatternItem::Literal {
29 content: Cow::Borrowed(&$self.input[range]),
30 quoted: $quoted,
31 }));
32 } else {
33 continue;
34 }
35 }};
36}
37
38/// Options passed to the constructor of [`Parser`].
39///
40/// ✨ *Enabled with the `alloc` Cargo feature.*
41#[derive(Debug, Default)]
42#[non_exhaustive]
43pub struct ParserOptions {
44 /// Controls how quotes (`'`) are interpreted.
45 pub quote_mode: QuoteMode,
46}
47
48/// Controls how quotes (`'`) are interpreted.
49#[derive(Debug, Default, PartialEq)]
50#[non_exhaustive]
51pub enum QuoteMode {
52 /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
53 #[default]
54 QuotesAreLiterals,
55 /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
56 ///
57 /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
58 QuotingSupported,
59 /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
60 ///
61 /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
62 QuotingRequired,
63}
64
65impl From<QuoteMode> for ParserOptions {
66 fn from(quote_mode: QuoteMode) -> Self {
67 Self { quote_mode }
68 }
69}
70
71/// Placeholder pattern parser.
72///
73/// The parser allows for handling flexible range of generic patterns
74/// with placeholders.
75///
76/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
77/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
78/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
79/// characters in the input pattern string.
80///
81/// At the moment the parser is written as a custom fallible iterator.
82///
83/// ✨ *Enabled with the `alloc` Cargo feature.*
84///
85/// # Examples
86///
87/// ```
88/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
89///
90/// let input = "{0}, {1}";
91///
92/// let mut parser = Parser::new(input, ParserOptions::default());
93///
94/// let mut result = vec![];
95///
96/// while let Some(element) =
97/// parser.try_next().expect("Failed to advance iterator")
98/// {
99/// result.push(element);
100/// }
101///
102/// assert_eq!(
103/// result,
104/// &[
105/// ParsedPatternItem::Placeholder(0),
106/// ParsedPatternItem::Literal {
107/// content: ", ".into(),
108/// quoted: false
109/// },
110/// ParsedPatternItem::Placeholder(1),
111/// ]
112/// );
113/// ```
114///
115/// # Named placeholders
116///
117/// The parser is also capable of parsing different placeholder types such as strings.
118///
119/// ## Examples
120/// ```
121/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
122///
123/// let input = "{start}, {end}";
124///
125/// let mut parser = Parser::new(input, ParserOptions::default());
126///
127/// let mut result = vec![];
128///
129/// while let Some(element) =
130/// parser.try_next().expect("Failed to advance iterator")
131/// {
132/// result.push(element);
133/// }
134///
135/// assert_eq!(
136/// result,
137/// &[
138/// ParsedPatternItem::Placeholder("start".to_owned()),
139/// ParsedPatternItem::Literal {
140/// content: ", ".into(),
141/// quoted: false
142/// },
143/// ParsedPatternItem::Placeholder("end".to_owned()),
144/// ]
145/// );
146/// ```
147///
148/// # Type parameters
149///
150/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
151///
152/// # Lifetimes
153///
154/// - `p`: The life time of an input string slice to be parsed.
155///
156/// # Design Decisions
157///
158/// The parser is written in an intentionally generic way to enable use against wide range
159/// of potential placeholder pattern models and use cases.
160///
161/// Serveral design decisions have been made that the reader should be aware of when using the API.
162///
163/// ## Zero copy
164///
165/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
166///
167/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
168/// slices of the input without ever having to modify the input or copy from it.
169///
170/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
171/// A parser that copies bytes from the input when generating the output can take a pattern literal
172/// that contains a quoted portion and concatenace the parts, effectively generating a single
173/// literal out of a series of syntactical literal quoted and unquoted nodes.
174/// A zero copy parser sacrifices that convenience for marginal performance gains.
175///
176/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
177/// and therefore can benefit from this design decision.
178/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
179/// zero-copy design still maintains high performance, only increasing the number of tokens
180/// returned by the parser, but without increase to allocations.
181///
182/// ### Examples
183/// ```
184/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
185///
186/// let input = "{0} 'and' {1}";
187///
188/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
189///
190/// let mut result = vec![];
191///
192/// while let Some(element) =
193/// parser.try_next().expect("Failed to advance iterator")
194/// {
195/// result.push(element);
196/// }
197///
198/// assert_eq!(
199/// result,
200/// &[
201/// ParsedPatternItem::Placeholder(0),
202/// ParsedPatternItem::Literal {
203/// content: " ".into(),
204/// quoted: false
205/// },
206/// ParsedPatternItem::Literal {
207/// content: "and".into(),
208/// quoted: true
209/// },
210/// ParsedPatternItem::Literal {
211/// content: " ".into(),
212/// quoted: false
213/// },
214/// ParsedPatternItem::Placeholder(1),
215/// ]
216/// );
217/// ```
218///
219/// ## Fallible Iterator
220///
221/// Rust providers a strong support for iterators and iterator combinators, which
222/// fits very well into the design of this parser/interpolator model.
223///
224/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
225/// fallible. As such, the decision has been made to design the API in line with what
226/// we hope will become a trait signature of a fallible iterator in the future, rather
227/// than implementing a reversed infallible iterator (where the [`Item`] would be
228/// `Option<Result<Item>>`).
229///
230/// That decision impacts the ergonomics of operating on the parser, on one hand making
231/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
232/// range of Rust iterator traits.
233///
234/// ## Generic Placeholder
235///
236/// To handle generic placeholder design, the only constrain necessary in the parser
237/// is that a placeholder must be parsed from a string slice.
238/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
239/// [`TryFrom<&str>`][`TryFrom`].
240/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
241/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
242///
243/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
244/// impact the core use case of placeholder patterns.
245///
246/// In result, the decision has been made to use [`FromStr`] for the time being, until
247/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
248///
249/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
250/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
251/// [`Item`]: core::iter::Iterator::Item
252/// [`TryFrom`]: core::convert::TryFrom
253/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
254#[derive(Debug)]
255pub struct Parser<'p, P> {
256 input: &'p str,
257 len: usize,
258
259 quote_mode: QuoteMode,
260
261 start_idx: usize,
262 idx: usize,
263
264 state: ParserState,
265 marker: PhantomData<P>,
266}
267
268impl<'p, P> Parser<'p, P> {
269 /// Creates a new `Parser`.
270 ///
271 /// The `allow_raw_letters` controls whether the parser will support
272 /// ASCII letters without quotes.
273 ///
274 /// # Examples
275 /// ```
276 /// use icu_pattern::{Parser, ParserOptions};
277 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
278 /// ```
279 pub fn new(input: &'p str, options: ParserOptions) -> Self {
280 Self {
281 input,
282 len: input.len(),
283
284 quote_mode: options.quote_mode,
285
286 start_idx: 0,
287 idx: 0,
288
289 state: ParserState::default(),
290 marker: PhantomData,
291 }
292 }
293
294 /// An iterator method that advances the iterator and returns the result of an attempt to parse
295 /// the next token.
296 ///
297 /// # Examples
298 /// ```
299 /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
300 ///
301 /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
302 ///
303 /// // A call to try_next() returns the next value…
304 /// assert_eq!(
305 /// Ok(Some(ParsedPatternItem::Placeholder(0))),
306 /// parser.try_next()
307 /// );
308 /// assert_eq!(
309 /// Ok(Some(ParsedPatternItem::Literal {
310 /// content: ", ".into(),
311 /// quoted: false
312 /// })),
313 /// parser.try_next()
314 /// );
315 /// assert_eq!(
316 /// Ok(Some(ParsedPatternItem::Placeholder(1))),
317 /// parser.try_next()
318 /// );
319 ///
320 /// // … and then `None` once it's over.
321 /// assert_eq!(Ok(None), parser.try_next());
322 /// ```
323 pub fn try_next(
324 &mut self,
325 ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
326 where
327 P: FromStr,
328 P::Err: Debug,
329 {
330 while let Some(b) = self.input.as_bytes().get(self.idx) {
331 match self.state {
332 ParserState::Placeholder if *b == b'}' => {
333 let range = self.advance_state(self.idx, ParserState::Default);
334 return self.input[range]
335 .parse()
336 .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
337 .map_err(ParserError::InvalidPlaceholder);
338 }
339 ParserState::QuotedLiteral
340 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
341 {
342 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
343 handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
344 } else {
345 handle_literal!(self, true, ParserState::Default)
346 }
347 }
348 ParserState::Default if *b == b'{' => {
349 handle_literal!(self, false, ParserState::Placeholder)
350 }
351 ParserState::Default
352 if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
353 {
354 if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
355 handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
356 } else {
357 handle_literal!(self, false, ParserState::QuotedLiteral)
358 }
359 }
360 ParserState::Default
361 if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
362 {
363 return Err(ParserError::IllegalCharacter(*b as char));
364 }
365 ParserState::Apostrophe { quoted } => {
366 self.start_idx -= 1;
367 if quoted {
368 handle_literal!(self, true, ParserState::QuotedLiteral)
369 } else {
370 handle_literal!(self, false, ParserState::Default)
371 }
372 }
373 _ => self.idx += 1,
374 }
375 }
376 match self.state {
377 ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
378 ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
379 ParserState::Apostrophe { .. } => unreachable!(),
380 ParserState::Default => {
381 let range = self.start_idx..self.len;
382 if !range.is_empty() {
383 self.start_idx = self.len;
384 Ok(Some(ParsedPatternItem::Literal {
385 content: Cow::Borrowed(&self.input[range]),
386 quoted: false,
387 }))
388 } else {
389 Ok(None)
390 }
391 }
392 }
393 }
394
395 fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
396 let range = self.start_idx..idx;
397 self.idx = idx + 1;
398 self.start_idx = self.idx;
399 self.state = next_state;
400 range
401 }
402
403 /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
404 pub fn try_collect_into_vec(
405 mut self,
406 ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
407 where
408 P: FromStr,
409 P::Err: Debug,
410 {
411 let mut result = vec![];
412 while let Some(token) = self.try_next()? {
413 result.push(token);
414 }
415 Ok(result)
416 }
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422 use core::ops::Deref;
423
424 #[test]
425 fn pattern_parse_placeholders() {
426 let samples = vec![
427 ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
428 (
429 "{0}{1}",
430 vec![
431 ParsedPatternItem::Placeholder(0),
432 ParsedPatternItem::Placeholder(1),
433 ],
434 ),
435 (
436 "{0} 'at' {1}",
437 vec![
438 ParsedPatternItem::Placeholder(0),
439 ParsedPatternItem::Literal {
440 content: " ".into(),
441 quoted: false,
442 },
443 ParsedPatternItem::Literal {
444 content: "at".into(),
445 quoted: true,
446 },
447 ParsedPatternItem::Literal {
448 content: " ".into(),
449 quoted: false,
450 },
451 ParsedPatternItem::Placeholder(1),
452 ],
453 ),
454 (
455 "{0}'at'{1}",
456 vec![
457 ParsedPatternItem::Placeholder(0),
458 ParsedPatternItem::Literal {
459 content: "at".into(),
460 quoted: true,
461 },
462 ParsedPatternItem::Placeholder(1),
463 ],
464 ),
465 (
466 "'{0}' 'at' '{1}'",
467 vec![
468 ParsedPatternItem::Literal {
469 content: "{0}".into(),
470 quoted: true,
471 },
472 ParsedPatternItem::Literal {
473 content: " ".into(),
474 quoted: false,
475 },
476 ParsedPatternItem::Literal {
477 content: "at".into(),
478 quoted: true,
479 },
480 ParsedPatternItem::Literal {
481 content: " ".into(),
482 quoted: false,
483 },
484 ParsedPatternItem::Literal {
485 content: "{1}".into(),
486 quoted: true,
487 },
488 ],
489 ),
490 (
491 "'PRE' {0} 'and' {1} 'POST'",
492 vec![
493 ParsedPatternItem::Literal {
494 content: "PRE".into(),
495 quoted: true,
496 },
497 ParsedPatternItem::Literal {
498 content: " ".into(),
499 quoted: false,
500 },
501 ParsedPatternItem::Placeholder(0),
502 ParsedPatternItem::Literal {
503 content: " ".into(),
504 quoted: false,
505 },
506 ParsedPatternItem::Literal {
507 content: "and".into(),
508 quoted: true,
509 },
510 ParsedPatternItem::Literal {
511 content: " ".into(),
512 quoted: false,
513 },
514 ParsedPatternItem::Placeholder(1),
515 ParsedPatternItem::Literal {
516 content: " ".into(),
517 quoted: false,
518 },
519 ParsedPatternItem::Literal {
520 content: "POST".into(),
521 quoted: true,
522 },
523 ],
524 ),
525 (
526 "{0} o''clock and 'o''clock'",
527 vec![
528 ParsedPatternItem::Placeholder(0),
529 ParsedPatternItem::Literal {
530 content: " o".into(),
531 quoted: false,
532 },
533 ParsedPatternItem::Literal {
534 content: "'".into(),
535 quoted: false,
536 },
537 ParsedPatternItem::Literal {
538 content: "clock and ".into(),
539 quoted: false,
540 },
541 ParsedPatternItem::Literal {
542 content: "o".into(),
543 quoted: true,
544 },
545 ParsedPatternItem::Literal {
546 content: "'".into(),
547 quoted: true,
548 },
549 ParsedPatternItem::Literal {
550 content: "clock".into(),
551 quoted: true,
552 },
553 ],
554 ),
555 ];
556
557 for (input, expected) in samples {
558 let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
559 let result = parser
560 .try_collect_into_vec()
561 .expect("Failed to parse a pattern");
562 assert_eq!(result.deref(), expected,);
563 }
564
565 let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
566 ("{", Some(ParserError::UnclosedPlaceholder)),
567 ("{0", Some(ParserError::UnclosedPlaceholder)),
568 ("{01", Some(ParserError::UnclosedPlaceholder)),
569 (
570 "{date}",
571 // This should be:
572 // ```
573 // ParserError::InvalidPlaceholder(
574 // ParseIntError {
575 // kind: core::num::IntErrorKind::InvalidDigit
576 // }
577 // ),
578 // ```
579 // Pending: https://github.com/rust-lang/rust/issues/22639
580 //
581 // Once that is fixed, we can stop using an `Option` here.
582 None,
583 ),
584 ("{date} 'days'", None),
585 ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
586 ("d", Some(ParserError::IllegalCharacter('d'))),
587 ];
588
589 for (input, error) in broken {
590 let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
591 let result = parser.try_collect_into_vec();
592 if let Some(error) = error {
593 assert_eq!(result.expect_err("Should have failed."), error,);
594 } else {
595 assert!(result.is_err());
596 }
597 }
598 }
599}