Skip to main content

icu_pattern/parser/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod error;
6pub mod token;
7
8use alloc::{borrow::Cow, vec, vec::Vec};
9use core::{fmt::Debug, marker::PhantomData, str::FromStr};
10pub use error::ParserError;
11pub use token::ParsedPatternItem;
12
13#[derive(PartialEq, Debug, Default)]
14enum ParserState {
15    #[default]
16    Default,
17    Placeholder,
18    QuotedLiteral,
19    Apostrophe {
20        quoted: bool,
21    },
22}
23
24macro_rules! handle_literal {
25    ($self:ident, $quoted:expr, $next_state:expr) => {{
26        let range = $self.advance_state($self.idx, $next_state);
27        if !range.is_empty() {
28            return Ok(Some(ParsedPatternItem::Literal {
29                content: Cow::Borrowed(&$self.input[range]),
30                quoted: $quoted,
31            }));
32        } else {
33            continue;
34        }
35    }};
36}
37
38/// Options passed to the constructor of [`Parser`].
39///
40/// ✨ *Enabled with the `alloc` Cargo feature.*
41#[derive(Debug, Default)]
42#[non_exhaustive]
43pub struct ParserOptions {
44    /// Controls how quotes (`'`) are interpreted.
45    pub quote_mode: QuoteMode,
46}
47
48/// Controls how quotes (`'`) are interpreted.
49#[derive(Debug, Default, PartialEq)]
50#[non_exhaustive]
51pub enum QuoteMode {
52    /// Quotes are interpreted as literals, i.e. `{0} o'clock` will interpolate to `5 o'clock`.
53    #[default]
54    QuotesAreLiterals,
55    /// Quotes can be used to quote ASCII characters, i.e. both `{0} World` and `{0} 'World'` will interpolate to `Hello World`.
56    ///
57    /// A double quote can be used to create a quote literal, i.e. `{0} o''clock`.
58    QuotingSupported,
59    /// Quotes are required to quote ASCII characters, i.e. `{0} 'World'` will interpolate to `Hello World`, while `{0} World` is an error.
60    ///
61    /// A double quote can be used to create a quote literal, i.e. `{0} 'o''clock'`.
62    QuotingRequired,
63}
64
65impl From<QuoteMode> for ParserOptions {
66    fn from(quote_mode: QuoteMode) -> Self {
67        Self { quote_mode }
68    }
69}
70
71/// Placeholder pattern parser.
72///
73/// The parser allows for handling flexible range of generic patterns
74/// with placeholders.
75///
76/// The [`Parser`] is generic over any placeholder which implements [`FromStr`]
77/// allowing the consumer to parse placeholder patterns such as "{0}, {1}",
78/// "{date}, {time}" or any other. A placeholder must be enclosed in `{` and `}`
79/// characters in the input pattern string.
80///
81/// At the moment the parser is written as a custom fallible iterator.
82///
83/// ✨ *Enabled with the `alloc` Cargo feature.*
84///
85/// # Examples
86///
87/// ```
88/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
89///
90/// let input = "{0}, {1}";
91///
92/// let mut parser = Parser::new(input, ParserOptions::default());
93///
94/// let mut result = vec![];
95///
96/// while let Some(element) =
97///     parser.try_next().expect("Failed to advance iterator")
98/// {
99///     result.push(element);
100/// }
101///
102/// assert_eq!(
103///     result,
104///     &[
105///         ParsedPatternItem::Placeholder(0),
106///         ParsedPatternItem::Literal {
107///             content: ", ".into(),
108///             quoted: false
109///         },
110///         ParsedPatternItem::Placeholder(1),
111///     ]
112/// );
113/// ```
114///
115/// # Named placeholders
116///
117/// The parser is also capable of parsing different placeholder types such as strings.
118///
119/// ## Examples
120/// ```
121/// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
122///
123/// let input = "{start}, {end}";
124///
125/// let mut parser = Parser::new(input, ParserOptions::default());
126///
127/// let mut result = vec![];
128///
129/// while let Some(element) =
130///     parser.try_next().expect("Failed to advance iterator")
131/// {
132///     result.push(element);
133/// }
134///
135/// assert_eq!(
136///     result,
137///     &[
138///         ParsedPatternItem::Placeholder("start".to_owned()),
139///         ParsedPatternItem::Literal {
140///             content: ", ".into(),
141///             quoted: false
142///         },
143///         ParsedPatternItem::Placeholder("end".to_owned()),
144///     ]
145/// );
146/// ```
147///
148/// # Type parameters
149///
150/// - `P`: The type of the placeholder used as a key for the [`PlaceholderValueProvider`].
151///
152/// # Lifetimes
153///
154/// - `p`: The life time of an input string slice to be parsed.
155///
156/// # Design Decisions
157///
158/// The parser is written in an intentionally generic way to enable use against wide range
159/// of potential placeholder pattern models and use cases.
160///
161/// Serveral design decisions have been made that the reader should be aware of when using the API.
162///
163/// ## Zero copy
164///
165/// The parser is intended for runtime use and is optimized for performance and low memory overhad.
166///
167/// Zero copy parsing is a model which allows the parser to produce tokens that are de-facto
168/// slices of the input without ever having to modify the input or copy from it.
169///
170/// In case of ICU patterns that decision brings a trade-off around handling of quoted literals.
171/// A parser that copies bytes from the input when generating the output can take a pattern literal
172/// that contains a quoted portion and concatenace the parts, effectively generating a single
173/// literal out of a series of syntactical literal quoted and unquoted nodes.
174/// A zero copy parser sacrifices that convenience for marginal performance gains.
175///
176/// The rationale for the decision is that many placeholder patterns do not contain ASCII letters
177/// and therefore can benefit from this design decision.
178/// Secondly, even in scenarios where ASCII letters, or other quoted literals, are used, the
179/// zero-copy design still maintains high performance, only increasing the number of tokens
180/// returned by the parser, but without increase to allocations.
181///
182/// ### Examples
183/// ```
184/// use icu_pattern::{ParsedPatternItem, Parser, QuoteMode};
185///
186/// let input = "{0} 'and' {1}";
187///
188/// let mut parser = Parser::new(input, QuoteMode::QuotingSupported.into());
189///
190/// let mut result = vec![];
191///
192/// while let Some(element) =
193///     parser.try_next().expect("Failed to advance iterator")
194/// {
195///     result.push(element);
196/// }
197///
198/// assert_eq!(
199///     result,
200///     &[
201///         ParsedPatternItem::Placeholder(0),
202///         ParsedPatternItem::Literal {
203///             content: " ".into(),
204///             quoted: false
205///         },
206///         ParsedPatternItem::Literal {
207///             content: "and".into(),
208///             quoted: true
209///         },
210///         ParsedPatternItem::Literal {
211///             content: " ".into(),
212///             quoted: false
213///         },
214///         ParsedPatternItem::Placeholder(1),
215///     ]
216/// );
217/// ```
218///
219/// ## Fallible Iterator
220///
221/// Rust providers a strong support for iterators and iterator combinators, which
222/// fits very well into the design of this parser/interpolator model.
223///
224/// Unfortunately, Rust iterators at the moment are infallible, while parsers are inhereantely
225/// fallible. As such, the decision has been made to design the API in line with what
226/// we hope will become a trait signature of a fallible iterator in the future, rather
227/// than implementing a reversed infallible iterator (where the [`Item`] would be
228/// `Option<Result<Item>>`).
229///
230/// That decision impacts the ergonomics of operating on the parser, on one hand making
231/// the fallible iteration more ergonomic, at a trade-off of losing access to the wide
232/// range of Rust iterator traits.
233///
234/// ## Generic Placeholder
235///
236/// To handle generic placeholder design, the only constrain necessary in the parser
237/// is that a placeholder must be parsed from a string slice.
238/// At the moment of writing, Rust is [preparing to deprecate][`RFC 2924`] [`FromStr`] in favor of
239/// [`TryFrom<&str>`][`TryFrom`].
240/// Among many benfits of such transition would be the auto-trait behavior of [`From`] and
241/// a [`TryFrom<&str>`][`TryFrom`] for [`&str`] allowing for placeholders to be [`&str`] themselves.
242///
243/// Unfortunately, at the moment [`TryFrom<&str>`][`TryFrom`] for [`usize`] is not implemented, which would
244/// impact the core use case of placeholder patterns.
245///
246/// In result, the decision has been made to use [`FromStr`] for the time being, until
247/// [`TryFrom<&str>`][`TryFrom`] gets implemented on all types that support [`FromStr`].
248///
249/// [`TR35 2.6.1]: https://unicode.org/reports/tr35/tr35-dates.html#dateTimeFormat
250/// [`RFC 2924`]: https://github.com/rust-lang/rfcs/pull/2924
251/// [`Item`]: core::iter::Iterator::Item
252/// [`TryFrom`]: core::convert::TryFrom
253/// [`PlaceholderValueProvider`]: crate::PlaceholderValueProvider
254#[derive(Debug)]
255pub struct Parser<'p, P> {
256    input: &'p str,
257    len: usize,
258
259    quote_mode: QuoteMode,
260
261    start_idx: usize,
262    idx: usize,
263
264    state: ParserState,
265    marker: PhantomData<P>,
266}
267
268impl<'p, P> Parser<'p, P> {
269    /// Creates a new `Parser`.
270    ///
271    /// The `allow_raw_letters` controls whether the parser will support
272    /// ASCII letters without quotes.
273    ///
274    /// # Examples
275    /// ```
276    /// use icu_pattern::{Parser, ParserOptions};
277    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
278    /// ```
279    pub fn new(input: &'p str, options: ParserOptions) -> Self {
280        Self {
281            input,
282            len: input.len(),
283
284            quote_mode: options.quote_mode,
285
286            start_idx: 0,
287            idx: 0,
288
289            state: ParserState::default(),
290            marker: PhantomData,
291        }
292    }
293
294    /// An iterator method that advances the iterator and returns the result of an attempt to parse
295    /// the next token.
296    ///
297    /// # Examples
298    /// ```
299    /// use icu_pattern::{ParsedPatternItem, Parser, ParserOptions};
300    ///
301    /// let mut parser = Parser::<usize>::new("{0}, {1}", ParserOptions::default());
302    ///
303    /// // A call to try_next() returns the next value…
304    /// assert_eq!(
305    ///     Ok(Some(ParsedPatternItem::Placeholder(0))),
306    ///     parser.try_next()
307    /// );
308    /// assert_eq!(
309    ///     Ok(Some(ParsedPatternItem::Literal {
310    ///         content: ", ".into(),
311    ///         quoted: false
312    ///     })),
313    ///     parser.try_next()
314    /// );
315    /// assert_eq!(
316    ///     Ok(Some(ParsedPatternItem::Placeholder(1))),
317    ///     parser.try_next()
318    /// );
319    ///
320    /// // … and then `None` once it's over.
321    /// assert_eq!(Ok(None), parser.try_next());
322    /// ```
323    pub fn try_next(
324        &mut self,
325    ) -> Result<Option<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
326    where
327        P: FromStr,
328        P::Err: Debug,
329    {
330        while let Some(b) = self.input.as_bytes().get(self.idx) {
331            match self.state {
332                ParserState::Placeholder if *b == b'}' => {
333                    let range = self.advance_state(self.idx, ParserState::Default);
334                    return self.input[range]
335                        .parse()
336                        .map(|ret| Some(ParsedPatternItem::Placeholder(ret)))
337                        .map_err(ParserError::InvalidPlaceholder);
338                }
339                ParserState::QuotedLiteral
340                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
341                {
342                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
343                        handle_literal!(self, true, ParserState::Apostrophe { quoted: true })
344                    } else {
345                        handle_literal!(self, true, ParserState::Default)
346                    }
347                }
348                ParserState::Default if *b == b'{' => {
349                    handle_literal!(self, false, ParserState::Placeholder)
350                }
351                ParserState::Default
352                    if *b == b'\'' && self.quote_mode != QuoteMode::QuotesAreLiterals =>
353                {
354                    if self.input.as_bytes().get(self.idx + 1) == Some(&b'\'') {
355                        handle_literal!(self, false, ParserState::Apostrophe { quoted: false })
356                    } else {
357                        handle_literal!(self, false, ParserState::QuotedLiteral)
358                    }
359                }
360                ParserState::Default
361                    if self.quote_mode == QuoteMode::QuotingRequired && b.is_ascii_alphabetic() =>
362                {
363                    return Err(ParserError::IllegalCharacter(*b as char));
364                }
365                ParserState::Apostrophe { quoted } => {
366                    self.start_idx -= 1;
367                    if quoted {
368                        handle_literal!(self, true, ParserState::QuotedLiteral)
369                    } else {
370                        handle_literal!(self, false, ParserState::Default)
371                    }
372                }
373                _ => self.idx += 1,
374            }
375        }
376        match self.state {
377            ParserState::Placeholder => Err(ParserError::UnclosedPlaceholder),
378            ParserState::QuotedLiteral => Err(ParserError::UnclosedQuotedLiteral),
379            ParserState::Apostrophe { .. } => unreachable!(),
380            ParserState::Default => {
381                let range = self.start_idx..self.len;
382                if !range.is_empty() {
383                    self.start_idx = self.len;
384                    Ok(Some(ParsedPatternItem::Literal {
385                        content: Cow::Borrowed(&self.input[range]),
386                        quoted: false,
387                    }))
388                } else {
389                    Ok(None)
390                }
391            }
392        }
393    }
394
395    fn advance_state(&mut self, idx: usize, next_state: ParserState) -> core::ops::Range<usize> {
396        let range = self.start_idx..idx;
397        self.idx = idx + 1;
398        self.start_idx = self.idx;
399        self.state = next_state;
400        range
401    }
402
403    /// Mutates this parser and collects all [`ParsedPatternItem`]s into a vector.
404    pub fn try_collect_into_vec(
405        mut self,
406    ) -> Result<Vec<ParsedPatternItem<'p, P>>, ParserError<<P as FromStr>::Err>>
407    where
408        P: FromStr,
409        P::Err: Debug,
410    {
411        let mut result = vec![];
412        while let Some(token) = self.try_next()? {
413            result.push(token);
414        }
415        Ok(result)
416    }
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422    use core::ops::Deref;
423
424    #[test]
425    fn pattern_parse_placeholders() {
426        let samples = vec![
427            ("{0}", vec![ParsedPatternItem::Placeholder(0)]),
428            (
429                "{0}{1}",
430                vec![
431                    ParsedPatternItem::Placeholder(0),
432                    ParsedPatternItem::Placeholder(1),
433                ],
434            ),
435            (
436                "{0} 'at' {1}",
437                vec![
438                    ParsedPatternItem::Placeholder(0),
439                    ParsedPatternItem::Literal {
440                        content: " ".into(),
441                        quoted: false,
442                    },
443                    ParsedPatternItem::Literal {
444                        content: "at".into(),
445                        quoted: true,
446                    },
447                    ParsedPatternItem::Literal {
448                        content: " ".into(),
449                        quoted: false,
450                    },
451                    ParsedPatternItem::Placeholder(1),
452                ],
453            ),
454            (
455                "{0}'at'{1}",
456                vec![
457                    ParsedPatternItem::Placeholder(0),
458                    ParsedPatternItem::Literal {
459                        content: "at".into(),
460                        quoted: true,
461                    },
462                    ParsedPatternItem::Placeholder(1),
463                ],
464            ),
465            (
466                "'{0}' 'at' '{1}'",
467                vec![
468                    ParsedPatternItem::Literal {
469                        content: "{0}".into(),
470                        quoted: true,
471                    },
472                    ParsedPatternItem::Literal {
473                        content: " ".into(),
474                        quoted: false,
475                    },
476                    ParsedPatternItem::Literal {
477                        content: "at".into(),
478                        quoted: true,
479                    },
480                    ParsedPatternItem::Literal {
481                        content: " ".into(),
482                        quoted: false,
483                    },
484                    ParsedPatternItem::Literal {
485                        content: "{1}".into(),
486                        quoted: true,
487                    },
488                ],
489            ),
490            (
491                "'PRE' {0} 'and' {1} 'POST'",
492                vec![
493                    ParsedPatternItem::Literal {
494                        content: "PRE".into(),
495                        quoted: true,
496                    },
497                    ParsedPatternItem::Literal {
498                        content: " ".into(),
499                        quoted: false,
500                    },
501                    ParsedPatternItem::Placeholder(0),
502                    ParsedPatternItem::Literal {
503                        content: " ".into(),
504                        quoted: false,
505                    },
506                    ParsedPatternItem::Literal {
507                        content: "and".into(),
508                        quoted: true,
509                    },
510                    ParsedPatternItem::Literal {
511                        content: " ".into(),
512                        quoted: false,
513                    },
514                    ParsedPatternItem::Placeholder(1),
515                    ParsedPatternItem::Literal {
516                        content: " ".into(),
517                        quoted: false,
518                    },
519                    ParsedPatternItem::Literal {
520                        content: "POST".into(),
521                        quoted: true,
522                    },
523                ],
524            ),
525            (
526                "{0} o''clock and 'o''clock'",
527                vec![
528                    ParsedPatternItem::Placeholder(0),
529                    ParsedPatternItem::Literal {
530                        content: " o".into(),
531                        quoted: false,
532                    },
533                    ParsedPatternItem::Literal {
534                        content: "'".into(),
535                        quoted: false,
536                    },
537                    ParsedPatternItem::Literal {
538                        content: "clock and ".into(),
539                        quoted: false,
540                    },
541                    ParsedPatternItem::Literal {
542                        content: "o".into(),
543                        quoted: true,
544                    },
545                    ParsedPatternItem::Literal {
546                        content: "'".into(),
547                        quoted: true,
548                    },
549                    ParsedPatternItem::Literal {
550                        content: "clock".into(),
551                        quoted: true,
552                    },
553                ],
554            ),
555        ];
556
557        for (input, expected) in samples {
558            let parser = Parser::new(input, QuoteMode::QuotingSupported.into());
559            let result = parser
560                .try_collect_into_vec()
561                .expect("Failed to parse a pattern");
562            assert_eq!(result.deref(), expected,);
563        }
564
565        let broken: Vec<(_, Option<ParserError<core::num::ParseIntError>>)> = vec![
566            ("{", Some(ParserError::UnclosedPlaceholder)),
567            ("{0", Some(ParserError::UnclosedPlaceholder)),
568            ("{01", Some(ParserError::UnclosedPlaceholder)),
569            (
570                "{date}",
571                // This should be:
572                // ```
573                // ParserError::InvalidPlaceholder(
574                //     ParseIntError {
575                //         kind: core::num::IntErrorKind::InvalidDigit
576                //     }
577                // ),
578                // ```
579                // Pending: https://github.com/rust-lang/rust/issues/22639
580                //
581                // Once that is fixed, we can stop using an `Option` here.
582                None,
583            ),
584            ("{date} 'days'", None),
585            ("'{00}", Some(ParserError::UnclosedQuotedLiteral)),
586            ("d", Some(ParserError::IllegalCharacter('d'))),
587        ];
588
589        for (input, error) in broken {
590            let parser = Parser::<usize>::new(input, QuoteMode::QuotingRequired.into());
591            let result = parser.try_collect_into_vec();
592            if let Some(error) = error {
593                assert_eq!(result.expect_err("Should have failed."), error,);
594            } else {
595                assert!(result.is_err());
596            }
597        }
598    }
599}