mf1_parser/
parser.rs

1pub use logos::Span;
2use logos::{Lexer, Logos};
3use std::borrow::Cow;
4use std::ops::Deref;
5
6use crate::ast::{SelectCase, Token as AstToken};
7
8#[cfg(test)]
9mod test;
10
11type Result<T> = std::result::Result<T, (String, Span)>;
12pub use Span as LexerSpan;
13#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Logos)]
14#[logos()]
15enum BodyToken {
16    // Body
17    /// Escapes to a single `'` apostrophe.
18    #[token("''")]
19    DoubleApostrophe,
20
21    /// Used to escape `{ } #` when they would otherwise be interpreted as syntax.
22    #[regex(r"'[{}#]([^'])*'")]
23    // This is slightly incompatible with the JS regex
24    // which is `'[{}#](?:[^]*?[^'])?'(?!')`. This uses lazy matching and
25    // backtracking, neither of which Logos supports.
26    // This results in "I said '{''Wow!''}'", "I said {'Wow!'}" failing.
27    Quote,
28
29    /// Enters the 'argument' lexer context
30    #[token(r"{")]
31    Argument, // Enter argument context
32
33    /// In a select, is the key, otherwise just a literal `#`
34    #[token("#")]
35    Octothorpe,
36
37    /// Anything but `{ } # '`, may also match a single quote.
38    #[regex(r#"([^\{\}#']+|')"#)]
39    Content,
40
41    /// Exits the body context - parser should error if unexpected.
42    #[token("}")]
43    End, // Exit context
44}
45
46type PassLexer<'source, S, T> = (Result<S>, Lexer<'source, T>);
47
48fn parse_body<'source, 'a, T>(
49    mut lex: Lexer<'source, BodyToken>,
50) -> PassLexer<'source, (Vec<AstToken<'source, 'a, T>>, bool), BodyToken>
51where
52    T: Deref<Target = str> + Clone + From<&'source str>,
53{
54    let mut ast: Vec<AstToken<T>> = vec![];
55    // lex.extras.push(State::Body);
56
57    while let Some(Ok(token)) = lex.next() {
58        match token {
59            BodyToken::Argument => {
60                let (res, tlex) = parse_arg(lex.morph());
61                lex = tlex.morph();
62                match res {
63                    Ok(Some(t)) => ast.push(t),
64                    Ok(None) => {}
65                    Err(e) => return (Err(e), lex),
66                };
67            }
68            BodyToken::DoubleApostrophe => ast.push(AstToken::Content {
69                value: lex.slice()[0..1].into(),
70            }),
71            BodyToken::Quote => {
72                let slice = lex.slice();
73                ast.push(AstToken::Content {
74                    value: slice[1..slice.len() - 1].into(),
75                })
76            }
77            BodyToken::Octothorpe => ast.push(AstToken::Octothorpe {}),
78            BodyToken::Content => ast.push(AstToken::Content {
79                value: lex.slice().into(),
80            }),
81            BodyToken::End => {
82                return (Ok((ast, true)), lex);
83            }
84        }
85    }
86
87    (Ok((ast, false)), lex)
88}
89
90// For the regexes, `\p{...}` is a unicode category.
91// See:
92// - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape
93// - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Character_class (`v` mode)
94// - https://www.unicode.org/reports/tr18/#Full_Properties
95// - https://www.unicode.org/reports/tr44/
96// - https://www.unicode.org/reports/tr31/
97
98// #[derive(Default, Debug)]
99// struct ArgTokenState {
100//     keywords: bool,
101// }
102
103// impl From<()> for ArgTokenState {
104//     fn from(value: ()) -> Self {
105//         Self::default()
106//     }
107// }
108// impl Into<()> for ArgTokenState {
109//     fn into(self) -> () {
110//         ()
111//     }
112// }
113
114#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Logos)]
115#[logos(skip r"\p{Pattern_White_Space}+")] // , extras = ArgTokenState
116enum ArgToken {
117    // Arguments
118    #[token("plural")]
119    Plural,
120    #[token("select")]
121    Select,
122    #[token("selectordinal")]
123    SelectOrdinal,
124
125    #[token(",")]
126    Comma,
127
128    /// An argument identifier
129    #[regex(r"[\d\p{ID_Start}][\p{ID_Continue}]*")]
130    Ident,
131
132    #[token("}")]
133    End, // Exit context
134}
135
136fn parse_arg<'source, 'a, T>(
137    mut lex: Lexer<'source, ArgToken>,
138) -> PassLexer<'source, Option<AstToken<'source, 'a, T>>, ArgToken>
139where
140    T: Deref<Target = str> + Clone + From<&'source str>,
141{
142    let mut arg = None;
143    let next = lex.next();
144    if let Some(Ok(token)) = next {
145        // First, we expect an identifier
146        match token {
147            ArgToken::Ident => arg = Some(lex.slice()),
148            // Keywords are identifiers in this context.
149            ArgToken::Plural | ArgToken::Select | ArgToken::SelectOrdinal => {
150                arg = Some(lex.slice())
151            }
152            // If we just get a close, we have something like ` { } `,
153            // which the user probs didn't mean, but we'll accept anyway
154            ArgToken::End => return (Ok(None), lex),
155            // Otherwise, we got something unexpected.
156            _ => {
157                return (
158                    Err(("Unexpected token in argument".to_owned(), lex.span())),
159                    lex,
160                )
161            }
162        };
163    } else {
164        // A stand-alone opening bracket?
165        dbg!(next, arg);
166        if next.is_some() {
167            return (
168                Err(("Unexpected token in argument".to_owned(), lex.span())),
169                lex,
170            );
171        } else {
172            return (
173                Err((
174                    "Message unexpectedly ended within argument".to_owned(),
175                    lex.span(),
176                )),
177                lex,
178            );
179        }
180    }
181    if let Some(Ok(token)) = lex.next() {
182        match token {
183            ArgToken::End => {
184                // Just a simple arg
185                if let Some(arg) = arg {
186                    return (Ok(Some(AstToken::PlainArg { arg: arg.into() })), lex);
187                } else {
188                    unreachable!() // At least, it should be...
189                }
190            }
191            ArgToken::Comma => {} // We got some more coming!
192            _ => {
193                return (
194                    Err((
195                        "Unexpected token in argument (expected comma or closing bracket)"
196                            .to_owned(),
197                        lex.span(),
198                    )),
199                    lex,
200                )
201            }
202        }
203    }
204
205    if let Some(Ok(token)) = lex.next() {
206        match token {
207            select @ (ArgToken::Plural | ArgToken::Select | ArgToken::SelectOrdinal) => {
208                let (res, tlex) = parse_select(select, arg.unwrap(), lex.morph());
209                lex = tlex.morph();
210                match res {
211                    Ok(t) => (Ok(Some(t)), lex),
212                    // Ok(None) => {}
213                    Err(e) => (Err(e), lex),
214                }
215            }
216
217            ArgToken::Ident => todo!(),
218
219            ArgToken::End => {
220                // Just a simple arg, but with an end comma
221                if let Some(arg) = arg {
222                    (Ok(Some(AstToken::PlainArg { arg: arg.into() })), lex)
223                } else {
224                    unreachable!() // At least, it should be...
225                }
226            }
227            // ArgToken::Comma => { }, // We got some more coming!
228            _ => (
229                Err(("Unexpected token in argument".to_owned(), lex.span())),
230                lex,
231            ),
232        }
233    } else {
234        (Err(("Unexpected end of input".to_owned(), lex.span())), lex)
235    }
236}
237
238#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Logos)]
239#[logos(skip r"\p{Pattern_White_Space}+")]
240enum SelectToken {
241    // Select
242    #[token("offset")]
243    Offset,
244
245    #[token(":")]
246    Colon,
247
248    #[regex(r"\d+", priority = 4)]
249    Int,
250
251    #[regex(r"[\d\p{ID_Start}][\p{ID_Continue}]*", priority = 2)]
252    Ident,
253
254    #[token(",")]
255    Comma,
256
257    #[token("{")]
258    Open,
259
260    #[token("}")]
261    End, // Exit context
262}
263
264fn parse_select<'source, 'a, T>(
265    parent_type: ArgToken,
266    arg: &'source str,
267    mut lex: Lexer<'source, SelectToken>,
268) -> PassLexer<'source, AstToken<'source, 'a, T>, SelectToken>
269where
270    T: Deref<Target = str> + Clone + From<&'source str>,
271{
272    let mut cases = vec![];
273    let mut offset = (None, None);
274    let mut expect_colon = false;
275    let mut expect_comma = true;
276    let mut key = None;
277
278    while let Some(Ok(token)) = lex.next() {
279        match token {
280            SelectToken::Offset => {
281                if offset.1.is_none() && !expect_comma {
282                    offset.1 = Some(lex.slice());
283                    expect_colon = true;
284                    expect_comma = true; // This may still be an ident
285                } else {
286                    return (
287                        Err(("Unexpected offset keyword".to_owned(), lex.span())),
288                        lex,
289                    );
290                }
291            }
292            SelectToken::Colon => {
293                if expect_colon {
294                    expect_colon = false;
295                    expect_comma = false; // Not an ident now
296                } else {
297                    return (Err(("Unexpected colon".to_owned(), lex.span())), lex);
298                }
299            }
300            SelectToken::Int => {
301                if offset.1.is_some() && !expect_colon {
302                    // We are expecting an offset
303                    match lex.slice().parse::<i32>() {
304                        Ok(i) => {
305                            offset.0 = Some(i);
306                            offset.1 = None
307                        }
308                        Err(e) => {
309                            return (
310                                Result::Err((format!("Bad integer: {}", e), lex.span())),
311                                lex,
312                            )
313                        }
314                    };
315                } else if offset.1.is_none() && !expect_comma && !expect_colon {
316                    // this is a key
317                    key = Some(lex.slice());
318                } else {
319                    return (Err(("Unexpected integer".to_owned(), lex.span())), lex);
320                }
321            }
322            SelectToken::Ident => {
323                if offset.1.is_none() && !expect_comma && !expect_colon {
324                    key = Some(lex.slice());
325                } else {
326                    return (Err(("Unexpected identifier".to_owned(), lex.span())), lex);
327                }
328            }
329            SelectToken::Comma => {
330                if expect_comma {
331                    expect_comma = false;
332                    expect_colon = false; // No longer expecting offset
333                } else {
334                    return (Err(("Unexpected comma".to_owned(), lex.span())), lex);
335                }
336            }
337            SelectToken::Open => {
338                if let Some(key_inner) = key {
339                    let (res, tlex) = parse_body(lex.morph());
340                    lex = tlex.morph();
341                    match res {
342                        Ok((t, true)) => {
343                            cases.push(SelectCase {
344                                key: key_inner.into(),
345                                tokens: Cow::Owned(t),
346                            });
347                            key = None
348                        }
349                        Ok((_, false)) => {
350                            return (Err(("Unexpected end of input".to_owned(), lex.span())), lex);
351                        }
352                        // Ok(None) => {}
353                        Err(e) => return (Err(e), lex),
354                    };
355                }
356            }
357            SelectToken::End => {
358                if !expect_colon {
359                    return (
360                        match parent_type {
361                            ArgToken::Plural => {
362                                let _token: std::result::Result<AstToken<T>, ()> =
363                                    Ok(AstToken::Plural {
364                                        arg: arg.into(),
365                                        cases: Cow::Owned(vec![]),
366                                        plural_offset: offset.0,
367                                    });
368                                todo!()
369                            }
370                            ArgToken::SelectOrdinal => {
371                                let _token: std::result::Result<AstToken<T>, ()> =
372                                    Ok(AstToken::SelectOrdinal {
373                                        arg: arg.into(),
374                                        cases: Cow::Owned(vec![]),
375                                        plural_offset: offset.0,
376                                    });
377                                todo!()
378                            }
379                            ArgToken::Select => Ok(AstToken::Select {
380                                arg: arg.into(),
381                                cases: Cow::Owned(cases),
382                                plural_offset: offset.0,
383                            }),
384                            _ => Err(("Unexpected parent token type".to_owned(), lex.span())),
385                        },
386                        lex,
387                    );
388                } else {
389                    return (
390                        Err(("Unexpected end of select".to_owned(), lex.span())),
391                        lex,
392                    );
393                }
394            }
395        }
396    }
397    todo! {}
398}
399
400// enum Modes<'source> {
401//     BodyToken(Lexer<'source, BodyToken>),
402//     ArgToken(Lexer<'source, ArgToken>),
403//     SelectToken(Lexer<'source, SelectToken>),
404// }
405
406// impl<'source> Modes<'source> {
407//     fn new(s: &'source str) -> Self {
408//         Self::BodyToken(BodyToken::lexer(s))
409//     }
410// }
411
412pub fn parse<'source, 'a, T>(src: &'source str) -> Result<Vec<AstToken<'source, 'a, T>>>
413where
414    T: Deref<Target = str> + Clone + From<&'source str>,
415{
416    let lex = BodyToken::lexer(src);
417
418    let (res, lex) = parse_body(lex);
419    match res {
420        Ok((_tok, true)) => Err(("Unexpected end of body".to_owned(), lex.span())),
421        Ok((tok, false)) => Ok(tok),
422
423        Err(e) => Err(e),
424    }
425}
426
427#[cfg(test)]
428mod inline_tests {
429    // Note this useful idiom: importing names from outer (for mod tests) scope.
430    use super::test::*;
431    use super::*;
432    use crate::parser::SelectCase;
433    use crate::Token;
434    // use crate::{ast::SelectCase, ast::Token};
435
436    macro_rules! parse_assert {
437        ( $src:literal, $( $i:expr ),* ) => {
438            {
439                assert_eq!(
440                    parse_ui($src),
441                    vec![
442                        $(
443                            parse_assert! (token, $i)
444                        ),+
445                    ]
446                );
447            }
448        };
449        ( token, $str:literal ) => {
450            crate::ast::Token::Content {
451                        value: $str
452            }
453        };
454
455        ( token, $tree:expr ) => {
456            $tree
457        }
458    }
459
460    macro_rules! parse_assert_concat {
461        ( $src:literal, $res:literal ) => {{
462            let res = parse::<&str>($src).unwrap();
463            let text: String = res
464                .iter()
465                .map(|t| match t {
466                    Token::Content { value } => *value,
467                    _ => panic!(),
468                })
469                .collect();
470            assert_eq!(&text, $res);
471        }};
472    }
473
474    #[test]
475    fn test_body_simple() {
476        parse_assert!("This is a message", "This is a message");
477    }
478
479    // This test is wrong - it should all be content in the original impl.
480    #[test]
481    fn test_body_octothorpe() {
482        parse_assert!(
483            "This is # an octothorpe",
484            "This is ",
485            Token::Octothorpe {},
486            " an octothorpe"
487        );
488    }
489
490    #[test]
491    fn test_body_doublequote() {
492        parse_assert_concat!("This is a doublequote: ''", "This is a doublequote: '");
493    }
494
495    #[test]
496    fn test_body_quote_escape() {
497        parse_assert_concat!(
498            "This is an '{escaped}' string, with some more escapes: '{', '}'",
499            "This is an {escaped} string, with some more escapes: {, }"
500        );
501    }
502
503    #[test]
504    fn test_body_quote_no_escape() {
505        parse_assert_concat!("This is a 'quoted' string", "This is a 'quoted' string");
506    }
507
508    #[test]
509    #[should_panic]
510    fn test_body_unexpected_close() {
511        let _ = parse::<&str>("This is an unexpected close: }").unwrap();
512    }
513
514    #[test]
515    fn test_arg_simple() {
516        parse_assert!(
517            "This is a {simple} replace.",
518            "This is a ",
519            Token::PlainArg { arg: "simple" },
520            " replace."
521        );
522    }
523
524    #[test]
525    fn test_arg_keyword() {
526        parse_assert!(
527            "This has a keyword {select} replace.",
528            "This has a keyword ",
529            Token::PlainArg { arg: "select" },
530            " replace."
531        );
532    }
533
534    #[test]
535    fn test_arg_select() {
536        parse_assert!(
537            "This is a {varname, select, this{...} that{...} other{...}}",
538            "This is a ",
539            Token::Select {
540                arg: "varname",
541                plural_offset: None,
542                cases: vec![
543                    SelectCase {
544                        key: "this",
545                        tokens: vec![Token::Content { value: "..." }].into()
546                    },
547                    SelectCase {
548                        key: "that",
549                        tokens: vec![Token::Content { value: "..." }].into()
550                    },
551                    SelectCase {
552                        key: "other",
553                        tokens: vec![Token::Content { value: "..." }].into()
554                    }
555                ]
556                .into()
557            }
558        );
559    }
560}