Skip to main content

onepass_seed/expr/
parse.rs

1use core::str::{self, Utf8Error};
2
3use nom::{
4    Finish, IResult, Parser,
5    branch::alt,
6    bytes::complete::{is_not, tag, take_while_m_n},
7    character::complete::{self, anychar, char, none_of},
8    combinator::{map, map_res, opt, peek, value, verify},
9    error::{self, ErrorKind},
10    multi::{fold, many1},
11    sequence::{delimited, preceded, separated_pair},
12};
13
14use super::{Context, Expr, Node, chars::Chars, generator::Generator};
15
16enum StringFragment<'a> {
17    Verbatim(&'a str),
18    Escaped(char),
19}
20
21enum CharFragment {
22    Single((char, char)),
23    Multi(&'static [(char, char)]),
24}
25
26pub type Error = nom::error::Error<String>;
27
28impl Expr<'_> {
29    /// Expressions can be parsed from UTF-8 strings.
30    ///
31    /// The following syntax is supported:
32    ///
33    /// # String literals
34    /// Any literal string that does not otherwise consist of syntax characters stands for itself.
35    /// A schema consisting of a literal string generates itself as the single password. Other
36    /// characters may be escaped with `'\\'`; aside from newline, carriage return, and tab, any
37    /// non-alphanumeric character stands for itself as a literal value when preceded by a
38    /// backslash.
39    /// ```
40    /// # use {onepass_seed::expr::Node, core::str::FromStr};
41    /// assert_eq!(Node::Literal("test".into()), "test".parse().unwrap());
42    /// assert_eq!(Node::Literal("(escape){}[]".into()), r#"\(escape\)\{\}\[\]"#.parse().unwrap());
43    /// ```
44    ///
45    /// Arbitrary Unicode characters may also be insterted as `\uXXXX`, or hex sequences (so long
46    /// as they encode valid ASCII or UTF-8 byte sequences) as `\xXX`.
47    ///
48    /// # Character classes
49    /// The special character classes `\w` and `\d` stand for word (alphanumeric plus underscore)
50    /// and digit characters respectively. They may show up anywhere in an expression and stand for
51    /// a single character in their range.
52    ///
53    /// Square bracket character classes are also supported, including the following POSIX
54    /// character classes:
55    /// - `[:lower:]` - lowercase ASCII letters
56    /// - `[:upper:]` - uppercase ASCII letters
57    /// - `[:alpha:]` - upper or lowercase ASCII letters
58    /// - `[:digit:]` - decimal digits
59    /// - `[:xdigit:]` - lowercase hexadecimal digits
60    /// - `[:punct:]` - ASCII punctuation, aka special characters
61    /// - `[:print:]` - printable ASCII characters
62    ///
63    /// Single characters (`[a]`) and unicode character ranges (`[a-z]`) are also supported.
64    ///
65    /// Any of these ranges may be combined within square brackets; `[[:upper:][a-z]\d]`
66    /// corresponds to uppercase ASCII, lowercase ASCII, and decimal digits.
67    ///
68    /// ```
69    /// # use {onepass_seed::expr::Node, core::str::FromStr};
70    /// assert_eq!("[a-z]".parse::<Node>().unwrap(), "[[:lower:]]".parse().unwrap());
71    /// assert_eq!("[A-Za-z0-9_]".parse::<Node>().unwrap(), "\\w".parse().unwrap());
72    /// ```
73    ///
74    /// # Lists
75    /// A sequence of nodes is represented by its concatenation. A nested list may be created using
76    /// parentheses (`()`). This is of limited utility since the language does not support choices,
77    /// but does allow e.g. setting a count on a sequence, like:
78    /// `([[:lower:]][[:digit:]][[:lower:]]){3}`.
79    ///
80    /// ```
81    /// # use core::str::FromStr;
82    /// # use crypto_bigint::{NonZero, U256};
83    /// # use num_traits::pow;
84    /// # use onepass_seed::expr::{Eval, Expr};
85    /// assert_eq!(
86    ///     NonZero::new(U256::from_u64((26u64*10*26).pow(3))).unwrap(),
87    ///     Expr::new("([[:lower:]][[:digit:]][[:lower:]]){3}".parse().unwrap()).size()
88    /// );
89    /// ```
90    ///
91    /// # Counts
92    /// As alluded to, expressions may be repeated for specified counts. The syntax is
93    /// `expr{min,max}`. If `max` is omitted, i.e. `expr{min}`, then `max == min`. If `min` is
94    /// omitted, i.e. `expr{,max}`, then `min == 0`.
95    ///
96    /// **NB.** In the current revision of the schema language, a count after a literal applies to
97    /// the whole string, not just the last character; so `ab{2}` is equivalent to `(ab){2}`, not
98    /// `a(b){2}`:
99    /// ```
100    /// # use {onepass_seed::expr::Node, core::str::FromStr};
101    /// assert_eq!("(ab){2}".parse::<Node>().unwrap(), "ab{2}".parse().unwrap());
102    /// ```
103    ///
104    /// # Generators
105    /// Arbitrary library-suppliable generators may be called. The library includes two: `word` to
106    /// produce a single word, and `words` to produce a sequence of words. Generators are
107    /// surrounded by curly braces and must start with a lowercase ASCII letter, e.g. `{word}`.
108    /// (This rule is what differentiates them from counts, which must start with an ASCII digit.)
109    ///
110    /// Generators may take arguments. The first non–lowercase-ASCII character in a generator
111    /// expression is taken as an argument separator, so e.g. `{words:2:U}` calls generator `words`
112    /// with arguments `"2"` and `"U"`.
113    ///
114    /// # Reserved syntax
115    /// The `|` character may be used inside of generators as an argument separator, like
116    /// `{word|U}`, but may not be used unescaped anywhere else in an expression. This syntax is
117    /// reserved for possible future expansion.
118    ///
119    /// # Errors
120    /// It is an error to write a character class with the higher character before the lower
121    /// character, e.g. `[b-a]`.
122    /// ```
123    /// # use core::str::FromStr;
124    /// # use onepass_seed::expr::Node;
125    /// assert!("[b-a]".parse::<Node>().is_err());
126    /// ```
127    ///
128    /// Partial remainders, e.g. in the case of unbalanced delimiters, yield an error. (These can,
129    /// and should, simply be backslash-escaped.)
130    /// ```
131    /// # use core::str::FromStr;
132    /// # use onepass_seed::expr::Node;
133    /// assert!("abcd}".parse::<Node>().is_err());
134    /// assert!("abcd\\}".parse::<Node>().is_ok());
135    /// ```
136    ///
137    /// At present, hex sequences that do not encode valid UTF-8 encoded text are an error.
138    ///
139    /// The syntax `[:word:]` (and `[:Word:]`) used to be the way to generate a word from a
140    /// dictionary in `onepass` v2. In the current syntax, these would both parse to degenerate
141    /// character classes, e.g. `[:dorw]`. Since this is virtually never intended, those specific
142    /// strings are presently a parse error.
143    ///
144    /// # Context
145    /// This function returns an expression against the default context.
146    /// [`Self::parse_with_context`] may be used to parse an expression against a custom context.
147    pub fn parse(input: &str) -> Result<Self, Error> {
148        Ok(Expr::new(input.parse()?))
149    }
150}
151
152impl<'a> Expr<'a> {
153    /// [`parse`][Self::parse] an expression with the given [`Context`].
154    pub fn parse_with_context(input: &str, context: &'a Context<'a>) -> Result<Self, Error> {
155        Ok(Expr::with_context(input.parse()?, context))
156    }
157}
158
159/// Parse a [`Node`], returning an [`IResult`].
160///
161/// This function is used to implement the [`FromStr`][str::FromStr] instance on which
162/// [`Expr::parse`] is based.
163pub fn parse_node(input: &str) -> IResult<&str, Node> {
164    map(many1(parse_count), Node::from_iter).parse(input)
165}
166
167impl str::FromStr for Node {
168    type Err = Error;
169
170    fn from_str(s: &str) -> Result<Self, Self::Err> {
171        match parse_node(s).finish() {
172            Ok((remaining, node)) => {
173                if !remaining.is_empty() {
174                    return Err(Error::new(s.to_string(), ErrorKind::Complete));
175                }
176                Ok(node)
177            }
178            Err(error::Error { input, code }) => Err(Error {
179                input: input.to_string(),
180                code,
181            }),
182        }
183    }
184}
185
186fn parse_count(input: &str) -> IResult<&str, Node> {
187    let (input, node) = parse_single(input)?;
188    let (remaining, count) = opt(delimited(
189        char('{'),
190        alt((
191            separated_pair(complete::u32, char(','), complete::u32),
192            map(complete::u32, |n| (n, n)),
193            map(preceded(char(','), complete::u32), |n| (0, n)),
194        )),
195        char('}'),
196    ))
197    .parse(input)?;
198    match count {
199        None => Ok((remaining, node)),
200        Some((min, max)) if max >= min => Ok((remaining, Node::Count(Box::new(node), min, max))),
201        _ => Err(nom::Err::Failure(error::Error::new(
202            input,
203            ErrorKind::Verify,
204        ))),
205    }
206}
207
208fn parse_single(input: &str) -> IResult<&str, Node> {
209    alt((
210        map(parse_literal, Node::Literal),
211        map(parse_chars, Node::Chars),
212        map(parse_generator, Node::Generator),
213        parse_list,
214    ))
215    .parse(input)
216}
217
218fn parse_literal(input: &str) -> IResult<&str, Box<str>> {
219    map(
220        fold(
221            1..,
222            parse_literal_fragment,
223            String::new,
224            |mut string, fragment| {
225                match fragment {
226                    StringFragment::Escaped(c) => string.push(c),
227                    StringFragment::Verbatim(s) => string.push_str(s),
228                }
229                string
230            },
231        ),
232        Into::into,
233    )
234    .parse(input)
235}
236
237fn parse_literal_fragment(input: &str) -> IResult<&str, StringFragment<'_>> {
238    alt((
239        map(parse_literal_verbatim, StringFragment::Verbatim),
240        map(parse_literal_escaped, StringFragment::Escaped),
241    ))
242    .parse(input)
243}
244
245fn parse_literal_verbatim(input: &str) -> IResult<&str, &str> {
246    let (input, res) = verify(is_not("\\[](){}|"), |s: &str| !s.is_empty()).parse(input)?;
247    Ok((input, res))
248}
249
250fn parse_literal_escaped(input: &str) -> IResult<&str, char> {
251    alt((
252        preceded(
253            char('\\'),
254            alt((
255                value('\n', char('n')),
256                value('\r', char('r')),
257                value('\t', char('t')),
258                verify(anychar, |&c| !c.is_ascii_alphanumeric()),
259            )),
260        ),
261        parse_hex_char,
262        parse_unicode_char,
263    ))
264    .parse(input)
265}
266
267fn parse_unicode_char(input: &str) -> IResult<&str, char> {
268    map_res(parse_unicode_digits, char::try_from).parse(input)
269}
270
271fn parse_unicode_digits(input: &str) -> IResult<&str, u32> {
272    preceded(
273        tag("\\u"),
274        map_res(
275            alt((
276                take_while_m_n(4, 4, |c: char| c.is_ascii_hexdigit()),
277                delimited(
278                    char('{'),
279                    take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit()),
280                    char('}'),
281                ),
282            )),
283            |s| u32::from_str_radix(s, 16),
284        ),
285    )
286    .parse(input)
287}
288
289fn parse_hex_char(input: &str) -> IResult<&str, char> {
290    let (input, b) = parse_hex_byte(input)?;
291    if b < 0b1000_0000 {
292        return Ok((input, b as char));
293    }
294    if b & 0b1110_0000 == 0b1100_0000 {
295        map_res(parse_hex_byte, |b2| {
296            let bs = [b, b2];
297            str_to_char(&bs)
298        })
299        .parse(input)
300    } else if b & 0b1111_0000 == 0b1110_0000 {
301        map_res((parse_hex_byte, parse_hex_byte), |(b2, b3)| {
302            let bs = [b, b2, b3];
303            str_to_char(&bs)
304        })
305        .parse(input)
306    } else if b & 0b1111_1000 == 0b1111_0000 {
307        map_res(
308            (parse_hex_byte, parse_hex_byte, parse_hex_byte),
309            |(b2, b3, b4)| {
310                let bs = [b, b2, b3, b4];
311                str_to_char(&bs)
312            },
313        )
314        .parse(input)
315    } else {
316        Err(nom::Err::Error(error::Error::new(input, ErrorKind::Verify)))
317    }
318}
319
320fn str_to_char(bs: &[u8]) -> Result<char, Utf8Error> {
321    let s = str::from_utf8(bs)?;
322    let mut iter = s.chars();
323    let c = iter.next().expect(s);
324    assert!(iter.next().is_none());
325    Ok(c)
326}
327
328fn parse_hex_byte(input: &str) -> IResult<&str, u8> {
329    preceded(
330        tag("\\x"),
331        map_res(take_while_m_n(2, 2, |c: char| c.is_ascii_hexdigit()), |s| {
332            u8::from_str_radix(s, 16)
333        }),
334    )
335    .parse(input)
336}
337
338fn parse_chars(input: &str) -> IResult<&str, Chars> {
339    alt((
340        parse_legacy_words_err,
341        parse_chars_brackets,
342        map(parse_chars_special, |ps| {
343            Chars::from_ranges(ps.iter().copied())
344        }),
345    ))
346    .parse(input)
347}
348
349fn parse_legacy_words_err(input: &str) -> IResult<&str, Chars> {
350    let res = alt((tag("[:word:]"), tag("[:Word:]"))).parse(input);
351    match res {
352        Ok(_) => Err(nom::Err::Failure(error::Error::new(
353            input,
354            ErrorKind::Verify,
355        ))),
356        Err(e) => Err(e),
357    }
358}
359
360fn parse_chars_brackets(input: &str) -> IResult<&str, Chars> {
361    delimited(
362        char('['),
363        map(
364            fold(
365                1..,
366                alt((
367                    map(parse_chars_posix, CharFragment::Multi),
368                    map(parse_chars_special, CharFragment::Multi),
369                    map(parse_chars_range, CharFragment::Single),
370                )),
371                Vec::new,
372                |mut chars, fragment| {
373                    match fragment {
374                        CharFragment::Single(p) => chars.push(p),
375                        CharFragment::Multi(ps) => chars.extend(ps),
376                    }
377                    chars
378                },
379            ),
380            Chars::from_ranges,
381        ),
382        char(']'),
383    )
384    .parse(input)
385}
386
387static LOWER: &[(char, char)] = &[('a', 'z')];
388static UPPER: &[(char, char)] = &[('A', 'Z')];
389static ALPHA: &[(char, char)] = &[('A', 'Z'), ('a', 'z')];
390static ALNUM: &[(char, char)] = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
391static DIGIT: &[(char, char)] = &[('0', '9')];
392static XDIGIT: &[(char, char)] = &[('0', '9'), ('a', 'f')];
393static PUNCT: &[(char, char)] = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
394static PRINT: &[(char, char)] = &[(' ', '~')];
395static WORD: &[(char, char)] = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
396
397fn parse_chars_posix(input: &str) -> IResult<&str, &'static [(char, char)]> {
398    delimited(
399        tag("[:"),
400        alt((
401            value(LOWER, tag("lower")),
402            value(UPPER, tag("upper")),
403            value(ALPHA, tag("alpha")),
404            value(ALNUM, tag("alnum")),
405            value(DIGIT, tag("digit")),
406            value(XDIGIT, tag("xdigit")),
407            value(PUNCT, tag("punct")),
408            value(PRINT, tag("print")),
409        )),
410        tag(":]"),
411    )
412    .parse(input)
413}
414
415fn parse_chars_range(input: &str) -> IResult<&str, (char, char)> {
416    if let (remaining, Some((a, b))) = opt(separated_pair(
417        parse_chars_single,
418        char('-'),
419        parse_chars_single,
420    ))
421    .parse(input)?
422    {
423        if a <= b {
424            return Ok((remaining, (a, b)));
425        }
426        return Err(nom::Err::Failure(error::Error::new(
427            input,
428            ErrorKind::Verify,
429        )));
430    }
431    map(parse_chars_single, |c| (c, c)).parse(input)
432}
433
434fn parse_chars_single(input: &str) -> IResult<&str, char> {
435    alt((none_of("\\]"), parse_literal_escaped)).parse(input)
436}
437
438fn parse_chars_special(input: &str) -> IResult<&str, &'static [(char, char)]> {
439    preceded(
440        char('\\'),
441        alt((value(WORD, char('w')), value(DIGIT, char('d')))),
442    )
443    .parse(input)
444}
445
446fn parse_generator(input: &str) -> IResult<&str, Generator> {
447    let verify_inner = peek(verify(anychar, |c| c.is_ascii_lowercase()));
448    let parse_inner = map(
449        fold(
450            1..,
451            parse_generator_fragment,
452            String::new,
453            |mut string, fragment| {
454                match fragment {
455                    StringFragment::Escaped(c) => string.push(c),
456                    StringFragment::Verbatim(s) => string.push_str(s),
457                }
458                string
459            },
460        ),
461        Generator::from,
462    );
463    delimited(char('{'), preceded(verify_inner, parse_inner), char('}')).parse(input)
464}
465
466fn parse_generator_fragment(input: &str) -> IResult<&str, StringFragment<'_>> {
467    alt((
468        map(parse_generator_verbatim, StringFragment::Verbatim),
469        map(parse_literal_escaped, StringFragment::Escaped),
470    ))
471    .parse(input)
472}
473
474fn parse_generator_verbatim(input: &str) -> IResult<&str, &str> {
475    verify(is_not("\\}"), |s: &str| !s.is_empty()).parse(input)
476}
477
478fn parse_list(input: &str) -> IResult<&str, Node> {
479    delimited(char('('), parse_node, char(')')).parse(input)
480}
481
482#[cfg(test)]
483mod tests {
484    use super::*;
485
486    #[test]
487    fn test_literal() {
488        let node = "cats".parse().unwrap();
489        assert_eq!(Node::Literal("cats".into()), node);
490        let node = r#"\\cats\tand\[dogs\]\{woof\}"#.parse().unwrap();
491        assert_eq!(Node::Literal("\\cats\tand[dogs]{woof}".into()), node);
492    }
493
494    #[test]
495    fn test_chars() {
496        assert_eq!(
497            Node::Chars(unsafe {
498                Chars::from_ranges_unchecked([('0', '9'), ('A', 'Z'), ('a', 'z')])
499            }),
500            "[A-Za-z0123-9]".parse::<Node>().unwrap(),
501        );
502        let res = "[z-a]".parse::<Node>();
503        assert!(res.is_err(), "{res:?}");
504        assert_eq!("error Verify at: z-a]", &format!("{}", res.unwrap_err()));
505    }
506
507    #[test]
508    fn test_chars_table() {
509        let tests = [
510            (vec![('A', 'Z')], "[A-MD-Z]"),
511            (vec![('A', 'Z')], "[D-ZA-M]"),
512            (vec![('a', 'j')], "[a-cb-ea-fb-j]"),
513            (vec![('a', 'a'), ('c', 'c')], "[ac]"),
514            (vec![('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], "\\w"),
515            (vec![('a', 'z')], "[[:lower:]]"),
516            (
517                vec![('!', '/'), (':', '@'), ('[', '`'), ('{', '~')],
518                "[[:punct:]]",
519            ),
520            (vec![('!', '~')], "[[:punct:]\\w]"),
521        ];
522        for (ranges, inp) in tests {
523            assert_eq!(
524                Node::Chars(unsafe { Chars::from_ranges_unchecked(ranges) }),
525                inp.parse().unwrap(),
526            );
527        }
528    }
529
530    #[test]
531    fn test_generators() {
532        assert_eq!(
533            Node::Generator(Generator::new("word\tup}")),
534            "{word\\tup\\}}".parse().unwrap()
535        );
536    }
537
538    #[test]
539    fn test_multi() {
540        assert_eq!(
541            Node::List(
542                vec![
543                    Node::Generator(Generator::new("word")),
544                    Node::Count(
545                        Node::List(
546                            vec![
547                                Node::Literal("-".into()),
548                                Node::Generator(Generator::new("word")),
549                            ]
550                            .into()
551                        )
552                        .into(),
553                        4,
554                        4
555                    ),
556                ]
557                .into()
558            ),
559            "{word}(-{word}){4}".parse().unwrap(),
560        );
561    }
562
563    #[test]
564    fn test_legacy_words_err() {
565        let res = "[:word:]".parse::<Node>();
566        assert_eq!(
567            "error Verify at: [:word:]",
568            &format!("{}", res.unwrap_err())
569        );
570    }
571
572    #[test]
573    fn test_literal_digits() {
574        assert_eq!(
575            Node::Literal("—".into()),
576            r#"\xe2\x80\x94"#.parse().unwrap()
577        );
578        assert_eq!(Node::Literal("—".into()), "\\u2014".parse().unwrap());
579        assert_eq!(Node::Literal("—".into()), "\\u{002014}".parse().unwrap());
580        assert_eq!(
581            Err(error::Error {
582                input: "\\x80".into(),
583                code: ErrorKind::Char
584            }),
585            "\\x80".parse::<Node>(),
586        );
587        assert_eq!(
588            Err(error::Error {
589                input: "\\ud800".into(),
590                code: ErrorKind::Char
591            }),
592            "\\ud800".parse::<Node>(),
593        );
594    }
595
596    #[test]
597    fn test_remaining() {
598        assert_eq!(
599            Err(Error::new("a\\".into(), ErrorKind::Complete)),
600            "a\\".parse::<Node>()
601        );
602    }
603}