df_ls_lexical_analysis 0.3.0-rc.1

use regex::Regex;

#[allow(dead_code)]
#[derive(Clone, Debug)]
pub(crate) struct RegexList {
    pub newline: Regex,
    pub line_space: Regex,
    pub comment: Regex,

    pub token_name: Regex,
    /// Matches everything inside token argument.
    /// Used as a capture everything allowed inside an argument.
    pub token_argument_all: Regex,

    pub token_argument_integer: Regex,
    pub token_argument_character: Regex,
    pub token_argument_arg_n: Regex,
    pub token_argument_reference: Regex,
    pub token_argument_string: Regex,
    pub token_argument_pipe_arguments: Regex,
    pub token_argument_bang_arg_n: Regex,

    pub token_open_bracket: Regex,
    pub token_close_bracket: Regex,
    pub token_separator: Regex,
    pub token_pipe_separator: Regex,

    pub header: Regex,
}

impl RegexList {
    #[allow(clippy::trivial_regex)]
    pub fn new() -> Self {
        // WARNING: Note that changing these expressions should be done with GREAT care.
        // Because changing this might have strong effects on the parsing speed and accuracy.
        // For more info about all the regex expressions see the `docs/Syntax_Highlighting.md` file.
        Self {
            // Accepts `\n` or `\r\n`.
            newline: Regex::new(r"\r?\n").unwrap(),
            // Accepts a sequence of ` ` (space) or `\t` (tab) and all combinations.
            // Needs to be at least 1 character long.
            line_space: Regex::new(r"[ \t]+").unwrap(),
            // Accepts everything except `[` or `]` and all combinations.
            // Needs to be at least 1 character long.
            comment: Regex::new(r"[^\[\]]+").unwrap(),

            // Accepts all sequences of numbers, and upper case latin characters.
            // `_` (underscore) is also allowed.
            // Needs to contain at least 1 upper case latin character.
            // It can not start with `_` or a number.
            //
            // Because of issue #38 a special case needed to be added for `VOLUME_mB`.
            token_name: Regex::new(r"(?:VOLUME_mB)|(?:[A-Z][A-Z_0-9]*)").unwrap(),
            // Accepts all sequence of characters except: `[`, `]`, `:`, `\r` or `\n`.
            // Needs to be at least 1 character long.
            token_argument_all: Regex::new(r"[^\[\]:\r\n]+").unwrap(),

            // Accepts all sequence of numbers. It can start with a `-` character.
            // Needs to be at least 1 number long.
            // Leading zeros do not effect the parsing. (issue #113)
            token_argument_integer: Regex::new(r"-?[0-9]+").unwrap(),
            // Accepts all characters surrounded by single quotes,
            // except for `]` and `:` (because of issue #111).
            // `\r` and `\n` are also not allowed because newlines are not allowed inside tokens.
            // Example `'a'` or `'?'`.
            // Is always exact 3 characters long.
            token_argument_character: Regex::new(r"('[^\]:\r\n]')").unwrap(),
            // A sequence starting with "ARG"
            // followed by any sequence of numbers.
            // At least 1 number is required.
            // No leading zeros allowed.
            token_argument_arg_n: Regex::new(r"ARG(?:0|[1-9][0-9]*)").unwrap(),
            // Accepts all sequences of numbers, and upper case latin characters.
            // `_` (underscore) is also allowed.
            // Needs to contain at least 1 upper case latin character.
            // It can not start with `_`.
            //
            // Because of issue #93 it can also start with a number.
            token_argument_reference: Regex::new(r"(?:[0-9]_*)*([A-Z][A-Z_0-9]*)").unwrap(),
            // Accepts all sequence of characters except: `|`, `[`, `]`, `:`, `\r` or `\n`.
            // Needs to be at least 1 character long.
            token_argument_string: Regex::new(r"[^\|\[\]:\r\n]+").unwrap(),
            // Accepts all sequence of characters except: `[`, `]`, `:`, `\r` or `\n`.
            // Needs to be at least 1 character long.
            // NOTE: The difference with string is that this does accept the `|` character.
            token_argument_pipe_arguments: Regex::new(r"[^\[\]:\r\n]+").unwrap(),
            // A sequence starting with "!ARG"
            // followed by any sequence of numbers.
            // At least 1 number is required.
            // This expression can be matched multiple
            // times in a string/reference.
            // No leading zeros allowed.
            token_argument_bang_arg_n: Regex::new(r"!ARG(?:0|[1-9][0-9]*)").unwrap(),

            // Accepts just 1 `[` (open square bracket)
            token_open_bracket: Regex::new(r"\[").unwrap(),
            // Accepts just 1 `]` (closing square bracket)
            token_close_bracket: Regex::new(r"\]").unwrap(),
            // Accepts just 1 `:` (colon)
            token_separator: Regex::new(r":").unwrap(),
            // Accepts just 1 `|` (pipe)
            token_pipe_separator: Regex::new(r"\|").unwrap(),

            // Accepts a sequence of characters that has at least one character
            // except `[`, `]`, `\r` or `\n`.
            // Needs to be at least 1 character long.
            header: Regex::new(r"[^\[\]\r\n]+").unwrap(),
        }
    }
}