rulex 0.4.4

DEPRECATED: Use pomsky instead. A new regular expression language
Documentation
use crate::{parse::ParseErrorMsg, span::Span};

use super::Token;

macro_rules! consume_chain {
    (
        $input:ident, $c:ident;
        if $cond:expr => $result:expr ; $($rest:tt)*
    ) => {
        if $cond {
            $result
        } else {
            consume_chain!($input, $c; $($rest)*)
        }
    };
    (
        $input:ident, $c:ident;
        if let $pat:pat = $test:expr => $result:expr ; $($rest:tt)*
    ) => {
        if let $pat = $test {
            $result
        } else {
            consume_chain!($input, $c; $($rest)*)
        }
    };
    (
        $input:ident, $c:ident;
    ) => {
        {
            (($c.len_utf8(), Token::Error))
        }
    }
}

pub(crate) fn tokenize(mut input: &str) -> Vec<(Token, Span)> {
    let mut result = vec![];
    let mut offset = 0;

    loop {
        let input_len = input.len();
        input = input.trim_start();
        while input.starts_with('#') {
            input = input.trim_start_matches(|c| c != '\n').trim_start();
        }
        offset += input_len - input.len();

        match input.chars().next() {
            None => break,
            Some(c) => {
                let (len, token) = consume_chain! {
                    input, c;

                    if input.starts_with("<%") => (2, Token::BStart);
                    if input.starts_with("%>") => (2, Token::BEnd);
                    if input.starts_with(">>") => (2, Token::LookAhead);
                    if input.starts_with("<<") => (2, Token::LookBehind);
                    if input.starts_with("::") => (2, Token::Backref);

                    if c == '%' => (1, Token::BWord);
                    if c == '*' => (1, Token::Star);
                    if c == '+' => (1, Token::Plus);
                    if c == '?' => (1, Token::QuestionMark);
                    if c == '|' => (1, Token::Pipe);
                    if c == ':' => (1, Token::Colon);
                    if c == ')' => (1, Token::CloseParen);
                    if c == '{' => (1, Token::OpenBrace);
                    if c == '}' => (1, Token::CloseBrace);
                    if c == ',' => (1, Token::Comma);
                    if c == '!' => (1, Token::Not);
                    if c == '[' => (1, Token::OpenBracket);
                    if c == '-' => (1, Token::Dash);
                    if c == ']' => (1, Token::CloseBracket);
                    if c == '.' => (1, Token::Dot);
                    if c == ';' => (1, Token::Semicolon);
                    if c == '=' => (1, Token::Equals);

                    if c == '\'' => match input[1..].find('\'') {
                        Some(len_inner) => (len_inner + 2, Token::String),
                        None => (input.len(), Token::ErrorMsg(ParseErrorMsg::UnclosedString)),
                    };

                    if c == '"' => match find_unescaped_quote(&input[1..]) {
                        Some(len_inner) => (len_inner + 2, Token::String),
                        None => (input.len(), Token::ErrorMsg(ParseErrorMsg::UnclosedString)),
                    };

                    if let Some(rest) = input.strip_prefix("U+") => {
                        match rest.find(|c: char| !c.is_ascii_hexdigit()) {
                            Some(0) => (1, Token::Error),
                            Some(len_inner) => (len_inner + 2, Token::CodePoint),
                            None => (input.len(), Token::CodePoint),
                        }
                    };

                    if matches!(c, '0'..='9') => (
                        input.find(|c: char| !matches!(c, '0'..='9')).unwrap_or(input.len()),
                        Token::Number,
                    );

                    if c.is_alphabetic() || c == '_' => (
                        input.find(|c: char| !c.is_alphanumeric() && c != '_').unwrap_or(input.len()),
                        Token::Identifier,
                    );

                    if c == '^' => (1, Token::ErrorMsg(ParseErrorMsg::Caret));
                    if c == '$' => (1, Token::ErrorMsg(ParseErrorMsg::Dollar));

                    if let Some(rest) = input.strip_prefix("(?") => (
                        match rest.chars().next() {
                            Some('<') => {
                                let name_len = rest.chars()
                                    .skip(1)
                                    .take_while(char::is_ascii_alphanumeric)
                                    .count();

                                if name_len > 0 && matches!(rest.chars().nth(1 + name_len), Some('>')) {
                                    4 + name_len
                                } else if let Some('=' | '!') = rest.chars().nth(1) {
                                    4
                                } else {
                                    3
                                }
                            }
                            Some('P') if matches!(rest.chars().nth(1), Some('<')) => {
                                let name_len = rest.chars()
                                    .skip(2)
                                    .take_while(char::is_ascii_alphanumeric)
                                    .count();

                                if name_len > 0 && matches!(rest.chars().nth(2 + name_len), Some('>')) {
                                    5 + name_len
                                } else {
                                    4
                                }
                            },
                            Some('>' | '!' | ':' | '=' | '(' | '|') => 3,
                            _ => 2,
                        },
                        Token::ErrorMsg(ParseErrorMsg::SpecialGroup),
                    );
                    if c == '(' => (1, Token::OpenParen);

                    if c == '\\' => {
                        if input.starts_with("\\u{") || input.starts_with("\\x{") {
                            match input[3..].find('}') {
                                Some(len) => (len + 4, Token::ErrorMsg(ParseErrorMsg::BackslashUnicode)),
                                None => (2, Token::ErrorMsg(ParseErrorMsg::Backslash)),
                            }
                        } else if let Some(rest) = input.strip_prefix("\\u") {
                            match rest.find(|c: char| !c.is_ascii_hexdigit()).unwrap_or(rest.len()) {
                                4.. => (6, Token::ErrorMsg(ParseErrorMsg::BackslashU4)),
                                _ => (2, Token::ErrorMsg(ParseErrorMsg::Backslash)),
                            }
                        } else if let Some(rest) = input.strip_prefix("\\x") {
                            match rest.find(|c: char| !c.is_ascii_hexdigit()).unwrap_or(rest.len()) {
                                2.. => (4, Token::ErrorMsg(ParseErrorMsg::BackslashX2)),
                                _ => (2, Token::ErrorMsg(ParseErrorMsg::Backslash)),
                            }
                        } else if let Some(rest) = input.strip_prefix("\\k<") {
                            match rest.find('>') {
                                Some(len) => (len + 4, Token::ErrorMsg(ParseErrorMsg::BackslashK)),
                                _ => (2, Token::ErrorMsg(ParseErrorMsg::Backslash)),
                            }
                        } else if let Some(next) = input.chars().nth(1) {
                            (1 + next.len_utf8(), Token::ErrorMsg(ParseErrorMsg::Backslash))
                        } else {
                            (1, Token::Error)
                        }
                    };
                };

                let start = offset;
                offset += len;
                input = &input[len..];
                result.push((token, Span::new(start, offset)));
            }
        }
    }

    result
}

fn find_unescaped_quote(input: &str) -> Option<usize> {
    let mut s = input;

    loop {
        match s.find(|c| c == '\\' || c == '"') {
            Some(n) => {
                if s.as_bytes()[n] == b'"' {
                    return Some(n + (input.len() - s.len()));
                } else if let Some(next) = s[n + 1..].chars().next() {
                    s = &s[n + 1 + next.len_utf8()..];
                } else {
                    return None;
                }
            }
            None => return None,
        }
    }
}