pomsky-syntax 0.12.0

Parser for pomsky, a new regular expression language
Documentation
use crate::Span;

use super::{
    LexErrorMsg, Token,
    micro_regex::{Capture, CharIs, Many0, Many1, MicroRegex},
};

macro_rules! consume_chain {
    (
        $input:ident, $c:ident;
        if $cond:expr => $result:expr ; $($rest:tt)*
    ) => {
        if $cond {
            $result
        } else {
            consume_chain!($input, $c; $($rest)*)
        }
    };
    (
        $input:ident, $c:ident;
        if let $pat:pat = $test:expr => $result:expr ; $($rest:tt)*
    ) => {
        if let $pat = $test {
            $result
        } else {
            consume_chain!($input, $c; $($rest)*)
        }
    };
    (
        $input:ident, $c:ident;
    ) => {
        {
            ($c.len_utf8(), Token::Error)
        }
    }
}

macro_rules! reserved_word_pattern {
    {} => (
        "U" | "let" | "lazy" | "greedy" | "range" | "base" | "atomic" | "enable" | "disable" |
        "if" | "else" | "recursion" | "regex" | "test" | "call"
    );
}

static SINGLE_TOKEN_LOOKUP: [Option<Token>; 127] = const {
    let mut table = [const { None }; 127];
    table[b'^' as usize] = Some(Token::Caret);
    table[b'$' as usize] = Some(Token::Dollar);
    table[b'%' as usize] = Some(Token::Percent);
    table[b'<' as usize] = Some(Token::AngleLeft);
    table[b'>' as usize] = Some(Token::AngleRight);
    table[b'*' as usize] = Some(Token::Star);
    table[b'+' as usize] = Some(Token::Plus);
    table[b'?' as usize] = Some(Token::QuestionMark);
    table[b'|' as usize] = Some(Token::Pipe);
    table[b'&' as usize] = Some(Token::Ampersand);
    table[b':' as usize] = Some(Token::Colon);
    table[b')' as usize] = Some(Token::CloseParen);
    table[b'{' as usize] = Some(Token::OpenBrace);
    table[b'}' as usize] = Some(Token::CloseBrace);
    table[b',' as usize] = Some(Token::Comma);
    table[b'!' as usize] = Some(Token::Not);
    table[b'[' as usize] = Some(Token::OpenBracket);
    table[b']' as usize] = Some(Token::CloseBracket);
    table[b'-' as usize] = Some(Token::Dash);
    table[b'.' as usize] = Some(Token::Dot);
    table[b';' as usize] = Some(Token::Semicolon);
    table[b'=' as usize] = Some(Token::Equals);
    table
};

fn lookup_single(c: char) -> Option<Token> {
    let c = c as u32;
    if c < 128 { SINGLE_TOKEN_LOOKUP[c as usize] } else { None }
}

pub(crate) fn tokenize(mut input: &str) -> Vec<(Token, Span)> {
    let mut result = vec![];
    let mut offset = 0;

    while let Some((token, start, end)) = next_token(input) {
        result.push((token, Span::new(offset + start, offset + end)));
        input = &input[end..];
        offset += end;
    }

    result
}

pub fn next_token(mut input: &str) -> Option<(Token, usize, usize)> {
    let input_len = input.len();
    input = input.trim_start();
    while input.starts_with('#') {
        input = input.trim_start_matches(|c| c != '\n').trim_start();
    }
    let start = input_len - input.len();

    match input.chars().next() {
        None => None,
        Some(c) => {
            let (len, token) = consume_chain! {
                input, c;

                if input.starts_with(">>") => (2, Token::LookAhead);
                if input.starts_with("<<") => (2, Token::LookBehind);
                if input.starts_with("::") => (2, Token::DoubleColon);

                if let Some(token) = lookup_single(c) => (1, token);

                if c == '\'' => match input[1..].find('\'') {
                    Some(len_inner) => (len_inner + 2, Token::String),
                    None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
                };

                if c == '"' => match find_unescaped_quote(&input[1..]) {
                    Some(len_inner) => (len_inner + 2, Token::String),
                    None => (input.len(), Token::ErrorMsg(LexErrorMsg::UnclosedString)),
                };

                if let Some((len, _)) = (
                    'U',
                    Many0(CharIs(char::is_whitespace)),
                    '+',
                    Many0(CharIs(char::is_whitespace)),
                    Many1(CharIs(|c| c.is_alphanumeric() || c == '_')),
                ).is_start(input) => {
                    if input[1..len].trim_start_matches(|c: char| c == '+' || c.is_whitespace())
                        .contains(|c: char| !c.is_ascii_hexdigit()) {
                        (len, Token::ErrorMsg(LexErrorMsg::InvalidCodePoint))
                    } else {
                        (len, Token::CodePoint)
                    }
                };

                if let Some((len, _)) = (
                    Many1(CharIs(|c| c.is_ascii_digit()))
                ).is_start(input) => match (input.as_bytes(), len) {
                    ([b'0', ..], 2..) => (len, Token::ErrorMsg(LexErrorMsg::LeadingZero)),
                    _ => (len, Token::Number),
                };

                if let Some((len, _)) = (
                    CharIs(|c| c.is_alphabetic() || c == '_'),
                    Many0(CharIs(|c| c.is_alphanumeric() || c == '_'))
                ).is_start(input) => match &input[..len] {
                    reserved_word_pattern!() => (len, Token::ReservedName),
                    _ => (len, Token::Identifier),
                };

                if let Some((len, err)) = parse_special_group(input) => (len, Token::ErrorMsg(err));

                if c == '(' => (1, Token::OpenParen);

                if let Some((len, err)) = parse_backslash(input) => (len, Token::ErrorMsg(err));
            };

            Some((token, start, start + len))
        }
    }
}

fn find_unescaped_quote(input: &str) -> Option<usize> {
    let mut s = input;

    loop {
        match s.find(['\\', '"']) {
            Some(n) => {
                if s.as_bytes()[n] == b'"' {
                    return Some(n + (input.len() - s.len()));
                }
                let next = s[n + 1..].chars().next()?;
                s = &s[n + 1 + next.len_utf8()..];
            }
            None => return None,
        }
    }
}

fn parse_backslash(input: &str) -> Option<(usize, LexErrorMsg)> {
    let hex = CharIs(|c| c.is_ascii_hexdigit());

    let ident = Many1(CharIs(|c| c.is_ascii_alphanumeric() || matches!(c, '-' | '+' | '_')));

    let after_gk: &[&dyn MicroRegex<Context = _>] = &[
        &('<', ident, '>'),
        &('{', ident, '}'),
        &('\'', ident, '\''),
        &(&["-", "+", ""][..], CharIs(|c| c.is_ascii_digit())),
    ];

    let after_p: &[&dyn MicroRegex<Context = _>] =
        &[&CharIs(|c| c.is_ascii_alphanumeric()), &('{', ident, '}'), &("{^", ident, '}')];

    let after_backslash: [&dyn MicroRegex<Context = _>; 6] = [
        &(&["u{", "x{"][..], Many1(hex), '}').ctx(LexErrorMsg::BackslashUnicode),
        &('u', hex, hex, hex, hex).ctx(LexErrorMsg::BackslashU4),
        &('x', hex, hex).ctx(LexErrorMsg::BackslashX2),
        &(&['k', 'g'][..], after_gk).ctx(LexErrorMsg::BackslashGK),
        &(&['p', 'P'][..], after_p).ctx(LexErrorMsg::BackslashProperty),
        &CharIs(|_| true).ctx(LexErrorMsg::Backslash),
    ];

    Capture(('\\', &after_backslash[..])).is_start(input).map(|(len, (_, err))| (len, err))
}

fn parse_special_group(input: &str) -> Option<(usize, LexErrorMsg)> {
    let ident = Many1(CharIs(|c| c.is_ascii_alphanumeric() || c == '-' || c == '+'));

    let after_open: &[&dyn MicroRegex<Context = _>] = &[
        &':'.ctx(LexErrorMsg::GroupNonCapturing),
        &'='.ctx(LexErrorMsg::GroupLookahead),
        &'!'.ctx(LexErrorMsg::GroupLookaheadNeg),
        &'>'.ctx(LexErrorMsg::GroupAtomic),
        &'('.ctx(LexErrorMsg::GroupConditional),
        &'|'.ctx(LexErrorMsg::GroupBranchReset),
        &"<=".ctx(LexErrorMsg::GroupLookbehind),
        &"<!".ctx(LexErrorMsg::GroupLookbehindNeg),
        &(&["P<", "<"][..], ident, '>').ctx(LexErrorMsg::GroupNamedCapture),
        &('\'', ident, '\'').ctx(LexErrorMsg::GroupNamedCapture),
        &("P=", ident, ')').ctx(LexErrorMsg::GroupPcreBackreference),
        &(&["P>", "&"][..]).ctx(LexErrorMsg::GroupSubroutineCall),
        &('#', Many0(CharIs(|c| c != ')')), ')').ctx(LexErrorMsg::GroupComment),
        &"".ctx(LexErrorMsg::GroupOther),
    ];

    Capture(("(?", after_open)).is_start(input).map(|(len, (_, err))| (len, err))
}