tom 0.0.1

Yet another format-preserving TOML parser/manipulator.
Documentation
use m_lexer::{Lexer, LexerBuilder, TokenKind};

use {symbol, Symbol, TextRange, TextUnit};

#[derive(Copy, Clone, Debug)]
pub(crate) struct Token {
    pub symbol: Symbol,
    pub range: TextRange,
}

impl Token {
    pub fn is_significant(self) -> bool {
        match self.symbol {
            symbol::WHITESPACE | symbol::COMMENT => false,
            _ => true,
        }
    }
}

#[derive(Debug)]
pub(crate) struct Tokens {
    pub raw_tokens: Vec<Token>,
    pub significant: Vec<usize>,
}

pub(crate) fn tokenize(input: &str) -> Tokens {
    let mut raw_tokens = Vec::new();
    let mut offset = TextUnit::from(0);
    for t in LEXER.tokenize(input) {
        let len = TextUnit::from(t.len as u32);
        raw_tokens.push(Token {
            symbol: Symbol::new(t.kind.0),
            range: TextRange::offset_len(offset, len),
        });
        offset += len;
    }
    let significant = raw_tokens
        .iter()
        .enumerate()
        .filter(|(_idx, tok)| tok.is_significant())
        .map(|(idx, _tok)| idx)
        .collect();
    Tokens {
        raw_tokens,
        significant,
    }
}

lazy_static! {
    static ref LEXER: Lexer = lexer();
}

fn lexer() -> Lexer {
    fn t(s: Symbol) -> TokenKind {
        TokenKind(s.0.get() as u16)
    }

    LexerBuilder::new()
        .error_token(t(symbol::ERROR))
        .tokens(&[
            (t(symbol::EQ), r"="),
            (t(symbol::DOT), r"\."),
            (t(symbol::COMMA), r","),
            (t(symbol::L_BRACK), r"\["),
            (t(symbol::R_BRACK), r"\]"),
            (t(symbol::L_CURLY), r"\{"),
            (t(symbol::R_CURLY), r"\}"),
            (t(symbol::WHITESPACE), r"\s+"),
            (t(symbol::COMMENT), r"#.*"),
            (t(symbol::BARE_KEY_OR_NUMBER), r"[0-9]+"),
            (
                t(symbol::BARE_KEY_OR_DATE),
                r"[0-9]{4}-[0-9]{2}-[0-9]{2}[Zz]?",
            ),
            (
                t(symbol::BASIC_STRING),
                r#"(?x)
                    " ([^\r\n"\\] | \\.)* "
                "#,
            ),
            (
                t(symbol::MULTILINE_BASIC_STRING),
                r#"(?x)
                    """ ([^"] | \\. | "[^"] | ""[^"])* """
                "#,
            ),
            (
                t(symbol::LITERAL_STRING),
                r#"(?x)
                    ' ([^\r\n'] | \\.)* '
                "#,
            ),
            (
                t(symbol::MULTILINE_LITERAL_STRING),
                r#"(?x)
                    ''' ([^'] | \\. | '[^'] | ''[^'])* '''
                "#,
            ),
            (t(symbol::BOOL), r"(false|true)"),
            (
                t(symbol::NUMBER),
                r"(?x)
                    [-+]?
                    (0|[1-9](_?[0-9])*) # no leading zeros
                    (\.[0-9](_?[0-9])*)?
                    ([eE][-+]?[1-9](_?[0-9])*)?
                ",
            ),
            (
                t(symbol::DATE_TIME),
                r"(?x)
                    ( [0-9]{4}-[0-9]{2}-[0-9]{2} ([Tt]([0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?))?
                    | ([0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+)?))
                    ([Zz]|[+-][0-9]{2}:[0-9]{2})?
                ",
            ),
            (t(symbol::BARE_KEY), r"[0-9_\-a-zA-Z]+"),
        ])
        .build()
}