kladd 0.2.0

A weirdly personal markup language
Documentation
use crate::char_cursor::CharCursor;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
    MetadataMarker,
    Bang,
    At,
    OpenCurly,
    CloseCurly,
    OpenBrace,
    CloseBrace,
    Equals,
    ForwardSlash,
    BackwardSlash,
    Star,
    Underscore,
    Dash,
    Tilde,
    Percent,
    Hashbang,
    Comma,
    Dot,
    DoubleQuote,
    SingleQuote,
    Text,
    Newline,
    Whitespace,
    Eof,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Token<'a> {
    pub kind: TokenKind,
    pub lexeme: &'a str,
}

impl<'a> Token<'a> {
    pub fn new(kind: TokenKind, lexeme: &'a str) -> Self {
        Token { kind, lexeme }
    }
}

pub fn tokenize<'a>(input: &'a str) -> Vec<Token<'a>> {
    let mut cursor = CharCursor::new(input);
    let mut res = Vec::with_capacity(input.len());

    loop {
        cursor.start = cursor.curr;
        match cursor.advance_token() {
            t @ Token {
                kind: TokenKind::Eof,
                ..
            } => {
                res.push(t);
                break;
            }
            tok => res.push(tok),
        }
    }

    res
}

pub fn is_horizontal_whitespace(c: Option<char>) -> bool {
    matches!(c, Some('\t' | ' '))
}

fn is_newline(c: Option<char>) -> bool {
    c.is_some_and(|c| c == '\n')
}

impl<'a> CharCursor<'a> {
    pub fn advance_token(&mut self) -> Token<'a> {
        let Some(first_char) = self.advance() else {
            return Token::new(TokenKind::Eof, "");
        };

        let token = match first_char {
            '\n' => self.newline(),
            c if is_horizontal_whitespace(Some(c)) => self.whitespace(),
            '+' if self.as_str().starts_with("++") => self.metadata(),
            '!' => TokenKind::Bang,
            ',' => TokenKind::Comma,
            '.' => TokenKind::Dot,
            '@' => TokenKind::At,
            '{' => TokenKind::OpenCurly,
            '}' => TokenKind::CloseCurly,
            '[' => TokenKind::OpenBrace,
            ']' => TokenKind::CloseBrace,
            '=' => TokenKind::Equals,
            '/' => TokenKind::ForwardSlash,
            '\\' => TokenKind::BackwardSlash,
            '*' => TokenKind::Star,
            '_' => TokenKind::Underscore,
            '-' => TokenKind::Dash,
            '~' => TokenKind::Tilde,
            '%' => TokenKind::Percent,
            '#' => TokenKind::Hashbang,
            '"' => TokenKind::DoubleQuote,
            '\'' => TokenKind::SingleQuote,
            _ => self.text(),
        };

        Token::new(token, self.lexeme())
    }

    fn whitespace(&mut self) -> TokenKind {
        self.eat_while(is_horizontal_whitespace);
        TokenKind::Whitespace
    }

    fn newline(&mut self) -> TokenKind {
        self.eat_while(is_newline);
        TokenKind::Newline
    }

    fn metadata(&mut self) -> TokenKind {
        debug_assert!(self.prev() == Some('+'));
        self.eat_while(|c| c.is_some_and(|i| i == '+'));
        TokenKind::MetadataMarker
    }

    fn text(&mut self) -> TokenKind {
        while self
            .peek()
            .is_some_and(|c| c.is_alphanumeric() || is_horizontal_whitespace(Some(c)))
        {
            self.advance();
        }

        TokenKind::Text
    }
}

#[cfg(test)]
mod tests {
    use crate::{
        char_cursor::CharCursor,
        lexer::{Token, TokenKind},
    };

    #[test]
    fn lex_quoted_text() {
        let cases = vec![(
            "this is some text",
            Token::new(TokenKind::Text, "this is some text"),
        )];

        for (input, expected) in cases {
            let res = CharCursor::new(input).advance_token();
            assert_eq!(expected, res);
        }
    }
}