kladd 0.1.0

A weirdly personal markup language
Documentation
use crate::char_cursor::CharCursor;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
    MetadataMarker,
    Bang,
    At,
    OpenCurly,
    CloseCurly,
    OpenBrace,
    CloseBrace,
    Equals,
    Slash,
    Star,
    Underscore,
    Dash,
    Tilde,
    Percent,
    Comma,
    DoubleQuote,
    SingleQoute,
    Text,
    Newline,
    Whitespace,
    Eof,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
    pub kind: TokenKind,
    pub lexeme: String,
}

impl Token {
    pub fn new(kind: TokenKind, lexeme: String) -> Self {
        Token { kind, lexeme }
    }
}

pub fn tokenize(input: &str) -> impl Iterator<Item = Token> {
    let mut cursor = CharCursor::new(input);
    std::iter::from_fn(move || {
        cursor.start = cursor.curr;
        match cursor.advance_token() {
            Token {
                kind: TokenKind::Eof,
                ..
            } => None,
            tok => Some(tok),
        }
    })
}

pub fn is_horizontal_whitespace(c: Option<char>) -> bool {
    matches!(c, Some('\t' | ' '))
}

fn is_newline(c: Option<char>) -> bool {
    c.is_some_and(|c| c == '\n')
}

impl CharCursor<'_> {
    pub fn advance_token(&mut self) -> Token {
        let Some(first_char) = self.advance() else {
            return Token::new(TokenKind::Eof, String::new());
        };

        let token = match first_char {
            '\n' => self.newline(),
            c if is_horizontal_whitespace(Some(c)) => self.whitespace(),
            '+' if self.as_str().starts_with("++") => self.metadata(),
            '!' => TokenKind::Bang,
            ',' => TokenKind::Comma,
            '@' => TokenKind::At,
            '{' => TokenKind::OpenCurly,
            '}' => TokenKind::CloseCurly,
            '[' => TokenKind::OpenBrace,
            ']' => TokenKind::CloseBrace,
            '=' => TokenKind::Equals,
            '/' => TokenKind::Slash,
            '*' => TokenKind::Star,
            '_' => TokenKind::Underscore,
            '-' => TokenKind::Dash,
            '~' => TokenKind::Tilde,
            '%' => TokenKind::Percent,
            '"' => TokenKind::DoubleQuote,
            '\'' => TokenKind::SingleQoute,
            _ => self.text(),
        };

        Token::new(token, self.lexeme().to_string())
    }

    fn whitespace(&mut self) -> TokenKind {
        self.eat_while(is_horizontal_whitespace);
        TokenKind::Whitespace
    }

    fn newline(&mut self) -> TokenKind {
        self.eat_while(is_newline);
        TokenKind::Newline
    }

    fn metadata(&mut self) -> TokenKind {
        debug_assert!(self.prev == Some('+'));
        self.eat_while(|c| c.is_some_and(|i| i == '+'));
        TokenKind::MetadataMarker
    }

    fn text(&mut self) -> TokenKind {
        while self
            .peek()
            .is_some_and(|c| c.is_alphanumeric() || is_horizontal_whitespace(Some(c)))
        {
            self.advance();
        }

        TokenKind::Text
    }
}

#[cfg(test)]
mod tests {
    use crate::{
        char_cursor::CharCursor,
        lexer::{Token, TokenKind},
    };

    #[test]
    fn lex_quoted_text() {
        let cases = vec![(
            "this is some text",
            Token::new(TokenKind::Text, "this is some text".to_string()),
        )];

        for (input, expected) in cases {
            let res = CharCursor::new(input).advance_token();
            assert_eq!(expected, res);
        }
    }
}