lex-core 0.8.2 - Docs.rs

//! Token definitions for the lex format
//!
//!     This module defines all the core tokens that can be produced by the lex lexer.
//!     The tokens are defined using the logos derive macro for efficient tokenization.
//!
//!     These are character/word level tokens produced directly by the logos lexer. They represent
//!     the fundamental building blocks of lex source code: text, whitespace, markers, punctuation,
//!     and indentation.
//!
//!     The logos lexer produces these tokens declaratively with no custom logic. This is a pure
//!     tokenization step that converts source text into a stream of typed tokens with byte ranges.
//!
//!     For the complete grammar specification, see specs/v1/grammar-core.lex.
//!
//!     Note: These core tokens are transformed into semantic tokens (like Indent/Dedent) in later
//!     stages of the lexing pipeline. See the [token](super) module for the complete token system.
use logos::Logos;
use std::fmt;

/// All possible tokens in the lex format
#[derive(Logos, Debug, PartialEq, Eq, Hash, Clone, serde::Serialize, serde::Deserialize)]
pub enum Token {
    // Special markers
    #[token("::")]
    LexMarker,

    // Indentation (simplified - one token per 4 spaces or tab)
    #[regex(r" {4}|\t", priority = 3)] // Either 4 spaces OR 1 tab - highest priority
    Indentation,

    // Semantic indentation tokens (generated by transformation)
    // These store the original tokens they were created from
    Indent(Vec<(Token, std::ops::Range<usize>)>),
    Dedent(Vec<(Token, std::ops::Range<usize>)>),

    // A blank line (whitespace only, followed by a newline)
    #[regex(r"\n", |lex| Some(lex.slice().to_owned()))]
    BlankLine(Option<String>),

    // Whitespace (excluding newlines and indentation)
    #[regex(r" {1,3}", |lex| Some(lex.slice().len()), priority = 1)]
    // 1-3 spaces only, lower priority than indentation
    Whitespace(usize),

    // Sequence markers
    #[token("-")]
    Dash,
    #[token(".")]
    Period,
    #[token("(")]
    OpenParen,
    #[token(")")]
    CloseParen,
    #[token(":")]
    Colon,

    // End Punctuation
    #[token("!")]
    ExclamationMark,
    #[token("?")]
    QuestionMark,
    #[token(";")]
    Semicolon,
    #[token("¡")]
    InvertedExclamationMark,
    #[token("¿")]
    InvertedQuestionMark,
    #[token("…")]
    Ellipsis,
    #[token("。")]
    IdeographicFullStop,
    #[token("！")]
    FullwidthExclamationMark,
    #[token("？")]
    FullwidthQuestionMark,
    #[token("⁉")]
    ExclamationQuestionMark,
    #[token("⁈")]
    QuestionExclamationMark,
    #[token("؟")]
    ArabicQuestionMark,
    #[token("۔")]
    ArabicFullStop,
    #[token("؍")]
    ArabicTripleDot,
    #[token("،")]
    ArabicComma,
    #[token("।")]
    Danda,
    #[token("॥")]
    DoubleDanda,
    #[token("৷")]
    BengaliCurrencyNumeratorFour,
    #[token("።")]
    EthiopianFullStop,
    #[token("։")]
    ArmenianFullStop,
    #[token("།")]
    TibetanShad,
    #[token("๏")]
    ThaiFongman,
    #[token("၊")]
    MyanmarComma,
    #[token("။")]
    MyanmarFullStop,

    // Parameter markers (for annotations)
    #[token(",")]
    Comma,
    #[token("\"")]
    Quote,
    #[token("=")]
    Equals,

    // Numbers (for ordered lists and session titles)
    #[regex(r"[0-9]+", |lex| lex.slice().to_owned(), priority = 2)]
    Number(String),

    // Text content (catch-all for non-special characters, excluding numbers and special chars)
    // The regex explicitly excludes all special characters that have dedicated tokens.
    // Character categories in the exclusion set:
    //   \s\n\t          - whitespace
    //   \-\.\(\):       - structural punctuation (sequence markers)
    //   0-9             - numbers
    //   ,="             - parameter markers (annotations)
    //   !?;             - basic latin punctuation
    //   ¡¿…⁉⁈           - extended latin punctuation
    //   。！？           - CJK punctuation
    //   ؟۔؍،            - arabic punctuation
    //   ।॥৷             - indic punctuation
    //   ።։།๏၊။          - other scripts (ethiopian, armenian, tibetan, thai, myanmar)
    #[regex(r#"[^\s\n\t\-\.\(\):0-9,="!?;¡¿…。！？⁉⁈؟۔؍،।॥৷።։།๏၊။]+"#, |lex| lex.slice().to_owned())]
    Text(String),
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let name = match self {
            Token::LexMarker => "lex-marker",
            Token::Indentation => "indentation",
            Token::Indent(_) => "indent",
            Token::Dedent(_) => "dedent",
            Token::BlankLine(_) => "blank-line",
            Token::Whitespace(_) => "whitespace",
            Token::Dash => "dash",
            Token::Period => "period",
            Token::OpenParen => "open-paren",
            Token::CloseParen => "close-paren",
            Token::Colon => "colon",
            Token::ExclamationMark => "exclamation-mark",
            Token::QuestionMark => "question-mark",
            Token::Semicolon => "semicolon",
            Token::InvertedExclamationMark => "inverted-exclamation-mark",
            Token::InvertedQuestionMark => "inverted-question-mark",
            Token::Ellipsis => "ellipsis",
            Token::IdeographicFullStop => "ideographic-full-stop",
            Token::FullwidthExclamationMark => "fullwidth-exclamation-mark",
            Token::FullwidthQuestionMark => "fullwidth-question-mark",
            Token::ExclamationQuestionMark => "exclamation-question-mark",
            Token::QuestionExclamationMark => "question-exclamation-mark",
            Token::ArabicQuestionMark => "arabic-question-mark",
            Token::ArabicFullStop => "arabic-full-stop",
            Token::ArabicTripleDot => "arabic-triple-dot",
            Token::ArabicComma => "arabic-comma",
            Token::Danda => "danda",
            Token::DoubleDanda => "double-danda",
            Token::BengaliCurrencyNumeratorFour => "bengali-currency-numerator-four",
            Token::EthiopianFullStop => "ethiopian-full-stop",
            Token::ArmenianFullStop => "armenian-full-stop",
            Token::TibetanShad => "tibetan-shad",
            Token::ThaiFongman => "thai-fongman",
            Token::MyanmarComma => "myanmar-comma",
            Token::MyanmarFullStop => "myanmar-full-stop",
            Token::Comma => "comma",
            Token::Quote => "quote",
            Token::Equals => "equals",
            Token::Number(s) => return write!(f, "<number:{s}>"),
            Token::Text(s) => return write!(f, "<text:{s}>"),
        };
        write!(f, "<{name}>")
    }
}

impl Token {
    /// Get an uppercase identifier describing this token variant. Used by CLI simple output.
    pub fn simple_name(&self) -> &'static str {
        match self {
            Token::LexMarker => "LEX_MARKER",
            Token::Indentation => "INDENTATION",
            Token::Indent(_) => "INDENT",
            Token::Dedent(_) => "DEDENT",
            Token::BlankLine(_) => "BLANK_LINE",
            Token::Whitespace(_) => "WHITESPACE",
            Token::Dash => "DASH",
            Token::Period => "PERIOD",
            Token::OpenParen => "OPEN_PAREN",
            Token::CloseParen => "CLOSE_PAREN",
            Token::Colon => "COLON",
            Token::ExclamationMark => "EXCLAMATION_MARK",
            Token::QuestionMark => "QUESTION_MARK",
            Token::Semicolon => "SEMICOLON",
            Token::InvertedExclamationMark => "INVERTED_EXCLAMATION_MARK",
            Token::InvertedQuestionMark => "INVERTED_QUESTION_MARK",
            Token::Ellipsis => "ELLIPSIS",
            Token::IdeographicFullStop => "IDEOGRAPHIC_FULL_STOP",
            Token::FullwidthExclamationMark => "FULLWIDTH_EXCLAMATION_MARK",
            Token::FullwidthQuestionMark => "FULLWIDTH_QUESTION_MARK",
            Token::ExclamationQuestionMark => "EXCLAMATION_QUESTION_MARK",
            Token::QuestionExclamationMark => "QUESTION_EXCLAMATION_MARK",
            Token::ArabicQuestionMark => "ARABIC_QUESTION_MARK",
            Token::ArabicFullStop => "ARABIC_FULL_STOP",
            Token::ArabicTripleDot => "ARABIC_TRIPLE_DOT",
            Token::ArabicComma => "ARABIC_COMMA",
            Token::Danda => "DANDA",
            Token::DoubleDanda => "DOUBLE_DANDA",
            Token::BengaliCurrencyNumeratorFour => "BENGALI_CURRENCY_NUMERATOR_FOUR",
            Token::EthiopianFullStop => "ETHIOPIAN_FULL_STOP",
            Token::ArmenianFullStop => "ARMENIAN_FULL_STOP",
            Token::TibetanShad => "TIBETAN_SHAD",
            Token::ThaiFongman => "THAI_FONGMAN",
            Token::MyanmarComma => "MYANMAR_COMMA",
            Token::MyanmarFullStop => "MYANMAR_FULL_STOP",
            Token::Comma => "COMMA",
            Token::Quote => "QUOTE",
            Token::Equals => "EQUALS",
            Token::Number(_) => "NUMBER",
            Token::Text(_) => "TEXT",
        }
    }

    /// Check if this token represents indentation
    pub fn is_indent(&self) -> bool {
        matches!(self, Token::Indentation)
    }

    /// Check if this token represents semantic indentation level
    pub fn is_indent_level(&self) -> bool {
        matches!(self, Token::Indent(_))
    }

    /// Check if this token represents semantic dedentation level
    pub fn is_dedent_level(&self) -> bool {
        matches!(self, Token::Dedent(_))
    }

    /// Check if this token is whitespace (including indentation)
    pub fn is_whitespace(&self) -> bool {
        matches!(
            self,
            Token::Indentation
                | Token::Indent(_)
                | Token::Dedent(_)
                | Token::BlankLine(_)
                | Token::Whitespace(_)
        )
    }

    /// Check if this token is a sequence marker
    pub fn is_sequence_marker(&self) -> bool {
        matches!(
            self,
            Token::Dash | Token::Period | Token::OpenParen | Token::CloseParen
        )
    }

    /// Check if this token is a number
    pub fn is_number(&self) -> bool {
        matches!(self, Token::Number(_))
    }

    /// Check if this token is text content
    pub fn is_text(&self) -> bool {
        matches!(self, Token::Text(_))
    }

    pub fn is_end_punctuation(&self) -> bool {
        matches!(
            self,
            Token::Period
                | Token::ExclamationMark
                | Token::QuestionMark
                | Token::Semicolon
                | Token::Comma
                | Token::InvertedExclamationMark
                | Token::InvertedQuestionMark
                | Token::Ellipsis
                | Token::IdeographicFullStop
                | Token::FullwidthExclamationMark
                | Token::FullwidthQuestionMark
                | Token::ExclamationQuestionMark
                | Token::QuestionExclamationMark
                | Token::ArabicQuestionMark
                | Token::ArabicFullStop
                | Token::ArabicTripleDot
                | Token::ArabicComma
                | Token::Danda
                | Token::DoubleDanda
                | Token::BengaliCurrencyNumeratorFour
                | Token::EthiopianFullStop
                | Token::ArmenianFullStop
                | Token::TibetanShad
                | Token::ThaiFongman
                | Token::MyanmarComma
                | Token::MyanmarFullStop
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::lex::lexing::tokenize;

    #[test]
    fn test_lex_marker() {
        let tokens: Vec<_> = tokenize("::").into_iter().map(|(t, _)| t).collect();
        assert_eq!(tokens, vec![Token::LexMarker]);
    }

    #[test]
    fn test_indentation_tokens() {
        // Test 4 spaces
        let tokens: Vec<_> = tokenize("    ").into_iter().map(|(t, _)| t).collect();
        assert_eq!(tokens, vec![Token::Indentation]);

        // Test tab
        let tokens: Vec<_> = tokenize("\t").into_iter().map(|(t, _)| t).collect();
        assert_eq!(tokens, vec![Token::Indentation]);

        // Test multiple indent levels
        let tokens: Vec<_> = tokenize("        ").into_iter().map(|(t, _)| t).collect(); // 8 spaces = 2 indent levels
        assert_eq!(tokens, vec![Token::Indentation, Token::Indentation]);
    }

    #[test]
    fn test_sequence_markers() {
        let tokens: Vec<_> = tokenize("- . ( ) :").into_iter().map(|(t, _)| t).collect();
        assert_eq!(
            tokens,
            vec![
                Token::Dash,
                Token::Whitespace(1),
                Token::Period,
                Token::Whitespace(1),
                Token::OpenParen,
                Token::Whitespace(1),
                Token::CloseParen,
                Token::Whitespace(1),
                Token::Colon
            ]
        );
    }

    #[test]
    fn test_text_tokens() {
        let tokens: Vec<_> = tokenize("hello world")
            .into_iter()
            .map(|(t, _)| t)
            .collect();
        assert_eq!(
            tokens,
            vec![
                Token::Text("hello".to_string()),
                Token::Whitespace(1),
                Token::Text("world".to_string())
            ]
        );
    }

    #[test]
    fn test_mixed_content() {
        let tokens: Vec<_> = tokenize("1. Hello world\n    - Item 1")
            .into_iter()
            .map(|(t, _)| t)
            .collect();
        assert_eq!(
            tokens,
            vec![
                Token::Number("1".to_string()),
                Token::Period,
                Token::Whitespace(1),
                Token::Text("Hello".to_string()),
                Token::Whitespace(1),
                Token::Text("world".to_string()),
                Token::BlankLine(Some("\n".to_string())),
                Token::Indentation,
                Token::Dash,
                Token::Whitespace(1),
                Token::Text("Item".to_string()),
                Token::Whitespace(1),
                Token::Number("1".to_string()),
            ]
        );
    }

    #[test]
    fn test_number_tokens() {
        let tokens: Vec<_> = tokenize("123 456").into_iter().map(|(t, _)| t).collect();
        assert_eq!(
            tokens,
            vec![
                Token::Number("123".to_string()),
                Token::Whitespace(1),
                Token::Number("456".to_string())
            ]
        );
    }

    #[test]
    fn test_token_predicates() {
        assert!(Token::Indentation.is_indent());
        assert!(Token::Indent(vec![]).is_indent_level());
        assert!(Token::Dedent(vec![]).is_dedent_level());
        assert!(!Token::Text("".to_string()).is_indent());

        assert!(Token::Indentation.is_whitespace());
        assert!(Token::Indent(vec![]).is_whitespace());
        assert!(Token::Dedent(vec![]).is_whitespace());
        assert!(Token::BlankLine(Some("".to_string())).is_whitespace());
        assert!(Token::Whitespace(1).is_whitespace());
        assert!(!Token::Text("".to_string()).is_whitespace());

        assert!(Token::Dash.is_sequence_marker());
        assert!(Token::Period.is_sequence_marker());
        assert!(!Token::Text("".to_string()).is_sequence_marker());
        assert!(!Token::Number("".to_string()).is_sequence_marker());

        assert!(Token::Text("".to_string()).is_text());
        assert!(!Token::Dash.is_text());
        assert!(!Token::Number("".to_string()).is_text());

        assert!(Token::Number("".to_string()).is_number());
        assert!(!Token::Text("".to_string()).is_number());
        assert!(!Token::Dash.is_number());
    }
}