klex 0.1.2

A simple lexer (tokenizer) generator for Rust
Documentation
// Test Unicode character ranges
%%
[\u{1F600}-\u{1F64F}]+ -> EmojiEmoticons
[\u{1F300}-\u{1F5FF}]+ -> EmojiMiscSymbols
[\u{1F680}-\u{1F6FF}]+ -> EmojiTransport
[\u{3040}-\u{309F}]+ -> Hiragana
[\u{30A0}-\u{30FF}]+ -> Katakana
[\u{4E00}-\u{9FFF}]+ -> Kanji
[\u{AC00}-\u{D7AF}]+ -> Hangul
[\u{0400}-\u{04FF}]+ -> Cyrillic
[\u{0600}-\u{06FF}]+ -> Arabic
[\x41-\x5A]+ -> UppercaseHex
[\x61-\x7A]+ -> LowercaseHex
[0-9]+ -> Number
[ \t\n\r]+ -> _
%%

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_emoji_emoticons() {
        let input = "πŸ˜€πŸ˜πŸ˜‚";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::EmojiEmoticons);
        assert_eq!(tokens[0].text, "πŸ˜€πŸ˜πŸ˜‚");
    }

    #[test]
    fn test_emoji_transport() {
        let input = "πŸš€πŸšπŸš‚";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::EmojiTransport);
        assert_eq!(tokens[0].text, "πŸš€πŸšπŸš‚");
    }

    #[test]
    fn test_hiragana() {
        let input = "γ²γ‚‰γŒγͺ";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Hiragana);
        assert_eq!(tokens[0].text, "γ²γ‚‰γŒγͺ");
    }

    #[test]
    fn test_katakana() {
        let input = "γ‚«γ‚Ώγ‚«γƒŠ";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Katakana);
        assert_eq!(tokens[0].text, "γ‚«γ‚Ώγ‚«γƒŠ");
    }

    #[test]
    fn test_kanji() {
        let input = "ζΌ’ε­—";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Kanji);
        assert_eq!(tokens[0].text, "ζΌ’ε­—");
    }

    #[test]
    fn test_mixed_japanese() {
        let input = "γ²γ‚‰γŒγͺγ‚«γ‚Ώγ‚«γƒŠζΌ’ε­—";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 3);
        assert_eq!(tokens[0].kind, TokenKind::Hiragana);
        assert_eq!(tokens[0].text, "γ²γ‚‰γŒγͺ");
        assert_eq!(tokens[1].kind, TokenKind::Katakana);
        assert_eq!(tokens[1].text, "γ‚«γ‚Ώγ‚«γƒŠ");
        assert_eq!(tokens[2].kind, TokenKind::Kanji);
        assert_eq!(tokens[2].text, "ζΌ’ε­—");
    }

    #[test]
    fn test_cyrillic() {
        let input = "ΠŸΡ€ΠΈΠ²Π΅Ρ‚";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Cyrillic);
        assert_eq!(tokens[0].text, "ΠŸΡ€ΠΈΠ²Π΅Ρ‚");
    }

    #[test]
    fn test_hangul() {
        let input = "ν•œκΈ€";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Hangul);
        assert_eq!(tokens[0].text, "ν•œκΈ€");
    }

    #[test]
    fn test_arabic() {
        let input = "Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ©";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::Arabic);
        assert_eq!(tokens[0].text, "Ψ§Ω„ΨΉΨ±Ψ¨ΩŠΨ©");
    }

    #[test]
    fn test_mixed_with_numbers() {
        let input = "πŸ˜€123γ‚«γ‚Ώγ‚«γƒŠ456";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 4);
        assert_eq!(tokens[0].kind, TokenKind::EmojiEmoticons);
        assert_eq!(tokens[0].text, "πŸ˜€");
        assert_eq!(tokens[1].kind, TokenKind::Number);
        assert_eq!(tokens[1].text, "123");
        assert_eq!(tokens[2].kind, TokenKind::Katakana);
        assert_eq!(tokens[2].text, "γ‚«γ‚Ώγ‚«γƒŠ");
        assert_eq!(tokens[3].kind, TokenKind::Number);
        assert_eq!(tokens[3].text, "456");
    }

    #[test]
    fn test_emoji_with_whitespace() {
        let input = "πŸ˜€ 😁 πŸ˜‚";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        // Whitespace tokens are filtered out
        assert_eq!(tokens.len(), 5); // 3 emojis + 2 whitespaces (if not filtered)
    }

    #[test]
    fn test_hex_uppercase() {
        let input = "ABCXYZ";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::UppercaseHex);
        assert_eq!(tokens[0].text, "ABCXYZ");
    }

    #[test]
    fn test_hex_lowercase() {
        let input = "abcxyz";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 1);
        assert_eq!(tokens[0].kind, TokenKind::LowercaseHex);
        assert_eq!(tokens[0].text, "abcxyz");
    }

    #[test]
    fn test_hex_mixed() {
        let input = "ABC123abc";
        let mut lexer = Lexer::from_str(input);
        let tokens = lexer.tokenize();
        assert_eq!(tokens.len(), 3);
        assert_eq!(tokens[0].kind, TokenKind::UppercaseHex);
        assert_eq!(tokens[0].text, "ABC");
        assert_eq!(tokens[1].kind, TokenKind::Number);
        assert_eq!(tokens[1].text, "123");
        assert_eq!(tokens[2].kind, TokenKind::LowercaseHex);
        assert_eq!(tokens[2].text, "abc");
    }
}