// Test Unicode character ranges
%%
[\u{1F600}-\u{1F64F}]+ -> EmojiEmoticons
[\u{1F300}-\u{1F5FF}]+ -> EmojiMiscSymbols
[\u{1F680}-\u{1F6FF}]+ -> EmojiTransport
[\u{3040}-\u{309F}]+ -> Hiragana
[\u{30A0}-\u{30FF}]+ -> Katakana
[\u{4E00}-\u{9FFF}]+ -> Kanji
[\u{AC00}-\u{D7AF}]+ -> Hangul
[\u{0400}-\u{04FF}]+ -> Cyrillic
[\u{0600}-\u{06FF}]+ -> Arabic
[\x41-\x5A]+ -> UppercaseHex
[\x61-\x7A]+ -> LowercaseHex
[0-9]+ -> Number
[ \t\n\r]+ -> _
%%
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_emoji_emoticons() {
let input = "πππ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::EmojiEmoticons);
assert_eq!(tokens[0].text, "πππ");
}
#[test]
fn test_emoji_transport() {
let input = "πππ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::EmojiTransport);
assert_eq!(tokens[0].text, "πππ");
}
#[test]
fn test_hiragana() {
let input = "γ²γγγͺ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Hiragana);
assert_eq!(tokens[0].text, "γ²γγγͺ");
}
#[test]
fn test_katakana() {
let input = "γ«γΏγ«γ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Katakana);
assert_eq!(tokens[0].text, "γ«γΏγ«γ");
}
#[test]
fn test_kanji() {
let input = "ζΌ’ε";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Kanji);
assert_eq!(tokens[0].text, "ζΌ’ε");
}
#[test]
fn test_mixed_japanese() {
let input = "γ²γγγͺγ«γΏγ«γζΌ’ε";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::Hiragana);
assert_eq!(tokens[0].text, "γ²γγγͺ");
assert_eq!(tokens[1].kind, TokenKind::Katakana);
assert_eq!(tokens[1].text, "γ«γΏγ«γ");
assert_eq!(tokens[2].kind, TokenKind::Kanji);
assert_eq!(tokens[2].text, "ζΌ’ε");
}
#[test]
fn test_cyrillic() {
let input = "ΠΡΠΈΠ²Π΅Ρ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Cyrillic);
assert_eq!(tokens[0].text, "ΠΡΠΈΠ²Π΅Ρ");
}
#[test]
fn test_hangul() {
let input = "νκΈ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Hangul);
assert_eq!(tokens[0].text, "νκΈ");
}
#[test]
fn test_arabic() {
let input = "Ψ§ΩΨΉΨ±Ψ¨ΩΨ©";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Arabic);
assert_eq!(tokens[0].text, "Ψ§ΩΨΉΨ±Ψ¨ΩΨ©");
}
#[test]
fn test_mixed_with_numbers() {
let input = "π123γ«γΏγ«γ456";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::EmojiEmoticons);
assert_eq!(tokens[0].text, "π");
assert_eq!(tokens[1].kind, TokenKind::Number);
assert_eq!(tokens[1].text, "123");
assert_eq!(tokens[2].kind, TokenKind::Katakana);
assert_eq!(tokens[2].text, "γ«γΏγ«γ");
assert_eq!(tokens[3].kind, TokenKind::Number);
assert_eq!(tokens[3].text, "456");
}
#[test]
fn test_emoji_with_whitespace() {
let input = "π π π";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
// Whitespace tokens are filtered out
assert_eq!(tokens.len(), 5); // 3 emojis + 2 whitespaces (if not filtered)
}
#[test]
fn test_hex_uppercase() {
let input = "ABCXYZ";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::UppercaseHex);
assert_eq!(tokens[0].text, "ABCXYZ");
}
#[test]
fn test_hex_lowercase() {
let input = "abcxyz";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::LowercaseHex);
assert_eq!(tokens[0].text, "abcxyz");
}
#[test]
fn test_hex_mixed() {
let input = "ABC123abc";
let mut lexer = Lexer::from_str(input);
let tokens = lexer.tokenize();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::UppercaseHex);
assert_eq!(tokens[0].text, "ABC");
assert_eq!(tokens[1].kind, TokenKind::Number);
assert_eq!(tokens[1].text, "123");
assert_eq!(tokens[2].kind, TokenKind::LowercaseHex);
assert_eq!(tokens[2].text, "abc");
}
}