use unicode_segmentation::UnicodeSegmentation;
use crate::analysis::token::{Token, TokenStream, TokenType};
use crate::analysis::tokenizer::Tokenizer;
use crate::error::Result;
#[derive(Clone, Debug, Default)]
pub struct UnicodeWordTokenizer;
impl UnicodeWordTokenizer {
pub fn new() -> Self {
UnicodeWordTokenizer
}
fn detect_token_type(word: &str) -> TokenType {
if word.is_empty() {
return TokenType::Other;
}
if word.chars().all(|c| c.is_numeric()) {
return TokenType::Num;
}
if word.chars().all(|c| matches!(c, '\u{3040}'..='\u{309F}')) {
return TokenType::Hiragana;
}
if word.chars().all(|c| matches!(c, '\u{30A0}'..='\u{30FF}')) {
return TokenType::Katakana;
}
if word
.chars()
.any(|c| matches!(c, '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}'))
{
return TokenType::Hangul;
}
if word.chars().any(|c| {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' )
}) {
return TokenType::Cjk;
}
if word
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
{
return TokenType::Alphanum;
}
if word.chars().all(|c| c.is_ascii_punctuation()) {
return TokenType::Punctuation;
}
TokenType::Other
}
}
impl Tokenizer for UnicodeWordTokenizer {
fn tokenize(&self, text: &str) -> Result<TokenStream> {
let mut current_offset = 0;
let tokens: Vec<Token> = text
.split_word_bounds()
.enumerate()
.filter_map(|(position, word)| {
let start_offset = current_offset;
current_offset += word.len();
if word.chars().any(|c| c.is_alphanumeric()) {
let end_offset = start_offset + word.len();
let token_type = Self::detect_token_type(word);
Some(
Token::with_offsets(word, position, start_offset, end_offset)
.with_token_type(token_type),
)
} else {
None
}
})
.collect();
Ok(Box::new(tokens.into_iter()))
}
fn name(&self) -> &'static str {
"unicode_word"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unicode_word_tokenizer() {
let tokenizer = UnicodeWordTokenizer::new();
let tokens: Vec<Token> = tokenizer.tokenize("hello, world!").unwrap().collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn test_tokenizer_name() {
assert_eq!(UnicodeWordTokenizer::new().name(), "unicode_word");
}
}