use crate::analysis::token::{Token, TokenStream, TokenType};
use crate::analysis::tokenizer::Tokenizer;
use crate::error::Result;
use crate::util::simd;
#[derive(Clone, Debug, Default)]
pub struct WhitespaceTokenizer;
impl WhitespaceTokenizer {
pub fn new() -> Self {
WhitespaceTokenizer
}
}
impl Tokenizer for WhitespaceTokenizer {
fn tokenize(&self, text: &str) -> Result<TokenStream> {
if text.is_ascii() && text.len() >= 32 {
self.tokenize_simd(text)
} else {
self.tokenize_fallback(text)
}
}
fn name(&self) -> &'static str {
"whitespace"
}
}
impl WhitespaceTokenizer {
fn detect_token_type(word: &str) -> TokenType {
if word.is_empty() {
return TokenType::Other;
}
if word.chars().all(|c| c.is_ascii_digit()) {
return TokenType::Num;
}
if word.chars().any(|c| {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' )
}) {
return TokenType::Cjk;
}
if word.chars().all(|c| matches!(c, '\u{30A0}'..='\u{30FF}')) {
return TokenType::Katakana;
}
if word.chars().all(|c| matches!(c, '\u{3040}'..='\u{309F}')) {
return TokenType::Hiragana;
}
if word
.chars()
.any(|c| matches!(c, '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}'))
{
return TokenType::Hangul;
}
if word
.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
{
return TokenType::Alphanum;
}
if word.chars().all(|c| c.is_ascii_punctuation()) {
return TokenType::Punctuation;
}
TokenType::Other
}
fn tokenize_simd(&self, text: &str) -> Result<TokenStream> {
let bytes = text.as_bytes();
let mut tokens = Vec::new();
let mut position = 0;
let mut start = 0;
while start < bytes.len() && bytes[start].is_ascii_whitespace() {
start += 1;
}
while start < bytes.len() {
let word_end = match simd::ascii::find_whitespace_simd(&bytes[start..]) {
Some(offset) => start + offset,
None => bytes.len(),
};
if word_end > start {
let word = &text[start..word_end];
let token_type = Self::detect_token_type(word);
let token = Token::with_offsets(word, position, start, word_end)
.with_token_type(token_type);
tokens.push(token);
position += 1;
}
start = word_end;
while start < bytes.len() && bytes[start].is_ascii_whitespace() {
start += 1;
}
}
Ok(Box::new(tokens.into_iter()))
}
fn tokenize_fallback(&self, text: &str) -> Result<TokenStream> {
let tokens: Vec<Token> = text
.split_whitespace()
.enumerate()
.map(|(position, word)| {
let start_offset = text.find(word).unwrap_or(0);
let end_offset = start_offset + word.len();
let token_type = Self::detect_token_type(word);
Token::with_offsets(word, position, start_offset, end_offset)
.with_token_type(token_type)
})
.collect();
Ok(Box::new(tokens.into_iter()))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_whitespace_tokenizer() {
let tokenizer = WhitespaceTokenizer::new();
let tokens: Vec<Token> = tokenizer.tokenize("hello world\ttest").unwrap().collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
assert_eq!(tokens[2].text, "test");
}
#[test]
fn test_tokenizer_name() {
assert_eq!(WhitespaceTokenizer::new().name(), "whitespace");
}
}