use crate::analysis::token::{Token, TokenStream};
use crate::analysis::tokenizer::Tokenizer;
use crate::error::{LaurusError, Result};
#[derive(Clone, Debug)]
pub struct NgramTokenizer {
min_gram: usize,
max_gram: usize,
}
impl NgramTokenizer {
pub fn new(min_gram: usize, max_gram: usize) -> Result<Self> {
if min_gram == 0 {
return Err(LaurusError::analysis(
"min_gram must be at least 1".to_string(),
));
}
if max_gram < min_gram {
return Err(LaurusError::analysis(format!(
"max_gram ({}) must be >= min_gram ({})",
max_gram, min_gram
)));
}
Ok(Self { min_gram, max_gram })
}
pub fn bigram() -> Self {
Self {
min_gram: 2,
max_gram: 2,
}
}
pub fn trigram() -> Self {
Self {
min_gram: 3,
max_gram: 3,
}
}
}
impl Tokenizer for NgramTokenizer {
fn tokenize(&self, text: &str) -> Result<TokenStream> {
let chars: Vec<char> = text.chars().collect();
let mut tokens = Vec::new();
let mut token_position = 0;
for start in 0..chars.len() {
for gram_size in self.min_gram..=self.max_gram {
let end = start + gram_size;
if end > chars.len() {
break;
}
let ngram: String = chars[start..end].iter().collect();
let start_offset: usize = chars[..start].iter().map(|c| c.len_utf8()).sum();
let end_offset: usize = chars[..end].iter().map(|c| c.len_utf8()).sum();
tokens.push(Token::with_offsets(
&ngram,
token_position,
start_offset,
end_offset,
));
token_position += 1;
}
}
Ok(Box::new(tokens.into_iter()))
}
fn name(&self) -> &'static str {
"ngram"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ngram_creation() {
let tokenizer = NgramTokenizer::new(2, 3);
assert!(tokenizer.is_ok());
let tokenizer = NgramTokenizer::new(0, 2);
assert!(tokenizer.is_err());
let tokenizer = NgramTokenizer::new(3, 2);
assert!(tokenizer.is_err());
}
#[test]
fn test_bigram() {
let tokenizer = NgramTokenizer::bigram();
let tokens: Vec<Token> = tokenizer.tokenize("hello").unwrap().collect();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].text, "he");
assert_eq!(tokens[1].text, "el");
assert_eq!(tokens[2].text, "ll");
assert_eq!(tokens[3].text, "lo");
}
#[test]
fn test_trigram() {
let tokenizer = NgramTokenizer::trigram();
let tokens: Vec<Token> = tokenizer.tokenize("hello").unwrap().collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "hel");
assert_eq!(tokens[1].text, "ell");
assert_eq!(tokens[2].text, "llo");
}
#[test]
fn test_variable_ngram() {
let tokenizer = NgramTokenizer::new(2, 3).unwrap();
let tokens: Vec<Token> = tokenizer.tokenize("abc").unwrap().collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "ab"); assert_eq!(tokens[1].text, "abc"); assert_eq!(tokens[2].text, "bc"); }
#[test]
fn test_unicode_support() {
let tokenizer = NgramTokenizer::bigram();
let tokens: Vec<Token> = tokenizer.tokenize("日本語").unwrap().collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "日本");
assert_eq!(tokens[1].text, "本語");
assert_eq!(tokens[0].start_offset, 0); assert_eq!(tokens[0].end_offset, 6); assert_eq!(tokens[1].start_offset, 3); assert_eq!(tokens[1].end_offset, 9);
let tokens: Vec<Token> = tokenizer.tokenize("ゴジラ").unwrap().collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "ゴジ");
assert_eq!(tokens[1].text, "ジラ");
assert_eq!(tokens[0].start_offset, 0); assert_eq!(tokens[0].end_offset, 6); assert_eq!(tokens[1].start_offset, 3); assert_eq!(tokens[1].end_offset, 9);
use unicode_normalization::UnicodeNormalization;
let nfkd_text = "ゴジラ".nfkd().collect::<String>();
let tokens: Vec<Token> = tokenizer.tokenize(&nfkd_text).unwrap().collect();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].text, "コ\u{3099}"); assert_eq!(tokens[1].text, "\u{3099}シ"); assert_eq!(tokens[2].text, "シ\u{3099}"); assert_eq!(tokens[3].text, "\u{3099}ラ"); }
#[test]
fn test_short_text() {
let tokenizer = NgramTokenizer::new(3, 5).unwrap();
let tokens: Vec<Token> = tokenizer.tokenize("ab").unwrap().collect();
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_exact_length() {
let tokenizer = NgramTokenizer::new(3, 3).unwrap();
let tokens: Vec<Token> = tokenizer.tokenize("abc").unwrap().collect();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "abc");
}
#[test]
fn test_tokenizer_name() {
let tokenizer = NgramTokenizer::bigram();
assert_eq!(tokenizer.name(), "ngram");
}
}