use std::mem;
use ld_lucivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
const NGRAM_SIZE: usize = 3;
#[derive(Clone)]
pub struct NgramFilter;
impl TokenFilter for NgramFilter {
type Tokenizer<T: Tokenizer> = NgramFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> NgramFilterWrapper<T> {
NgramFilterWrapper { inner: tokenizer }
}
}
#[derive(Clone)]
pub struct NgramFilterWrapper<T> {
inner: T,
}
impl<T: Tokenizer> Tokenizer for NgramFilterWrapper<T> {
type TokenStream<'a> = NgramFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
NgramFilterStream {
tail: self.inner.token_stream(text),
buffer: Vec::new(),
buffer_idx: 0,
token: Token {
offset_from: 0,
offset_to: 0,
position: 0,
text: String::new(),
position_length: 1,
},
}
}
}
pub struct NgramFilterStream<T> {
tail: T,
buffer: Vec<String>,
buffer_idx: usize,
token: Token,
}
impl<T: TokenStream> TokenStream for NgramFilterStream<T> {
fn advance(&mut self) -> bool {
if self.buffer_idx < self.buffer.len() {
self.token.text.clear();
self.token.text.push_str(&self.buffer[self.buffer_idx]);
self.buffer_idx += 1;
return true;
}
if !self.tail.advance() {
return false;
}
let (offset_from, offset_to, position, position_length, text) = {
let src = self.tail.token_mut();
(
src.offset_from,
src.offset_to,
src.position,
src.position_length,
mem::take(&mut src.text),
)
};
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.position = position;
self.token.position_length = position_length;
self.buffer.clear();
let chars: Vec<char> = text.chars().collect();
if chars.len() < NGRAM_SIZE {
self.buffer.push(text);
} else {
for w in chars.windows(NGRAM_SIZE) {
self.buffer.push(w.iter().collect());
}
}
self.buffer_idx = 1;
self.token.text.clear();
self.token.text.push_str(&self.buffer[0]);
true
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use ld_lucivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenStream};
use super::NgramFilter;
fn tokenize(text: &str) -> Vec<String> {
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(NgramFilter)
.build();
let mut stream = analyzer.token_stream(text);
let mut tokens = Vec::new();
while stream.advance() {
tokens.push(stream.token().text.clone());
}
tokens
}
#[test]
fn test_trigrams_basic() {
assert_eq!(tokenize("hello"), vec!["hel", "ell", "llo"]);
}
#[test]
fn test_short_token_passthrough() {
assert_eq!(tokenize("ab"), vec!["ab"]);
assert_eq!(tokenize("a"), vec!["a"]);
}
#[test]
fn test_exact_trigram_length() {
assert_eq!(tokenize("abc"), vec!["abc"]);
}
#[test]
fn test_multi_token() {
assert_eq!(
tokenize("hello world"),
vec!["hel", "ell", "llo", "wor", "orl", "rld"]
);
}
#[test]
fn test_lowercase() {
assert_eq!(tokenize("Hello"), vec!["hel", "ell", "llo"]);
}
#[test]
fn test_programming() {
assert_eq!(
tokenize("programming"),
vec!["pro", "rog", "ogr", "gra", "ram", "amm", "mmi", "min", "ing"]
);
}
#[test]
fn test_mixed_lengths() {
assert_eq!(tokenize("I am OK"), vec!["i", "am", "ok"]);
}
#[test]
fn test_unicode() {
assert_eq!(tokenize("café"), vec!["caf", "afé"]);
}
#[test]
fn test_positions_preserved() {
let mut analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(NgramFilter)
.build();
let mut stream = analyzer.token_stream("hello world");
let mut positions = Vec::new();
while stream.advance() {
positions.push((stream.token().text.clone(), stream.token().position));
}
assert_eq!(
positions,
vec![
("hel".into(), 0),
("ell".into(), 0),
("llo".into(), 0),
("wor".into(), 1),
("orl".into(), 1),
("rld".into(), 1),
]
);
}
}