use std::collections::HashSet;
use crate::analysis::token::Token;
pub trait TokenFilter: Send + Sync {
fn apply(&self, tokens: &mut Vec<Token>);
}
pub struct LowercaseFilter;
impl TokenFilter for LowercaseFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
for token in tokens.iter_mut() {
let lowered = token.text.to_lowercase();
if lowered != token.text {
token.text = lowered;
}
}
}
}
pub struct StopFilter {
stop_words: HashSet<String>,
}
impl StopFilter {
pub fn new(words: impl IntoIterator<Item = impl Into<String>>) -> Self {
Self {
stop_words: words.into_iter().map(Into::into).collect(),
}
}
pub fn english() -> Self {
Self::new(ENGLISH_STOP_WORDS.iter().copied())
}
}
impl TokenFilter for StopFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
tokens.retain(|token| !self.stop_words.contains(&token.text));
}
}
const ENGLISH_STOP_WORDS: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with",
];
pub struct StemmerFilter {
algorithm: rust_stemmers::Algorithm,
}
impl StemmerFilter {
pub fn new(algorithm: rust_stemmers::Algorithm) -> Self {
Self { algorithm }
}
pub fn english() -> Self {
Self::new(rust_stemmers::Algorithm::English)
}
}
impl TokenFilter for StemmerFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
let stemmer = rust_stemmers::Stemmer::create(self.algorithm);
for token in tokens.iter_mut() {
let stemmed = stemmer.stem(&token.text);
if stemmed != token.text {
token.text = stemmed.into_owned();
}
}
}
}
pub use rust_stemmers::Algorithm as StemmerAlgorithm;
pub struct AsciiFoldingFilter {
pub preserve_original: bool,
}
impl AsciiFoldingFilter {
pub fn new(preserve_original: bool) -> Self {
Self { preserve_original }
}
}
impl TokenFilter for AsciiFoldingFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
if self.preserve_original {
let mut extra = Vec::new();
for token in tokens.iter_mut() {
let folded = ascii_fold(&token.text);
if folded != token.text {
extra.push(Token {
text: folded,
offset_from: token.offset_from,
offset_to: token.offset_to,
position: token.position, });
}
}
tokens.extend(extra);
} else {
for token in tokens.iter_mut() {
let folded = ascii_fold(&token.text);
if folded != token.text {
token.text = folded;
}
}
}
}
}
fn ascii_fold(s: &str) -> String {
let mut result = String::with_capacity(s.len());
for ch in s.chars() {
if (ch as u32) < 0x80 {
result.push(ch);
} else {
result.push_str(fold_char(ch));
}
}
result
}
fn fold_char(ch: char) -> &'static str {
match ch {
'\u{00C0}'..='\u{00C5}' => "A", '\u{00C6}' => "AE", '\u{00C7}' => "C", '\u{00C8}'..='\u{00CB}' => "E", '\u{00CC}'..='\u{00CF}' => "I", '\u{00D0}' => "D", '\u{00D1}' => "N", '\u{00D2}'..='\u{00D6}' => "O", '\u{00D8}' => "O", '\u{00D9}'..='\u{00DC}' => "U", '\u{00DD}' => "Y", '\u{00DE}' => "TH", '\u{00DF}' => "ss", '\u{00E0}'..='\u{00E5}' => "a", '\u{00E6}' => "ae", '\u{00E7}' => "c", '\u{00E8}'..='\u{00EB}' => "e", '\u{00EC}'..='\u{00EF}' => "i", '\u{00F0}' => "d", '\u{00F1}' => "n", '\u{00F2}'..='\u{00F6}' => "o", '\u{00F8}' => "o", '\u{00F9}'..='\u{00FC}' => "u", '\u{00FD}' | '\u{00FF}' => "y", '\u{00FE}' => "th",
'\u{0100}' | '\u{0102}' | '\u{0104}' => "A",
'\u{0101}' | '\u{0103}' | '\u{0105}' => "a",
'\u{0106}' | '\u{0108}' | '\u{010A}' | '\u{010C}' => "C",
'\u{0107}' | '\u{0109}' | '\u{010B}' | '\u{010D}' => "c",
'\u{010E}' | '\u{0110}' => "D",
'\u{010F}' | '\u{0111}' => "d",
'\u{0112}' | '\u{0114}' | '\u{0116}' | '\u{0118}' | '\u{011A}' => "E",
'\u{0113}' | '\u{0115}' | '\u{0117}' | '\u{0119}' | '\u{011B}' => "e",
'\u{011C}' | '\u{011E}' | '\u{0120}' | '\u{0122}' => "G",
'\u{011D}' | '\u{011F}' | '\u{0121}' | '\u{0123}' => "g",
'\u{0124}' | '\u{0126}' => "H",
'\u{0125}' | '\u{0127}' => "h",
'\u{0128}' | '\u{012A}' | '\u{012C}' | '\u{012E}' | '\u{0130}' => "I",
'\u{0129}' | '\u{012B}' | '\u{012D}' | '\u{012F}' | '\u{0131}' => "i",
'\u{0132}' => "IJ",
'\u{0133}' => "ij",
'\u{0134}' => "J",
'\u{0135}' => "j",
'\u{0136}' => "K",
'\u{0137}' | '\u{0138}' => "k",
'\u{0139}' | '\u{013B}' | '\u{013D}' | '\u{013F}' | '\u{0141}' => "L",
'\u{013A}' | '\u{013C}' | '\u{013E}' | '\u{0140}' | '\u{0142}' => "l",
'\u{0143}' | '\u{0145}' | '\u{0147}' | '\u{014A}' => "N",
'\u{0144}' | '\u{0146}' | '\u{0148}' | '\u{0149}' | '\u{014B}' => "n",
'\u{014C}' | '\u{014E}' | '\u{0150}' => "O",
'\u{014D}' | '\u{014F}' | '\u{0151}' => "o",
'\u{0152}' => "OE",
'\u{0153}' => "oe",
'\u{0154}' | '\u{0156}' | '\u{0158}' => "R",
'\u{0155}' | '\u{0157}' | '\u{0159}' => "r",
'\u{015A}' | '\u{015C}' | '\u{015E}' | '\u{0160}' => "S",
'\u{015B}' | '\u{015D}' | '\u{015F}' | '\u{0161}' => "s",
'\u{0162}' | '\u{0164}' | '\u{0166}' => "T",
'\u{0163}' | '\u{0165}' | '\u{0167}' => "t",
'\u{0168}' | '\u{016A}' | '\u{016C}' | '\u{016E}' | '\u{0170}' | '\u{0172}' => "U",
'\u{0169}' | '\u{016B}' | '\u{016D}' | '\u{016F}' | '\u{0171}' | '\u{0173}' => "u",
'\u{0174}' => "W",
'\u{0175}' => "w",
'\u{0176}' => "Y",
'\u{0177}' => "y",
'\u{0178}' => "Y",
'\u{0179}' | '\u{017B}' | '\u{017D}' => "Z",
'\u{017A}' | '\u{017C}' | '\u{017E}' => "z",
'\u{0218}' | '\u{021A}' => "S", '\u{0219}' | '\u{021B}' => "s", '\u{01A0}' | '\u{01A2}' => "O",
'\u{01A1}' | '\u{01A3}' => "o",
'\u{01AF}' => "U",
'\u{01B0}' => "u",
'\u{FF21}'..='\u{FF3A}' => {
return leak_fold(ch);
}
'\u{FF41}'..='\u{FF5A}' => {
return leak_fold(ch);
}
_ => return leak_fold(ch),
}
}
fn leak_fold(ch: char) -> &'static str {
let code = ch as u32;
if (0xFF21..=0xFF3A).contains(&code) {
let ascii = (code - 0xFF21 + b'A' as u32) as u8 as char;
return match ascii {
'A' => "A",
'B' => "B",
'C' => "C",
'D' => "D",
'E' => "E",
'F' => "F",
'G' => "G",
'H' => "H",
'I' => "I",
'J' => "J",
'K' => "K",
'L' => "L",
'M' => "M",
'N' => "N",
'O' => "O",
'P' => "P",
'Q' => "Q",
'R' => "R",
'S' => "S",
'T' => "T",
'U' => "U",
'V' => "V",
'W' => "W",
'X' => "X",
'Y' => "Y",
'Z' => "Z",
_ => unreachable!(),
};
}
if (0xFF41..=0xFF5A).contains(&code) {
let ascii = (code - 0xFF41 + b'a' as u32) as u8 as char;
return match ascii {
'a' => "a",
'b' => "b",
'c' => "c",
'd' => "d",
'e' => "e",
'f' => "f",
'g' => "g",
'h' => "h",
'i' => "i",
'j' => "j",
'k' => "k",
'l' => "l",
'm' => "m",
'n' => "n",
'o' => "o",
'p' => "p",
'q' => "q",
'r' => "r",
's' => "s",
't' => "t",
'u' => "u",
'v' => "v",
'w' => "w",
'x' => "x",
'y' => "y",
'z' => "z",
_ => unreachable!(),
};
}
let s = ch.to_string();
Box::leak(s.into_boxed_str())
}
pub struct NGramTokenFilter {
pub min_gram: usize,
pub max_gram: usize,
}
impl NGramTokenFilter {
pub fn new(min_gram: usize, max_gram: usize) -> Self {
Self { min_gram, max_gram }
}
}
impl TokenFilter for NGramTokenFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
let original = std::mem::take(tokens);
for token in &original {
let chars: Vec<(usize, char)> = token.text.char_indices().collect();
for n in self.min_gram..=self.max_gram {
if n > chars.len() {
break;
}
for i in 0..=chars.len() - n {
let start = chars[i].0;
let end = if i + n < chars.len() {
chars[i + n].0
} else {
token.text.len()
};
tokens.push(Token {
text: token.text[start..end].to_string(),
offset_from: token.offset_from,
offset_to: token.offset_to,
position: token.position,
});
}
}
}
}
}
pub struct EdgeNGramTokenFilter {
pub min_gram: usize,
pub max_gram: usize,
pub preserve_original: bool,
}
impl EdgeNGramTokenFilter {
pub fn new(min_gram: usize, max_gram: usize, preserve_original: bool) -> Self {
Self {
min_gram,
max_gram,
preserve_original,
}
}
}
impl TokenFilter for EdgeNGramTokenFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
let original = std::mem::take(tokens);
for token in &original {
let chars: Vec<(usize, char)> = token.text.char_indices().collect();
let mut emitted_original = false;
for n in self.min_gram..=self.max_gram.min(chars.len()) {
let end = if n < chars.len() {
chars[n].0
} else {
token.text.len()
};
if n == chars.len() {
emitted_original = true;
}
tokens.push(Token {
text: token.text[..end].to_string(),
offset_from: token.offset_from,
offset_to: token.offset_to,
position: token.position,
});
}
if self.preserve_original && !emitted_original {
tokens.push(token.clone());
}
}
}
}
pub struct SynonymFilter {
synonym_map: std::collections::HashMap<String, Vec<String>>,
}
impl SynonymFilter {
pub fn new(rules: &[String], expand: bool) -> Self {
let mut synonym_map: std::collections::HashMap<String, Vec<String>> =
std::collections::HashMap::new();
for rule in rules {
let rule = rule.trim();
if rule.is_empty() || rule.starts_with('#') {
continue;
}
if let Some((left, right)) = rule.split_once("=>") {
let left_terms: Vec<String> = left
.split(',')
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.collect();
let right_terms: Vec<String> = right
.split(',')
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.collect();
for term in &left_terms {
synonym_map
.entry(term.clone())
.or_default()
.extend(right_terms.clone());
}
} else {
let terms: Vec<String> = rule
.split(',')
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.collect();
if expand {
for term in &terms {
let others: Vec<String> =
terms.iter().filter(|t| *t != term).cloned().collect();
synonym_map.entry(term.clone()).or_default().extend(others);
}
} else {
if let Some(canonical) = terms.first() {
for term in &terms[1..] {
synonym_map
.entry(term.clone())
.or_default()
.push(canonical.clone());
}
}
}
}
}
Self { synonym_map }
}
}
impl TokenFilter for SynonymFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
let mut extra = Vec::new();
for token in tokens.iter() {
if let Some(synonyms) = self.synonym_map.get(&token.text) {
for syn in synonyms {
extra.push(Token {
text: syn.clone(),
offset_from: token.offset_from,
offset_to: token.offset_to,
position: token.position, });
}
}
}
tokens.extend(extra);
}
}
pub struct ShingleFilter {
pub min_size: usize,
pub max_size: usize,
pub output_unigrams: bool,
pub separator: String,
pub filler_token: String,
}
impl ShingleFilter {
pub fn new(min_size: usize, max_size: usize, output_unigrams: bool) -> Self {
Self {
min_size,
max_size,
output_unigrams,
separator: " ".to_string(),
filler_token: "_".to_string(),
}
}
}
impl TokenFilter for ShingleFilter {
fn apply(&self, tokens: &mut Vec<Token>) {
if tokens.is_empty() {
return;
}
let original = tokens.clone();
let mut result = Vec::new();
for (i, token) in original.iter().enumerate() {
if self.output_unigrams {
result.push(token.clone());
}
for size in self.min_size..=self.max_size {
if i + size > original.len() {
break;
}
let shingle_tokens = &original[i..i + size];
let shingle_text: String = shingle_tokens
.iter()
.map(|t| t.text.as_str())
.collect::<Vec<_>>()
.join(&self.separator);
result.push(Token {
text: shingle_text,
offset_from: shingle_tokens.first().unwrap().offset_from,
offset_to: shingle_tokens.last().unwrap().offset_to,
position: token.position,
});
}
}
*tokens = result;
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_tokens(words: &[&str]) -> Vec<Token> {
words
.iter()
.enumerate()
.map(|(i, w)| Token::new(*w, 0, w.len(), i as u32))
.collect()
}
#[test]
fn lowercase_basic() {
let mut tokens = make_tokens(&["Hello", "WORLD", "TeSt"]);
LowercaseFilter.apply(&mut tokens);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
assert_eq!(tokens[2].text, "test");
}
#[test]
fn lowercase_already_lower() {
let mut tokens = make_tokens(&["hello", "world"]);
LowercaseFilter.apply(&mut tokens);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn lowercase_unicode() {
let mut tokens = make_tokens(&["CAFÉ", "Ñoño"]);
LowercaseFilter.apply(&mut tokens);
assert_eq!(tokens[0].text, "café");
assert_eq!(tokens[1].text, "ñoño");
}
#[test]
fn lowercase_preserves_positions() {
let mut tokens = make_tokens(&["A", "B", "C"]);
LowercaseFilter.apply(&mut tokens);
assert_eq!(tokens[0].position, 0);
assert_eq!(tokens[1].position, 1);
assert_eq!(tokens[2].position, 2);
}
#[test]
fn lowercase_empty() {
let mut tokens: Vec<Token> = Vec::new();
LowercaseFilter.apply(&mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn stop_removes_stop_words() {
let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
StopFilter::english().apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["quick", "brown", "fox"]);
}
#[test]
fn stop_preserves_positions() {
let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
StopFilter::english().apply(&mut tokens);
assert_eq!(tokens[0].position, 1); assert_eq!(tokens[1].position, 2); }
#[test]
fn stop_all_removed() {
let mut tokens = make_tokens(&["the", "a", "is", "it"]);
StopFilter::english().apply(&mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn stop_none_removed() {
let mut tokens = make_tokens(&["quick", "brown", "fox"]);
StopFilter::english().apply(&mut tokens);
assert_eq!(tokens.len(), 3);
}
#[test]
fn stop_custom_words() {
let mut tokens = make_tokens(&["hello", "world", "goodbye"]);
let filter = StopFilter::new(["hello", "goodbye"]);
filter.apply(&mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "world");
}
#[test]
fn stop_case_sensitive() {
let mut tokens = make_tokens(&["The", "quick"]);
StopFilter::english().apply(&mut tokens);
assert_eq!(tokens.len(), 2);
}
#[test]
fn stemmer_english_basic() {
let mut tokens = make_tokens(&["running", "cats", "easily"]);
StemmerFilter::english().apply(&mut tokens);
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[1].text, "cat");
assert_eq!(tokens[2].text, "easili");
}
#[test]
fn stemmer_already_stemmed() {
let mut tokens = make_tokens(&["run", "cat"]);
StemmerFilter::english().apply(&mut tokens);
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[1].text, "cat");
}
#[test]
fn stemmer_preserves_positions() {
let mut tokens = make_tokens(&["running", "jumping"]);
StemmerFilter::english().apply(&mut tokens);
assert_eq!(tokens[0].position, 0);
assert_eq!(tokens[1].position, 1);
}
#[test]
fn stemmer_empty() {
let mut tokens: Vec<Token> = Vec::new();
StemmerFilter::english().apply(&mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn asciifolding_basic() {
let mut tokens = make_tokens(&["café", "résumé", "naïve"]);
AsciiFoldingFilter::new(false).apply(&mut tokens);
assert_eq!(tokens[0].text, "cafe");
assert_eq!(tokens[1].text, "resume");
assert_eq!(tokens[2].text, "naive");
}
#[test]
fn asciifolding_no_change() {
let mut tokens = make_tokens(&["hello", "world"]);
AsciiFoldingFilter::new(false).apply(&mut tokens);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn asciifolding_preserve_original() {
let mut tokens = make_tokens(&["café"]);
AsciiFoldingFilter::new(true).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"café")); assert!(texts.contains(&"cafe")); }
#[test]
fn asciifolding_german() {
let mut tokens = make_tokens(&["über", "straße"]);
AsciiFoldingFilter::new(false).apply(&mut tokens);
assert_eq!(tokens[0].text, "uber");
assert_eq!(tokens[1].text, "strasse");
}
#[test]
fn asciifolding_ligatures() {
let mut tokens = make_tokens(&["Æneid", "œuvre"]);
AsciiFoldingFilter::new(false).apply(&mut tokens);
assert_eq!(tokens[0].text, "AEneid");
assert_eq!(tokens[1].text, "oeuvre");
}
#[test]
fn ngram_filter_basic() {
let mut tokens = make_tokens(&["quick"]);
NGramTokenFilter::new(2, 3).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["qu", "ui", "ic", "ck", "qui", "uic", "ick"]);
}
#[test]
fn ngram_filter_empty() {
let mut tokens: Vec<Token> = Vec::new();
NGramTokenFilter::new(2, 3).apply(&mut tokens);
assert!(tokens.is_empty());
}
#[test]
fn edge_ngram_filter_basic() {
let mut tokens = make_tokens(&["quick"]);
EdgeNGramTokenFilter::new(2, 4, false).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["qu", "qui", "quic"]);
}
#[test]
fn edge_ngram_filter_preserve_original() {
let mut tokens = make_tokens(&["quick"]);
EdgeNGramTokenFilter::new(2, 3, true).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["qu", "qui", "quick"]); }
#[test]
fn synonym_equivalent() {
let filter = SynonymFilter::new(&["quick, fast, speedy".to_string()], true);
let mut tokens = make_tokens(&["quick"]);
filter.apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"quick"));
assert!(texts.contains(&"fast"));
assert!(texts.contains(&"speedy"));
}
#[test]
fn synonym_explicit() {
let filter = SynonymFilter::new(&["big => large".to_string()], true);
let mut tokens = make_tokens(&["big"]);
filter.apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"big")); assert!(texts.contains(&"large")); }
#[test]
fn synonym_no_match() {
let filter = SynonymFilter::new(&["quick, fast".to_string()], true);
let mut tokens = make_tokens(&["slow"]);
filter.apply(&mut tokens);
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "slow");
}
#[test]
fn synonym_expand_false() {
let filter = SynonymFilter::new(&["quick, fast, speedy".to_string()], false);
let mut tokens = make_tokens(&["fast"]);
filter.apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"fast"));
assert!(texts.contains(&"quick")); }
#[test]
fn synonym_same_position() {
let filter = SynonymFilter::new(&["quick, fast".to_string()], true);
let mut tokens = make_tokens(&["quick"]);
filter.apply(&mut tokens);
assert!(tokens.iter().all(|t| t.position == 0));
}
#[test]
fn shingle_basic() {
let mut tokens = make_tokens(&["the", "quick", "brown", "fox"]);
ShingleFilter::new(2, 2, false).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["the quick", "quick brown", "brown fox"]);
}
#[test]
fn shingle_with_unigrams() {
let mut tokens = make_tokens(&["the", "quick", "brown"]);
ShingleFilter::new(2, 2, true).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(
texts,
vec!["the", "the quick", "quick", "quick brown", "brown"]
);
}
#[test]
fn shingle_trigrams() {
let mut tokens = make_tokens(&["a", "b", "c", "d"]);
ShingleFilter::new(3, 3, false).apply(&mut tokens);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["a b c", "b c d"]);
}
#[test]
fn shingle_empty() {
let mut tokens: Vec<Token> = Vec::new();
ShingleFilter::new(2, 2, false).apply(&mut tokens);
assert!(tokens.is_empty());
}
}