use rust_stemmers::{Algorithm, Stemmer};
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Serialize, Debug, Eq, PartialEq)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum Tokenizer {
EdgeNgram {
min_gram: usize,
max_gram: usize,
},
#[serde(rename_all = "camelCase")]
Ngram {
token_length: usize,
},
Standard,
}
impl Tokenizer {
pub fn process(&self, text: String) -> Vec<String> {
match self {
Tokenizer::Ngram { token_length } => process_ngram(text, *token_length),
Tokenizer::Standard => process_standard(text),
Tokenizer::EdgeNgram { min_gram, max_gram } => {
process_edge_ngram_alphabetic(text, *min_gram, *max_gram)
}
}
}
}
fn process_ngram(text: String, token_length: usize) -> Vec<String> {
let chars = text.chars().collect::<Vec<_>>();
if chars.len() < token_length {
return vec![];
}
let mut grams: Vec<String> = vec![];
for i in 0..=(chars.len() - token_length) {
grams.push(chars[i..i + token_length].iter().collect());
}
grams
}
fn process_standard(text: String) -> Vec<String> {
text.split(&[' ', ',', ';', ':', '!'])
.map(|x| x.into())
.collect()
}
fn process_edge_ngram_with_filter(
text: String,
min_gram: usize,
max_gram: usize,
filter: impl Fn(char) -> bool,
) -> Vec<String> {
let mut grams: Vec<String> = vec![];
let chars: Vec<char> = text.chars().collect();
if chars.is_empty() {
return grams;
}
let mut current_word_start = 0;
for i in 0..=chars.len() - 1 {
let current_char = chars[i];
let current_gram_len = i - current_word_start + 1;
if !filter(current_char) {
current_word_start = i + 1;
} else if min_gram <= current_gram_len && current_gram_len <= max_gram {
grams.push(chars[current_word_start..=i].iter().collect());
}
}
grams
}
pub fn process_edge_ngram_raw(text: String, min_gram: usize, max_gram: usize) -> Vec<String> {
process_edge_ngram_with_filter(text, min_gram, max_gram, |_c| true)
}
fn process_edge_ngram_alphabetic(text: String, min_gram: usize, max_gram: usize) -> Vec<String> {
process_edge_ngram_with_filter(text, min_gram, max_gram, |c| c.is_alphabetic())
}
#[derive(Clone, Deserialize, Serialize, Debug, Eq, PartialEq)]
#[serde(tag = "kind", rename_all = "snake_case")]
pub enum TokenFilter {
Upcase,
Downcase,
Stemmer,
Stop,
}
impl TokenFilter {
pub fn process_single(&self, text: String) -> Option<String> {
match self {
TokenFilter::Upcase => Some(text.to_uppercase()),
TokenFilter::Downcase => Some(text.to_lowercase()),
TokenFilter::Stemmer => Some(stem_text(text)),
TokenFilter::Stop => filter_if_stop_word(text),
}
}
pub fn process(&self, text: Vec<String>) -> Vec<String> {
text.into_iter()
.flat_map(|text| self.process_single(text))
.collect()
}
}
const STOPWORDS_LIST: [&str; 33] = [
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with",
];
pub fn filter_if_stop_word(plaintext: String) -> Option<String> {
let word = plaintext.to_lowercase();
for stopword in STOPWORDS_LIST {
if word == stopword {
return None;
}
}
Some(plaintext)
}
pub fn stem_text(plaintext: String) -> String {
Stemmer::create(Algorithm::English)
.stem(&plaintext.to_lowercase())
.to_string()
}
pub fn char_filter_prefix_and_suffix(plaintext: &str, chars_to_filter: &[char]) -> String {
let mut result = String::from(plaintext);
for ch in chars_to_filter {
if let Some(stripped) = result.strip_suffix(*ch) {
result = stripped.to_string();
}
if let Some(stripped) = result.strip_prefix(*ch) {
result = stripped.to_string();
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_standard() {
let output = Tokenizer::Standard.process("Hello from Ada Lovelace".into());
assert_eq!(output, vec!["Hello", "from", "Ada", "Lovelace"]);
}
#[test]
fn test_ngram() {
let output = Tokenizer::Ngram { token_length: 3 }.process("Lovelace".into());
assert_eq!(output, vec!["Lov", "ove", "vel", "ela", "lac", "ace"]);
}
#[test]
fn test_ngram_equal_length() {
let output = Tokenizer::Ngram { token_length: 4 }.process("Love".into());
assert_eq!(output, vec!["Love"]);
}
#[test]
fn test_ngram_shorter_length() {
let output = Tokenizer::Ngram { token_length: 4 }.process("Lov".into());
assert_eq!(output, Vec::<String>::new());
}
#[test]
fn test_ngram_zero_length() {
let output = Tokenizer::Ngram { token_length: 0 }.process("Lovelace".into());
assert_eq!(output, vec!["", "", "", "", "", "", "", "", ""]);
}
#[test]
fn test_edge_ngram_empty_input() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 2,
max_gram: 10,
};
let output = tokenizer.process("".to_string());
assert_eq!(output, Vec::<String>::new())
}
#[test]
fn test_edge_ngram_single_word() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 1,
max_gram: 10,
};
let output = tokenizer.process("Thomas".to_string());
assert_eq!(output, vec!["T", "Th", "Tho", "Thom", "Thoma", "Thomas"])
}
#[test]
fn test_edge_ngram_multiple_words() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 1,
max_gram: 10,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(
output,
vec!["H", "He", "Hea", "Heat", "Heath", "J", "Jo", "Jon", "Jone", "Jones"]
)
}
#[test]
fn test_edge_ngram_raw_min_gram_2() {
let output = process_edge_ngram_raw("Heath@Jones.com".to_string(), 2, 10);
assert_eq!(
output,
vec![
"He",
"Hea",
"Heat",
"Heath",
"Heath@",
"Heath@J",
"Heath@Jo",
"Heath@Jon",
"Heath@Jone"
]
)
}
#[test]
fn test_edge_ngram_min_gram_2() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 2,
max_gram: 10,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(
output,
vec!["He", "Hea", "Heat", "Heath", "Jo", "Jon", "Jone", "Jones"]
)
}
#[test]
fn test_edge_ngram_max_gram_lt_word_len() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 1,
max_gram: 2,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(output, vec!["H", "He", "J", "Jo"])
}
#[test]
fn test_edge_ngram_max_eq_min() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 3,
max_gram: 3,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(output, vec!["Hea", "Jon"])
}
#[test]
fn test_edge_ngram_max_lt_min() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 4,
max_gram: 3,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(output, Vec::<String>::new())
}
#[test]
fn test_edge_ngram_min_0() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 0,
max_gram: 1,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(output, vec!["H", "J"])
}
#[test]
fn test_edge_ngram_min_and_max_0() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 0,
max_gram: 0,
};
let output = tokenizer.process("Heath Jones".to_string());
assert_eq!(output, Vec::<String>::new())
}
#[test]
fn test_edge_ngram_words_of_various_lengths() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 2,
max_gram: 4,
};
let output = tokenizer.process("a bb ccc dddd eeeee".to_string());
assert_eq!(
output,
vec!["bb", "cc", "ccc", "dd", "ddd", "dddd", "ee", "eee", "eeee"]
)
}
#[test]
fn test_edge_ngram_non_alpha_chars() {
let tokenizer = Tokenizer::EdgeNgram {
min_gram: 1,
max_gram: 2,
};
let output = tokenizer.process("123!?hi 🤨ño\\".to_string());
assert_eq!(output, vec!["h", "hi", "ñ", "ño"])
}
#[test]
fn test_downcase() {
let output = TokenFilter::Downcase.process(vec!["HeLLOWorlD".into()]);
assert_eq!(output, vec!["helloworld"]);
}
#[test]
fn test_upcase() {
let output = TokenFilter::Upcase.process(vec!["HeLLOWorlD".into()]);
assert_eq!(output, vec!["HELLOWORLD"]);
}
#[test]
fn test_char_filter_removes_prefix_and_suffix() {
let plaintext_mixed_ops = "_testing%";
let plaintext_underscore_op = "_testing_";
let plaintext_percentage_ops = "%testing%";
let chars = ['%', '_'];
let mixed_op_output = char_filter_prefix_and_suffix(plaintext_mixed_ops, &chars);
let underscore_op_output = char_filter_prefix_and_suffix(plaintext_underscore_op, &chars);
let percentage_op_output = char_filter_prefix_and_suffix(plaintext_percentage_ops, &chars);
assert_eq!(mixed_op_output, "testing");
assert_eq!(underscore_op_output, "testing");
assert_eq!(percentage_op_output, "testing");
}
#[test]
fn test_stop_word_case_insensitive_filter() {
let output = TokenFilter::Stop.process(vec![
"This".into(),
"is".into(),
"a".into(),
"test".into(),
"of".into(),
"Stop-Words".into(),
]);
assert_eq!(output, vec!["test", "Stop-Words"]);
}
#[test]
fn test_oops_all_stop_words() {
let output = TokenFilter::Stop.process(vec!["this".into(), "is".into(), "There".into()]);
assert_eq!(output, Vec::<String>::new());
}
#[test]
fn test_stemmer_basic() {
let tokens = Tokenizer::Standard
.process("These greetings are delivered directly from Ada Lovelace".into());
let output = TokenFilter::Stemmer.process(tokens);
assert_eq!(
output,
vec!["these", "greet", "are", "deliv", "direct", "from", "ada", "lovelac"]
);
}
}