use regex::Regex;
use std::sync::LazyLock;
static SENTENCE_BOUNDARY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[.!?]+\s+").expect("Invalid sentence boundary regex"));
static WORD_BOUNDARY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[\s\p{P}]+").expect("Invalid word boundary regex"));
#[derive(Clone, Debug, Default)]
pub struct Tokenizer {
min_sentence_length: usize,
min_word_length: usize,
lowercase: bool,
}
impl Tokenizer {
pub fn new() -> Self {
Self {
min_sentence_length: 10,
min_word_length: 1,
lowercase: true,
}
}
pub fn with_min_sentence_length(mut self, length: usize) -> Self {
self.min_sentence_length = length;
self
}
pub fn with_min_word_length(mut self, length: usize) -> Self {
self.min_word_length = length;
self
}
pub fn with_lowercase(mut self, lowercase: bool) -> Self {
self.lowercase = lowercase;
self
}
pub fn sentences<'a>(&self, text: &'a str) -> impl Iterator<Item = String> + 'a {
let min_len = self.min_sentence_length;
let lowercase = self.lowercase;
SENTENCE_BOUNDARY
.split(text)
.filter(move |s| s.len() >= min_len)
.map(move |s| {
let s = s.trim();
if lowercase {
s.to_lowercase()
} else {
s.to_string()
}
})
}
pub fn words<'a>(&self, text: &'a str) -> impl Iterator<Item = String> + 'a {
let min_len = self.min_word_length;
let lowercase = self.lowercase;
WORD_BOUNDARY
.split(text)
.filter(move |w| w.len() >= min_len)
.map(move |w| {
if lowercase {
w.to_lowercase()
} else {
w.to_string()
}
})
}
pub fn tokenize(&self, text: &str) -> Vec<String> {
self.words(text).collect()
}
pub fn tokenize_with_spans<'a>(&self, text: &'a str) -> Vec<(&'a str, usize, usize)> {
let mut tokens = Vec::new();
let mut last_end = 0;
for mat in WORD_BOUNDARY.find_iter(text) {
if last_end < mat.start() {
let token = &text[last_end..mat.start()];
if token.len() >= self.min_word_length {
tokens.push((token, last_end, mat.start()));
}
}
last_end = mat.end();
}
if last_end < text.len() {
let token = &text[last_end..];
if token.len() >= self.min_word_length {
tokens.push((token, last_end, text.len()));
}
}
tokens
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sentence_tokenization() {
let tokenizer = Tokenizer::new().with_min_sentence_length(5);
let text = "Hello world. This is a test! How are you?";
let sentences: Vec<_> = tokenizer.sentences(text).collect();
assert_eq!(sentences.len(), 3);
assert_eq!(sentences[0], "hello world");
assert_eq!(sentences[1], "this is a test");
assert_eq!(sentences[2], "how are you?");
}
#[test]
fn test_word_tokenization() {
let tokenizer = Tokenizer::new();
let text = "Hello, world! This is a test.";
let words: Vec<_> = tokenizer.words(text).collect();
assert_eq!(words, vec!["hello", "world", "this", "is", "a", "test"]);
}
#[test]
fn test_no_lowercase() {
let tokenizer = Tokenizer::new().with_lowercase(false);
let text = "Hello World";
let words: Vec<_> = tokenizer.words(text).collect();
assert_eq!(words, vec!["Hello", "World"]);
}
}