use alloc::borrow::Cow;
use alloc::boxed::Box;
use alloc::string::String;
mod grapheme;
mod shingle;
mod word;
#[cfg(feature = "cjk")]
#[cfg_attr(docsrs, doc(cfg(feature = "cjk")))]
mod cjk;
pub use grapheme::GraphemeTokenizer;
pub use shingle::ShingleTokenizer;
pub use word::WordTokenizer;
#[cfg(feature = "cjk")]
#[cfg_attr(docsrs, doc(cfg(feature = "cjk")))]
pub use cjk::{CjkSegmenter, CjkTokenizer};
pub enum TokenStream<'a> {
Borrowed(Box<dyn Iterator<Item = &'a str> + Send + 'a>),
Owned(Box<dyn Iterator<Item = String> + Send + 'a>),
}
impl<'a> TokenStream<'a> {
pub fn into_string_iter(self) -> Box<dyn Iterator<Item = String> + Send + 'a> {
match self {
TokenStream::Borrowed(it) => Box::new(it.map(String::from)),
TokenStream::Owned(it) => it,
}
}
}
pub trait Tokenizer: Send + Sync {
fn tokens<'a>(&'a self, input: &'a str) -> TokenStream<'a>;
fn name(&self) -> Cow<'static, str>;
fn for_each_token(&self, input: &str, f: &mut dyn FnMut(&str)) {
for tok in self.tokens(input).into_string_iter() {
f(&tok);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec::Vec;
fn collect(stream: TokenStream<'_>) -> Vec<String> {
stream.into_string_iter().collect()
}
#[test]
fn word_borrowed_then_owned_yields_same_strings() {
let w = WordTokenizer;
let toks: Vec<String> = collect(w.tokens("hello world!"));
assert_eq!(toks, ["hello", "world"]);
}
#[test]
fn names_are_stable() {
assert_eq!(WordTokenizer.name(), "word-uax29");
assert_eq!(GraphemeTokenizer.name(), "grapheme-uax29");
let s = ShingleTokenizer {
k: 3,
inner: WordTokenizer,
};
assert_eq!(s.name(), "shingle-k=3/word-uax29");
}
}