pub trait Tokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
fn token_count(&self, text: &str) -> usize {
self.tokenize(text).len()
}
}
#[derive(Clone, Copy, Debug, Default)]
pub struct WhitespaceTokenizer;
impl Tokenizer for WhitespaceTokenizer {
#[inline]
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.split_whitespace().collect()
}
#[inline]
fn token_count(&self, text: &str) -> usize {
text.split_whitespace().count()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_splits_on_spaces() {
let tokens = WhitespaceTokenizer.tokenize("hello world foo");
assert_eq!(tokens, vec!["hello", "world", "foo"]);
}
#[test]
fn tokenize_splits_on_tabs_and_newlines() {
let tokens = WhitespaceTokenizer.tokenize("a\tb\nc");
assert_eq!(tokens, vec!["a", "b", "c"]);
}
#[test]
fn tokenize_collapses_runs_of_whitespace() {
let tokens = WhitespaceTokenizer.tokenize(" foo bar ");
assert_eq!(tokens, vec!["foo", "bar"]);
}
#[test]
fn tokenize_empty_string_returns_empty() {
assert!(WhitespaceTokenizer.tokenize("").is_empty());
}
#[test]
fn tokenize_whitespace_only_returns_empty() {
assert!(WhitespaceTokenizer.tokenize(" \t\n ").is_empty());
}
#[test]
fn tokenize_single_token_no_whitespace() {
let tokens = WhitespaceTokenizer.tokenize("solo");
assert_eq!(tokens, vec!["solo"]);
}
#[test]
fn tokenize_returns_slices_into_original() {
let text = String::from("alpha beta gamma");
let tokens = WhitespaceTokenizer.tokenize(&text);
for token in &tokens {
let token_ptr = token.as_ptr() as usize;
let text_start = text.as_ptr() as usize;
let text_end = text_start + text.len();
assert!(token_ptr >= text_start && token_ptr < text_end);
}
}
#[test]
fn tokenize_unicode_whitespace_splits_correctly() {
let tokens = WhitespaceTokenizer.tokenize("東京\u{3000}大阪");
assert_eq!(tokens, vec!["東京", "大阪"]);
}
#[test]
fn token_count_matches_tokenize_len() {
let text = "one two three four";
assert_eq!(
WhitespaceTokenizer.token_count(text),
WhitespaceTokenizer.tokenize(text).len()
);
}
#[test]
fn token_count_empty_is_zero() {
assert_eq!(WhitespaceTokenizer.token_count(""), 0);
}
#[test]
fn token_count_whitespace_only_is_zero() {
assert_eq!(WhitespaceTokenizer.token_count(" \t\n "), 0);
}
#[test]
fn token_count_single_word() {
assert_eq!(WhitespaceTokenizer.token_count("word"), 1);
}
#[test]
fn default_token_count_delegates_to_tokenize() {
struct PipeTokenizer;
impl Tokenizer for PipeTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.split('|').filter(|s| !s.is_empty()).collect()
}
}
assert_eq!(PipeTokenizer.token_count("a|b|c"), 3);
assert_eq!(PipeTokenizer.token_count(""), 0);
}
#[test]
fn whitespace_tokenizer_is_clone_copy_and_debug() {
let t = WhitespaceTokenizer;
let cloned = t;
let copied = t;
assert_eq!(format!("{:?}", cloned), "WhitespaceTokenizer");
let _ = copied;
}
#[test]
fn whitespace_tokenizer_default_is_usable() {
let t = WhitespaceTokenizer;
assert_eq!(t.token_count("x y"), 2);
}
}