#![forbid(unsafe_code)]
#![doc = include_str!("../README.md")]
use std::collections::HashSet;
#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub struct Word {
text: String,
}
impl Word {
pub fn new(input: &str) -> Option<Self> {
let text = normalize_word(input);
(!text.is_empty()).then_some(Self { text })
}
pub fn as_str(&self) -> &str {
&self.text
}
pub fn into_string(self) -> String {
self.text
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct WordStats {
pub total: usize,
pub unique: usize,
}
impl WordStats {
pub fn from_text(input: &str) -> Self {
let all_words = words(input);
let total = all_words.len();
let unique = unique_words(input).len();
Self { total, unique }
}
}
pub fn word_count(input: &str) -> usize {
words(input).len()
}
pub fn unique_words(input: &str) -> Vec<Word> {
let mut seen = HashSet::new();
let mut unique = Vec::new();
for word in words(input) {
if seen.insert(word.text.clone()) {
unique.push(word);
}
}
unique
}
pub fn normalize_word(input: &str) -> String {
let characters: Vec<char> = input.trim().chars().collect();
let mut output = String::new();
for (index, character) in characters.iter().copied().enumerate() {
let previous = index
.checked_sub(1)
.and_then(|value| characters.get(value))
.copied();
let next = characters.get(index + 1).copied();
if character.is_alphanumeric() || is_apostrophe(previous, character, next) {
output.extend(character.to_lowercase());
}
}
output
}
pub fn contains_word(input: &str, target: &str) -> bool {
let normalized_target = normalize_word(target);
!normalized_target.is_empty()
&& words(input)
.iter()
.any(|word| word.as_str() == normalized_target)
}
pub fn starts_with_word(input: &str, target: &str) -> bool {
let normalized_target = normalize_word(target);
!normalized_target.is_empty()
&& words(input)
.first()
.is_some_and(|word| word.as_str() == normalized_target)
}
pub fn ends_with_word(input: &str, target: &str) -> bool {
let normalized_target = normalize_word(target);
!normalized_target.is_empty()
&& words(input)
.last()
.is_some_and(|word| word.as_str() == normalized_target)
}
pub fn words(input: &str) -> Vec<Word> {
word_ranges(input)
.into_iter()
.filter_map(|(start, end)| Word::new(&input[start..end]))
.collect()
}
fn word_ranges(input: &str) -> Vec<(usize, usize)> {
let characters: Vec<(usize, char)> = input.char_indices().collect();
let mut ranges = Vec::new();
let mut start = None;
for (index, (byte_index, character)) in characters.iter().copied().enumerate() {
let previous = index.checked_sub(1).map(|value| characters[value].1);
let next = characters.get(index + 1).map(|(_, value)| *value);
let is_word_character =
character.is_alphanumeric() || is_apostrophe(previous, character, next);
if is_word_character {
if start.is_none() {
start = Some(byte_index);
}
} else if let Some(word_start) = start.take() {
ranges.push((word_start, byte_index));
}
}
if let Some(word_start) = start {
ranges.push((word_start, input.len()));
}
ranges
}
fn is_apostrophe(previous: Option<char>, current: char, next: Option<char>) -> bool {
matches!(current, '\'' | '’')
&& previous.is_some_and(char::is_alphanumeric)
&& next.is_some_and(char::is_alphanumeric)
}
#[cfg(test)]
mod tests {
use super::{
Word, WordStats, contains_word, ends_with_word, normalize_word, starts_with_word,
unique_words, word_count, words,
};
#[test]
fn handles_empty_and_whitespace_only_input() {
assert_eq!(word_count(""), 0);
assert!(words(" \n").is_empty());
assert_eq!(normalize_word(" \n"), "");
}
#[test]
fn normalizes_ascii_words_and_punctuation() {
assert_eq!(normalize_word("Hello!"), "hello");
assert_eq!(normalize_word("don't"), "don't");
assert_eq!(word_count("Hello, hello world"), 3);
}
#[test]
fn preserves_first_seen_unique_words() {
let unique = unique_words("Hello, hello world world");
let texts: Vec<&str> = unique.iter().map(Word::as_str).collect();
assert_eq!(texts, vec!["hello", "world"]);
}
#[test]
fn checks_word_boundaries() {
assert!(contains_word("Hello, world", "world"));
assert!(!contains_word("cartwheel", "art"));
assert!(starts_with_word("Hello world", "hello"));
assert!(ends_with_word("Hello world!", "world"));
}
#[test]
fn handles_multiline_and_unicode_input() {
let extracted = words("Straße\ncafé");
let texts: Vec<&str> = extracted.iter().map(Word::as_str).collect();
assert_eq!(texts, vec!["straße", "café"]);
let stats = WordStats::from_text("Straße\ncafé café");
assert_eq!(stats.total, 3);
assert_eq!(stats.unique, 2);
}
}