Skip to main content

use_word/
lib.rs

1#![forbid(unsafe_code)]
2#![doc = include_str!("../README.md")]
3
4use std::collections::HashSet;
5
6/// A normalized word token.
7#[derive(Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
8pub struct Word {
9    text: String,
10}
11
12impl Word {
13    /// Creates a normalized word when at least one word character is present.
14    pub fn new(input: &str) -> Option<Self> {
15        let text = normalize_word(input);
16        (!text.is_empty()).then_some(Self { text })
17    }
18
19    /// Returns the normalized word text.
20    pub fn as_str(&self) -> &str {
21        &self.text
22    }
23
24    /// Consumes the word and returns the owned string.
25    pub fn into_string(self) -> String {
26        self.text
27    }
28}
29
30/// Aggregate counts derived from text.
31#[derive(Clone, Copy, Debug, Eq, PartialEq)]
32pub struct WordStats {
33    /// Total number of normalized words.
34    pub total: usize,
35    /// Total number of distinct normalized words.
36    pub unique: usize,
37}
38
39impl WordStats {
40    /// Builds stats from the input text.
41    pub fn from_text(input: &str) -> Self {
42        let all_words = words(input);
43        let total = all_words.len();
44        let unique = unique_words(input).len();
45        Self { total, unique }
46    }
47}
48
49/// Counts normalized words in the input.
50pub fn word_count(input: &str) -> usize {
51    words(input).len()
52}
53
54/// Returns distinct normalized words in first-seen order.
55pub fn unique_words(input: &str) -> Vec<Word> {
56    let mut seen = HashSet::new();
57    let mut unique = Vec::new();
58
59    for word in words(input) {
60        if seen.insert(word.text.clone()) {
61            unique.push(word);
62        }
63    }
64
65    unique
66}
67
68/// Normalizes a word-like value by lowercasing letters and dropping non-word punctuation.
69pub fn normalize_word(input: &str) -> String {
70    let characters: Vec<char> = input.trim().chars().collect();
71    let mut output = String::new();
72
73    for (index, character) in characters.iter().copied().enumerate() {
74        let previous = index
75            .checked_sub(1)
76            .and_then(|value| characters.get(value))
77            .copied();
78        let next = characters.get(index + 1).copied();
79
80        if character.is_alphanumeric() || is_apostrophe(previous, character, next) {
81            output.extend(character.to_lowercase());
82        }
83    }
84
85    output
86}
87
88/// Returns `true` when the input contains the target as a full normalized word.
89pub fn contains_word(input: &str, target: &str) -> bool {
90    let normalized_target = normalize_word(target);
91    !normalized_target.is_empty()
92        && words(input)
93            .iter()
94            .any(|word| word.as_str() == normalized_target)
95}
96
97/// Returns `true` when the first normalized word matches the target.
98pub fn starts_with_word(input: &str, target: &str) -> bool {
99    let normalized_target = normalize_word(target);
100    !normalized_target.is_empty()
101        && words(input)
102            .first()
103            .is_some_and(|word| word.as_str() == normalized_target)
104}
105
106/// Returns `true` when the last normalized word matches the target.
107pub fn ends_with_word(input: &str, target: &str) -> bool {
108    let normalized_target = normalize_word(target);
109    !normalized_target.is_empty()
110        && words(input)
111            .last()
112            .is_some_and(|word| word.as_str() == normalized_target)
113}
114
115/// Extracts normalized words from the input.
116pub fn words(input: &str) -> Vec<Word> {
117    word_ranges(input)
118        .into_iter()
119        .filter_map(|(start, end)| Word::new(&input[start..end]))
120        .collect()
121}
122
123fn word_ranges(input: &str) -> Vec<(usize, usize)> {
124    let characters: Vec<(usize, char)> = input.char_indices().collect();
125    let mut ranges = Vec::new();
126    let mut start = None;
127
128    for (index, (byte_index, character)) in characters.iter().copied().enumerate() {
129        let previous = index.checked_sub(1).map(|value| characters[value].1);
130        let next = characters.get(index + 1).map(|(_, value)| *value);
131        let is_word_character =
132            character.is_alphanumeric() || is_apostrophe(previous, character, next);
133
134        if is_word_character {
135            if start.is_none() {
136                start = Some(byte_index);
137            }
138        } else if let Some(word_start) = start.take() {
139            ranges.push((word_start, byte_index));
140        }
141    }
142
143    if let Some(word_start) = start {
144        ranges.push((word_start, input.len()));
145    }
146
147    ranges
148}
149
150fn is_apostrophe(previous: Option<char>, current: char, next: Option<char>) -> bool {
151    matches!(current, '\'' | '’')
152        && previous.is_some_and(char::is_alphanumeric)
153        && next.is_some_and(char::is_alphanumeric)
154}
155
156#[cfg(test)]
157mod tests {
158    use super::{
159        Word, WordStats, contains_word, ends_with_word, normalize_word, starts_with_word,
160        unique_words, word_count, words,
161    };
162
163    #[test]
164    fn handles_empty_and_whitespace_only_input() {
165        assert_eq!(word_count(""), 0);
166        assert!(words("   \n").is_empty());
167        assert_eq!(normalize_word("   \n"), "");
168    }
169
170    #[test]
171    fn normalizes_ascii_words_and_punctuation() {
172        assert_eq!(normalize_word("Hello!"), "hello");
173        assert_eq!(normalize_word("don't"), "don't");
174        assert_eq!(word_count("Hello, hello world"), 3);
175    }
176
177    #[test]
178    fn preserves_first_seen_unique_words() {
179        let unique = unique_words("Hello, hello world world");
180        let texts: Vec<&str> = unique.iter().map(Word::as_str).collect();
181        assert_eq!(texts, vec!["hello", "world"]);
182    }
183
184    #[test]
185    fn checks_word_boundaries() {
186        assert!(contains_word("Hello, world", "world"));
187        assert!(!contains_word("cartwheel", "art"));
188        assert!(starts_with_word("Hello world", "hello"));
189        assert!(ends_with_word("Hello world!", "world"));
190    }
191
192    #[test]
193    fn handles_multiline_and_unicode_input() {
194        let extracted = words("Straße\ncafé");
195        let texts: Vec<&str> = extracted.iter().map(Word::as_str).collect();
196        assert_eq!(texts, vec!["straße", "café"]);
197
198        let stats = WordStats::from_text("Straße\ncafé café");
199        assert_eq!(stats.total, 3);
200        assert_eq!(stats.unique, 2);
201    }
202}