writing_analysis/
utils.rs1use unicode_segmentation::UnicodeSegmentation;
2
3#[derive(Debug, Clone, PartialEq)]
5pub struct TextStatistics {
6 pub sentence_count: usize,
7 pub word_count: usize,
8 pub syllable_count: usize,
9 pub character_count: usize,
10 pub polysyllable_count: usize,
11}
12
13static ABBREVIATIONS: &[&str] = &[
14 "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.", "Inc.", "Ltd.", "Corp.", "vs.",
15 "etc.", "e.g.", "i.e.", "Vol.", "Dept.", "Est.", "Govt.", "No.",
16];
17
18pub fn split_sentences(text: &str) -> Vec<&str> {
20 find_sentence_spans(text)
21}
22
23fn find_sentence_spans(text: &str) -> Vec<&str> {
25 let text = text.trim();
26 if text.is_empty() {
27 return Vec::new();
28 }
29
30 let mut sentences = Vec::new();
31 let mut start = 0;
32
33 let chars: Vec<(usize, char)> = text.char_indices().collect();
35 let len = chars.len();
36
37 let mut ci = 0;
38 while ci < len {
39 let (_byte_pos, ch) = chars[ci];
40 if ch == '.' || ch == '!' || ch == '?' {
41 if ch == '.' {
43 while ci + 1 < len && chars[ci + 1].1 == '.' {
44 ci += 1;
45 }
46 }
47 let end_byte = chars[ci].0;
48
49 if ch == '.' && is_abbreviation(text, end_byte) {
51 ci += 1;
52 continue;
53 }
54
55 if ch == '.' && ci > 0 && ci + 1 < len
57 && chars[ci - 1].1.is_ascii_digit()
58 && chars[ci + 1].1.is_ascii_digit()
59 {
60 ci += 1;
61 continue;
62 }
63
64 let after_byte = end_byte + ch.len_utf8();
66
67 if after_byte >= text.len() {
68 let sentence = text[start..after_byte].trim();
70 if !sentence.is_empty() {
71 sentences.push(sentence);
72 }
73 start = after_byte;
74 } else {
75 let mut j = ci + 1;
77 while j < len && chars[j].1.is_ascii_whitespace() {
78 j += 1;
79 }
80 if j < len && chars[j].1.is_ascii_uppercase() && j > ci + 1 {
81 let sentence = text[start..after_byte].trim();
83 if !sentence.is_empty() {
84 sentences.push(sentence);
85 }
86 start = chars[j].0;
87 }
88 }
89 }
90 ci += 1;
91 }
92
93 let remaining = text[start..].trim();
95 if !remaining.is_empty() {
96 sentences.push(remaining);
97 }
98
99 sentences
100}
101
102fn is_abbreviation(text: &str, dot_pos: usize) -> bool {
104 for abbr in ABBREVIATIONS {
105 let abbr_len = abbr.len();
106 if dot_pos + 1 >= abbr_len {
107 let candidate_start = dot_pos + 1 - abbr_len;
108 if !text.is_char_boundary(candidate_start) {
110 continue;
111 }
112 let candidate = &text[candidate_start..=dot_pos];
113 if candidate.eq_ignore_ascii_case(abbr) {
114 return true;
115 }
116 }
117 }
118 false
119}
120
121pub fn split_words(text: &str) -> Vec<&str> {
123 text.unicode_words().collect()
124}
125
126pub fn count_syllables(word: &str) -> usize {
128 let word_lower = word.to_lowercase();
129 let chars: Vec<char> = word_lower.chars().filter(|c| c.is_alphabetic()).collect();
130
131 if chars.is_empty() {
132 return 1;
133 }
134
135 let vowels = "aeiouy";
136 let mut count: usize = 0;
137 let mut prev_vowel = false;
138
139 for &ch in &chars {
140 let is_vowel = vowels.contains(ch);
141 if is_vowel && !prev_vowel {
142 count += 1;
143 }
144 prev_vowel = is_vowel;
145 }
146
147 if chars.len() > 2 {
149 if let Some(&last) = chars.last() {
150 if last == 'e' {
151 let second_last = chars[chars.len() - 2];
152 if !vowels.contains(second_last) {
153 if chars.len() >= 3 {
155 let third_last = chars[chars.len() - 3];
156 if second_last == 'l' && !vowels.contains(third_last) {
157 } else {
160 count = count.saturating_sub(1);
161 }
162 } else {
163 count = count.saturating_sub(1);
164 }
165 }
166 }
167 }
168 }
169
170 count.max(1)
171}
172
173pub fn count_characters(text: &str) -> usize {
175 text.chars().filter(|c| c.is_alphabetic()).count()
176}
177
178pub fn compute_statistics(text: &str) -> TextStatistics {
180 let sentences = split_sentences(text);
181 let words = split_words(text);
182 let syllable_count: usize = words.iter().map(|w| count_syllables(w)).sum();
183 let character_count = count_characters(text);
184 let polysyllable_count = words.iter().filter(|w| count_syllables(w) >= 3).count();
185
186 TextStatistics {
187 sentence_count: sentences.len(),
188 word_count: words.len(),
189 syllable_count,
190 character_count,
191 polysyllable_count,
192 }
193}
194
195#[cfg(test)]
196mod tests {
197 use super::*;
198
199 #[test]
200 fn split_sentences_basic() {
201 let sentences = split_sentences("Hello world. How are you? I am fine!");
202 assert_eq!(sentences.len(), 3);
203 }
204
205 #[test]
206 fn split_sentences_abbreviation() {
207 let sentences = split_sentences("Dr. Smith went to Washington. He arrived on time.");
208 assert_eq!(sentences.len(), 2);
209 }
210
211 #[test]
212 fn split_sentences_decimal() {
213 let sentences = split_sentences("He scored 3.5 points. That was great.");
214 assert_eq!(sentences.len(), 2);
215 }
216
217 #[test]
218 fn split_sentences_empty() {
219 let sentences = split_sentences("");
220 assert_eq!(sentences.len(), 0);
221 }
222
223 #[test]
224 fn split_sentences_single() {
225 let sentences = split_sentences("Just one sentence.");
226 assert_eq!(sentences.len(), 1);
227 }
228
229 #[test]
230 fn split_sentences_no_final_punctuation() {
231 let sentences = split_sentences("Hello world");
232 assert_eq!(sentences.len(), 1);
233 }
234
235 #[test]
236 fn split_sentences_ellipsis() {
237 let sentences = split_sentences("Wait... What happened?");
238 assert_eq!(sentences.len(), 2);
239 }
240
241 #[test]
242 fn count_syllables_monosyllabic() {
243 assert_eq!(count_syllables("the"), 1);
244 assert_eq!(count_syllables("cat"), 1);
245 assert_eq!(count_syllables("fire"), 1);
246 }
247
248 #[test]
249 fn count_syllables_multisyllabic() {
250 assert_eq!(count_syllables("hello"), 2);
251 assert_eq!(count_syllables("beautiful"), 3);
252 assert_eq!(count_syllables("understanding"), 4);
253 }
254
255 #[test]
256 fn count_syllables_table() {
257 assert_eq!(count_syllables("table"), 2);
258 }
259
260 #[test]
261 fn split_words_basic() {
262 let words = split_words("Hello world");
263 assert_eq!(words, vec!["Hello", "world"]);
264 }
265
266 #[test]
267 fn split_words_with_punctuation() {
268 let words = split_words("Hello, world!");
269 assert_eq!(words, vec!["Hello", "world"]);
270 }
271
272 #[test]
273 fn count_characters_letters_only() {
274 assert_eq!(count_characters("Hello, world! 123"), 10);
275 }
276
277 #[test]
278 fn compute_statistics_basic() {
279 let stats = compute_statistics("The cat sat on the mat. The dog ran fast.");
280 assert_eq!(stats.sentence_count, 2);
281 assert_eq!(stats.word_count, 10);
282 }
283}