writing_analysis/
utils.rs1use unicode_segmentation::UnicodeSegmentation;
2
3#[derive(Debug, Clone, PartialEq)]
5pub struct TextStatistics {
6 pub sentence_count: usize,
7 pub word_count: usize,
8 pub syllable_count: usize,
9 pub character_count: usize,
10 pub polysyllable_count: usize,
11}
12
13static ABBREVIATIONS: &[&str] = &[
14 "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.", "Inc.", "Ltd.", "Corp.", "vs.",
15 "etc.", "e.g.", "i.e.", "Vol.", "Dept.", "Est.", "Govt.", "No.",
16];
17
18pub fn split_sentences(text: &str) -> Vec<&str> {
20 find_sentence_spans(text)
21}
22
23fn find_sentence_spans(text: &str) -> Vec<&str> {
25 let text = text.trim();
26 if text.is_empty() {
27 return Vec::new();
28 }
29
30 let mut sentences = Vec::new();
31 let mut start = 0;
32 let bytes = text.as_bytes();
33 let len = bytes.len();
34
35 let mut i = 0;
36 while i < len {
37 let b = bytes[i];
38 if b == b'.' || b == b'!' || b == b'?' {
39 if b == b'.' {
41 while i + 1 < len && bytes[i + 1] == b'.' {
42 i += 1;
43 }
44 }
45
46 if b == b'.' && is_abbreviation(text, i) {
48 i += 1;
49 continue;
50 }
51
52 if b == b'.' && i > 0 && i + 1 < len && bytes[i - 1].is_ascii_digit() && bytes[i + 1].is_ascii_digit() {
54 i += 1;
55 continue;
56 }
57
58 let after = i + 1;
60 if after >= len {
61 let sentence = text[start..=i].trim();
63 if !sentence.is_empty() {
64 sentences.push(sentence);
65 }
66 start = after;
67 } else {
68 let mut j = after;
70 while j < len && bytes[j].is_ascii_whitespace() {
71 j += 1;
72 }
73 if j < len && bytes[j].is_ascii_uppercase() && j > after {
74 let sentence = text[start..=i].trim();
76 if !sentence.is_empty() {
77 sentences.push(sentence);
78 }
79 start = j;
80 }
81 }
82 }
83 i += 1;
84 }
85
86 let remaining = text[start..].trim();
88 if !remaining.is_empty() {
89 sentences.push(remaining);
90 }
91
92 sentences
93}
94
95fn is_abbreviation(text: &str, dot_pos: usize) -> bool {
97 for abbr in ABBREVIATIONS {
98 let abbr_len = abbr.len();
99 if dot_pos + 1 >= abbr_len {
100 let candidate_start = dot_pos + 1 - abbr_len;
101 let candidate = &text[candidate_start..=dot_pos];
102 if candidate.eq_ignore_ascii_case(abbr) {
103 return true;
104 }
105 }
106 }
107 false
108}
109
110pub fn split_words(text: &str) -> Vec<&str> {
112 text.unicode_words().collect()
113}
114
115pub fn count_syllables(word: &str) -> usize {
117 let word_lower = word.to_lowercase();
118 let chars: Vec<char> = word_lower.chars().filter(|c| c.is_alphabetic()).collect();
119
120 if chars.is_empty() {
121 return 1;
122 }
123
124 let vowels = "aeiouy";
125 let mut count: usize = 0;
126 let mut prev_vowel = false;
127
128 for &ch in &chars {
129 let is_vowel = vowels.contains(ch);
130 if is_vowel && !prev_vowel {
131 count += 1;
132 }
133 prev_vowel = is_vowel;
134 }
135
136 if chars.len() > 2 {
138 if let Some(&last) = chars.last() {
139 if last == 'e' {
140 let second_last = chars[chars.len() - 2];
141 if !vowels.contains(second_last) {
142 if chars.len() >= 3 {
144 let third_last = chars[chars.len() - 3];
145 if second_last == 'l' && !vowels.contains(third_last) {
146 } else {
149 count = count.saturating_sub(1);
150 }
151 } else {
152 count = count.saturating_sub(1);
153 }
154 }
155 }
156 }
157 }
158
159 count.max(1)
160}
161
162pub fn count_characters(text: &str) -> usize {
164 text.chars().filter(|c| c.is_alphabetic()).count()
165}
166
167pub fn compute_statistics(text: &str) -> TextStatistics {
169 let sentences = split_sentences(text);
170 let words = split_words(text);
171 let syllable_count: usize = words.iter().map(|w| count_syllables(w)).sum();
172 let character_count = count_characters(text);
173 let polysyllable_count = words.iter().filter(|w| count_syllables(w) >= 3).count();
174
175 TextStatistics {
176 sentence_count: sentences.len(),
177 word_count: words.len(),
178 syllable_count,
179 character_count,
180 polysyllable_count,
181 }
182}
183
184#[cfg(test)]
185mod tests {
186 use super::*;
187
188 #[test]
189 fn split_sentences_basic() {
190 let sentences = split_sentences("Hello world. How are you? I am fine!");
191 assert_eq!(sentences.len(), 3);
192 }
193
194 #[test]
195 fn split_sentences_abbreviation() {
196 let sentences = split_sentences("Dr. Smith went to Washington. He arrived on time.");
197 assert_eq!(sentences.len(), 2);
198 }
199
200 #[test]
201 fn split_sentences_decimal() {
202 let sentences = split_sentences("He scored 3.5 points. That was great.");
203 assert_eq!(sentences.len(), 2);
204 }
205
206 #[test]
207 fn split_sentences_empty() {
208 let sentences = split_sentences("");
209 assert_eq!(sentences.len(), 0);
210 }
211
212 #[test]
213 fn split_sentences_single() {
214 let sentences = split_sentences("Just one sentence.");
215 assert_eq!(sentences.len(), 1);
216 }
217
218 #[test]
219 fn split_sentences_no_final_punctuation() {
220 let sentences = split_sentences("Hello world");
221 assert_eq!(sentences.len(), 1);
222 }
223
224 #[test]
225 fn split_sentences_ellipsis() {
226 let sentences = split_sentences("Wait... What happened?");
227 assert_eq!(sentences.len(), 2);
228 }
229
230 #[test]
231 fn count_syllables_monosyllabic() {
232 assert_eq!(count_syllables("the"), 1);
233 assert_eq!(count_syllables("cat"), 1);
234 assert_eq!(count_syllables("fire"), 1);
235 }
236
237 #[test]
238 fn count_syllables_multisyllabic() {
239 assert_eq!(count_syllables("hello"), 2);
240 assert_eq!(count_syllables("beautiful"), 3);
241 assert_eq!(count_syllables("understanding"), 4);
242 }
243
244 #[test]
245 fn count_syllables_table() {
246 assert_eq!(count_syllables("table"), 2);
247 }
248
249 #[test]
250 fn split_words_basic() {
251 let words = split_words("Hello world");
252 assert_eq!(words, vec!["Hello", "world"]);
253 }
254
255 #[test]
256 fn split_words_with_punctuation() {
257 let words = split_words("Hello, world!");
258 assert_eq!(words, vec!["Hello", "world"]);
259 }
260
261 #[test]
262 fn count_characters_letters_only() {
263 assert_eq!(count_characters("Hello, world! 123"), 10);
264 }
265
266 #[test]
267 fn compute_statistics_basic() {
268 let stats = compute_statistics("The cat sat on the mat. The dog ran fast.");
269 assert_eq!(stats.sentence_count, 2);
270 assert_eq!(stats.word_count, 10);
271 }
272}