Skip to main content

writing_analysis/
utils.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3/// Raw text statistics used by readability formulas.
4#[derive(Debug, Clone, PartialEq)]
5pub struct TextStatistics {
6    pub sentence_count: usize,
7    pub word_count: usize,
8    pub syllable_count: usize,
9    pub character_count: usize,
10    pub polysyllable_count: usize,
11}
12
13static ABBREVIATIONS: &[&str] = &[
14    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.", "Inc.", "Ltd.", "Corp.", "vs.",
15    "etc.", "e.g.", "i.e.", "Vol.", "Dept.", "Est.", "Govt.", "No.",
16];
17
18/// Split text into sentences using punctuation heuristics.
19pub fn split_sentences(text: &str) -> Vec<&str> {
20    find_sentence_spans(text)
21}
22
23/// Find sentence spans directly in the source text.
24fn find_sentence_spans(text: &str) -> Vec<&str> {
25    let text = text.trim();
26    if text.is_empty() {
27        return Vec::new();
28    }
29
30    let mut sentences = Vec::new();
31    let mut start = 0;
32
33    // Iterate by char_indices to correctly handle multi-byte UTF-8
34    let chars: Vec<(usize, char)> = text.char_indices().collect();
35    let len = chars.len();
36
37    let mut ci = 0;
38    while ci < len {
39        let (_byte_pos, ch) = chars[ci];
40        if ch == '.' || ch == '!' || ch == '?' {
41            // Check for ellipsis: consume all consecutive dots
42            if ch == '.' {
43                while ci + 1 < len && chars[ci + 1].1 == '.' {
44                    ci += 1;
45                }
46            }
47            let end_byte = chars[ci].0;
48
49            // Check if this is an abbreviation
50            if ch == '.' && is_abbreviation(text, end_byte) {
51                ci += 1;
52                continue;
53            }
54
55            // Check if this is a decimal number (digit.digit)
56            if ch == '.' && ci > 0 && ci + 1 < len
57                && chars[ci - 1].1.is_ascii_digit()
58                && chars[ci + 1].1.is_ascii_digit()
59            {
60                ci += 1;
61                continue;
62            }
63
64            // Byte position after the punctuation character
65            let after_byte = end_byte + ch.len_utf8();
66
67            if after_byte >= text.len() {
68                // End of text — this is a sentence boundary
69                let sentence = text[start..after_byte].trim();
70                if !sentence.is_empty() {
71                    sentences.push(sentence);
72                }
73                start = after_byte;
74            } else {
75                // Check for whitespace then uppercase
76                let mut j = ci + 1;
77                while j < len && chars[j].1.is_ascii_whitespace() {
78                    j += 1;
79                }
80                if j < len && chars[j].1.is_ascii_uppercase() && j > ci + 1 {
81                    // Sentence boundary
82                    let sentence = text[start..after_byte].trim();
83                    if !sentence.is_empty() {
84                        sentences.push(sentence);
85                    }
86                    start = chars[j].0;
87                }
88            }
89        }
90        ci += 1;
91    }
92
93    // Remaining text
94    let remaining = text[start..].trim();
95    if !remaining.is_empty() {
96        sentences.push(remaining);
97    }
98
99    sentences
100}
101
102/// Check if the period at position `dot_pos` (byte offset) is part of an abbreviation.
103fn is_abbreviation(text: &str, dot_pos: usize) -> bool {
104    for abbr in ABBREVIATIONS {
105        let abbr_len = abbr.len();
106        if dot_pos + 1 >= abbr_len {
107            let candidate_start = dot_pos + 1 - abbr_len;
108            // Ensure candidate_start is on a char boundary
109            if !text.is_char_boundary(candidate_start) {
110                continue;
111            }
112            let candidate = &text[candidate_start..=dot_pos];
113            if candidate.eq_ignore_ascii_case(abbr) {
114                return true;
115            }
116        }
117    }
118    false
119}
120
121/// Split text into words using Unicode word boundaries.
122pub fn split_words(text: &str) -> Vec<&str> {
123    text.unicode_words().collect()
124}
125
126/// Count syllables in an English word using vowel-group heuristics.
127pub fn count_syllables(word: &str) -> usize {
128    let word_lower = word.to_lowercase();
129    let chars: Vec<char> = word_lower.chars().filter(|c| c.is_alphabetic()).collect();
130
131    if chars.is_empty() {
132        return 1;
133    }
134
135    let vowels = "aeiouy";
136    let mut count: usize = 0;
137    let mut prev_vowel = false;
138
139    for &ch in &chars {
140        let is_vowel = vowels.contains(ch);
141        if is_vowel && !prev_vowel {
142            count += 1;
143        }
144        prev_vowel = is_vowel;
145    }
146
147    // Silent-e adjustment: trailing 'e' after a consonant
148    if chars.len() > 2 {
149        if let Some(&last) = chars.last() {
150            if last == 'e' {
151                let second_last = chars[chars.len() - 2];
152                if !vowels.contains(second_last) {
153                    // But not if it's "-le" after a consonant (e.g., "table")
154                    if chars.len() >= 3 {
155                        let third_last = chars[chars.len() - 3];
156                        if second_last == 'l' && !vowels.contains(third_last) {
157                            // consonant + le: keep the syllable (e.g., "ta-ble")
158                            // don't subtract
159                        } else {
160                            count = count.saturating_sub(1);
161                        }
162                    } else {
163                        count = count.saturating_sub(1);
164                    }
165                }
166            }
167        }
168    }
169
170    count.max(1)
171}
172
173/// Count only alphabetic characters (letters).
174pub fn count_characters(text: &str) -> usize {
175    text.chars().filter(|c| c.is_alphabetic()).count()
176}
177
178/// Compute aggregate text statistics.
179pub fn compute_statistics(text: &str) -> TextStatistics {
180    let sentences = split_sentences(text);
181    let words = split_words(text);
182    let syllable_count: usize = words.iter().map(|w| count_syllables(w)).sum();
183    let character_count = count_characters(text);
184    let polysyllable_count = words.iter().filter(|w| count_syllables(w) >= 3).count();
185
186    TextStatistics {
187        sentence_count: sentences.len(),
188        word_count: words.len(),
189        syllable_count,
190        character_count,
191        polysyllable_count,
192    }
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn split_sentences_basic() {
201        let sentences = split_sentences("Hello world. How are you? I am fine!");
202        assert_eq!(sentences.len(), 3);
203    }
204
205    #[test]
206    fn split_sentences_abbreviation() {
207        let sentences = split_sentences("Dr. Smith went to Washington. He arrived on time.");
208        assert_eq!(sentences.len(), 2);
209    }
210
211    #[test]
212    fn split_sentences_decimal() {
213        let sentences = split_sentences("He scored 3.5 points. That was great.");
214        assert_eq!(sentences.len(), 2);
215    }
216
217    #[test]
218    fn split_sentences_empty() {
219        let sentences = split_sentences("");
220        assert_eq!(sentences.len(), 0);
221    }
222
223    #[test]
224    fn split_sentences_single() {
225        let sentences = split_sentences("Just one sentence.");
226        assert_eq!(sentences.len(), 1);
227    }
228
229    #[test]
230    fn split_sentences_no_final_punctuation() {
231        let sentences = split_sentences("Hello world");
232        assert_eq!(sentences.len(), 1);
233    }
234
235    #[test]
236    fn split_sentences_ellipsis() {
237        let sentences = split_sentences("Wait... What happened?");
238        assert_eq!(sentences.len(), 2);
239    }
240
241    #[test]
242    fn count_syllables_monosyllabic() {
243        assert_eq!(count_syllables("the"), 1);
244        assert_eq!(count_syllables("cat"), 1);
245        assert_eq!(count_syllables("fire"), 1);
246    }
247
248    #[test]
249    fn count_syllables_multisyllabic() {
250        assert_eq!(count_syllables("hello"), 2);
251        assert_eq!(count_syllables("beautiful"), 3);
252        assert_eq!(count_syllables("understanding"), 4);
253    }
254
255    #[test]
256    fn count_syllables_table() {
257        assert_eq!(count_syllables("table"), 2);
258    }
259
260    #[test]
261    fn split_words_basic() {
262        let words = split_words("Hello world");
263        assert_eq!(words, vec!["Hello", "world"]);
264    }
265
266    #[test]
267    fn split_words_with_punctuation() {
268        let words = split_words("Hello, world!");
269        assert_eq!(words, vec!["Hello", "world"]);
270    }
271
272    #[test]
273    fn count_characters_letters_only() {
274        assert_eq!(count_characters("Hello, world! 123"), 10);
275    }
276
277    #[test]
278    fn compute_statistics_basic() {
279        let stats = compute_statistics("The cat sat on the mat. The dog ran fast.");
280        assert_eq!(stats.sentence_count, 2);
281        assert_eq!(stats.word_count, 10);
282    }
283}