Skip to main content

writing_analysis/
utils.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3/// Raw text statistics used by readability formulas.
4#[derive(Debug, Clone, PartialEq)]
5pub struct TextStatistics {
6    pub sentence_count: usize,
7    pub word_count: usize,
8    pub syllable_count: usize,
9    pub character_count: usize,
10    pub polysyllable_count: usize,
11}
12
13static ABBREVIATIONS: &[&str] = &[
14    "Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Sr.", "Jr.", "St.", "Inc.", "Ltd.", "Corp.", "vs.",
15    "etc.", "e.g.", "i.e.", "Vol.", "Dept.", "Est.", "Govt.", "No.",
16];
17
18/// Split text into sentences using punctuation heuristics.
19pub fn split_sentences(text: &str) -> Vec<&str> {
20    find_sentence_spans(text)
21}
22
23/// Find sentence spans directly in the source text.
24fn find_sentence_spans(text: &str) -> Vec<&str> {
25    let text = text.trim();
26    if text.is_empty() {
27        return Vec::new();
28    }
29
30    let mut sentences = Vec::new();
31    let mut start = 0;
32    let bytes = text.as_bytes();
33    let len = bytes.len();
34
35    let mut i = 0;
36    while i < len {
37        let b = bytes[i];
38        if b == b'.' || b == b'!' || b == b'?' {
39            // Check for ellipsis: consume all consecutive dots
40            if b == b'.' {
41                while i + 1 < len && bytes[i + 1] == b'.' {
42                    i += 1;
43                }
44            }
45
46            // Check if this is an abbreviation
47            if b == b'.' && is_abbreviation(text, i) {
48                i += 1;
49                continue;
50            }
51
52            // Check if this is a decimal number (digit.digit)
53            if b == b'.' && i > 0 && i + 1 < len && bytes[i - 1].is_ascii_digit() && bytes[i + 1].is_ascii_digit() {
54                i += 1;
55                continue;
56            }
57
58            // Look ahead: is there whitespace followed by an uppercase letter, or is this the end?
59            let after = i + 1;
60            if after >= len {
61                // End of text — this is a sentence boundary
62                let sentence = text[start..=i].trim();
63                if !sentence.is_empty() {
64                    sentences.push(sentence);
65                }
66                start = after;
67            } else {
68                // Check for whitespace then uppercase
69                let mut j = after;
70                while j < len && bytes[j].is_ascii_whitespace() {
71                    j += 1;
72                }
73                if j < len && bytes[j].is_ascii_uppercase() && j > after {
74                    // Sentence boundary
75                    let sentence = text[start..=i].trim();
76                    if !sentence.is_empty() {
77                        sentences.push(sentence);
78                    }
79                    start = j;
80                }
81            }
82        }
83        i += 1;
84    }
85
86    // Remaining text
87    let remaining = text[start..].trim();
88    if !remaining.is_empty() {
89        sentences.push(remaining);
90    }
91
92    sentences
93}
94
95/// Check if the period at position `dot_pos` is part of an abbreviation.
96fn is_abbreviation(text: &str, dot_pos: usize) -> bool {
97    for abbr in ABBREVIATIONS {
98        let abbr_len = abbr.len();
99        if dot_pos + 1 >= abbr_len {
100            let candidate_start = dot_pos + 1 - abbr_len;
101            let candidate = &text[candidate_start..=dot_pos];
102            if candidate.eq_ignore_ascii_case(abbr) {
103                return true;
104            }
105        }
106    }
107    false
108}
109
110/// Split text into words using Unicode word boundaries.
111pub fn split_words(text: &str) -> Vec<&str> {
112    text.unicode_words().collect()
113}
114
115/// Count syllables in an English word using vowel-group heuristics.
116pub fn count_syllables(word: &str) -> usize {
117    let word_lower = word.to_lowercase();
118    let chars: Vec<char> = word_lower.chars().filter(|c| c.is_alphabetic()).collect();
119
120    if chars.is_empty() {
121        return 1;
122    }
123
124    let vowels = "aeiouy";
125    let mut count: usize = 0;
126    let mut prev_vowel = false;
127
128    for &ch in &chars {
129        let is_vowel = vowels.contains(ch);
130        if is_vowel && !prev_vowel {
131            count += 1;
132        }
133        prev_vowel = is_vowel;
134    }
135
136    // Silent-e adjustment: trailing 'e' after a consonant
137    if chars.len() > 2 {
138        if let Some(&last) = chars.last() {
139            if last == 'e' {
140                let second_last = chars[chars.len() - 2];
141                if !vowels.contains(second_last) {
142                    // But not if it's "-le" after a consonant (e.g., "table")
143                    if chars.len() >= 3 {
144                        let third_last = chars[chars.len() - 3];
145                        if second_last == 'l' && !vowels.contains(third_last) {
146                            // consonant + le: keep the syllable (e.g., "ta-ble")
147                            // don't subtract
148                        } else {
149                            count = count.saturating_sub(1);
150                        }
151                    } else {
152                        count = count.saturating_sub(1);
153                    }
154                }
155            }
156        }
157    }
158
159    count.max(1)
160}
161
162/// Count only alphabetic characters (letters).
163pub fn count_characters(text: &str) -> usize {
164    text.chars().filter(|c| c.is_alphabetic()).count()
165}
166
167/// Compute aggregate text statistics.
168pub fn compute_statistics(text: &str) -> TextStatistics {
169    let sentences = split_sentences(text);
170    let words = split_words(text);
171    let syllable_count: usize = words.iter().map(|w| count_syllables(w)).sum();
172    let character_count = count_characters(text);
173    let polysyllable_count = words.iter().filter(|w| count_syllables(w) >= 3).count();
174
175    TextStatistics {
176        sentence_count: sentences.len(),
177        word_count: words.len(),
178        syllable_count,
179        character_count,
180        polysyllable_count,
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn split_sentences_basic() {
190        let sentences = split_sentences("Hello world. How are you? I am fine!");
191        assert_eq!(sentences.len(), 3);
192    }
193
194    #[test]
195    fn split_sentences_abbreviation() {
196        let sentences = split_sentences("Dr. Smith went to Washington. He arrived on time.");
197        assert_eq!(sentences.len(), 2);
198    }
199
200    #[test]
201    fn split_sentences_decimal() {
202        let sentences = split_sentences("He scored 3.5 points. That was great.");
203        assert_eq!(sentences.len(), 2);
204    }
205
206    #[test]
207    fn split_sentences_empty() {
208        let sentences = split_sentences("");
209        assert_eq!(sentences.len(), 0);
210    }
211
212    #[test]
213    fn split_sentences_single() {
214        let sentences = split_sentences("Just one sentence.");
215        assert_eq!(sentences.len(), 1);
216    }
217
218    #[test]
219    fn split_sentences_no_final_punctuation() {
220        let sentences = split_sentences("Hello world");
221        assert_eq!(sentences.len(), 1);
222    }
223
224    #[test]
225    fn split_sentences_ellipsis() {
226        let sentences = split_sentences("Wait... What happened?");
227        assert_eq!(sentences.len(), 2);
228    }
229
230    #[test]
231    fn count_syllables_monosyllabic() {
232        assert_eq!(count_syllables("the"), 1);
233        assert_eq!(count_syllables("cat"), 1);
234        assert_eq!(count_syllables("fire"), 1);
235    }
236
237    #[test]
238    fn count_syllables_multisyllabic() {
239        assert_eq!(count_syllables("hello"), 2);
240        assert_eq!(count_syllables("beautiful"), 3);
241        assert_eq!(count_syllables("understanding"), 4);
242    }
243
244    #[test]
245    fn count_syllables_table() {
246        assert_eq!(count_syllables("table"), 2);
247    }
248
249    #[test]
250    fn split_words_basic() {
251        let words = split_words("Hello world");
252        assert_eq!(words, vec!["Hello", "world"]);
253    }
254
255    #[test]
256    fn split_words_with_punctuation() {
257        let words = split_words("Hello, world!");
258        assert_eq!(words, vec!["Hello", "world"]);
259    }
260
261    #[test]
262    fn count_characters_letters_only() {
263        assert_eq!(count_characters("Hello, world! 123"), 10);
264    }
265
266    #[test]
267    fn compute_statistics_basic() {
268        let stats = compute_statistics("The cat sat on the mat. The dog ran fast.");
269        assert_eq!(stats.sentence_count, 2);
270        assert_eq!(stats.word_count, 10);
271    }
272}