scirs2_text/
text_statistics.rs1use crate::tokenize::{SentenceTokenizer, Tokenizer, WordTokenizer};
4use crate::{Result, TextError};
5
6#[derive(Debug, Clone)]
8pub struct TextStatistics {
9 wordtokenizer: WordTokenizer,
11 sentencetokenizer: SentenceTokenizer,
13}
14
15impl Default for TextStatistics {
16 fn default() -> Self {
17 Self::new()
18 }
19}
20
21impl TextStatistics {
22 pub fn new() -> Self {
24 Self {
25 wordtokenizer: WordTokenizer::new(true), sentencetokenizer: SentenceTokenizer::new(),
27 }
28 }
29
30 pub fn with_tokenizers(
32 wordtokenizer: WordTokenizer,
33 sentencetokenizer: SentenceTokenizer,
34 ) -> Self {
35 Self {
36 wordtokenizer,
37 sentencetokenizer,
38 }
39 }
40
41 pub fn word_count(&self, text: &str) -> Result<usize> {
43 Ok(self.wordtokenizer.tokenize(text)?.len())
44 }
45
46 pub fn sentence_count(&self, text: &str) -> Result<usize> {
48 Ok(self.sentencetokenizer.tokenize(text)?.len())
49 }
50
51 fn count_syllables(&self, word: &str) -> usize {
53 if word.is_empty() {
54 return 0;
55 }
56
57 let word = word.trim().to_lowercase();
58
59 if word.len() <= 3 {
61 return 1;
62 }
63
64 let word = if word.ends_with("es") || word.ends_with("ed") {
66 &word[..word.len() - 2]
67 } else if word.ends_with('e') && word.len() > 2 {
68 &word[..word.len() - 1]
69 } else {
70 &word
71 };
72
73 let vowels = ['a', 'e', 'i', 'o', 'u', 'y'];
74 let mut syllable_count = 0;
75 let mut prev_is_vowel = false;
76
77 for ch in word.chars() {
78 let is_vowel = vowels.contains(&ch);
79
80 if is_vowel && !prev_is_vowel {
81 syllable_count += 1;
82 }
83
84 prev_is_vowel = is_vowel;
85 }
86
87 syllable_count.max(1)
89 }
90
91 pub fn syllable_count(&self, text: &str) -> Result<usize> {
93 let words = self.wordtokenizer.tokenize(text)?;
94 Ok(words.iter().map(|w| self.count_syllables(w)).sum())
95 }
96
97 pub fn complex_word_count(&self, text: &str) -> Result<usize> {
99 let words = self.wordtokenizer.tokenize(text)?;
100 Ok(words
101 .iter()
102 .filter(|w| self.count_syllables(w) >= 3)
103 .count())
104 }
105
106 pub fn avg_sentence_length(&self, text: &str) -> Result<f64> {
108 let word_count = self.word_count(text)?;
109 let sentence_count = self.sentence_count(text)?;
110
111 if sentence_count == 0 {
112 return Err(TextError::InvalidInput("Text has no sentences".to_string()));
113 }
114
115 Ok(word_count as f64 / sentence_count as f64)
116 }
117
118 pub fn avg_word_length(&self, text: &str) -> Result<f64> {
120 let words = self.wordtokenizer.tokenize(text)?;
121
122 if words.is_empty() {
123 return Err(TextError::InvalidInput("Text has no words".to_string()));
124 }
125
126 let char_count: usize = words.iter().map(|w| w.chars().count()).sum();
127 Ok(char_count as f64 / words.len() as f64)
128 }
129
130 pub fn avg_syllables_per_word(&self, text: &str) -> Result<f64> {
132 let words = self.wordtokenizer.tokenize(text)?;
133
134 if words.is_empty() {
135 return Err(TextError::InvalidInput("Text has no words".to_string()));
136 }
137
138 let syllable_count: usize = words.iter().map(|w| self.count_syllables(w)).sum();
139 Ok(syllable_count as f64 / words.len() as f64)
140 }
141
142 pub fn flesch_reading_ease(&self, text: &str) -> Result<f64> {
153 let avg_sentence_length = self.avg_sentence_length(text)?;
154 let avg_syllables_per_word = self.avg_syllables_per_word(text)?;
155
156 let score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word);
157
158 Ok(score.clamp(0.0, 100.0))
160 }
161
162 pub fn flesch_kincaid_grade_level(&self, text: &str) -> Result<f64> {
166 let avg_sentence_length = self.avg_sentence_length(text)?;
167 let avg_syllables_per_word = self.avg_syllables_per_word(text)?;
168
169 let grade = 0.39 * avg_sentence_length + 11.8 * avg_syllables_per_word - 15.59;
170
171 Ok(grade.max(0.0))
173 }
174
175 pub fn gunning_fog(&self, text: &str) -> Result<f64> {
180 let avg_sentence_length = self.avg_sentence_length(text)?;
181 let complex_words = self.complex_word_count(text)? as f64;
182 let words = self.word_count(text)? as f64;
183
184 if words == 0.0 {
185 return Err(TextError::InvalidInput("Text has no words".to_string()));
186 }
187
188 let percentage_complex_words = (complex_words / words) * 100.0;
189 let fog = 0.4 * (avg_sentence_length + percentage_complex_words / 100.0);
190
191 Ok(fog)
192 }
193
194 pub fn smog_index(&self, text: &str) -> Result<f64> {
199 let sentences = self.sentence_count(text)?;
200 let complex_words = self.complex_word_count(text)? as f64;
201
202 if sentences < 30 {
203 return Err(TextError::InvalidInput(
204 "SMOG formula is designed for 30+ sentences, results may be inaccurate".to_string(),
205 ));
206 }
207
208 let smog = 1.043 * (complex_words * (30.0 / sentences as f64)).sqrt() + 3.1291;
209 Ok(smog)
210 }
211
212 pub fn automated_readability_index(&self, text: &str) -> Result<f64> {
216 let character_count = text.chars().filter(|c| !c.is_whitespace()).count() as f64;
217 let word_count = self.word_count(text)? as f64;
218 let sentence_count = self.sentence_count(text)? as f64;
219
220 if word_count == 0.0 || sentence_count == 0.0 {
221 return Err(TextError::InvalidInput(
222 "Text is too short for analysis".to_string(),
223 ));
224 }
225
226 let ari =
227 4.71 * (character_count / word_count) + 0.5 * (word_count / sentence_count) - 21.43;
228
229 Ok(ari.max(0.0))
231 }
232
233 pub fn coleman_liau_index(&self, text: &str) -> Result<f64> {
237 let character_count = text.chars().filter(|c| !c.is_whitespace()).count() as f64;
238 let word_count = self.word_count(text)? as f64;
239 let sentence_count = self.sentence_count(text)? as f64;
240
241 if word_count == 0.0 {
242 return Err(TextError::InvalidInput("Text has no words".to_string()));
243 }
244
245 let l = (character_count / word_count) * 100.0; let s = (sentence_count / word_count) * 100.0; let coleman_liau = 0.0588 * l - 0.296 * s - 15.8;
249
250 Ok(coleman_liau.max(0.0))
252 }
253
254 pub fn dale_chall_readability(&self, text: &str) -> Result<f64> {
259 let words = self.wordtokenizer.tokenize(text)?;
262 let word_count = words.len() as f64;
263 let sentence_count = self.sentence_count(text)? as f64;
264
265 if word_count == 0.0 || sentence_count == 0.0 {
266 return Err(TextError::InvalidInput(
267 "Text is too short for analysis".to_string(),
268 ));
269 }
270
271 let difficult_word_count = self.complex_word_count(text)? as f64;
273 let percent_difficult_words = (difficult_word_count / word_count) * 100.0;
274
275 let raw_score = 0.1579 * percent_difficult_words + 0.0496 * (word_count / sentence_count);
276
277 let score = if percent_difficult_words > 5.0 {
279 raw_score + 3.6365
280 } else {
281 raw_score
282 };
283
284 Ok(score)
285 }
286
287 pub fn lexical_diversity(&self, text: &str) -> Result<f64> {
289 let words = self.wordtokenizer.tokenize(text)?;
290
291 if words.is_empty() {
292 return Err(TextError::InvalidInput("Text has no words".to_string()));
293 }
294
295 let total_words = words.len() as f64;
296 let unique_words = words.iter().collect::<std::collections::HashSet<_>>().len() as f64;
297
298 Ok(unique_words / total_words)
299 }
300
301 pub fn type_token_ratio(&self, text: &str) -> Result<f64> {
303 self.lexical_diversity(text)
304 }
305
306 pub fn get_all_metrics(&self, text: &str) -> Result<ReadabilityMetrics> {
308 Ok(ReadabilityMetrics {
309 flesch_reading_ease: self.flesch_reading_ease(text)?,
310 flesch_kincaid_grade_level: self.flesch_kincaid_grade_level(text)?,
311 gunning_fog: self.gunning_fog(text)?,
312 automated_readability_index: self.automated_readability_index(text)?,
313 coleman_liau_index: self.coleman_liau_index(text)?,
314 lexical_diversity: self.lexical_diversity(text)?,
315 smog_index: self.smog_index(text).ok(), dale_chall_readability: self.dale_chall_readability(text)?,
317 text_statistics: TextMetrics {
318 word_count: self.word_count(text)?,
319 sentence_count: self.sentence_count(text)?,
320 syllable_count: self.syllable_count(text)?,
321 complex_word_count: self.complex_word_count(text)?,
322 avg_sentence_length: self.avg_sentence_length(text)?,
323 avg_word_length: self.avg_word_length(text)?,
324 avg_syllables_per_word: self.avg_syllables_per_word(text)?,
325 },
326 })
327 }
328}
329
330#[derive(Debug, Clone)]
332pub struct ReadabilityMetrics {
333 pub flesch_reading_ease: f64,
335 pub flesch_kincaid_grade_level: f64,
337 pub gunning_fog: f64,
339 pub smog_index: Option<f64>,
341 pub automated_readability_index: f64,
343 pub coleman_liau_index: f64,
345 pub dale_chall_readability: f64,
347 pub lexical_diversity: f64,
349 pub text_statistics: TextMetrics,
351}
352
353#[derive(Debug, Clone)]
355pub struct TextMetrics {
356 pub word_count: usize,
358 pub sentence_count: usize,
360 pub syllable_count: usize,
362 pub complex_word_count: usize,
364 pub avg_sentence_length: f64,
366 pub avg_word_length: f64,
368 pub avg_syllables_per_word: f64,
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 const SIMPLE_TEXT: &str = "This is a simple test. It has short sentences. Words are small.";
377 const COMPLEX_TEXT: &str = "The systematic study of scientific methodology encompasses various philosophical and interdisciplinary perspectives. Researchers diligently analyze epistemological foundations of empirical investigation while considering phenomenological implications.";
378
379 #[test]
380 fn test_basic_counts() {
381 let stats = TextStatistics::new();
382
383 assert_eq!(stats.word_count(SIMPLE_TEXT).unwrap(), 12);
384 assert_eq!(stats.sentence_count(SIMPLE_TEXT).unwrap(), 3);
385 assert!(stats.syllable_count(SIMPLE_TEXT).unwrap() >= 12);
386
387 assert_eq!(stats.word_count(COMPLEX_TEXT).unwrap(), 24);
388 assert_eq!(stats.sentence_count(COMPLEX_TEXT).unwrap(), 2);
389 assert!(stats.complex_word_count(COMPLEX_TEXT).unwrap() >= 8);
390 }
391
392 #[test]
393 fn test_averages() {
394 let stats = TextStatistics::new();
395
396 let simple_avg_sentence_len = stats.avg_sentence_length(SIMPLE_TEXT).unwrap();
397 assert!(simple_avg_sentence_len > 3.8 && simple_avg_sentence_len < 4.2);
398
399 let complex_avg_sentence_len = stats.avg_sentence_length(COMPLEX_TEXT).unwrap();
400 assert!(complex_avg_sentence_len > 10.0 && complex_avg_sentence_len < 13.0);
401
402 let simple_avg_word_len = stats.avg_word_length(SIMPLE_TEXT).unwrap();
403 assert!(simple_avg_word_len > 2.0 && simple_avg_word_len < 5.0);
404
405 let complex_avg_word_len = stats.avg_word_length(COMPLEX_TEXT).unwrap();
406 assert!(complex_avg_word_len > 7.0);
407 }
408
409 #[test]
410 fn test_readability_metrics() {
411 let stats = TextStatistics::new();
412
413 let simple_flesch = stats.flesch_reading_ease(SIMPLE_TEXT).unwrap();
415 let complex_flesch = stats.flesch_reading_ease(COMPLEX_TEXT).unwrap();
416 assert!(simple_flesch > complex_flesch);
417
418 let simple_grade = stats.flesch_kincaid_grade_level(SIMPLE_TEXT).unwrap();
420 let complex_grade = stats.flesch_kincaid_grade_level(COMPLEX_TEXT).unwrap();
421 assert!(simple_grade < complex_grade);
422
423 let simple_fog = stats.gunning_fog(SIMPLE_TEXT).unwrap();
425 let complex_fog = stats.gunning_fog(COMPLEX_TEXT).unwrap();
426 assert!(simple_fog < complex_fog);
427 }
428
429 #[test]
430 fn test_lexical_diversity() {
431 let stats = TextStatistics::new();
432
433 let simple_diversity = stats.lexical_diversity(SIMPLE_TEXT).unwrap();
434 let complex_diversity = stats.lexical_diversity(COMPLEX_TEXT).unwrap();
435
436 assert!(simple_diversity > 0.0 && complex_diversity > 0.0);
439
440 assert_eq!(
442 stats.type_token_ratio(SIMPLE_TEXT).unwrap(),
443 simple_diversity
444 );
445 }
446
447 #[test]
448 fn test_get_all_metrics() {
449 let stats = TextStatistics::new();
450
451 let metrics = stats.get_all_metrics(COMPLEX_TEXT).unwrap();
452
453 assert!(metrics.flesch_reading_ease < 50.0);
454 assert!(metrics.flesch_kincaid_grade_level > 12.0);
455 assert!(metrics.gunning_fog > 5.0); assert!(metrics.text_statistics.word_count == 24);
457 assert!(metrics.text_statistics.sentence_count == 2);
458 }
459
460 #[test]
461 fn test_smog_error() {
462 let stats = TextStatistics::new();
463
464 assert!(stats.smog_index(SIMPLE_TEXT).is_err());
466
467 let metrics = stats.get_all_metrics(SIMPLE_TEXT).unwrap();
469 assert!(metrics.smog_index.is_none());
470 }
471
472 #[test]
473 fn test_emptytext() {
474 let stats = TextStatistics::new();
475
476 assert_eq!(stats.word_count("").unwrap(), 0);
477 assert_eq!(stats.sentence_count("").unwrap(), 0);
478 assert_eq!(stats.syllable_count("").unwrap(), 0);
479
480 assert!(stats.avg_sentence_length("").is_err());
482 assert!(stats.avg_word_length("").is_err());
483 assert!(stats.lexical_diversity("").is_err());
484 assert!(stats.flesch_reading_ease("").is_err());
485 }
486}