use crate::tokenize::{SentenceTokenizer, Tokenizer, WordTokenizer};
use crate::{Result, TextError};
#[derive(Debug, Clone)]
pub struct TextStatistics {
wordtokenizer: WordTokenizer,
sentencetokenizer: SentenceTokenizer,
}
impl Default for TextStatistics {
fn default() -> Self {
Self::new()
}
}
impl TextStatistics {
pub fn new() -> Self {
Self {
wordtokenizer: WordTokenizer::new(true), sentencetokenizer: SentenceTokenizer::new(),
}
}
pub fn with_tokenizers(
wordtokenizer: WordTokenizer,
sentencetokenizer: SentenceTokenizer,
) -> Self {
Self {
wordtokenizer,
sentencetokenizer,
}
}
pub fn word_count(&self, text: &str) -> Result<usize> {
Ok(self.wordtokenizer.tokenize(text)?.len())
}
pub fn sentence_count(&self, text: &str) -> Result<usize> {
Ok(self.sentencetokenizer.tokenize(text)?.len())
}
fn count_syllables(&self, word: &str) -> usize {
if word.is_empty() {
return 0;
}
let word = word.trim().to_lowercase();
if word.len() <= 3 {
return 1;
}
let word = if word.ends_with("es") || word.ends_with("ed") {
&word[..word.len() - 2]
} else if word.ends_with('e') && word.len() > 2 {
&word[..word.len() - 1]
} else {
&word
};
let vowels = ['a', 'e', 'i', 'o', 'u', 'y'];
let mut syllable_count = 0;
let mut prev_is_vowel = false;
for ch in word.chars() {
let is_vowel = vowels.contains(&ch);
if is_vowel && !prev_is_vowel {
syllable_count += 1;
}
prev_is_vowel = is_vowel;
}
syllable_count.max(1)
}
pub fn syllable_count(&self, text: &str) -> Result<usize> {
let words = self.wordtokenizer.tokenize(text)?;
Ok(words.iter().map(|w| self.count_syllables(w)).sum())
}
pub fn complex_word_count(&self, text: &str) -> Result<usize> {
let words = self.wordtokenizer.tokenize(text)?;
Ok(words
.iter()
.filter(|w| self.count_syllables(w) >= 3)
.count())
}
pub fn avg_sentence_length(&self, text: &str) -> Result<f64> {
let word_count = self.word_count(text)?;
let sentence_count = self.sentence_count(text)?;
if sentence_count == 0 {
return Err(TextError::InvalidInput("Text has no sentences".to_string()));
}
Ok(word_count as f64 / sentence_count as f64)
}
pub fn avg_word_length(&self, text: &str) -> Result<f64> {
let words = self.wordtokenizer.tokenize(text)?;
if words.is_empty() {
return Err(TextError::InvalidInput("Text has no words".to_string()));
}
let char_count: usize = words.iter().map(|w| w.chars().count()).sum();
Ok(char_count as f64 / words.len() as f64)
}
pub fn avg_syllables_per_word(&self, text: &str) -> Result<f64> {
let words = self.wordtokenizer.tokenize(text)?;
if words.is_empty() {
return Err(TextError::InvalidInput("Text has no words".to_string()));
}
let syllable_count: usize = words.iter().map(|w| self.count_syllables(w)).sum();
Ok(syllable_count as f64 / words.len() as f64)
}
pub fn flesch_reading_ease(&self, text: &str) -> Result<f64> {
let avg_sentence_length = self.avg_sentence_length(text)?;
let avg_syllables_per_word = self.avg_syllables_per_word(text)?;
let score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word);
Ok(score.clamp(0.0, 100.0))
}
pub fn flesch_kincaid_grade_level(&self, text: &str) -> Result<f64> {
let avg_sentence_length = self.avg_sentence_length(text)?;
let avg_syllables_per_word = self.avg_syllables_per_word(text)?;
let grade = 0.39 * avg_sentence_length + 11.8 * avg_syllables_per_word - 15.59;
Ok(grade.max(0.0))
}
pub fn gunning_fog(&self, text: &str) -> Result<f64> {
let avg_sentence_length = self.avg_sentence_length(text)?;
let complex_words = self.complex_word_count(text)? as f64;
let words = self.word_count(text)? as f64;
if words == 0.0 {
return Err(TextError::InvalidInput("Text has no words".to_string()));
}
let percentage_complex_words = (complex_words / words) * 100.0;
let fog = 0.4 * (avg_sentence_length + percentage_complex_words / 100.0);
Ok(fog)
}
pub fn smog_index(&self, text: &str) -> Result<f64> {
let sentences = self.sentence_count(text)?;
let complex_words = self.complex_word_count(text)? as f64;
if sentences < 30 {
return Err(TextError::InvalidInput(
"SMOG formula is designed for 30+ sentences, results may be inaccurate".to_string(),
));
}
let smog = 1.043 * (complex_words * (30.0 / sentences as f64)).sqrt() + 3.1291;
Ok(smog)
}
pub fn automated_readability_index(&self, text: &str) -> Result<f64> {
let character_count = text.chars().filter(|c| !c.is_whitespace()).count() as f64;
let word_count = self.word_count(text)? as f64;
let sentence_count = self.sentence_count(text)? as f64;
if word_count == 0.0 || sentence_count == 0.0 {
return Err(TextError::InvalidInput(
"Text is too short for analysis".to_string(),
));
}
let ari =
4.71 * (character_count / word_count) + 0.5 * (word_count / sentence_count) - 21.43;
Ok(ari.max(0.0))
}
pub fn coleman_liau_index(&self, text: &str) -> Result<f64> {
let character_count = text.chars().filter(|c| !c.is_whitespace()).count() as f64;
let word_count = self.word_count(text)? as f64;
let sentence_count = self.sentence_count(text)? as f64;
if word_count == 0.0 {
return Err(TextError::InvalidInput("Text has no words".to_string()));
}
let l = (character_count / word_count) * 100.0; let s = (sentence_count / word_count) * 100.0;
let coleman_liau = 0.0588 * l - 0.296 * s - 15.8;
Ok(coleman_liau.max(0.0))
}
pub fn dale_chall_readability(&self, text: &str) -> Result<f64> {
let words = self.wordtokenizer.tokenize(text)?;
let word_count = words.len() as f64;
let sentence_count = self.sentence_count(text)? as f64;
if word_count == 0.0 || sentence_count == 0.0 {
return Err(TextError::InvalidInput(
"Text is too short for analysis".to_string(),
));
}
let difficult_word_count = self.complex_word_count(text)? as f64;
let percent_difficult_words = (difficult_word_count / word_count) * 100.0;
let raw_score = 0.1579 * percent_difficult_words + 0.0496 * (word_count / sentence_count);
let score = if percent_difficult_words > 5.0 {
raw_score + 3.6365
} else {
raw_score
};
Ok(score)
}
pub fn lexical_diversity(&self, text: &str) -> Result<f64> {
let words = self.wordtokenizer.tokenize(text)?;
if words.is_empty() {
return Err(TextError::InvalidInput("Text has no words".to_string()));
}
let total_words = words.len() as f64;
let unique_words = words.iter().collect::<std::collections::HashSet<_>>().len() as f64;
Ok(unique_words / total_words)
}
pub fn type_token_ratio(&self, text: &str) -> Result<f64> {
self.lexical_diversity(text)
}
pub fn get_all_metrics(&self, text: &str) -> Result<ReadabilityMetrics> {
Ok(ReadabilityMetrics {
flesch_reading_ease: self.flesch_reading_ease(text)?,
flesch_kincaid_grade_level: self.flesch_kincaid_grade_level(text)?,
gunning_fog: self.gunning_fog(text)?,
automated_readability_index: self.automated_readability_index(text)?,
coleman_liau_index: self.coleman_liau_index(text)?,
lexical_diversity: self.lexical_diversity(text)?,
smog_index: self.smog_index(text).ok(), dale_chall_readability: self.dale_chall_readability(text)?,
text_statistics: TextMetrics {
word_count: self.word_count(text)?,
sentence_count: self.sentence_count(text)?,
syllable_count: self.syllable_count(text)?,
complex_word_count: self.complex_word_count(text)?,
avg_sentence_length: self.avg_sentence_length(text)?,
avg_word_length: self.avg_word_length(text)?,
avg_syllables_per_word: self.avg_syllables_per_word(text)?,
},
})
}
}
#[derive(Debug, Clone)]
pub struct ReadabilityMetrics {
pub flesch_reading_ease: f64,
pub flesch_kincaid_grade_level: f64,
pub gunning_fog: f64,
pub smog_index: Option<f64>,
pub automated_readability_index: f64,
pub coleman_liau_index: f64,
pub dale_chall_readability: f64,
pub lexical_diversity: f64,
pub text_statistics: TextMetrics,
}
#[derive(Debug, Clone)]
pub struct TextMetrics {
pub word_count: usize,
pub sentence_count: usize,
pub syllable_count: usize,
pub complex_word_count: usize,
pub avg_sentence_length: f64,
pub avg_word_length: f64,
pub avg_syllables_per_word: f64,
}
#[cfg(test)]
mod tests {
use super::*;
const SIMPLE_TEXT: &str = "This is a simple test. It has short sentences. Words are small.";
const COMPLEX_TEXT: &str = "The systematic study of scientific methodology encompasses various philosophical and interdisciplinary perspectives. Researchers diligently analyze epistemological foundations of empirical investigation while considering phenomenological implications.";
#[test]
fn test_basic_counts() {
let stats = TextStatistics::new();
assert_eq!(stats.word_count(SIMPLE_TEXT).expect("Operation failed"), 12);
assert_eq!(
stats.sentence_count(SIMPLE_TEXT).expect("Operation failed"),
3
);
assert!(stats.syllable_count(SIMPLE_TEXT).expect("Operation failed") >= 12);
assert_eq!(
stats.word_count(COMPLEX_TEXT).expect("Operation failed"),
24
);
assert_eq!(
stats
.sentence_count(COMPLEX_TEXT)
.expect("Operation failed"),
2
);
assert!(
stats
.complex_word_count(COMPLEX_TEXT)
.expect("Operation failed")
>= 8
);
}
#[test]
fn test_averages() {
let stats = TextStatistics::new();
let simple_avg_sentence_len = stats
.avg_sentence_length(SIMPLE_TEXT)
.expect("Operation failed");
assert!(simple_avg_sentence_len > 3.8 && simple_avg_sentence_len < 4.2);
let complex_avg_sentence_len = stats
.avg_sentence_length(COMPLEX_TEXT)
.expect("Operation failed");
assert!(complex_avg_sentence_len > 10.0 && complex_avg_sentence_len < 13.0);
let simple_avg_word_len = stats
.avg_word_length(SIMPLE_TEXT)
.expect("Operation failed");
assert!(simple_avg_word_len > 2.0 && simple_avg_word_len < 5.0);
let complex_avg_word_len = stats
.avg_word_length(COMPLEX_TEXT)
.expect("Operation failed");
assert!(complex_avg_word_len > 7.0);
}
#[test]
fn test_readability_metrics() {
let stats = TextStatistics::new();
let simple_flesch = stats
.flesch_reading_ease(SIMPLE_TEXT)
.expect("Operation failed");
let complex_flesch = stats
.flesch_reading_ease(COMPLEX_TEXT)
.expect("Operation failed");
assert!(simple_flesch > complex_flesch);
let simple_grade = stats
.flesch_kincaid_grade_level(SIMPLE_TEXT)
.expect("Operation failed");
let complex_grade = stats
.flesch_kincaid_grade_level(COMPLEX_TEXT)
.expect("Operation failed");
assert!(simple_grade < complex_grade);
let simple_fog = stats.gunning_fog(SIMPLE_TEXT).expect("Operation failed");
let complex_fog = stats.gunning_fog(COMPLEX_TEXT).expect("Operation failed");
assert!(simple_fog < complex_fog);
}
#[test]
fn test_lexical_diversity() {
let stats = TextStatistics::new();
let simple_diversity = stats
.lexical_diversity(SIMPLE_TEXT)
.expect("Operation failed");
let complex_diversity = stats
.lexical_diversity(COMPLEX_TEXT)
.expect("Operation failed");
assert!(simple_diversity > 0.0 && complex_diversity > 0.0);
assert_eq!(
stats
.type_token_ratio(SIMPLE_TEXT)
.expect("Operation failed"),
simple_diversity
);
}
#[test]
fn test_get_all_metrics() {
let stats = TextStatistics::new();
let metrics = stats
.get_all_metrics(COMPLEX_TEXT)
.expect("Operation failed");
assert!(metrics.flesch_reading_ease < 50.0);
assert!(metrics.flesch_kincaid_grade_level > 12.0);
assert!(metrics.gunning_fog > 5.0); assert!(metrics.text_statistics.word_count == 24);
assert!(metrics.text_statistics.sentence_count == 2);
}
#[test]
fn test_smog_error() {
let stats = TextStatistics::new();
assert!(stats.smog_index(SIMPLE_TEXT).is_err());
let metrics = stats
.get_all_metrics(SIMPLE_TEXT)
.expect("Operation failed");
assert!(metrics.smog_index.is_none());
}
#[test]
fn test_emptytext() {
let stats = TextStatistics::new();
assert_eq!(stats.word_count("").expect("Operation failed"), 0);
assert_eq!(stats.sentence_count("").expect("Operation failed"), 0);
assert_eq!(stats.syllable_count("").expect("Operation failed"), 0);
assert!(stats.avg_sentence_length("").is_err());
assert!(stats.avg_word_length("").is_err());
assert!(stats.lexical_diversity("").is_err());
assert!(stats.flesch_reading_ease("").is_err());
}
}