use serde::{Deserialize, Serialize};
use crate::prose::ProseRange;
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ProseInsights {
pub word_count: usize,
pub sentence_count: usize,
pub character_count: usize,
pub reading_level: f32, }
fn is_quote_or_paren(bytes: &[u8], i: usize, len: usize, right_dquote: [u8; 3]) -> bool {
if matches!(bytes[i], b'"' | b'\'' | b')') {
return i + 1 >= len || bytes[i + 1].is_ascii_whitespace();
}
if i + 2 < len && bytes[i..i + 3] == right_dquote {
return i + 3 >= len || bytes[i + 3].is_ascii_whitespace();
}
false
}
fn count_sentences(text: &str) -> usize {
const RIGHT_DQUOTE: [u8; 3] = [0xE2, 0x80, 0x9D];
let bytes = text.as_bytes();
let len = bytes.len();
let mut count: usize = 0;
let mut i = 0;
while i < len {
if !matches!(bytes[i], b'.' | b'!' | b'?') {
i += 1;
continue;
}
while i + 1 < len && matches!(bytes[i + 1], b'.' | b'!' | b'?') {
i += 1;
}
let next = i + 1;
if next >= len {
count += 1;
} else if bytes[next].is_ascii_whitespace() {
let is_initial = i >= 1
&& bytes[i - 1].is_ascii_uppercase()
&& (i < 2 || bytes[i - 2].is_ascii_whitespace());
if !is_initial {
count += 1;
}
} else if is_quote_or_paren(bytes, next, len, RIGHT_DQUOTE) {
count += 1;
}
i += 1;
}
if count == 0 && bytes.iter().any(u8::is_ascii_alphanumeric) {
count = 1;
}
count
}
fn count_words(text: &str) -> usize {
text.split_whitespace()
.filter(|w| w.chars().any(char::is_alphanumeric))
.count()
}
fn count_characters(text: &str) -> usize {
text.chars()
.filter(|c| {
c.is_alphanumeric() || matches!(c, '\'' | '\u{2019}' | '-' | '\u{2013}' | '\u{2014}')
})
.count()
}
#[allow(clippy::cast_precision_loss)]
fn compute_ari(character_count: usize, word_count: usize, sentence_count: usize) -> f32 {
if word_count > 0 && sentence_count > 0 {
4.71f32.mul_add(
character_count as f32 / word_count as f32,
0.5f32.mul_add(word_count as f32 / sentence_count as f32, -21.43),
)
} else {
0.0
}
}
impl ProseInsights {
#[must_use]
pub fn analyze(text: &str) -> Self {
let character_count = count_characters(text);
let word_count = count_words(text);
let sentence_count = count_sentences(text);
let reading_level = compute_ari(character_count, word_count, sentence_count);
Self {
word_count,
sentence_count,
character_count,
reading_level,
}
}
#[must_use]
pub fn analyze_ranges(full_text: &str, ranges: &[ProseRange]) -> Self {
let mut total_characters = 0;
let mut total_words = 0;
let mut total_sentences = 0;
for range in ranges {
let prose = range.extract_text(full_text);
total_characters += count_characters(&prose);
total_words += count_words(&prose);
total_sentences += count_sentences(&prose);
}
let reading_level = compute_ari(total_characters, total_words, total_sentences);
Self {
word_count: total_words,
sentence_count: total_sentences,
character_count: total_characters,
reading_level,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_text() {
let insights = ProseInsights::analyze("");
assert_eq!(insights.word_count, 0);
assert_eq!(insights.sentence_count, 0);
assert_eq!(insights.character_count, 0);
assert_eq!(insights.reading_level, 0.0);
}
#[test]
fn single_sentence() {
let insights = ProseInsights::analyze("Hello world.");
assert_eq!(insights.word_count, 2);
assert_eq!(insights.sentence_count, 1);
assert_eq!(insights.character_count, 10);
}
#[test]
fn multiple_sentences() {
let insights = ProseInsights::analyze("First sentence. Second sentence. Third one!");
assert_eq!(insights.word_count, 6);
assert_eq!(insights.sentence_count, 3);
}
#[test]
fn reading_level_simple_text() {
let text = "The cat sat. The dog ran. A bird flew.";
let insights = ProseInsights::analyze(text);
assert!(insights.reading_level < 10.0);
}
#[test]
fn reading_level_complex_text() {
let text = "Notwithstanding the aforementioned constitutional provisions, \
the jurisprudential interpretation necessitates comprehensive \
deliberation regarding substantive procedural requirements.";
let insights = ProseInsights::analyze(text);
assert!(insights.reading_level > 10.0);
}
#[test]
fn character_count_excludes_whitespace() {
let insights = ProseInsights::analyze("a b c");
assert_eq!(insights.character_count, 3);
}
#[test]
fn question_marks_count_as_sentences() {
let insights = ProseInsights::analyze("Is this a question? Yes it is.");
assert_eq!(insights.sentence_count, 2);
}
#[test]
fn ellipses_not_counted_as_multiple_sentences() {
let insights = ProseInsights::analyze("Wait for it... and there it is.");
assert_eq!(insights.sentence_count, 2);
}
#[test]
fn initials_not_split() {
let insights = ProseInsights::analyze("I met J. Smith at the office.");
assert_eq!(insights.sentence_count, 1);
}
#[test]
fn markup_characters_excluded_from_count() {
let insights = ProseInsights::analyze("Hello [world](http://example.com).");
assert!(insights.character_count < 30);
}
#[test]
fn text_without_terminal_punctuation_counts_as_one_sentence() {
let insights = ProseInsights::analyze("A sentence without ending punctuation");
assert_eq!(insights.sentence_count, 1);
}
#[test]
fn analyze_ranges_uses_prose_only() {
let doc = "# Heading\n\nThe cat sat on the mat. The dog ran home.\n\n```code block```\n";
let ranges = vec![ProseRange {
start_byte: 12, end_byte: 54,
exclusions: vec![],
}];
let from_ranges = ProseInsights::analyze_ranges(doc, &ranges);
let prose_only = ProseInsights::analyze(&doc[12..54]);
assert_eq!(from_ranges.word_count, prose_only.word_count);
assert_eq!(from_ranges.sentence_count, prose_only.sentence_count);
assert_eq!(from_ranges.character_count, prose_only.character_count);
}
}