use unicode_segmentation::UnicodeSegmentation;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RhythmVerdict {
TooShort,
Monotone,
Steady,
Varied,
Choppy,
}
impl RhythmVerdict {
pub fn from(cv: f64, n_sentences: usize) -> Self {
if n_sentences < 3 {
return Self::TooShort;
}
if cv < 0.25 {
Self::Monotone
} else if cv < 0.45 {
Self::Steady
} else if cv < 0.80 {
Self::Varied
} else {
Self::Choppy
}
}
pub fn label(self) -> &'static str {
match self {
Self::TooShort => "TOO SHORT TO JUDGE",
Self::Monotone => "MONOTONE",
Self::Steady => "STEADY",
Self::Varied => "VARIED",
Self::Choppy => "CHOPPY",
}
}
pub fn note(self) -> &'static str {
match self {
Self::TooShort => "need at least 3 sentences to judge rhythm",
Self::Monotone =>
"sentences too uniform — drones · break it with a short one",
Self::Steady =>
"modest variation · workable but can sing louder",
Self::Varied =>
"strong variation · good prose rhythm",
Self::Choppy =>
"extreme variation — fragments + long sentences mixed",
}
}
}
#[derive(Debug, Clone)]
pub struct SentenceSample {
pub line_no: usize,
pub word_count: usize,
pub preview: String,
}
#[derive(Debug, Clone)]
pub struct RhythmStats {
pub lengths: Vec<usize>,
pub mean: f64,
pub stdev: f64,
pub cv: f64,
pub min: usize,
pub max: usize,
pub verdict: RhythmVerdict,
pub samples: Vec<SentenceSample>,
pub shortest: Vec<SentenceSample>,
pub longest: Vec<SentenceSample>,
}
impl RhythmStats {
pub fn empty() -> Self {
Self {
lengths: Vec::new(),
mean: 0.0,
stdev: 0.0,
cv: 0.0,
min: 0,
max: 0,
verdict: RhythmVerdict::TooShort,
samples: Vec::new(),
shortest: Vec::new(),
longest: Vec::new(),
}
}
}
pub fn analyse(lines: &[String]) -> RhythmStats {
if lines.is_empty() {
return RhythmStats::empty();
}
let mut text = String::new();
let mut row_starts: Vec<usize> = Vec::with_capacity(lines.len());
for (i, line) in lines.iter().enumerate() {
row_starts.push(text.chars().count());
text.push_str(line);
if i + 1 < lines.len() {
text.push('\n');
}
}
let sentences = split_sentences(&text);
if sentences.is_empty() {
return RhythmStats::empty();
}
let mut samples: Vec<SentenceSample> = Vec::with_capacity(sentences.len());
for sent in &sentences {
let word_count = sent.text.unicode_words().count();
if word_count == 0 {
continue;
}
let row = row_for_char_index(&row_starts, sent.char_start);
let preview: String = sent.text.chars().take(64).collect();
samples.push(SentenceSample {
line_no: row + 1,
word_count,
preview,
});
}
if samples.is_empty() {
return RhythmStats::empty();
}
let lengths: Vec<usize> = samples.iter().map(|s| s.word_count).collect();
let n = lengths.len();
let sum: usize = lengths.iter().sum();
let mean = sum as f64 / n as f64;
let variance: f64 = if n > 0 {
lengths
.iter()
.map(|l| {
let d = *l as f64 - mean;
d * d
})
.sum::<f64>()
/ n as f64
} else {
0.0
};
let stdev = variance.sqrt();
let cv = if mean > 0.0 { stdev / mean } else { 0.0 };
let min = *lengths.iter().min().unwrap_or(&0);
let max = *lengths.iter().max().unwrap_or(&0);
let verdict = RhythmVerdict::from(cv, n);
let mut shortest: Vec<SentenceSample> = samples.clone();
shortest.sort_by_key(|s| s.word_count);
shortest.truncate(3);
let mut longest: Vec<SentenceSample> = samples.clone();
longest.sort_by(|a, b| b.word_count.cmp(&a.word_count));
longest.truncate(3);
RhythmStats {
lengths,
mean,
stdev,
cv,
min,
max,
verdict,
samples,
shortest,
longest,
}
}
#[derive(Debug, Clone)]
struct RawSentence {
char_start: usize,
text: String,
}
fn split_sentences(text: &str) -> Vec<RawSentence> {
let chars: Vec<char> = text.chars().collect();
let mut sentences: Vec<RawSentence> = Vec::new();
let mut buf = String::new();
let mut start_char: usize = 0;
let mut current_char: usize = 0;
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if buf.is_empty() && c.is_whitespace() {
i += 1;
current_char += 1;
start_char = current_char;
continue;
}
buf.push(c);
current_char += 1;
if matches!(c, '.' | '!' | '?') {
let block_start = i;
let mut j = i + 1;
while j < chars.len()
&& matches!(
chars[j],
'.' | '!'
| '?'
| '"'
| '\''
| '”'
| '’'
| ')'
| ']'
)
{
buf.push(chars[j]);
j += 1;
current_char += 1;
}
let followed_by_space = j >= chars.len() || chars[j].is_whitespace();
let block: &[char] = &chars[block_start..j];
let is_ellipsis =
block.len() >= 2 && block.iter().all(|c| *c == '.');
if followed_by_space
&& !is_ellipsis
&& !ends_with_abbreviation(buf.trim()) {
let trimmed = buf.trim().to_string();
if !trimmed.is_empty() {
sentences.push(RawSentence {
char_start: start_char,
text: trimmed,
});
}
buf.clear();
i = j;
start_char = current_char;
continue;
}
i = j;
} else {
i += 1;
}
}
let tail = buf.trim().to_string();
if !tail.is_empty() {
sentences.push(RawSentence {
char_start: start_char,
text: tail,
});
}
sentences
}
const ABBREVIATIONS: &[&str] = &[
"Mr.", "Mrs.", "Ms.", "Dr.", "Sr.", "Jr.", "St.", "Mt.", "Fr.", "Rev.",
"Prof.", "Gen.", "Col.", "Maj.", "Lt.", "Capt.", "Sgt.", "Hon.", "Pres.",
"Sen.", "Rep.", "Gov.",
"Ph.D.", "M.D.", "B.A.", "M.A.", "B.S.", "M.S.", "Esq.",
"e.g.", "i.e.", "etc.", "vs.", "viz.", "cf.", "et.", "al.",
"Inc.", "Ltd.", "Co.", "Corp.",
"No.", "Vol.", "p.", "pp.", "fig.", "ed.", "eds.",
];
fn ends_with_abbreviation(s: &str) -> bool {
let last = s.split_whitespace().last().unwrap_or("");
if last.is_empty() {
return false;
}
ABBREVIATIONS.iter().any(|a| last.eq_ignore_ascii_case(a))
}
fn row_for_char_index(row_starts: &[usize], char_idx: usize) -> usize {
let mut row = 0;
for (i, start) in row_starts.iter().enumerate() {
if *start <= char_idx {
row = i;
} else {
break;
}
}
row
}
#[cfg(test)]
mod tests {
use super::*;
fn lines(text: &str) -> Vec<String> {
text.split('\n').map(|s| s.to_string()).collect()
}
#[test]
fn empty_input_yields_empty_stats() {
let r = analyse(&[]);
assert_eq!(r.lengths.len(), 0);
assert_eq!(r.verdict, RhythmVerdict::TooShort);
}
#[test]
fn single_sentence_too_short() {
let r = analyse(&lines("The cat sat on the mat."));
assert_eq!(r.lengths, vec![6]);
assert_eq!(r.verdict, RhythmVerdict::TooShort);
}
#[test]
fn three_uniform_sentences_drones() {
let r = analyse(&lines(
"I ate the red apple. I drank the cold milk. I read the old book.",
));
assert_eq!(r.lengths, vec![5, 5, 5]);
assert!((r.stdev - 0.0).abs() < 1e-9);
assert_eq!(r.verdict, RhythmVerdict::Monotone);
}
#[test]
fn varied_sentences_get_varied_verdict() {
let r = analyse(&lines(
"Bob ran fast. The morning fog clung to the cobblestones as he turned the corner. \
He coughed. She had not slept in seventy hours and her hands shook against the rusted iron rail. \
Then silence.",
));
assert!(r.lengths.len() >= 5);
assert!(matches!(
r.verdict,
RhythmVerdict::Varied | RhythmVerdict::Choppy
));
}
#[test]
fn abbreviations_dont_split() {
let r = analyse(&lines(
"Dr. Smith arrived at noon. Mrs. Hale was already there. \
They drank tea quietly.",
));
assert_eq!(r.lengths.len(), 3);
}
#[test]
fn ellipsis_and_terminator_combo() {
let r = analyse(&lines("She thought... it was over. He didn't agree."));
assert_eq!(r.lengths.len(), 2);
}
#[test]
fn closing_quotes_consumed_with_terminator() {
let r = analyse(&lines(
"He said \"Hello.\" Then she nodded. Then she walked away.",
));
assert_eq!(r.lengths.len(), 3);
}
#[test]
fn rhythm_verdict_thresholds() {
assert_eq!(RhythmVerdict::from(0.5, 2), RhythmVerdict::TooShort);
assert_eq!(RhythmVerdict::from(0.10, 5), RhythmVerdict::Monotone);
assert_eq!(RhythmVerdict::from(0.30, 5), RhythmVerdict::Steady);
assert_eq!(RhythmVerdict::from(0.55, 5), RhythmVerdict::Varied);
assert_eq!(RhythmVerdict::from(1.10, 5), RhythmVerdict::Choppy);
}
#[test]
fn line_no_tracks_starting_row() {
let r = analyse(&lines(
"First sentence here.\nSecond starts on row two.\nThird is row three.",
));
assert_eq!(r.samples.len(), 3);
assert_eq!(r.samples[0].line_no, 1);
assert_eq!(r.samples[1].line_no, 2);
assert_eq!(r.samples[2].line_no, 3);
}
#[test]
fn outliers_picked_correctly() {
let r = analyse(&lines(
"Tiny. The medium one has six words here. \
This is by far the very longest sentence in the whole paragraph with many many words.",
));
assert_eq!(r.shortest[0].word_count, 1);
assert!(r.longest[0].word_count >= 16);
}
}