use std::collections::HashSet;
pub(crate) fn percentile(sorted: &[usize], p: f64) -> f32 {
match sorted.len() {
0 => 0.0,
1 => sorted[0] as f32,
n => {
let rank = (p / 100.0) * (n - 1) as f64;
let lo = rank.floor() as usize;
let hi = rank.ceil() as usize;
let frac = rank - lo as f64;
(sorted[lo] as f64 + (sorted[hi] as f64 - sorted[lo] as f64) * frac) as f32
}
}
}
pub(crate) fn mean_std(xs: &[usize]) -> (f64, f64) {
if xs.is_empty() {
return (0.0, 0.0);
}
let n = xs.len() as f64;
let mean = xs.iter().sum::<usize>() as f64 / n;
let var = xs.iter().map(|&x| (x as f64 - mean).powi(2)).sum::<f64>() / n;
(mean, var.sqrt())
}
pub(crate) fn coefficient_of_variation(lengths: &[usize]) -> f32 {
let (mean, std) = mean_std(lengths);
if mean == 0.0 {
0.0
} else {
(std / mean) as f32
}
}
pub(crate) fn burstiness(lengths: &[usize]) -> f32 {
let (mean, std) = mean_std(lengths);
let denom = std + mean;
if denom == 0.0 {
0.0
} else {
((std - mean) / denom) as f32
}
}
pub(crate) fn mattr(tokens: &[&str], window: usize) -> f32 {
let n = tokens.len();
if n == 0 {
return 0.0;
}
let w = window.clamp(1, n);
if n == w {
let uniq = tokens.iter().collect::<HashSet<_>>().len();
return uniq as f32 / n as f32;
}
let windows = n - w + 1;
let mut sum = 0.0f64;
for start in 0..windows {
let uniq = tokens[start..start + w].iter().collect::<HashSet<_>>().len();
sum += uniq as f64 / w as f64;
}
(sum / windows as f64) as f32
}
#[cfg(test)]
mod tests {
use super::*;
use crate::prose::ProseLanguage::En;
#[test]
fn percentiles_interpolate() {
let mut v = [1usize, 2, 3, 4, 5];
v.sort_unstable();
assert_eq!(percentile(&v, 50.0), 3.0);
assert_eq!(percentile(&v, 0.0), 1.0);
assert_eq!(percentile(&v, 100.0), 5.0);
assert!((percentile(&v, 25.0) - 2.0).abs() < 0.001);
}
#[test]
fn cv_and_burstiness_relationship() {
let uni = [5usize, 5, 5, 5];
assert_eq!(coefficient_of_variation(&uni), 0.0);
assert!((burstiness(&uni) + 1.0).abs() < 1e-6);
let varied = [2usize, 8, 3, 20, 5];
let cv = coefficient_of_variation(&varied) as f64;
let b = burstiness(&varied) as f64;
assert!(((cv - 1.0) / (cv + 1.0) - b).abs() < 1e-5);
}
#[test]
fn mattr_bounds() {
let uniq = ["a", "b", "c", "d"];
assert!((mattr(&uniq, 2) - 1.0).abs() < 1e-6);
let same = ["x", "x", "x", "x"];
assert!((mattr(&same, 2) - 0.5).abs() < 1e-6);
assert_eq!(mattr(&[], 100), 0.0);
}
#[test]
fn tier1_integration() {
use crate::prose::VoiceScope;
use crate::prose::profile::compute_profile;
let text = "The cat sat. The dog ran fast across the wide green field today. Up.";
let t = compute_profile(text, VoiceScope::Book, &En, false, 100);
assert_eq!(t.sentence_count, 3);
assert_eq!(t.word_count, 14);
assert!(t.cv > 0.0);
assert!(t.p50 >= t.p10);
assert!(t.mattr > 0.0 && t.mattr <= 1.0);
}
}