use rust_stemmers::Stemmer;
use unicode_segmentation::UnicodeSegmentation;
use crate::config::{built_in_stop_words, parse_stemmer_language};
#[derive(Debug, Clone)]
pub struct EchoConfig {
pub window: usize,
pub min_repeats: usize,
pub max_global: usize,
pub min_word_len: usize,
}
impl Default for EchoConfig {
fn default() -> Self {
Self {
window: 5,
min_repeats: 3,
max_global: 40,
min_word_len: 4,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct EchoFinding {
pub stem: String,
pub surface_forms: Vec<String>,
pub para_start: usize,
pub para_end: usize,
pub count: usize,
}
#[derive(Clone)]
struct Occurrence {
para: usize, surface: String,
}
pub fn detect_echoes(
paragraphs: &[String],
language: &str,
stop_override: Option<&[String]>,
cfg: &EchoConfig,
) -> Vec<EchoFinding> {
let stemmer = parse_stemmer_language(language).map(Stemmer::create);
let stem_of = |w: &str| -> String { crate::text::normalize_stem(w, &stemmer) };
let stop_source: Vec<String> = match stop_override {
Some(list) if !list.is_empty() => list.to_vec(),
_ => built_in_stop_words(language)
.iter()
.map(|s| (*s).to_string())
.collect(),
};
let stop_set: std::collections::HashSet<String> = stop_source
.iter()
.map(|w| stem_of(w))
.filter(|s| !s.is_empty())
.collect();
let mut occ: std::collections::HashMap<String, Vec<Occurrence>> =
std::collections::HashMap::new();
for (p_idx, para) in paragraphs.iter().enumerate() {
for word in para.unicode_words() {
if word.chars().count() < cfg.min_word_len {
continue;
}
if !word.chars().any(|c| c.is_alphabetic()) {
continue;
}
let stem = stem_of(word);
if stem.is_empty() || stop_set.contains(&stem) {
continue;
}
occ.entry(stem).or_default().push(Occurrence {
para: p_idx,
surface: word.to_lowercase(),
});
}
}
let mut findings: Vec<EchoFinding> = Vec::new();
for (stem, occs) in occ {
let total = occs.len();
if total < cfg.min_repeats || total > cfg.max_global {
continue;
}
if let Some(f) = densest_window(&stem, &occs, cfg) {
findings.push(f);
}
}
findings.sort_by(|a, b| {
a.para_start
.cmp(&b.para_start)
.then_with(|| a.stem.cmp(&b.stem))
});
findings
}
fn densest_window(
stem: &str,
occs: &[Occurrence],
cfg: &EchoConfig,
) -> Option<EchoFinding> {
let mut best: Option<(usize, usize, usize)> = None; let mut j = 0;
for i in 0..occs.len() {
if j < i {
j = i;
}
while j + 1 < occs.len()
&& occs[j + 1].para.saturating_sub(occs[i].para) < cfg.window
{
j += 1;
}
let count = j - i + 1;
if count >= cfg.min_repeats {
match best {
Some((bc, _, _)) if count <= bc => {}
_ => best = Some((count, i, j)),
}
}
}
let (count, i, j) = best?;
let mut surface_forms: Vec<String> = Vec::new();
for o in &occs[i..=j] {
if !surface_forms.contains(&o.surface) {
surface_forms.push(o.surface.clone());
}
}
Some(EchoFinding {
stem: stem.to_string(),
surface_forms,
para_start: occs[i].para + 1,
para_end: occs[j].para + 1,
count,
})
}
#[cfg(test)]
mod tests {
use super::*;
fn cfg() -> EchoConfig {
EchoConfig::default()
}
fn paras(v: &[&str]) -> Vec<String> {
v.iter().map(|s| s.to_string()).collect()
}
#[test]
fn flags_inflected_echo_in_english() {
let p = paras(&[
"She walked to the window slowly.",
"He was walking across the room.",
"They walked out together at last.",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert_eq!(f.len(), 1, "expected one echo, got {f:?}");
assert_eq!(f[0].stem, "walk");
assert_eq!(f[0].count, 3);
assert_eq!(f[0].para_start, 1);
assert_eq!(f[0].para_end, 3);
assert!(f[0].surface_forms.contains(&"walked".to_string()));
assert!(f[0].surface_forms.contains(&"walking".to_string()));
}
#[test]
fn flags_inflected_echo_in_russian() {
let p = paras(&[
"На горизонте показался корабль.",
"Капитан корабля молчал долго.",
"На корабле горели три фонаря.",
]);
let f = detect_echoes(&p, "russian", None, &cfg());
assert_eq!(f.len(), 1, "expected one Russian echo, got {f:?}");
assert_eq!(f[0].count, 3);
assert_eq!(f[0].surface_forms.len(), 3);
}
#[test]
fn unsupported_language_uses_exact_forms() {
let p = paras(&[
"the artefact glimmered faintly here",
"the artefact rested on the table",
"the artefact vanished by morning",
]);
let f = detect_echoes(&p, "japanese", None, &cfg());
assert_eq!(f.len(), 1);
assert_eq!(f[0].stem, "artefact");
assert_eq!(f[0].surface_forms, vec!["artefact".to_string()]);
}
#[test]
fn stop_words_not_flagged() {
let p = paras(&[
"the cat sat on the mat",
"the dog ran by the fence",
"the bird flew over the hill",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert!(
!f.iter().any(|e| e.stem == "the"),
"stop-word 'the' must not be flagged",
);
}
#[test]
fn common_vocabulary_above_ceiling_skipped() {
let p = paras(&[
"star star",
"star star",
"star",
]);
let mut c = cfg();
c.max_global = 4;
let f = detect_echoes(&p, "english", None, &c);
assert!(f.is_empty(), "above-ceiling word should skip, got {f:?}");
}
#[test]
fn occurrences_beyond_window_not_flagged() {
let mut v = vec![
"the lantern flickered once".to_string(),
];
for i in 0..7 {
v.push(format!("filler paragraph number {i} here"));
}
v.push("the lantern flickered twice".to_string());
v.push("the lantern flickered thrice".to_string());
let f = detect_echoes(&v, "english", None, &cfg());
assert!(
!f.iter().any(|e| e.stem == "lantern"),
"spread-out occurrences must not flag, got {f:?}",
);
}
#[test]
fn tight_cluster_within_window_flagged() {
let p = paras(&[
"The harbor was a harbor unlike any harbor he knew.",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert_eq!(f.len(), 1);
assert_eq!(f[0].stem, "harbor");
assert_eq!(f[0].count, 3);
assert_eq!(f[0].para_start, 1);
assert_eq!(f[0].para_end, 1);
}
#[test]
fn below_min_repeats_not_flagged() {
let p = paras(&[
"the beacon shone bright",
"the beacon shone again",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert!(!f.iter().any(|e| e.stem == "beacon"));
}
#[test]
fn short_words_skipped() {
let p = paras(&[
"she ran ran ran",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert!(f.is_empty());
}
#[test]
fn stop_override_replaces_builtin() {
let p = paras(&[
"The harbor was a harbor unlike any harbor.",
]);
let stop = vec!["harbor".to_string()];
let f = detect_echoes(&p, "english", Some(&stop), &cfg());
assert!(f.is_empty(), "overridden stop-word should suppress");
}
#[test]
fn multiple_echoes_sorted_by_paragraph() {
let p = paras(&[
"glimmered glimmered glimmered here",
"nothing of note in this filler",
"shimmered shimmered shimmered there",
]);
let f = detect_echoes(&p, "english", None, &cfg());
assert_eq!(f.len(), 2);
assert_eq!(f[0].stem, "glimmer");
assert_eq!(f[0].para_start, 1);
assert_eq!(f[1].stem, "shimmer");
assert_eq!(f[1].para_start, 3);
}
#[test]
fn empty_input_no_findings() {
assert!(detect_echoes(&[], "english", None, &cfg()).is_empty());
assert!(
detect_echoes(¶s(&["", " "]), "english", None, &cfg())
.is_empty()
);
}
}