use std::collections::HashSet;
use rust_stemmers::Stemmer;
use unicode_segmentation::UnicodeSegmentation;
use crate::config::{
built_in_cognition_verbs, built_in_emotion_adjectives,
built_in_filter_words, built_in_linking_verbs, built_in_manner_adverbs,
built_in_stop_words, parse_stemmer_language, FilterWordsConfig,
RepeatedPhrasesConfig, ShowDontTellConfig,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StyleWarningKind {
FilterWord,
RepeatedPhrase,
ShowDontTell,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct StyleHit {
pub col_start: usize,
pub col_end: usize,
pub kind: StyleWarningKind,
}
pub struct FilterWordsDetector {
targets: HashSet<String>,
stemmer: Option<Stemmer>,
}
impl FilterWordsDetector {
pub fn new(cfg: &FilterWordsConfig, language: &str) -> Self {
let configured: &Vec<String> = match language.to_lowercase().as_str() {
"russian" => &cfg.russian,
"french" => &cfg.french,
"german" => &cfg.german,
"spanish" => &cfg.spanish,
_ => &cfg.english,
};
let stemmer = if cfg.use_stemming {
parse_stemmer_language(language).map(Stemmer::create)
} else {
None
};
let normalise = |w: &str| -> String {
let lc = w.trim().to_lowercase();
match &stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
}
};
let mut targets: HashSet<String> = HashSet::new();
if configured.is_empty() {
for w in built_in_filter_words(language) {
let key = normalise(w);
if !key.is_empty() {
targets.insert(key);
}
}
} else {
for w in configured {
let key = normalise(w);
if !key.is_empty() {
targets.insert(key);
}
}
}
for w in &cfg.extra_words {
let key = normalise(w);
if !key.is_empty() {
targets.insert(key);
}
}
Self { targets, stemmer }
}
pub fn is_empty(&self) -> bool {
self.targets.is_empty()
}
pub fn detect(&self, line: &str) -> Vec<StyleHit> {
if self.targets.is_empty() || line.is_empty() {
return Vec::new();
}
let mut byte_to_char: Vec<usize> = Vec::with_capacity(line.len() + 1);
let mut char_count = 0usize;
for (b, _) in line.char_indices() {
while byte_to_char.len() < b {
byte_to_char.push(char_count);
}
byte_to_char.push(char_count);
char_count += 1;
}
while byte_to_char.len() <= line.len() {
byte_to_char.push(char_count);
}
let mut out = Vec::new();
for (byte_start, word) in line.unicode_word_indices() {
let lc = word.to_lowercase();
let key = match &self.stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
};
if !self.targets.contains(&key) {
continue;
}
let byte_end = byte_start + word.len();
let col_start = byte_to_char[byte_start];
let col_end = byte_to_char.get(byte_end).copied().unwrap_or(char_count);
out.push(StyleHit {
col_start,
col_end,
kind: StyleWarningKind::FilterWord,
});
}
out
}
}
pub struct RepeatedPhraseDetector {
per_row: Vec<Vec<StyleHit>>,
}
#[derive(Debug, Clone)]
struct WordToken {
row: usize,
col_start: usize,
col_end: usize,
stem: String,
}
impl RepeatedPhraseDetector {
pub fn new(
cfg: &RepeatedPhrasesConfig,
language: &str,
lines: &[String],
) -> Self {
let mut per_row: Vec<Vec<StyleHit>> =
(0..lines.len()).map(|_| Vec::new()).collect();
let n = cfg.n.max(2) as usize;
let threshold = cfg.threshold.max(2) as usize;
let stemmer = if cfg.use_stemming {
parse_stemmer_language(language).map(rust_stemmers::Stemmer::create)
} else {
None
};
let stop_configured = match language.to_lowercase().as_str() {
"russian" => &cfg.russian_stop_words,
"french" => &cfg.french_stop_words,
"german" => &cfg.german_stop_words,
"spanish" => &cfg.spanish_stop_words,
_ => &cfg.english_stop_words,
};
let normalise_stop = |w: &str| -> String {
let lc = w.trim().to_lowercase();
match &stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
}
};
let stops: std::collections::HashSet<String> = if stop_configured.is_empty() {
built_in_stop_words(language)
.iter()
.map(|s| normalise_stop(s))
.collect()
} else {
stop_configured.iter().map(|s| normalise_stop(s)).collect()
};
let mut tokens: Vec<WordToken> = Vec::new();
for (row, line) in lines.iter().enumerate() {
let mut byte_to_char: Vec<usize> = Vec::with_capacity(line.len() + 1);
let mut char_count = 0usize;
for (b, _) in line.char_indices() {
while byte_to_char.len() < b {
byte_to_char.push(char_count);
}
byte_to_char.push(char_count);
char_count += 1;
}
while byte_to_char.len() <= line.len() {
byte_to_char.push(char_count);
}
for (byte_start, word) in line.unicode_word_indices() {
let lc = word.to_lowercase();
let stem = match &stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
};
if stops.contains(&stem) {
continue;
}
let byte_end = byte_start + word.len();
let col_start = byte_to_char[byte_start];
let col_end =
byte_to_char.get(byte_end).copied().unwrap_or(char_count);
tokens.push(WordToken {
row,
col_start,
col_end,
stem,
});
}
}
if tokens.len() < n {
return Self { per_row };
}
use std::collections::HashMap;
let mut groups: HashMap<String, Vec<usize>> = HashMap::new();
for i in 0..=(tokens.len() - n) {
let key = tokens[i..i + n]
.iter()
.map(|t| t.stem.as_str())
.collect::<Vec<_>>()
.join("|");
groups.entry(key).or_default().push(i);
}
for indices in groups.values() {
if indices.len() < threshold {
continue;
}
for &start in indices {
let span = &tokens[start..start + n];
let start_row = span.first().unwrap().row;
let end_row = span.last().unwrap().row;
let start_col = span.first().unwrap().col_start;
let end_col = span.last().unwrap().col_end;
if start_row == end_row {
per_row[start_row].push(StyleHit {
col_start: start_col,
col_end: end_col,
kind: StyleWarningKind::RepeatedPhrase,
});
} else {
for r in start_row..=end_row {
let cs = if r == start_row { start_col } else { 0 };
let ce = if r == end_row {
end_col
} else {
usize::MAX / 2
};
per_row[r].push(StyleHit {
col_start: cs,
col_end: ce,
kind: StyleWarningKind::RepeatedPhrase,
});
}
}
}
}
for row_hits in &mut per_row {
row_hits.sort_by_key(|h| h.col_start);
}
Self { per_row }
}
pub fn hits_for_row(&self, row: usize) -> &[StyleHit] {
self.per_row.get(row).map(Vec::as_slice).unwrap_or(&[])
}
#[allow(dead_code)]
pub fn total_hits(&self) -> usize {
self.per_row.iter().map(Vec::len).sum()
}
pub fn is_empty(&self) -> bool {
self.per_row.iter().all(|r| r.is_empty())
}
}
pub struct ShowDontTellDetector {
linking_verbs: HashSet<String>,
emotion_adjectives: HashSet<String>,
manner_adverbs: HashSet<String>,
cognition_verbs: HashSet<String>,
stemmer: Option<Stemmer>,
}
impl ShowDontTellDetector {
pub fn new(cfg: &ShowDontTellConfig, language: &str) -> Self {
let stemmer = if cfg.use_stemming {
parse_stemmer_language(language).map(Stemmer::create)
} else {
None
};
let normalise = |w: &str| -> String {
let lc = w.trim().to_lowercase();
match &stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
}
};
let configured_lv: &Vec<String> = match language.to_lowercase().as_str() {
"russian" => &cfg.russian_linking_verbs,
"french" => &cfg.french_linking_verbs,
"german" => &cfg.german_linking_verbs,
"spanish" => &cfg.spanish_linking_verbs,
_ => &cfg.english_linking_verbs,
};
let configured_ea: &Vec<String> = match language.to_lowercase().as_str() {
"russian" => &cfg.russian_emotion_adjectives,
"french" => &cfg.french_emotion_adjectives,
"german" => &cfg.german_emotion_adjectives,
"spanish" => &cfg.spanish_emotion_adjectives,
_ => &cfg.english_emotion_adjectives,
};
let configured_ma: &Vec<String> = match language.to_lowercase().as_str() {
"russian" => &cfg.russian_manner_adverbs,
"french" => &cfg.french_manner_adverbs,
"german" => &cfg.german_manner_adverbs,
"spanish" => &cfg.spanish_manner_adverbs,
_ => &cfg.english_manner_adverbs,
};
let configured_cv: &Vec<String> = match language.to_lowercase().as_str() {
"russian" => &cfg.russian_cognition_verbs,
"french" => &cfg.french_cognition_verbs,
"german" => &cfg.german_cognition_verbs,
"spanish" => &cfg.spanish_cognition_verbs,
_ => &cfg.english_cognition_verbs,
};
let build = |configured: &Vec<String>,
fallback: &[&str]|
-> HashSet<String> {
let mut s: HashSet<String> = HashSet::new();
if configured.is_empty() {
for w in fallback {
let key = normalise(w);
if !key.is_empty() {
s.insert(key);
}
}
} else {
for w in configured {
let key = normalise(w);
if !key.is_empty() {
s.insert(key);
}
}
}
s
};
Self {
linking_verbs: build(configured_lv, built_in_linking_verbs(language)),
emotion_adjectives: build(
configured_ea,
built_in_emotion_adjectives(language),
),
manner_adverbs: build(
configured_ma,
built_in_manner_adverbs(language),
),
cognition_verbs: build(
configured_cv,
built_in_cognition_verbs(language),
),
stemmer,
}
}
pub fn is_empty(&self) -> bool {
self.linking_verbs.is_empty()
&& self.emotion_adjectives.is_empty()
&& self.manner_adverbs.is_empty()
&& self.cognition_verbs.is_empty()
}
pub fn detect(&self, line: &str) -> Vec<StyleHit> {
if self.is_empty() || line.is_empty() {
return Vec::new();
}
let mut byte_to_char: Vec<usize> = Vec::with_capacity(line.len() + 1);
let mut char_count = 0usize;
for (b, _) in line.char_indices() {
while byte_to_char.len() < b {
byte_to_char.push(char_count);
}
byte_to_char.push(char_count);
char_count += 1;
}
while byte_to_char.len() <= line.len() {
byte_to_char.push(char_count);
}
struct Tok {
byte_start: usize,
byte_end: usize,
stem: String,
}
let tokens: Vec<Tok> = line
.unicode_word_indices()
.map(|(b, w)| {
let lc = w.to_lowercase();
let stem = match &self.stemmer {
Some(s) => s.stem(&lc).into_owned(),
None => lc,
};
Tok {
byte_start: b,
byte_end: b + w.len(),
stem,
}
})
.collect();
let mut out: Vec<StyleHit> = Vec::new();
for (i, tok) in tokens.iter().enumerate() {
if self.manner_adverbs.contains(&tok.stem) {
out.push(StyleHit {
col_start: byte_to_char[tok.byte_start],
col_end: byte_to_char
.get(tok.byte_end)
.copied()
.unwrap_or(char_count),
kind: StyleWarningKind::ShowDontTell,
});
continue;
}
if self.cognition_verbs.contains(&tok.stem) {
out.push(StyleHit {
col_start: byte_to_char[tok.byte_start],
col_end: byte_to_char
.get(tok.byte_end)
.copied()
.unwrap_or(char_count),
kind: StyleWarningKind::ShowDontTell,
});
continue;
}
if self.linking_verbs.contains(&tok.stem) {
if let Some(next) = tokens.get(i + 1) {
if self.emotion_adjectives.contains(&next.stem) {
out.push(StyleHit {
col_start: byte_to_char[tok.byte_start],
col_end: byte_to_char
.get(next.byte_end)
.copied()
.unwrap_or(char_count),
kind: StyleWarningKind::ShowDontTell,
});
}
}
}
}
out.sort_by_key(|h| h.col_start);
out
}
}
#[cfg(test)]
mod tests {
use super::*;
fn cfg_default() -> FilterWordsConfig {
FilterWordsConfig::default()
}
fn cols_of(hits: &[StyleHit]) -> Vec<(usize, usize)> {
hits.iter().map(|h| (h.col_start, h.col_end)).collect()
}
#[test]
fn english_filter_word_basic() {
let d = FilterWordsDetector::new(&cfg_default(), "english");
let hits = d.detect("I just wanted to see");
assert_eq!(cols_of(&hits), vec![(2, 6)]);
}
#[test]
fn english_case_insensitive() {
let d = FilterWordsDetector::new(&cfg_default(), "english");
let hits = d.detect("Just wait. JUST a moment.");
assert_eq!(cols_of(&hits), vec![(0, 4), (11, 15)]);
}
#[test]
fn russian_basic() {
let d = FilterWordsDetector::new(&cfg_default(), "russian");
let hits = d.detect("Он был очень устал и просто хотел спать.");
assert_eq!(hits.len(), 2);
}
#[test]
fn russian_stemming_catches_inflections() {
let d = FilterWordsDetector::new(&cfg_default(), "russian");
for form in &["казалось", "казался", "казалась", "казались"] {
let hits = d.detect(form);
assert!(
!hits.is_empty(),
"Russian stemmer failed to match `{form}`: cfg had `казаться`"
);
}
}
#[test]
fn english_stemming_catches_inflections() {
let d = FilterWordsDetector::new(&cfg_default(), "english");
for form in &["seem", "seemed", "seems", "seeming"] {
let hits = d.detect(form);
assert!(
!hits.is_empty(),
"English stemmer failed to match `{form}`"
);
}
}
#[test]
fn use_stemming_off_disables_inflection_matching() {
let mut cfg = cfg_default();
cfg.use_stemming = false;
let d = FilterWordsDetector::new(&cfg, "english");
assert!(!d.detect("seem").is_empty());
assert!(d.detect("seemed").is_empty());
}
#[test]
fn user_override_replaces_default_for_that_language() {
let mut cfg = cfg_default();
cfg.english = vec!["foo".into(), "bar".into()];
let d = FilterWordsDetector::new(&cfg, "english");
assert!(d.detect("just a test").is_empty());
assert_eq!(d.detect("foo bar baz").len(), 2);
}
#[test]
fn extra_words_add_on_top_of_default() {
let mut cfg = cfg_default();
cfg.extra_words = vec!["foo".into()];
let d = FilterWordsDetector::new(&cfg, "english");
assert_eq!(d.detect("just foo here").len(), 2);
}
#[test]
fn unknown_language_falls_back_to_english() {
let d = FilterWordsDetector::new(&cfg_default(), "klingon");
assert_eq!(d.detect("just a test").len(), 1);
}
#[test]
fn cyrillic_columns_are_char_indexed_not_byte() {
let d = FilterWordsDetector::new(&cfg_default(), "russian");
let hits = d.detect("очень устал");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].col_start, 0);
assert_eq!(hits[0].col_end, 5);
}
#[test]
fn no_partial_word_match() {
let d = FilterWordsDetector::new(&cfg_default(), "english");
let hits = d.detect("justice is essential");
assert!(
!hits.iter().any(|h| h.col_start == 0 && h.col_end == 7),
"false positive on `justice`: {hits:?}"
);
}
#[test]
fn punctuation_doesnt_break_match() {
let d = FilterWordsDetector::new(&cfg_default(), "english");
let hits = d.detect("And just.");
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].col_start, 4);
assert_eq!(hits[0].col_end, 8);
}
fn rp_default() -> RepeatedPhrasesConfig {
RepeatedPhrasesConfig::default()
}
#[test]
fn rp_three_repeats_get_flagged() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 3;
let lines = vec![
"she lifted her shoulders slightly".to_string(),
"he paused; she lifted her shoulders again".to_string(),
"later she lifted her shoulders once more".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert!(
d.total_hits() >= 3,
"expected at least 3 hits for lift|shoulder × 3, got {} ({:?})",
d.total_hits(),
d.per_row,
);
}
#[test]
fn rp_two_repeats_below_threshold() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 3;
let lines = vec![
"she lifted her shoulders".to_string(),
"she lifted her shoulders".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert_eq!(d.total_hits(), 0, "2 < threshold; should not flag");
}
#[test]
fn rp_lower_threshold_flags_two() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 2;
let lines = vec![
"she lifted her shoulders".to_string(),
"she lifted her shoulders".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert!(d.total_hits() >= 2);
}
#[test]
fn rp_stemming_aligns_inflections() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 2;
let lines = vec![
"she lifted her shoulders".to_string(),
"he was lifting her shoulders again".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert!(
d.total_hits() >= 2,
"stems should align lifted/lifting; got hits {:?}",
d.per_row,
);
}
#[test]
fn rp_russian_inflections() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 3;
let lines = vec![
"Он поднял плечи".to_string(),
"Она подняла плечи".to_string(),
"Они подняли плечи".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "russian", &lines);
assert!(
d.total_hits() >= 3,
"expected ≥ 3 hits for поднимать-плечи × 3, got {} ({:?})",
d.total_hits(),
d.per_row,
);
}
#[test]
fn rp_stop_words_excluded_from_ngrams() {
let mut cfg = rp_default();
cfg.n = 3;
cfg.threshold = 2;
let lines = vec![
"the big dog and the small cat".to_string(),
"the big dog and the small cat".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert!(
d.total_hits() >= 2,
"expected at least 2 hits across the dup line"
);
}
#[test]
fn rp_disabled_yields_no_hits() {
let mut cfg = rp_default();
cfg.enabled = false;
let lines: Vec<String> = vec!["nothing to see here".into(); 5];
let d = RepeatedPhraseDetector::new(&cfg, "english", &lines);
assert!(!d.is_empty());
}
#[test]
fn rp_empty_input_no_panic() {
let d = RepeatedPhraseDetector::new(&rp_default(), "english", &[]);
assert!(d.is_empty());
}
#[test]
fn rp_columns_char_indexed() {
let mut cfg = rp_default();
cfg.n = 2;
cfg.threshold = 3;
let lines = vec![
"очень просто слово".to_string(),
"очень просто слово".to_string(),
"очень просто слово".to_string(),
];
let d = RepeatedPhraseDetector::new(&cfg, "russian", &lines);
let row0 = d.hits_for_row(0);
assert!(!row0.is_empty());
assert_eq!(row0[0].col_start, 0);
}
fn sdt_cfg_default() -> ShowDontTellConfig {
ShowDontTellConfig::default()
}
#[test]
fn sdt_was_angry_flagged() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
assert!(!d.is_empty(), "english defaults populated");
let hits = d.detect("She was angry at the dog.");
assert!(!hits.is_empty(), "telling 'was angry' should hit");
let h = &hits[0];
assert_eq!(h.kind, StyleWarningKind::ShowDontTell);
assert_eq!(h.col_start, 4);
assert_eq!(h.col_end, 13);
}
#[test]
fn sdt_was_running_not_flagged() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let hits = d.detect("She was running through the rain.");
assert!(
hits.is_empty(),
"non-emotion 'was running' must NOT hit, got: {hits:?}"
);
}
#[test]
fn sdt_seemed_nervous_flagged_via_stemming() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let hits = d.detect("He seemed nervous about the meeting.");
assert!(!hits.is_empty());
}
#[test]
fn sdt_manner_adverb_flagged() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let hits = d.detect("\"Get out,\" she said angrily.");
assert!(hits.iter().any(|h| {
let trimmed = "Get out".len();
let _ = trimmed; h.kind == StyleWarningKind::ShowDontTell
}));
}
#[test]
fn sdt_cognition_verb_flagged() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let hits = d.detect("She realised the room was empty.");
assert!(!hits.is_empty(), "'realised' must trigger cognition hit");
}
#[test]
fn sdt_plain_action_prose_clean() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let hits = d.detect(
"He poured the coffee and watched the rain hit the shutters.",
);
assert!(
hits.is_empty(),
"action prose must stay clean, got: {hits:?}"
);
}
#[test]
fn sdt_unsupported_language_falls_back_quiet() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "klingon");
assert!(d.is_empty());
}
#[test]
fn sdt_unicode_columns_safe() {
let d = ShowDontTellDetector::new(&sdt_cfg_default(), "english");
let line = "Café was empty. He was sad.";
let hits = d.detect(line);
if let Some(h) = hits.last() {
let chars: Vec<char> = line.chars().collect();
assert!(h.col_end <= chars.len());
}
}
}