use std::collections::HashSet;
use std::num::NonZeroU32;
use std::sync::LazyLock;
use crate::config::Profile;
use crate::parser::phrase_search::count_word_bounded;
use crate::parser::{split_sentences, Document, Sentence};
use crate::rules::Rule;
use crate::types::{Diagnostic, Language, Location, Severity, SourceFile};
static EN_CONNECTORS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
vec![
"however",
"nevertheless",
"yet",
"although",
"but",
"because",
"since",
"as",
"for",
"therefore",
"thus",
"consequently",
"hence",
"so",
"first",
"then",
"next",
"finally",
"for example",
"notably",
"in particular",
"such as",
"moreover",
"furthermore",
"also",
"additionally",
]
});
static FR_CONNECTORS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
vec![
"cependant",
"toutefois",
"en revanche",
"néanmoins",
"pourtant",
"mais",
"parce que",
"car",
"puisque",
"en effet",
"donc",
"ainsi",
"par conséquent",
"c'est pourquoi",
"d'abord",
"ensuite",
"puis",
"enfin",
"premièrement",
"par exemple",
"notamment",
"en particulier",
"de plus",
"en outre",
"également",
"par ailleurs",
]
});
#[derive(Debug, Clone)]
pub struct Config {
pub max_per_window: NonZeroU32,
pub window_size: NonZeroU32,
pub custom_connectors: Vec<String>,
}
impl Config {
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
let max = match profile {
Profile::DevDoc => 4,
Profile::Public => 3,
Profile::Falc => 2,
};
Self {
max_per_window: NonZeroU32::new(max).expect("non-zero literal"),
window_size: NonZeroU32::new(5).expect("non-zero literal"),
custom_connectors: Vec::new(),
}
}
}
#[derive(Debug, Clone)]
pub struct RepetitiveConnectors {
config: Config,
}
impl RepetitiveConnectors {
#[must_use]
pub const fn new(config: Config) -> Self {
Self { config }
}
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
Self::new(Config::for_profile(profile))
}
pub const ID: &'static str = "rhythm.repetitive-connectors";
fn connectors_for(&self, language: Language) -> Vec<String> {
let base: Vec<&'static str> = match language {
Language::En => EN_CONNECTORS.iter().copied().collect(),
Language::Fr => FR_CONNECTORS.iter().copied().collect(),
Language::Unknown => return Vec::new(),
};
let mut seen: HashSet<String> = HashSet::new();
base.into_iter()
.map(str::to_string)
.chain(self.config.custom_connectors.iter().cloned())
.filter(|c| seen.insert(c.clone()))
.collect()
}
}
impl Rule for RepetitiveConnectors {
fn id(&self) -> &'static str {
Self::ID
}
fn check(&self, document: &Document, language: Language) -> Vec<Diagnostic> {
let connectors = self.connectors_for(language);
if connectors.is_empty() {
return Vec::new();
}
let mut sentences: Vec<(Sentence, Option<&str>)> = Vec::new();
for (paragraph, section_title) in document.paragraphs_with_section() {
for s in split_sentences(¶graph.text, paragraph.start_line, 1) {
sentences.push((s, section_title));
}
}
if sentences.is_empty() {
return Vec::new();
}
let threshold = self.config.max_per_window.get() as usize;
let window = self.config.window_size.get() as usize;
let mut diagnostics = Vec::new();
for connector in &connectors {
let mut hits: Vec<usize> = Vec::new();
for (idx, (sentence, _)) in sentences.iter().enumerate() {
let lowered = sentence.text.to_lowercase();
let occurrences = count_word_bounded(&lowered, connector);
for _ in 0..occurrences {
hits.push(idx);
}
}
if hits.len() <= threshold {
continue;
}
let mut k = 0;
while k + threshold < hits.len() {
let cluster_start = hits[k];
let cluster_end = hits[k + threshold];
if cluster_end < cluster_start + window {
let (sentence, section) = &sentences[cluster_start];
let count = threshold + 1;
diagnostics.push(build_diagnostic(
&document.source,
sentence.line,
sentence.column,
&sentence.text,
connector,
u32::try_from(count).unwrap_or(u32::MAX),
self.config.max_per_window.get(),
window,
*section,
));
k += threshold + 1;
} else {
k += 1;
}
}
}
diagnostics.sort_by_key(|d| (d.location.line, d.location.column));
diagnostics
}
}
#[allow(clippy::too_many_arguments)]
fn build_diagnostic(
source: &SourceFile,
line: u32,
column: u32,
sentence_text: &str,
connector: &str,
count: u32,
max: u32,
window: usize,
section: Option<&str>,
) -> Diagnostic {
let length = u32::try_from(sentence_text.chars().count()).unwrap_or(u32::MAX);
let location = Location::new(source.clone(), line, column, length);
let message = format!(
"Connector \"{connector}\" appears {count} times within {window} consecutive sentences \
(max {max}). Vary the connector or restructure the passage."
);
let diag = Diagnostic::new(
RepetitiveConnectors::ID,
Severity::Warning,
location,
message,
);
match section {
Some(title) => diag.with_section(title),
None => diag,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::parse_plain;
use crate::types::SourceFile;
fn lint(text: &str, profile: Profile, language: Language) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
RepetitiveConnectors::for_profile(profile).check(&document, language)
}
#[test]
fn id_is_kebab_case() {
assert_eq!(RepetitiveConnectors::ID, "rhythm.repetitive-connectors");
}
#[test]
fn varied_connectors_do_not_trigger() {
let text = "First we act. Then we think. However, we also pause. Therefore we improve.";
assert!(lint(text, Profile::Public, Language::En).is_empty());
}
#[test]
fn repeated_then_triggers() {
let text = "We analyzed the data. Then we built the model. Then we validated it. \
Then we shipped it. Then we archived it.";
let diags = lint(text, Profile::Public, Language::En);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("\"then\""));
assert!(diags[0].message.contains("4 times"));
}
#[test]
fn at_threshold_does_not_trigger() {
let text = "Then a. Then b. Then c.";
assert!(lint(text, Profile::Public, Language::En).is_empty());
}
#[test]
fn spread_out_repetition_does_not_trigger() {
let text = "Then a. b. Then b. c. Then c. d. Then d. e.";
assert!(lint(text, Profile::Public, Language::En).is_empty());
}
#[test]
fn falc_profile_is_stricter() {
let text = "Then a. Then b. Then c.";
assert!(lint(text, Profile::Public, Language::En).is_empty());
assert!(!lint(text, Profile::Falc, Language::En).is_empty());
}
#[test]
fn multi_word_connector_matches() {
let text = "In particular the first. In particular the second. In particular the \
third. In particular the fourth. Mid.";
let diags = lint(text, Profile::Public, Language::En);
assert!(
diags.iter().any(|d| d.message.contains("in particular")),
"expected an in-particular diagnostic: {diags:?}"
);
}
#[test]
fn french_connector_matches() {
let text = "Puis nous avons lu. Puis nous avons écrit. Puis nous avons révisé. \
Puis nous avons publié. Fin.";
let diags = lint(text, Profile::Public, Language::Fr);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("puis"));
}
#[test]
fn case_insensitive_match() {
let text = "Then a. THEN b. then c. Then d. Mid.";
let diags = lint(text, Profile::Public, Language::En);
assert_eq!(diags.len(), 1);
}
#[test]
fn word_boundary_prevents_partial_match() {
let text = "Therein a. Therein b. Therein c. Therein d. Therein e.";
assert!(lint(text, Profile::Public, Language::En).is_empty());
}
#[test]
fn unknown_language_skips_rule() {
let text = "Then a. Then b. Then c. Then d.";
assert!(lint(text, Profile::Public, Language::Unknown).is_empty());
}
#[test]
fn one_diagnostic_per_cluster() {
let text = "Then a. Then b. Then c. Then d. Then e. Then f.";
let diags = lint(text, Profile::Public, Language::En);
assert_eq!(diags.len(), 1);
}
#[test]
fn config_thresholds_match_rules_md() {
assert_eq!(Config::for_profile(Profile::DevDoc).max_per_window.get(), 4);
assert_eq!(Config::for_profile(Profile::Public).max_per_window.get(), 3);
assert_eq!(Config::for_profile(Profile::Falc).max_per_window.get(), 2);
assert_eq!(Config::for_profile(Profile::Public).window_size.get(), 5);
}
#[test]
fn category_is_rhythm() {
let text = "Then a. Then b. Then c. Then d. Mid.";
let diags = lint(text, Profile::Public, Language::En);
assert_eq!(diags[0].category(), crate::types::Category::Rhythm);
}
#[test]
fn snapshot_fixture() {
let text = "We analyzed the data. Then we built the model. Then we validated it. \
Then we shipped it. Then we archived it.";
let diags = lint(text, Profile::Public, Language::En);
insta::assert_yaml_snapshot!(diags, {
".*.location.file" => "<input>",
});
}
}