use low_expectations::ExpectationSuite;
use prosesmasher_domain_types::{CheckConfig, Document, Locale};
use crate::check::Check;
#[derive(Debug)]
pub struct RecommendedTermsCheck;
impl Check for RecommendedTermsCheck {
fn id(&self) -> &'static str {
"recommended-terms"
}
fn label(&self) -> &'static str {
"Recommended Terms"
}
fn supported_locales(&self) -> Option<&'static [Locale]> {
None
}
fn run(&self, doc: &Document, config: &CheckConfig, suite: &mut ExpectationSuite) {
let Some(ref pool) = config.quality.lexical.recommended_terms else {
return;
};
if pool.terms.is_empty() {
return;
}
let all_words: Vec<&str> = doc
.sections
.iter()
.flat_map(|s| &s.blocks)
.flat_map(|b| super::collect_paragraph_words(b))
.collect();
let lower_words: Vec<String> = all_words.iter().map(|w| w.to_lowercase()).collect();
let match_count = if pool.allow_inflections {
count_stem_matches(&lower_words, &pool.terms)
} else {
count_exact_matches(&lower_words, &pool.terms)
};
let observed = i64::try_from(match_count).unwrap_or(i64::MAX);
let min = i64::try_from(pool.min_count).unwrap_or(0);
let _result = suite
.expect_value_to_be_at_least("recommended-terms", observed, min)
.label("Recommended Terms")
.checking(&format!(
"at least {} of {} pool terms present",
pool.min_count,
pool.terms.len()
));
}
}
fn count_exact_matches(lower_words: &[String], terms: &[String]) -> usize {
terms
.iter()
.filter(|term| {
let lower_term = term.to_lowercase();
lower_words.contains(&lower_term)
})
.count()
}
fn count_stem_matches(lower_words: &[String], terms: &[String]) -> usize {
terms
.iter()
.filter(|term| {
let stem = rough_stem(&term.to_lowercase());
lower_words.iter().any(|w| w.starts_with(&stem))
})
.count()
}
fn rough_stem(word: &str) -> String {
let w = word.trim();
for suffix in &[
"ation", "tion", "ment", "ness", "ing", "ies", "ied", "es", "ed", "er", "ly", "s",
] {
if let Some(stripped) = w.strip_suffix(suffix)
&& stripped.len() >= 3
{
return stripped.to_owned();
}
}
w.to_owned()
}
#[cfg(test)]
#[path = "recommended_terms_tests.rs"]
mod tests;