use unicode_segmentation::UnicodeSegmentation;
use crate::condition::ConditionTag;
use crate::config::Profile;
use crate::language::{en, fr};
use crate::parser::{split_sentences, Document};
use crate::rules::Rule;
use crate::types::{Diagnostic, Language, Location, Severity};
#[derive(Debug, Clone, Copy, Default)]
pub struct MixedNumericFormat;
impl MixedNumericFormat {
#[must_use]
pub const fn new() -> Self {
Self
}
#[must_use]
pub const fn for_profile(_profile: Profile) -> Self {
Self::new()
}
pub const ID: &'static str = "structure.mixed-numeric-format";
pub const TAGS: &'static [ConditionTag] = &[ConditionTag::Dyscalculia, ConditionTag::General];
}
impl Rule for MixedNumericFormat {
fn id(&self) -> &'static str {
Self::ID
}
fn condition_tags(&self) -> &'static [ConditionTag] {
Self::TAGS
}
fn check(&self, document: &Document, language: Language) -> Vec<Diagnostic> {
let spelled = spelled_list_for(language);
let mut diagnostics = Vec::new();
for (paragraph, section_title) in document.paragraphs_with_section() {
let sentences = split_sentences(¶graph.text, paragraph.start_line, 1);
for sentence in sentences {
let Some((digit_sample, spelled_sample)) = find_mixed_pair(&sentence.text, spelled)
else {
continue;
};
let length =
u32::try_from(sentence.text.graphemes(true).count()).unwrap_or(u32::MAX);
let location = Location::new(
document.source.clone(),
sentence.line,
sentence.column,
length,
);
let message = format!(
"Sentence mixes a digit numeral (`{digit_sample}`) with a spelled-out numeral \
(`{spelled_sample}`). Plain-language and CDC Clear Communication Index \
guidance recommends using one form consistently within a sentence."
);
let mut diag = Diagnostic::new(Self::ID, Severity::Warning, location, message);
if let Some(title) = section_title {
diag = diag.with_section(title);
}
diagnostics.push(diag);
}
}
diagnostics
}
}
fn spelled_list_for(language: Language) -> &'static [&'static str] {
match language {
Language::Fr => fr::SPELLED_NUMERALS,
Language::En | Language::Unknown => en::SPELLED_NUMERALS,
}
}
fn find_mixed_pair<'a>(
text: &'a str,
spelled: &'static [&'static str],
) -> Option<(&'a str, &'a str)> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut digit_sample: Option<&str> = None;
let mut spelled_sample: Option<&str> = None;
let mut i = 0;
while i < len {
let b = bytes[i];
if b.is_ascii_digit() {
let start = i;
i += 1;
while i < len {
let c = bytes[i];
if c.is_ascii_digit() {
i += 1;
} else if (c == b'.' || c == b',' || c == b' ')
&& i + 1 < len
&& bytes[i + 1].is_ascii_digit()
{
i += 2;
} else {
break;
}
}
if digit_sample.is_none() {
digit_sample = Some(&text[start..i]);
}
} else if is_word_byte(b) {
let start = i;
i += 1;
while i < len && is_word_byte(bytes[i]) {
i += 1;
}
if spelled_sample.is_none() {
let word = &text[start..i];
if matches_spelled(word, spelled) {
spelled_sample = Some(word);
}
}
} else {
i += utf8_char_len(b);
}
if let (Some(d), Some(s)) = (digit_sample, spelled_sample) {
return Some((d, s));
}
}
None
}
fn is_word_byte(b: u8) -> bool {
b.is_ascii_alphabetic() || b >= 0x80
}
fn utf8_char_len(leading: u8) -> usize {
if leading < 0x80 {
1
} else if leading < 0xC0 {
1
} else if leading < 0xE0 {
2
} else if leading < 0xF0 {
3
} else {
4
}
}
fn matches_spelled(word: &str, spelled: &'static [&'static str]) -> bool {
if !word.is_ascii() {
return false;
}
spelled
.iter()
.any(|candidate| word.eq_ignore_ascii_case(candidate))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::{parse_markdown, parse_plain};
use crate::types::{Category, SourceFile};
fn lint(text: &str, language: Language) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
MixedNumericFormat::new().check(&document, language)
}
fn lint_md(text: &str, language: Language) -> Vec<Diagnostic> {
let document = parse_markdown(text, SourceFile::Anonymous);
MixedNumericFormat::new().check(&document, language)
}
#[test]
fn id_is_kebab_case() {
assert_eq!(MixedNumericFormat::ID, "structure.mixed-numeric-format");
}
#[test]
fn tags_carry_dyscalculia_and_general() {
assert!(MixedNumericFormat::TAGS.contains(&ConditionTag::Dyscalculia));
assert!(MixedNumericFormat::TAGS.contains(&ConditionTag::General));
}
#[test]
fn category_is_structure() {
let diags = lint("We received 5 reports and twelve complaints.", Language::En);
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].category(), Category::Structure);
}
#[test]
fn mix_in_en_sentence_triggers() {
let diags = lint("We received 5 reports and twelve complaints.", Language::En);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("`5`"));
assert!(diags[0].message.contains("`twelve`"));
}
#[test]
fn mix_in_fr_sentence_triggers() {
let diags = lint(
"Nous avons reçu 5 rapports et douze plaintes.",
Language::Fr,
);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("`5`"));
assert!(diags[0].message.contains("`douze`"));
}
#[test]
fn en_excludes_indefinite_pronoun_one() {
assert!(lint("One of 42 readers noticed.", Language::En).is_empty());
}
#[test]
fn fr_excludes_indefinite_article_un_une() {
assert!(lint("Un lecteur sur 42 a remarqué.", Language::Fr).is_empty());
assert!(lint("Une page parmi 42 a bougé.", Language::Fr).is_empty());
}
#[test]
fn digits_only_sentence_does_not_trigger() {
assert!(lint("We received 5 reports and 12 complaints.", Language::En).is_empty());
}
#[test]
fn spelled_only_sentence_does_not_trigger() {
assert!(lint(
"We received five reports and twelve complaints.",
Language::En
)
.is_empty());
}
#[test]
fn decimal_number_counts_as_single_digit_token() {
let diags = lint("Pi is about 3.14 across three decimals.", Language::En);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("`3.14`"));
assert!(diags[0].message.contains("`three`"));
}
#[test]
fn thousands_separator_comma_counts_as_single_token() {
let diags = lint(
"The archive holds 1,000 items across ten shelves.",
Language::En,
);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("`1,000`"));
}
#[test]
fn fr_thousands_narrow_space_counts_as_single_token() {
let diags = lint(
"L'archive contient 1 000 éléments sur dix étagères.",
Language::Fr,
);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("`1 000`"));
}
#[test]
fn mix_across_distinct_sentences_does_not_trigger() {
assert!(lint(
"We received 5 reports. The three reviewers agreed.",
Language::En
)
.is_empty());
}
#[test]
fn two_separate_mixed_sentences_yield_two_diagnostics() {
let diags = lint(
"We saw 5 bugs and three regressions. Later 7 more and ten followups.",
Language::En,
);
assert_eq!(diags.len(), 2);
}
#[test]
fn fenced_code_block_content_is_ignored() {
let md = "Intro paragraph.\n\n```\nfive + 5 = ten\n```\n\nPlain prose with 5 apples.\n";
assert!(lint_md(md, Language::En).is_empty());
}
#[test]
fn unknown_language_falls_back_to_english() {
let diags = lint("We saw 5 bugs and three regressions.", Language::Unknown);
assert_eq!(diags.len(), 1);
}
#[test]
fn snapshot_fixture() {
let text = "We received 5 reports and twelve complaints. All clear.";
let diags = lint(text, Language::En);
insta::assert_yaml_snapshot!(diags, {
".*.location.file" => "<input>",
});
}
}