use std::num::NonZeroU32;
use unicode_segmentation::UnicodeSegmentation;
use crate::condition::ConditionTag;
use crate::config::Profile;
use crate::parser::{split_sentences, Document};
use crate::rules::{Rule, Status};
use crate::types::{Diagnostic, Language, Location, Severity, SourceFile};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Config {
pub max_numbers: NonZeroU32,
}
impl Config {
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
let max = match profile {
Profile::DevDoc => 6,
Profile::Public => 4,
Profile::Falc => 3,
};
Self {
max_numbers: NonZeroU32::new(max).expect("non-zero literal"),
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct NumberRun {
config: Config,
}
impl NumberRun {
#[must_use]
pub const fn new(config: Config) -> Self {
Self { config }
}
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
Self::new(Config::for_profile(profile))
}
pub const ID: &'static str = "structure.number-run";
}
impl Rule for NumberRun {
fn id(&self) -> &'static str {
Self::ID
}
fn check(&self, document: &Document, _language: Language) -> Vec<Diagnostic> {
let max = self.config.max_numbers.get();
let mut diags = Vec::new();
for (paragraph, section_title) in document.paragraphs_with_section() {
let sentences = split_sentences(¶graph.text, paragraph.start_line, 1);
for sentence in sentences {
let Some((count, first_offset)) = scan_numeric_run(&sentence.text, max) else {
continue;
};
diags.push(build_diagnostic(
&document.source,
&sentence.text,
sentence.line,
sentence.column,
first_offset,
count,
max,
section_title,
));
}
}
diags
}
fn condition_tags(&self) -> &'static [ConditionTag] {
&[ConditionTag::Dyscalculia]
}
fn status(&self) -> Status {
Status::Experimental
}
}
fn scan_numeric_run(text: &str, max: u32) -> Option<(u32, usize)> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut count: u32 = 0;
let mut first_offset: Option<usize> = None;
let mut i = 0;
while i < len {
let b = bytes[i];
if b.is_ascii_digit() {
let start = i;
i += 1;
let mut separator_used = false;
while i < len {
let c = bytes[i];
if c.is_ascii_digit() {
i += 1;
} else if !separator_used
&& (c == b'.' || c == b',')
&& i + 1 < len
&& bytes[i + 1].is_ascii_digit()
{
separator_used = true;
i += 2;
} else {
break;
}
}
count = count.saturating_add(1);
if first_offset.is_none() {
first_offset = Some(start);
}
} else {
i += utf8_char_len(b);
}
}
if count > max {
first_offset.map(|off| (count, off))
} else {
None
}
}
fn utf8_char_len(leading: u8) -> usize {
if leading < 0x80 {
1
} else if leading < 0xC0 {
1
} else if leading < 0xE0 {
2
} else if leading < 0xF0 {
3
} else {
4
}
}
fn build_diagnostic(
source: &SourceFile,
sentence_text: &str,
sentence_line: u32,
sentence_column: u32,
first_offset: usize,
actual: u32,
max: u32,
section: Option<&str>,
) -> Diagnostic {
let prefix = &sentence_text[..first_offset];
let prefix_graphemes = u32::try_from(prefix.graphemes(true).count()).unwrap_or(u32::MAX);
let column = sentence_column.saturating_add(prefix_graphemes);
let length = u32::try_from(sentence_text.graphemes(true).count()).unwrap_or(u32::MAX);
let location = Location::new(source.clone(), sentence_line, column, length);
let message = format!(
"Sentence packs {actual} numeric tokens (maximum {max}). plain-language guidance \
recommends not placing many numbers or statistics together in one sentence; \
split the sentence or move some figures to a list or table."
);
let diag = Diagnostic::new(NumberRun::ID, Severity::Warning, location, message);
match section {
Some(title) => diag.with_section(title),
None => diag,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::{parse_markdown, parse_plain};
use crate::types::{Category, SourceFile};
fn lint(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
NumberRun::for_profile(profile).check(&document, Language::En)
}
fn lint_md(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_markdown(text, SourceFile::Anonymous);
NumberRun::for_profile(profile).check(&document, Language::En)
}
#[test]
fn id_is_kebab_case_and_category_prefixed() {
assert_eq!(NumberRun::ID, "structure.number-run");
assert_eq!(
NumberRun::for_profile(Profile::Public).id(),
"structure.number-run"
);
}
#[test]
fn ships_as_experimental() {
assert_eq!(
NumberRun::for_profile(Profile::Public).status(),
Status::Experimental
);
}
#[test]
fn carries_dyscalculia_condition_tag() {
let rule = NumberRun::for_profile(Profile::Public);
assert_eq!(rule.condition_tags(), &[ConditionTag::Dyscalculia]);
}
#[test]
fn category_is_structure() {
let diags = lint("Counts hit 1, 2, 3, 4, 5 across reviews.", Profile::Public);
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].category(), Category::Structure);
}
#[test]
fn sentence_under_threshold_does_not_trigger() {
let diags = lint("Counts hit 1, 2, and 3 across reviews.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn sentence_at_threshold_does_not_trigger() {
let diags = lint("Counts hit 1, 2, 3, and 4 across reviews.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn sentence_over_threshold_triggers() {
let diags = lint("Counts hit 1, 2, 3, 4, 5 across reviews.", Profile::Public);
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].rule_id, NumberRun::ID);
assert_eq!(diags[0].severity, Severity::Warning);
assert!(diags[0].message.contains("5 numeric tokens"));
assert!(diags[0].message.contains("maximum 4"));
}
#[test]
fn decimal_is_one_token() {
let diags = lint(
"Constants include 3.14, 2.71, 1.41, 1.61, and 0.57 across the table.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
assert!(diags[0].message.contains("5 numeric tokens"));
}
#[test]
fn three_decimals_under_threshold() {
let diags = lint(
"Constants 3.14, 2.71, and 1.41 are common.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn hyphenated_date_is_three_tokens() {
let under = lint(
"Released on 2026-05-04 with 1 patch attached.",
Profile::Public,
);
assert!(under.is_empty(), "got {under:?}");
let over = lint(
"Released on 2026-05-04 with 1 patch and 2 hotfixes.",
Profile::Public,
);
assert_eq!(over.len(), 1);
assert!(over[0].message.contains("5 numeric tokens"));
}
#[test]
fn fenced_code_block_excluded() {
let md = "Plain prose intro.\n\n\
```\n\
vals = [1, 2, 3, 4, 5, 6, 7, 8]\n\
```\n\n\
Plain prose outro.";
assert!(lint_md(md, Profile::Public).is_empty());
}
#[test]
fn devdoc_profile_is_more_tolerant() {
let text = "Counts hit 1, 2, 3, 4, 5 across reviews.";
assert!(!lint(text, Profile::Public).is_empty());
assert!(lint(text, Profile::DevDoc).is_empty());
}
#[test]
fn falc_profile_is_stricter() {
let text = "Counts hit 1, 2, 3, and 4 across reviews.";
assert!(lint(text, Profile::Public).is_empty());
assert!(!lint(text, Profile::Falc).is_empty());
}
#[test]
fn french_input_is_caught_too() {
let diags = lint(
"Les comptages atteignent 1, 2, 3, 4, 5 selon les revues.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
}
#[test]
fn position_points_at_first_numeric_token() {
let diags = lint("Counts hit 1, 2, 3, 4, 5 across reviews.", Profile::Public);
assert_eq!(diags[0].location.line, 1);
assert_eq!(diags[0].location.column, 12);
assert!(diags[0].location.length > 0);
}
#[test]
fn multiple_offending_sentences_each_fire() {
let text = "First batch was 1, 2, 3, 4, 5 across reviews. \
Second batch was 6, 7, 8, 9, 10 across audits.";
let diags = lint(text, Profile::Public);
assert_eq!(diags.len(), 2, "got {diags:?}");
}
#[test]
fn citation_salad_fires() {
let diags = lint(
"See work by Smith 2020, Jones 2021, Wei 2022, Park 2023, and Lee 2024.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
}
#[test]
fn sentence_with_no_numbers_does_not_trigger() {
let diags = lint(
"The team eventually decided to ship the migration on schedule.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn config_thresholds_are_as_documented() {
assert_eq!(Config::for_profile(Profile::DevDoc).max_numbers.get(), 6);
assert_eq!(Config::for_profile(Profile::Public).max_numbers.get(), 4);
assert_eq!(Config::for_profile(Profile::Falc).max_numbers.get(), 3);
}
#[test]
fn snapshot_fixture() {
let text = "Mild paragraph mentions 1 and 2 figures only.\n\n\
Heavy paragraph hits 1, 2, 3, 4, 5 numbers in a row.\n\n\
Plain prose without any digits at all here.";
let document = parse_markdown(text, SourceFile::Anonymous);
let diags = NumberRun::for_profile(Profile::Public).check(&document, Language::En);
insta::assert_yaml_snapshot!(diags, {
".*.location.file" => "<input>",
});
}
}