use unicode_segmentation::UnicodeSegmentation;
use crate::condition::ConditionTag;
use crate::config::Profile;
use crate::language::{en, fr};
use crate::parser::{split_sentences, Document};
use crate::rules::{Rule, Status};
use crate::types::{Diagnostic, Language, Location, Severity, SourceFile};
const MIN_DIGITS: u32 = 4;
const YEAR_LOW: u64 = 1000;
const YEAR_HIGH: u64 = 2999;
const FIGURE_REF_LOOKBACK_BYTES: usize = 16;
const CURRENCY_SYMBOLS: &[char] = &['$', '€', '£', '¥'];
const UNIT_TOKENS: &[&str] = &[
"km", "m", "cm", "mm", "µm", "nm", "ha", "m²", "m³", "km²", "km/h", "kg", "g", "mg", "µg", "t",
"L", "l", "mL", "ml", "cL", "cl", "h", "min", "s", "ms", "µs", "°C", "°F", "K", "Hz", "kHz",
"MHz", "GHz", "MB", "GB", "KB", "TB", "PB", "Mo", "Go", "Ko", "To", "mph", "kWh", "Wh", "W",
"kW", "MW", "bps", "Mbps", "Gbps",
];
const FIGURE_REFS_EN: &[&str] = &[
"figure", "fig.", "fig", "page", "pages", "section", "sections", "p.", "pp.", "chapter",
"table", "tab.", "tab", "row", "column", "line", "no.", "n.", "step", "rule", "issue", "PR",
"pr", "#",
];
const FIGURE_REFS_FR: &[&str] = &[
"figure", "fig.", "fig", "page", "pages", "section", "sections", "p.", "pp.", "chapitre",
"tableau", "tableaux", "ligne", "colonne", "n°", "no.", "no", "étape", "règle", "issue", "PR",
"pr", "#", "annexe", "annexes",
];
const MAGNITUDES_EN: &[&str] = &[
"million",
"millions",
"billion",
"billions",
"trillion",
"trillions",
];
const MAGNITUDES_FR: &[&str] = &[
"million",
"millions",
"milliard",
"milliards",
"billion",
"billions",
"trillion",
"trillions",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Config {
pub min_value: u64,
}
impl Config {
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
let min = match profile {
Profile::DevDoc => 100_000,
Profile::Public => 10_000,
Profile::Falc => 1_000,
};
Self { min_value: min }
}
}
#[derive(Debug, Clone, Copy)]
pub struct LargeNumberUnanchored {
config: Config,
}
impl LargeNumberUnanchored {
#[must_use]
pub const fn new(config: Config) -> Self {
Self { config }
}
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
Self::new(Config::for_profile(profile))
}
pub const ID: &'static str = "readability.large-number-unanchored";
}
impl Rule for LargeNumberUnanchored {
fn id(&self) -> &'static str {
Self::ID
}
fn check(&self, document: &Document, language: Language) -> Vec<Diagnostic> {
let Some(lookups) = LanguageLookups::for_language(language) else {
return Vec::new();
};
let mut diags = Vec::new();
for (paragraph, section_title) in document.paragraphs_with_section() {
let sentences = split_sentences(¶graph.text, paragraph.start_line, 1);
for sentence in sentences {
if has_anchor(&sentence.text, &lookups) {
continue;
}
let Some(candidate) =
first_unanchored_candidate(&sentence.text, self.config.min_value, &lookups)
else {
continue;
};
diags.push(build_diagnostic(
&document.source,
&sentence.text,
sentence.line,
sentence.column,
candidate,
section_title,
));
}
}
diags
}
fn condition_tags(&self) -> &'static [ConditionTag] {
&[ConditionTag::Dyscalculia, ConditionTag::General]
}
fn status(&self) -> Status {
Status::Experimental
}
}
struct LanguageLookups {
comparators: &'static [&'static str],
magnitudes: &'static [&'static str],
figure_refs: &'static [&'static str],
ratio_word: &'static str,
}
impl LanguageLookups {
fn for_language(language: Language) -> Option<Self> {
match language {
Language::En => Some(Self {
comparators: en::ANCHOR_COMPARATORS_EN,
magnitudes: MAGNITUDES_EN,
figure_refs: FIGURE_REFS_EN,
ratio_word: "out of",
}),
Language::Fr => Some(Self {
comparators: fr::ANCHOR_COMPARATORS_FR,
magnitudes: MAGNITUDES_FR,
figure_refs: FIGURE_REFS_FR,
ratio_word: "sur",
}),
Language::Unknown => None,
}
}
}
#[derive(Debug, Clone, Copy)]
struct Candidate {
offset: usize,
kind: CandidateKind,
}
#[derive(Debug, Clone, Copy)]
enum CandidateKind {
LargeNumeral { digits: u32, value: u64 },
Magnitude,
}
fn has_anchor(sentence: &str, lookups: &LanguageLookups) -> bool {
if sentence.contains('%') {
return true;
}
if sentence.chars().any(|c| CURRENCY_SYMBOLS.contains(&c)) {
return true;
}
let lower = sentence.to_lowercase();
for &phrase in lookups.comparators {
if lower.contains(phrase) {
return true;
}
}
if has_unit_token(sentence) {
return true;
}
if has_ratio_pattern(&lower, lookups.ratio_word) {
return true;
}
false
}
fn has_unit_token(sentence: &str) -> bool {
for token in tokenize_words(sentence) {
if UNIT_TOKENS.contains(&token) {
return true;
}
}
false
}
fn has_ratio_pattern(sentence_lower: &str, ratio_word: &str) -> bool {
if find_word_then_digit(sentence_lower, ratio_word) {
return true;
}
let bytes = sentence_lower.as_bytes();
for (i, &b) in bytes.iter().enumerate() {
if b == b'/' {
let before = preceding_nonspace_digit(bytes, i);
let after = following_nonspace_digit(bytes, i);
if before && after {
return true;
}
}
}
false
}
fn preceding_nonspace_digit(bytes: &[u8], index: usize) -> bool {
let mut j = index;
while j > 0 {
j -= 1;
match bytes[j] {
b' ' => {},
b'0'..=b'9' => return true,
_ => return false,
}
}
false
}
fn following_nonspace_digit(bytes: &[u8], index: usize) -> bool {
let mut j = index + 1;
while j < bytes.len() {
match bytes[j] {
b' ' => j += 1,
b'0'..=b'9' => return true,
_ => return false,
}
}
false
}
fn find_word_then_digit(sentence_lower: &str, word: &str) -> bool {
let mut search_from = 0;
while let Some(rel) = sentence_lower[search_from..].find(word) {
let start = search_from + rel;
let end = start + word.len();
let prev_ok = start == 0
|| sentence_lower.as_bytes()[start - 1].is_ascii_whitespace()
|| sentence_lower.as_bytes()[start - 1] == b',';
let next_byte = sentence_lower.as_bytes().get(end).copied();
let next_ok = matches!(next_byte, Some(b' '));
if prev_ok && next_ok {
let mut k = end + 1;
while k < sentence_lower.len() && sentence_lower.as_bytes()[k] == b' ' {
k += 1;
}
if k < sentence_lower.len() && sentence_lower.as_bytes()[k].is_ascii_digit() {
return true;
}
}
search_from = end;
}
false
}
fn first_unanchored_candidate(
sentence: &str,
min_value: u64,
lookups: &LanguageLookups,
) -> Option<Candidate> {
let bytes = sentence.as_bytes();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b.is_ascii_digit() {
let token = scan_numeric_token(sentence, i);
let suffix_letter = bytes.get(token.end).is_some_and(u8::is_ascii_alphabetic);
if suffix_letter {
i = token.end;
continue;
}
let is_year_shape = !token.had_separator
&& token.digits == 4
&& (YEAR_LOW..=YEAR_HIGH).contains(&token.value);
if is_year_shape {
i = token.end;
continue;
}
if preceded_by_figure_ref(sentence, i, lookups.figure_refs) {
i = token.end;
continue;
}
if token.digits >= MIN_DIGITS && token.value >= min_value {
return Some(Candidate {
offset: i,
kind: CandidateKind::LargeNumeral {
digits: token.digits,
value: token.value,
},
});
}
i = token.end;
} else if b.is_ascii_alphabetic() {
let word_end = scan_word_end(bytes, i);
let word = &sentence[i..word_end];
if is_magnitude_word(word, lookups.magnitudes) {
return Some(Candidate {
offset: i,
kind: CandidateKind::Magnitude,
});
}
i = word_end;
} else {
i += utf8_char_len(b);
}
}
None
}
struct NumericToken {
end: usize,
digits: u32,
value: u64,
had_separator: bool,
}
fn scan_numeric_token(sentence: &str, start: usize) -> NumericToken {
let mut digits: u32 = 0;
let mut value: u64 = 0;
let mut had_separator = false;
let mut end = start;
let mut chars = sentence[start..].char_indices().peekable();
let mut last_was_digit = false;
while let Some((rel, ch)) = chars.peek().copied() {
if ch.is_ascii_digit() {
digits = digits.saturating_add(1);
value = value
.saturating_mul(10)
.saturating_add(u64::from(ch as u8 - b'0'));
end = start + rel + ch.len_utf8();
last_was_digit = true;
chars.next();
} else if last_was_digit && is_numeric_separator(ch) {
chars.next(); if let Some(&(_, next)) = chars.peek() {
if next.is_ascii_digit() {
had_separator = true;
last_was_digit = false;
continue;
}
}
break;
} else {
break;
}
}
NumericToken {
end,
digits,
value,
had_separator,
}
}
fn is_numeric_separator(ch: char) -> bool {
matches!(ch, ',' | '.' | ' ' | '\u{a0}' | '\u{2009}' | '\u{202f}')
}
fn utf8_char_len(leading: u8) -> usize {
if leading < 0xC0 {
1
} else if leading < 0xE0 {
2
} else if leading < 0xF0 {
3
} else {
4
}
}
fn scan_word_end(bytes: &[u8], start: usize) -> usize {
let mut j = start;
while j < bytes.len() && (bytes[j].is_ascii_alphabetic() || bytes[j] == b'\'') {
j += 1;
}
j
}
fn is_magnitude_word(word: &str, magnitudes: &[&str]) -> bool {
let lower = word.to_ascii_lowercase();
magnitudes.iter().any(|&m| m == lower)
}
fn preceded_by_figure_ref(sentence: &str, digit_offset: usize, figure_refs: &[&str]) -> bool {
let lookback_start = digit_offset.saturating_sub(FIGURE_REF_LOOKBACK_BYTES);
let prefix = &sentence[lookback_start..digit_offset];
let lower = prefix.to_lowercase();
let trimmed = lower.trim_end_matches(|c: char| {
c.is_whitespace() || c == '.' || c == ':' || c == '°' || c == '#'
});
for &keyword in figure_refs {
if trimmed.ends_with(keyword) {
let kw_start = trimmed.len().saturating_sub(keyword.len());
let prev_byte = trimmed.as_bytes().get(kw_start.wrapping_sub(1)).copied();
let whole_word = prev_byte.map_or(true, |b| !(b as char).is_ascii_alphabetic());
if whole_word {
return true;
}
}
}
if prefix.trim_end().ends_with('#') {
return true;
}
false
}
fn tokenize_words(sentence: &str) -> impl Iterator<Item = &str> + '_ {
sentence
.split(|c: char| {
c.is_whitespace() || matches!(c, ',' | ';' | '!' | '?' | '(' | ')' | '[' | ']')
})
.filter(|s| !s.is_empty())
}
fn build_diagnostic(
source: &SourceFile,
sentence_text: &str,
sentence_line: u32,
sentence_column: u32,
candidate: Candidate,
section: Option<&str>,
) -> Diagnostic {
let prefix = &sentence_text[..candidate.offset];
let prefix_graphemes = u32::try_from(prefix.graphemes(true).count()).unwrap_or(u32::MAX);
let column = sentence_column.saturating_add(prefix_graphemes);
let length = u32::try_from(sentence_text.graphemes(true).count()).unwrap_or(u32::MAX);
let location = Location::new(source.clone(), sentence_line, column, length);
let message = match candidate.kind {
CandidateKind::LargeNumeral { digits, value } => format!(
"Large numeral ({digits}-digit, value ≈ {value}) appears with no anchor in this sentence \
(no unit, percentage, ratio, or comparison phrase). plain-language guidance recommends \
giving large numbers a comparison or denominator the reader can ground."
),
CandidateKind::Magnitude => String::from(
"Magnitude word appears with no anchor in this sentence (no unit, percentage, ratio, \
or comparison phrase). plain-language guidance recommends pairing magnitude words \
with a unit or a comparison the reader can ground.",
),
};
let diag = Diagnostic::new(
LargeNumberUnanchored::ID,
Severity::Warning,
location,
message,
);
match section {
Some(title) => diag.with_section(title),
None => diag,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::{parse_markdown, parse_plain};
use crate::types::Category;
fn lint_en(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
LargeNumberUnanchored::for_profile(profile).check(&document, Language::En)
}
fn lint_fr(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
LargeNumberUnanchored::for_profile(profile).check(&document, Language::Fr)
}
fn lint_md_en(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_markdown(text, SourceFile::Anonymous);
LargeNumberUnanchored::for_profile(profile).check(&document, Language::En)
}
#[test]
fn id_is_kebab_case_and_category_prefixed() {
assert_eq!(
LargeNumberUnanchored::ID,
"readability.large-number-unanchored"
);
}
#[test]
fn ships_as_experimental() {
assert_eq!(
LargeNumberUnanchored::for_profile(Profile::Public).status(),
Status::Experimental
);
}
#[test]
fn carries_dyscalculia_and_general_condition_tags() {
let rule = LargeNumberUnanchored::for_profile(Profile::Public);
assert_eq!(
rule.condition_tags(),
&[ConditionTag::Dyscalculia, ConditionTag::General]
);
}
#[test]
fn category_is_readability() {
let diags = lint_en(
"Le projet a couté 1234567 selon le rapport final.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
assert_eq!(diags[0].category(), Category::Readability);
}
#[test]
fn unknown_language_skips_silently() {
let document = parse_plain(
"The budget reached 4,800,000,000 last year.",
SourceFile::Anonymous,
);
let diags =
LargeNumberUnanchored::for_profile(Profile::Public).check(&document, Language::Unknown);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn config_thresholds_are_as_documented() {
assert_eq!(Config::for_profile(Profile::DevDoc).min_value, 100_000);
assert_eq!(Config::for_profile(Profile::Public).min_value, 10_000);
assert_eq!(Config::for_profile(Profile::Falc).min_value, 1_000);
}
#[test]
fn large_unanchored_numeral_fires() {
let diags = lint_en("The budget reached 4800000000 last year.", Profile::Public);
assert_eq!(diags.len(), 1, "got {diags:?}");
assert_eq!(diags[0].rule_id, LargeNumberUnanchored::ID);
assert_eq!(diags[0].severity, Severity::Warning);
assert!(diags[0].message.contains("Large numeral"));
}
#[test]
fn small_numeral_under_threshold_does_not_fire() {
let diags = lint_en("The room held 9999 attendees comfortably.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn three_digit_number_does_not_fire() {
let diags = lint_en("The team filed 250 issues last sprint.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn en_comma_thousands_separator_collapses() {
let diags = lint_en("The fund crossed 1,000,000 last quarter.", Profile::Public);
assert_eq!(diags.len(), 1, "got {diags:?}");
}
#[test]
fn fr_nbsp_thousands_separator_collapses() {
let diags = lint_fr(
"Le fonds a dépassé 1\u{a0}000\u{a0}000 le trimestre dernier.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
}
#[test]
fn four_digit_year_is_not_a_candidate() {
let diags = lint_en("In 2024 the team shipped the migration.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn year_with_thousands_separator_still_fires_when_large() {
let diags = lint_en("About 1,066 troops landed near Hastings.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn ordinal_is_not_a_candidate() {
let diags = lint_en("Look at the 12345th attendee on the list.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn page_reference_is_skipped() {
let diags = lint_en("See page 12345 in the appendix.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn figure_reference_is_skipped() {
let diags = lint_en("See figure 12345 in chapter 2.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn issue_hash_reference_is_skipped() {
let diags = lint_en("Closes #12345 in the tracker.", Profile::Public);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn percentage_anchors_the_whole_sentence() {
let diags = lint_en(
"The fund grew 4800000000 dollars, which is 12% of GDP.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn currency_symbol_anchors() {
let diags = lint_en(
"The fund crossed $4800000000 last quarter.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn unit_token_anchors() {
let diags = lint_en(
"The reservoir holds 4800000000 L of fresh water.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn ratio_pattern_out_of_anchors_en() {
let diags = lint_en(
"About 4800000000 out of 7900000000 people had access.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn ratio_pattern_sur_anchors_fr() {
let diags = lint_fr(
"Environ 4800000000 sur 7900000000 personnes y ont accès.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn comparator_phrase_anchors_en() {
let diags = lint_en(
"The fund crossed 4800000000 last quarter, roughly the size of a small economy.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn comparator_phrase_anchors_fr() {
let diags = lint_fr(
"Le fonds a atteint 4800000000 le trimestre dernier, soit environ le PIB d'un petit pays.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn magnitude_word_alone_fires() {
let diags = lint_en(
"The proposal mentioned several billion in vague spending.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
assert!(diags[0].message.contains("Magnitude word"));
}
#[test]
fn fr_milliard_alone_fires() {
let diags = lint_fr(
"La proposition mentionne plusieurs milliards de dépenses vagues.",
Profile::Public,
);
assert_eq!(diags.len(), 1, "got {diags:?}");
}
#[test]
fn magnitude_word_with_unit_anchored() {
let diags = lint_en(
"The plan budgets several million kg of feedstock for the year.",
Profile::Public,
);
assert!(diags.is_empty(), "got {diags:?}");
}
#[test]
fn devdoc_profile_is_more_tolerant() {
let text = "The cluster ran 50000 jobs across the weekend window.";
assert!(!lint_en(text, Profile::Public).is_empty());
assert!(lint_en(text, Profile::DevDoc).is_empty());
}
#[test]
fn falc_profile_is_stricter() {
let text = "The cluster ran 5000 jobs across the weekend window.";
assert!(lint_en(text, Profile::Public).is_empty());
assert!(!lint_en(text, Profile::Falc).is_empty());
}
#[test]
fn fenced_code_block_excluded() {
let md = "Plain prose intro.\n\n\
```\n\
let big = 4800000000;\n\
```\n\n\
Plain prose outro.";
assert!(lint_md_en(md, Profile::Public).is_empty());
}
#[test]
fn one_diagnostic_per_sentence() {
let text = "The fund crossed 4800000000 last quarter. \
The reserve grew 5500000000 the next year.";
let diags = lint_en(text, Profile::Public);
assert_eq!(diags.len(), 2, "got {diags:?}");
}
#[test]
fn position_points_at_first_candidate_token() {
let diags = lint_en("The fund crossed 4800000000 last quarter.", Profile::Public);
assert_eq!(diags.len(), 1);
assert_eq!(diags[0].location.line, 1);
assert_eq!(diags[0].location.column, 18);
}
#[test]
fn snapshot_fixture() {
let text = "Mild paragraph quotes 250 attendees only.\n\n\
Heavy paragraph cites 4800000000 in vague spending across regions.\n\n\
Plain prose without any large numbers at all here.";
let document = parse_markdown(text, SourceFile::Anonymous);
let diags =
LargeNumberUnanchored::for_profile(Profile::Public).check(&document, Language::En);
insta::assert_yaml_snapshot!(diags, {
".*.location.file" => "<input>",
});
}
}