use std::collections::HashSet;
use std::num::NonZeroU32;
use std::sync::LazyLock;
use crate::config::Profile;
use crate::parser::Document;
use crate::rules::Rule;
use crate::types::{Diagnostic, Language, Location, Severity, SourceFile};
static COMMON_WHITELIST: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"PDF",
"SMS",
"GPS",
"ID",
"OK",
"FAQ",
"MUST",
"SHALL",
"SHOULD",
"MAY",
"NOT",
"REQUIRED",
"RECOMMENDED",
"OPTIONAL",
]
.into_iter()
.collect()
});
static TECH_WHITELIST: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
[
"URL", "HTML", "CSS", "JSON", "XML", "HTTP", "HTTPS", "UTF", "IO",
"API", "CLI", "GUI", "OS", "CPU", "RAM", "SSD", "USB", "IDE", "SDK", "CI", "CD",
]
.into_iter()
.collect()
});
#[derive(Debug, Clone)]
pub struct Config {
pub min_length: NonZeroU32,
pub whitelist: Vec<String>,
baseline: BaselineWhitelist,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum BaselineWhitelist {
Minimal,
Extended,
None,
}
impl Config {
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
let (min_length, baseline) = match profile {
Profile::DevDoc => (3, BaselineWhitelist::Extended),
Profile::Public => (2, BaselineWhitelist::Minimal),
Profile::Falc => (2, BaselineWhitelist::None),
};
Self {
min_length: NonZeroU32::new(min_length).expect("non-zero literal"),
whitelist: Vec::new(),
baseline,
}
}
fn is_whitelisted(&self, token: &str) -> bool {
if self.whitelist.iter().any(|w| w == token) {
return true;
}
match self.baseline {
BaselineWhitelist::None => false,
BaselineWhitelist::Minimal => COMMON_WHITELIST.contains(token),
BaselineWhitelist::Extended => {
COMMON_WHITELIST.contains(token) || TECH_WHITELIST.contains(token)
},
}
}
#[must_use]
pub fn with_extra_whitelist(mut self, extra: Vec<String>) -> Self {
self.whitelist.extend(extra);
self
}
}
#[derive(Debug, Clone)]
pub struct UnexplainedAbbreviation {
config: Config,
}
impl UnexplainedAbbreviation {
#[must_use]
pub const fn new(config: Config) -> Self {
Self { config }
}
#[must_use]
pub fn for_profile(profile: Profile) -> Self {
Self::new(Config::for_profile(profile))
}
pub const ID: &'static str = "lexicon.unexplained-abbreviation";
}
impl Rule for UnexplainedAbbreviation {
fn id(&self) -> &'static str {
Self::ID
}
fn check(&self, document: &Document, _language: Language) -> Vec<Diagnostic> {
let min = self.config.min_length.get();
let defined = collect_defined_acronyms(document, min);
let mut diagnostics = Vec::new();
for (paragraph, section_title) in document.paragraphs_with_section() {
for (byte_offset, token) in iter_acronyms(¶graph.text) {
let letter_count =
u32::try_from(token.chars().filter(|c| c.is_alphabetic()).count())
.unwrap_or(u32::MAX);
if letter_count < min {
continue;
}
if defined.contains(token) {
continue;
}
if self.config.is_whitelisted(token) {
continue;
}
let (line_offset, column) = line_column_at(¶graph.text, byte_offset);
let line = paragraph.start_line.saturating_add(line_offset);
diagnostics.push(build_diagnostic(
&document.source,
line,
column,
token,
section_title,
));
}
}
diagnostics.sort_by_key(|d| (d.location.line, d.location.column));
diagnostics
}
}
fn collect_defined_acronyms(document: &Document, min_letters: u32) -> HashSet<String> {
let mut defined = HashSet::new();
for (paragraph, _section) in document.paragraphs_with_section() {
let text = paragraph.text.as_str();
collect_defined_in_text(text, min_letters, &mut defined);
}
defined
}
fn collect_defined_in_text(text: &str, min_letters: u32, out: &mut HashSet<String>) {
let acronyms: Vec<(usize, &str)> = iter_acronyms(text)
.filter(|(_, tok)| {
let letters = u32::try_from(tok.chars().filter(|c| c.is_alphabetic()).count())
.unwrap_or(u32::MAX);
letters >= min_letters
})
.collect();
let bytes = text.as_bytes();
for &(start, token) in &acronyms {
let end = start + token.len();
if let Some(paren_open) = next_non_space(bytes, end) {
if bytes.get(paren_open) == Some(&b'(') {
if let Some(paren_close) = find_matching_close(bytes, paren_open + 1) {
let inner = &text[paren_open + 1..paren_close];
if has_two_alpha_words(inner) {
out.insert(token.to_string());
continue;
}
}
}
}
if start > 0 && bytes.get(start - 1) == Some(&b'(') {
if let Some(paren_close) = find_matching_close(bytes, start) {
if paren_close == end {
let before = &text[..start.saturating_sub(1)];
if has_two_alpha_words(trim_to_definition_head(before)) {
out.insert(token.to_string());
}
}
}
}
}
}
fn next_non_space(bytes: &[u8], start: usize) -> Option<usize> {
(start..bytes.len()).find(|&i| !matches!(bytes[i], b' ' | b'\t'))
}
fn find_matching_close(bytes: &[u8], start: usize) -> Option<usize> {
let mut depth: i32 = 1;
let mut i = start;
while i < bytes.len() {
match bytes[i] {
b'(' => depth += 1,
b')' => {
depth -= 1;
if depth == 0 {
return Some(i);
}
},
b'\n' => return None, _ => {},
}
i += 1;
}
None
}
fn trim_to_definition_head(before: &str) -> &str {
let cut = before
.rfind(['.', '!', '?', ':', '\n'])
.map_or(0, |i| i + 1);
before[cut..].trim()
}
fn has_two_alpha_words(text: &str) -> bool {
text.split_whitespace()
.filter(|w| w.chars().next().is_some_and(char::is_alphabetic))
.count()
>= 2
}
fn iter_acronyms(text: &str) -> impl Iterator<Item = (usize, &str)> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
std::iter::from_fn(move || {
while i < len {
while i < len && !is_acronym_byte(bytes[i]) {
let step = utf8_char_len(bytes[i]);
i += step;
}
if i >= len {
return None;
}
let start = i;
if start > 0 && is_identifier_byte(bytes[start - 1]) {
while i < len && is_acronym_byte(bytes[i]) {
i += 1;
}
continue;
}
while i < len && is_acronym_byte(bytes[i]) {
i += 1;
}
if i < len && is_identifier_byte(bytes[i]) {
continue;
}
if i + 1 < len && bytes[i] == b'.' && is_extension_byte(bytes[i + 1]) {
continue;
}
let slice = &text[start..i];
if slice.chars().any(|c| c.is_ascii_uppercase()) {
return Some((start, slice));
}
}
None
})
}
const fn is_acronym_byte(b: u8) -> bool {
b.is_ascii_uppercase() || b.is_ascii_digit()
}
const fn is_identifier_byte(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_'
}
const fn is_extension_byte(b: u8) -> bool {
b.is_ascii_lowercase() || b.is_ascii_digit()
}
const fn utf8_char_len(first_byte: u8) -> usize {
if first_byte < 0x80 {
1
} else if first_byte < 0xE0 {
2
} else if first_byte < 0xF0 {
3
} else {
4
}
}
fn line_column_at(text: &str, byte_offset: usize) -> (u32, u32) {
let capped = byte_offset.min(text.len());
let prefix = &text[..capped];
#[allow(clippy::naive_bytecount)]
let line_offset =
u32::try_from(prefix.bytes().filter(|&b| b == b'\n').count()).unwrap_or(u32::MAX);
let current_line_start = prefix.rfind('\n').map_or(0, |pos| pos + 1);
let column =
u32::try_from(text[current_line_start..capped].chars().count() + 1).unwrap_or(u32::MAX);
(line_offset, column)
}
fn build_diagnostic(
source: &SourceFile,
line: u32,
column: u32,
token: &str,
section: Option<&str>,
) -> Diagnostic {
let length = u32::try_from(token.chars().count()).unwrap_or(u32::MAX);
let location = Location::new(source.clone(), line, column, length);
let message = format!("Acronym \"{token}\" is not defined on first use.");
let diag = Diagnostic::new(
UnexplainedAbbreviation::ID,
Severity::Warning,
location,
message,
);
match section {
Some(title) => diag.with_section(title),
None => diag,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::parse_plain;
use crate::types::SourceFile;
fn lint(text: &str, profile: Profile) -> Vec<Diagnostic> {
let document = parse_plain(text, SourceFile::Anonymous);
UnexplainedAbbreviation::for_profile(profile).check(&document, Language::En)
}
#[test]
fn id_is_kebab_case() {
assert_eq!(
UnexplainedAbbreviation::ID,
"lexicon.unexplained-abbreviation"
);
}
#[test]
fn prose_without_acronyms_does_not_trigger() {
assert!(lint("A quiet sentence of ordinary words.", Profile::Public).is_empty());
}
#[test]
fn whitelisted_acronym_does_not_trigger() {
assert!(lint("Open the PDF file.", Profile::Public).is_empty());
}
#[test]
fn unknown_acronym_triggers() {
let diags = lint("Send it through the ZQX adapter.", Profile::Public);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("ZQX"));
}
#[test]
fn dev_doc_allows_tech_acronyms() {
let text = "Call the HTTP API.";
assert!(lint(text, Profile::DevDoc).is_empty());
let falc = lint(text, Profile::Falc);
assert_eq!(falc.len(), 2);
}
#[test]
fn public_does_not_allow_tech_acronyms_by_default() {
let diags = lint("Check the HTTP status.", Profile::Public);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("HTTP"));
}
#[test]
fn min_length_is_respected() {
let cfg = Config::for_profile(Profile::DevDoc);
assert_eq!(cfg.min_length.get(), 3);
let diags = lint("The ZQ panel is broken.", Profile::DevDoc);
assert!(diags.is_empty());
}
#[test]
fn mixed_case_tokens_are_ignored() {
let diags = lint("Connect via WiFi to the iPhone.", Profile::Public);
assert!(diags.is_empty());
}
#[test]
fn embedded_in_word_is_ignored() {
let diags = lint("The myAPIcall helper is deprecated.", Profile::DevDoc);
assert!(diags.is_empty());
}
#[test]
fn trailing_digits_count_as_part_of_the_acronym() {
let diags = lint("The IP4 field is missing.", Profile::Public);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("IP4"));
}
#[test]
fn multiple_occurrences_all_flagged() {
let diags = lint("Use ZQX now. Also ZQX later. ZQX again.", Profile::Public);
assert_eq!(diags.len(), 3);
}
#[test]
fn user_whitelist_silences_entry() {
let cfg = Config {
whitelist: vec!["ZQX".to_string()],
..Config::for_profile(Profile::Public)
};
let doc = parse_plain("Send it through the ZQX.", SourceFile::Anonymous);
let diags = UnexplainedAbbreviation::new(cfg).check(&doc, Language::En);
assert!(diags.is_empty());
}
#[test]
fn definition_with_expansion_first_silences_rule() {
let text = "The World Wide Web (WWW) is huge. The WWW is everywhere.";
assert!(lint(text, Profile::Public).is_empty());
}
#[test]
fn definition_with_acronym_first_silences_rule() {
let text = "WWW (World Wide Web) powers the internet. WWW is universal.";
assert!(lint(text, Profile::Public).is_empty());
}
#[test]
fn definition_silences_even_prior_occurrences() {
let text = "The WWW is everywhere. Note: WWW (World Wide Web).";
assert!(lint(text, Profile::Public).is_empty());
}
#[test]
fn short_parenthetical_note_is_not_a_definition() {
let text = "The ZQX (TBD). Later the ZQX acts up.";
let diags = lint(text, Profile::Public);
assert_eq!(diags.len(), 2);
}
#[test]
fn definition_does_not_carry_across_sentence_boundary_on_the_left() {
let text = "A prior sentence. Foo (ZQX). Use ZQX elsewhere.";
let diags = lint(text, Profile::Public);
assert_eq!(diags.len(), 2);
}
#[test]
fn baseline_no_longer_ships_accessibility_acronyms() {
let diags = lint("Follow WCAG guidelines.", Profile::DevDoc);
assert_eq!(diags.len(), 1);
assert!(diags[0].message.contains("WCAG"));
}
#[test]
fn baseline_still_ships_web_stack() {
assert!(lint("The HTTP URL hits an API.", Profile::DevDoc).is_empty());
}
#[test]
fn with_extra_whitelist_restores_project_acronyms() {
let rule = UnexplainedAbbreviation::new(
Config::for_profile(Profile::DevDoc)
.with_extra_whitelist(vec!["WCAG".to_string(), "ARIA".to_string()]),
);
let doc = parse_plain("WCAG and ARIA apply.", SourceFile::Anonymous);
assert!(rule.check(&doc, Language::En).is_empty());
}
#[test]
fn category_is_lexicon() {
let diags = lint("Fix the ZQX.", Profile::Public);
assert_eq!(diags[0].category(), crate::types::Category::Lexicon);
}
#[test]
fn snapshot_fixture() {
let text = "Check ZQX today. Then HTTP the API and read the FAQ.";
let diags = lint(text, Profile::Public);
insta::assert_yaml_snapshot!(diags, {
".*.location.file" => "<input>",
});
}
}