lucid-lint 0.2.1

A cognitive accessibility linter for prose. Bilingual EN/FR. CI-native.
Documentation
//! English-specific language data.
//!
//! Used by [`crate::language::detect_language`] for language detection,
//! and by lexical rules to exclude function words from content analysis.

use std::sync::LazyLock;

use std::collections::HashSet;

/// Common English stop-words (function words).
///
/// This list is deliberately kept to highly frequent function words to minimize
/// false detection of English in mixed-language texts. It is NOT a full stop-word
/// list suitable for information retrieval.
pub static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
    [
        // Articles
        "the",
        "a",
        "an",
        // Pronouns
        "i",
        "you",
        "he",
        "she",
        "it",
        "we",
        "they",
        "me",
        "him",
        "her",
        "us",
        "them",
        "my",
        "your",
        "his",
        "its",
        "our",
        "their",
        "this",
        "that",
        "these",
        "those",
        // Auxiliaries and be-forms
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
        "do",
        "does",
        "did",
        "done",
        "will",
        "would",
        "shall",
        "should",
        "can",
        "could",
        "may",
        "might",
        "must",
        // Prepositions
        "of",
        "in",
        "on",
        "at",
        "to",
        "from",
        "for",
        "by",
        "with",
        "about",
        "into",
        "through",
        "during",
        "before",
        "after",
        "above",
        "below",
        "between",
        "under",
        "over",
        // Conjunctions
        "and",
        "or",
        "but",
        "nor",
        "so",
        "yet",
        "if",
        "because",
        "as",
        "than",
        "that",
        "while",
        "when",
        "where",
        "whether",
        "although",
        // Common adverbs and fillers
        "not",
        "no",
        "yes",
        "only",
        "just",
        "also",
        "very",
        "too",
        "here",
        "there",
        "now",
        "then",
        // Common contractions (lowercased)
        "don't",
        "doesn't",
        "didn't",
        "won't",
        "wouldn't",
        "can't",
        "couldn't",
        "shouldn't",
        "isn't",
        "aren't",
        "wasn't",
        "weren't",
        "haven't",
        "hasn't",
        "hadn't",
        "i'm",
        "you're",
        "he's",
        "she's",
        "it's",
        "we're",
        "they're",
        "i've",
        "you've",
        "we've",
        "they've",
        "i'll",
        "you'll",
        "he'll",
        "she'll",
        "we'll",
        "they'll",
    ]
    .into_iter()
    .collect()
});

/// Default English weasel words and phrases (lowercased).
///
/// A weasel word weakens a statement without informing the reader: the
/// reader must silently decide whether the qualification matters. See
/// [`RULES.md`](../../RULES.md#weasel-words).
pub static WEASELS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        "some",
        "many",
        "often",
        "just",
        "simply",
        "clearly",
        "obviously",
        "seemingly",
        "arguably",
        "basically",
        "essentially",
        "virtually",
        "various",
        "numerous",
        "rather",
        "quite",
        "sort of",
        "kind of",
        "a bit",
    ]
});

/// English negation markers (lowercased word forms).
///
/// Each entry is a standalone lexical negation. Contracted forms ending in
/// `n't` (`don't`, `can't`, `isn't`, `won't`, `doesn't`, …) are detected by
/// the `nested-negation` rule via a suffix check rather than enumeration.
pub static NEGATIONS: &[&str] = &[
    "not", "no", "never", "none", "nothing", "nobody", "no-one", "noone", "nowhere", "neither",
    "nor", "cannot", "without",
];

/// English conditional / temporal-conditional connectors (lowercased).
///
/// Used by the `conditional-stacking` rule to count branching constructs
/// that compound the inferential load on a single sentence. Mixes pure
/// conditionals (`if`, `unless`, `provided`) with temporal conjunctions
/// that introduce a conditional-like sub-clause (`when`, `while`).
pub static CONDITIONALS: &[&str] = &[
    "if",
    "unless",
    "when",
    "whenever",
    "while",
    "until",
    "provided",
    "assuming",
    "in case",
    "as long as",
    "as soon as",
    "even if",
    "only if",
];

/// English redundant intensifiers (lowercased).
///
/// Intensifiers try to *upgrade* the confidence of a statement but
/// rarely add information: "very important" reduces to "important" or,
/// better, to a quantified claim. plainlanguage.gov (Chapter 4) and the
/// CDC Clear Communication Index both flag these as plain-language
/// anti-patterns.
///
/// Disjoint from [`WEASELS`] on purpose — that list captures hedges
/// that *downgrade* confidence (`rather`, `quite`, `arguably`). A word
/// belongs to one list or the other, never both.
pub static INTENSIFIERS: &[&str] = &[
    "very",
    "really",
    "extremely",
    "absolutely",
    "totally",
    "completely",
    "utterly",
    "terribly",
    "awfully",
    "incredibly",
    "highly",
    "deeply",
    "super",
];

/// English spelled-out cardinal numerals (lowercased).
///
/// Used by the `mixed-numeric-format` rule to detect a sentence that
/// mixes digits with spelled-out numerals (CDC Clear Communication Index,
/// plainlanguage.gov: present numbers consistently throughout).
///
/// `one` is deliberately excluded — it doubles as an indefinite pronoun
/// (`one of the readers`, `no one`) and the false-positive rate is
/// prohibitive. The list therefore starts at `two`.
pub static SPELLED_NUMERALS: &[&str] = &[
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen",
    "twenty",
    "thirty",
    "forty",
    "fifty",
    "sixty",
    "seventy",
    "eighty",
    "ninety",
    "hundred",
    "thousand",
    "million",
    "billion",
];

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn contains_common_articles() {
        assert!(STOPWORDS.contains("the"));
        assert!(STOPWORDS.contains("a"));
        assert!(STOPWORDS.contains("an"));
    }

    #[test]
    fn contains_common_auxiliaries() {
        assert!(STOPWORDS.contains("is"));
        assert!(STOPWORDS.contains("have"));
        assert!(STOPWORDS.contains("will"));
    }

    #[test]
    fn does_not_contain_content_words() {
        assert!(!STOPWORDS.contains("accessibility"));
        assert!(!STOPWORDS.contains("linter"));
    }
}