lucid-lint 0.2.5

//! English-specific language data.
//!
//! Used by [`crate::language::detect_language`] for language detection,
//! and by lexical rules to exclude function words from content analysis.

use std::sync::LazyLock;

use std::collections::HashSet;

/// Common English stop-words (function words).
///
/// This list is deliberately kept to highly frequent function words to minimize
/// false detection of English in mixed-language texts. It is NOT a full stop-word
/// list suitable for information retrieval.
pub static STOPWORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
    [
        // Articles
        "the",
        "a",
        "an",
        // Pronouns
        "i",
        "you",
        "he",
        "she",
        "it",
        "we",
        "they",
        "me",
        "him",
        "her",
        "us",
        "them",
        "my",
        "your",
        "his",
        "its",
        "our",
        "their",
        "this",
        "that",
        "these",
        "those",
        // Auxiliaries and be-forms
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "having",
        "do",
        "does",
        "did",
        "done",
        "will",
        "would",
        "shall",
        "should",
        "can",
        "could",
        "may",
        "might",
        "must",
        // Prepositions
        "of",
        "in",
        "on",
        "at",
        "to",
        "from",
        "for",
        "by",
        "with",
        "about",
        "into",
        "through",
        "during",
        "before",
        "after",
        "above",
        "below",
        "between",
        "under",
        "over",
        // Conjunctions
        "and",
        "or",
        "but",
        "nor",
        "so",
        "yet",
        "if",
        "because",
        "as",
        "than",
        "that",
        "while",
        "when",
        "where",
        "whether",
        "although",
        // Common adverbs and fillers
        "not",
        "no",
        "yes",
        "only",
        "just",
        "also",
        "very",
        "too",
        "here",
        "there",
        "now",
        "then",
        // Common contractions (lowercased)
        "don't",
        "doesn't",
        "didn't",
        "won't",
        "wouldn't",
        "can't",
        "couldn't",
        "shouldn't",
        "isn't",
        "aren't",
        "wasn't",
        "weren't",
        "haven't",
        "hasn't",
        "hadn't",
        "i'm",
        "you're",
        "he's",
        "she's",
        "it's",
        "we're",
        "they're",
        "i've",
        "you've",
        "we've",
        "they've",
        "i'll",
        "you'll",
        "he'll",
        "she'll",
        "we'll",
        "they'll",
    ]
    .into_iter()
    .collect()
});

/// Default English weasel words and phrases (lowercased).
///
/// A weasel word weakens a statement without informing the reader: the
/// reader must silently decide whether the qualification matters. See
/// [`RULES.md`](../../RULES.md#weasel-words).
pub static WEASELS: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    vec![
        "some",
        "many",
        "often",
        "just",
        "simply",
        "clearly",
        "obviously",
        "seemingly",
        "arguably",
        "basically",
        "essentially",
        "virtually",
        "various",
        "numerous",
        "rather",
        "quite",
        "sort of",
        "kind of",
        "a bit",
    ]
});

/// English negation markers (lowercased word forms).
///
/// Each entry is a standalone lexical negation. Contracted forms ending in
/// `n't` (`don't`, `can't`, `isn't`, `won't`, `doesn't`, …) are detected by
/// the `nested-negation` rule via a suffix check rather than enumeration.
pub static NEGATIONS: &[&str] = &[
    "not", "no", "never", "none", "nothing", "nobody", "no-one", "noone", "nowhere", "neither",
    "nor", "cannot", "without",
];

/// English conditional / temporal-conditional connectors (lowercased).
///
/// Used by the `conditional-stacking` rule to count branching constructs
/// that compound the inferential load on a single sentence. Mixes pure
/// conditionals (`if`, `unless`, `provided`) with temporal conjunctions
/// that introduce a conditional-like sub-clause (`when`, `while`).
pub static CONDITIONALS: &[&str] = &[
    "if",
    "unless",
    "when",
    "whenever",
    "while",
    "until",
    "provided",
    "assuming",
    "in case",
    "as long as",
    "as soon as",
    "even if",
    "only if",
];

/// English redundant intensifiers (lowercased).
///
/// Intensifiers try to *upgrade* the confidence of a statement but
/// rarely add information: "very important" reduces to "important" or,
/// better, to a quantified claim. plainlanguage.gov (Chapter 4) and the
/// CDC Clear Communication Index both flag these as plain-language
/// anti-patterns.
///
/// Disjoint from [`WEASELS`] on purpose — that list captures hedges
/// that *downgrade* confidence (`rather`, `quite`, `arguably`). A word
/// belongs to one list or the other, never both.
pub static INTENSIFIERS: &[&str] = &[
    "very",
    "really",
    "extremely",
    "absolutely",
    "totally",
    "completely",
    "utterly",
    "terribly",
    "awfully",
    "incredibly",
    "highly",
    "deeply",
    "super",
];

/// English homophone groups (lowercased).
///
/// Each inner slice lists orthographic variants that share (or near-share)
/// pronunciation. Used by the `lexicon.homophone-density` rule to flag
/// paragraphs where homophones cluster: dyslexic readers decode
/// phonologically and must spell-disambiguate in context, which raises
/// cognitive load (BDA Dyslexia Style Guide; plainlanguage.gov on
/// avoiding ambiguous wording).
///
/// The list mixes function-word triples (`their` / `there` / `they're`)
/// — frequent enough to cluster — with content-word pairs (`affect` /
/// `effect`, `principal` / `principle`) where the orthographic confusion
/// also distorts meaning. Calibration happens during the v0.2.x dogfood
/// window while the rule ships as `Status::Experimental`.
pub static HOMOPHONE_GROUPS_EN: &[&[&str]] = &[
    &["their", "there", "they're"],
    &["your", "you're"],
    &["to", "too", "two"],
    &["its", "it's"],
    &["affect", "effect"],
    &["principal", "principle"],
    &["weather", "whether"],
    &["lose", "loose"],
];

/// English spelled-out cardinal numerals (lowercased).
///
/// Used by the `mixed-numeric-format` rule to detect a sentence that
/// mixes digits with spelled-out numerals (CDC Clear Communication Index,
/// plainlanguage.gov: present numbers consistently throughout).
///
/// `one` is deliberately excluded — it doubles as an indefinite pronoun
/// (`one of the readers`, `no one`) and the false-positive rate is
/// prohibitive. The list therefore starts at `two`.
pub static SPELLED_NUMERALS: &[&str] = &[
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "fifteen",
    "sixteen",
    "seventeen",
    "eighteen",
    "nineteen",
    "twenty",
    "thirty",
    "forty",
    "fifty",
    "sixty",
    "seventy",
    "eighty",
    "ninety",
    "hundred",
    "thousand",
    "million",
    "billion",
];

/// English comparator phrases that anchor a large number in a sentence.
///
/// Used by the `readability.large-number-unanchored` rule. A sentence
/// containing any of these phrases is treated as anchored, regardless
/// of where in the sentence the comparator sits relative to the
/// numeral. The list is curated and short on purpose — false-negative
/// recall is bounded but false-positive risk stays low.
///
/// Match is case-insensitive substring; phrases that contain a space
/// must be lowercased here.
pub static ANCHOR_COMPARATORS_EN: &[&str] = &[
    "out of",
    "of every",
    "as many as",
    "the size of",
    "the population of",
    "compared to",
    "compared with",
    "equivalent to",
    "the equivalent of",
    "equal to",
    "roughly",
    "approximately",
    "about ",
    "around ",
    "more than",
    "less than",
    "no more than",
    "no less than",
    "at least",
    "at most",
    "up to",
    "averaging",
    "an average of",
    "translates to",
    "amounts to",
    "near ",
    "nearly ",
];

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn contains_common_articles() {
        assert!(STOPWORDS.contains("the"));
        assert!(STOPWORDS.contains("a"));
        assert!(STOPWORDS.contains("an"));
    }

    #[test]
    fn contains_common_auxiliaries() {
        assert!(STOPWORDS.contains("is"));
        assert!(STOPWORDS.contains("have"));
        assert!(STOPWORDS.contains("will"));
    }

    #[test]
    fn does_not_contain_content_words() {
        assert!(!STOPWORDS.contains("accessibility"));
        assert!(!STOPWORDS.contains("linter"));
    }
}