harper_core/linting/
an_a.rs

1use std::borrow::Cow;
2
3use itertools::Itertools;
4
5use crate::char_ext::CharExt;
6use crate::linting::{Lint, LintKind, Linter, Suggestion};
7use crate::{Document, TokenStringExt};
8
9#[derive(Debug, Default)]
10pub struct AnA;
11
12impl Linter for AnA {
13    fn lint(&mut self, document: &Document) -> Vec<Lint> {
14        let mut lints = Vec::new();
15
16        for chunk in document.iter_chunks() {
17            for (first_idx, second_idx) in chunk.iter_word_indices().tuple_windows() {
18                // [`TokenKind::Unlintable`] might have semantic meaning.
19                if chunk[first_idx..second_idx].iter_unlintables().count() > 0
20                    || chunk[first_idx + 1..second_idx]
21                        .iter_word_like_indices()
22                        .count()
23                        > 0
24                {
25                    continue;
26                }
27
28                let first = &chunk[first_idx];
29                let second = &chunk[second_idx];
30
31                let chars_first = document.get_span_content(&first.span);
32                let chars_second = document.get_span_content(&second.span);
33                // Break the second word on hyphens for this lint.
34                // Example: "An ML-based" is an acceptable noun phrase.
35                let chars_second = chars_second
36                    .split(|c| !c.is_alphanumeric())
37                    .next()
38                    .unwrap_or(chars_second);
39
40                let is_a_an = match chars_first {
41                    ['a'] => Some(true),
42                    ['A'] => Some(true),
43                    ['a', 'n'] => Some(false),
44                    ['A', 'n'] => Some(false),
45                    _ => None,
46                };
47
48                let Some(a_an) = is_a_an else {
49                    continue;
50                };
51
52                let should_be_a_an = !starts_with_vowel(chars_second);
53
54                if a_an != should_be_a_an {
55                    let replacement = match a_an {
56                        true => vec!['a', 'n'],
57                        false => vec!['a'],
58                    };
59
60                    lints.push(Lint {
61                        span: first.span,
62                        lint_kind: LintKind::Miscellaneous,
63                        suggestions: vec![Suggestion::replace_with_match_case(
64                            replacement,
65                            chars_first,
66                        )],
67                        message: "Incorrect indefinite article.".to_string(),
68                        priority: 31,
69                    })
70                }
71            }
72        }
73
74        lints
75    }
76
77    fn description(&self) -> &'static str {
78        "A rule that looks for incorrect indefinite articles. For example, `this is an mule` would be flagged as incorrect."
79    }
80}
81
82fn to_lower_word(word: &[char]) -> Cow<'_, [char]> {
83    if word.iter().any(|c| c.is_uppercase()) {
84        Cow::Owned(
85            word.iter()
86                .flat_map(|c| c.to_lowercase())
87                .collect::<Vec<_>>(),
88        )
89    } else {
90        Cow::Borrowed(word)
91    }
92}
93
94/// Checks whether a provided word begins with a vowel _sound_.
95///
96/// It was produced through trial and error.
97/// Matches with 99.71% and 99.77% of vowels and non-vowels in the
98/// Carnegie-Mellon University word -> pronunciation dataset.
99fn starts_with_vowel(word: &[char]) -> bool {
100    let is_likely_initialism = word.iter().all(|c| !c.is_alphabetic() || c.is_uppercase());
101
102    if is_likely_initialism && !word.is_empty() && !is_likely_acronym(word) {
103        return matches!(
104            word[0],
105            'A' | 'E' | 'F' | 'H' | 'I' | 'L' | 'M' | 'N' | 'O' | 'R' | 'S' | 'X'
106        );
107    }
108
109    let word = to_lower_word(word);
110    let word = word.as_ref();
111
112    if matches!(
113        word,
114        [] | ['u', 'k', ..]
115            | ['e', 'u', 'p', 'h', ..]
116            | ['e', 'u', 'g' | 'l' | 'c', ..]
117            | ['o', 'n', 'e']
118            | ['o', 'n', 'c', 'e']
119    ) {
120        return false;
121    }
122
123    if matches!(word, |['h', 'o', 'u', 'r', ..]| ['h', 'o', 'n', ..]
124        | ['u', 'n', 'i', 'n' | 'm', ..]
125        | ['u', 'n', 'a' | 'u', ..]
126        | ['h', 'e', 'r', 'b', ..]
127        | ['u', 'r', 'b', ..]
128        | ['i', 'n', 't', ..])
129    {
130        return true;
131    }
132
133    if matches!(word, ['u', 'n' | 's', 'i' | 'a' | 'u', ..]) {
134        return false;
135    }
136
137    if matches!(word, ['u', 'n', ..]) {
138        return true;
139    }
140
141    if matches!(word, ['u', 'r', 'g', ..]) {
142        return true;
143    }
144
145    if matches!(word, ['u', 't', 't', ..]) {
146        return true;
147    }
148
149    if matches!(
150        word,
151        ['u', 't' | 'r' | 'n', ..] | ['e', 'u', 'r', ..] | ['u', 'w', ..] | ['u', 's', 'e', ..]
152    ) {
153        return false;
154    }
155
156    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u', 'l' | 'd', ..]) {
157        return true;
158    }
159
160    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u' | '-' | 's', ..]) {
161        return false;
162    }
163
164    if matches!(
165        word,
166        ['s', 'o', 's']
167            | ['r', 'z', ..]
168            | ['n', 'g', ..]
169            | ['n', 'v', ..]
170            | ['x']
171            | ['x', 'b', 'o', 'x']
172            | ['h', 'e', 'i', 'r', ..]
173            | ['h', 'o', 'n', 'o', 'r', ..]
174    ) {
175        return true;
176    }
177
178    if matches!(
179        word,
180        ['j', 'u' | 'o', 'n', ..] | ['j', 'u', 'r', 'a' | 'i' | 'o', ..]
181    ) {
182        return false;
183    }
184
185    if matches!(word, ['x', '-' | '\'' | '.' | 'o' | 's', ..]) {
186        return true;
187    }
188
189    matches!(
190        word,
191        ['a', ..] | ['e', ..] | ['i', ..] | ['o', ..] | ['u', ..]
192    )
193}
194
195fn is_likely_acronym(word: &[char]) -> bool {
196    // If it's three letters or longer, and the first two letters are not consonants, the initialism might be an acronym.
197    // (Like MAC, NASA, LAN, etc.)
198    word.get(..3).is_some_and(|first_chars| {
199        first_chars
200            .iter()
201            .take(2)
202            .fold(0, |acc, char| acc + !char.is_vowel() as u8)
203            < 2
204    })
205}
206
207#[cfg(test)]
208mod tests {
209    use super::AnA;
210    use crate::linting::tests::assert_lint_count;
211
212    #[test]
213    fn detects_html_as_vowel() {
214        assert_lint_count("Here is a HTML document.", AnA, 1);
215    }
216
217    #[test]
218    fn detects_llm_as_vowel() {
219        assert_lint_count("Here is a LLM document.", AnA, 1);
220    }
221
222    #[test]
223    fn detects_llm_hyphen_as_vowel() {
224        assert_lint_count("Here is a LLM-based system.", AnA, 1);
225    }
226
227    #[test]
228    fn capitalized_fourier() {
229        assert_lint_count("Then, perform a Fourier transform.", AnA, 0);
230    }
231
232    #[test]
233    fn once_over() {
234        assert_lint_count("give this a once-over.", AnA, 0);
235    }
236
237    #[test]
238    fn issue_196() {
239        assert_lint_count("This is formatted as an `ext4` file system.", AnA, 0);
240    }
241
242    #[test]
243    fn allows_lowercase_vowels() {
244        assert_lint_count("not an error", AnA, 0);
245    }
246
247    #[test]
248    fn allows_lowercase_consonants() {
249        assert_lint_count("not a crash", AnA, 0);
250    }
251
252    #[test]
253    fn disallows_lowercase_vowels() {
254        assert_lint_count("not a error", AnA, 1);
255    }
256
257    #[test]
258    fn disallows_lowercase_consonants() {
259        assert_lint_count("not an crash", AnA, 1);
260    }
261
262    #[test]
263    fn allows_uppercase_vowels() {
264        assert_lint_count("not an Error", AnA, 0);
265    }
266
267    #[test]
268    fn allows_uppercase_consonants() {
269        assert_lint_count("not a Crash", AnA, 0);
270    }
271
272    #[test]
273    fn disallows_uppercase_vowels() {
274        assert_lint_count("not a Error", AnA, 1);
275    }
276
277    #[test]
278    fn disallows_uppercase_consonants() {
279        assert_lint_count("not an Crash", AnA, 1);
280    }
281
282    #[test]
283    fn disallows_a_interface() {
284        assert_lint_count(
285            "A interface for an object that can perform linting actions.",
286            AnA,
287            1,
288        );
289    }
290
291    #[test]
292    fn allow_issue_751() {
293        assert_lint_count("He got a 52% approval rating.", AnA, 0);
294    }
295
296    #[test]
297    fn allow_an_mp_and_an_mp3() {
298        assert_lint_count("an MP and an MP3?", AnA, 0);
299    }
300
301    #[test]
302    fn disallow_a_mp_and_a_mp3() {
303        assert_lint_count("a MP and a MP3?", AnA, 2);
304    }
305
306    #[test]
307    fn recognize_acronyms() {
308        // a
309        assert_lint_count("using a MAC address", AnA, 0);
310        assert_lint_count("a NASA spacecraft", AnA, 0);
311        assert_lint_count("a NAT", AnA, 0);
312        assert_lint_count("a REST API", AnA, 0);
313        assert_lint_count("a LIBERO", AnA, 0);
314        assert_lint_count("a README", AnA, 0);
315        assert_lint_count("a LAN", AnA, 0);
316
317        // an
318        assert_lint_count("an RA message", AnA, 0);
319        assert_lint_count("an SI unit", AnA, 0);
320        assert_lint_count("he is an MA of both Oxford and Cambridge", AnA, 0);
321        assert_lint_count("in an FA Cup 6th Round match", AnA, 0);
322        assert_lint_count("a AM transmitter", AnA, 1);
323    }
324}