harper_core/linting/
an_a.rs

1use std::borrow::Cow;
2
3use itertools::Itertools;
4
5use crate::char_ext::CharExt;
6use crate::linting::{Lint, LintKind, Linter, Suggestion};
7use crate::{Document, TokenStringExt};
8
9#[derive(Debug, Default)]
10pub struct AnA;
11
12impl Linter for AnA {
13    fn lint(&mut self, document: &Document) -> Vec<Lint> {
14        let mut lints = Vec::new();
15
16        for chunk in document.iter_chunks() {
17            for (first_idx, second_idx) in chunk.iter_word_indices().tuple_windows() {
18                // [`TokenKind::Unlintable`] might have semantic meaning.
19                if chunk[first_idx..second_idx].iter_unlintables().count() > 0
20                    || chunk[first_idx + 1..second_idx]
21                        .iter_word_like_indices()
22                        .count()
23                        > 0
24                {
25                    continue;
26                }
27
28                let first = &chunk[first_idx];
29                let second = &chunk[second_idx];
30
31                let chars_first = document.get_span_content(&first.span);
32                let chars_second = document.get_span_content(&second.span);
33                // Break the second word on hyphens for this lint.
34                // Example: "An ML-based" is an acceptable noun phrase.
35                let chars_second = chars_second
36                    .split(|c| !c.is_alphanumeric())
37                    .next()
38                    .unwrap_or(chars_second);
39
40                let is_a_an = match chars_first {
41                    ['a'] => Some(true),
42                    ['A'] => Some(true),
43                    ['a', 'n'] => Some(false),
44                    ['A', 'n'] => Some(false),
45                    _ => None,
46                };
47
48                let Some(a_an) = is_a_an else {
49                    continue;
50                };
51
52                let should_be_a_an = !starts_with_vowel(chars_second);
53
54                if a_an != should_be_a_an {
55                    let replacement = match a_an {
56                        true => vec!['a', 'n'],
57                        false => vec!['a'],
58                    };
59
60                    lints.push(Lint {
61                        span: first.span,
62                        lint_kind: LintKind::Miscellaneous,
63                        suggestions: vec![Suggestion::replace_with_match_case(
64                            replacement,
65                            chars_first,
66                        )],
67                        message: "Incorrect indefinite article.".to_string(),
68                        priority: 31,
69                    })
70                }
71            }
72        }
73
74        lints
75    }
76
77    fn description(&self) -> &'static str {
78        "A rule that looks for incorrect indefinite articles. For example, `this is an mule` would be flagged as incorrect."
79    }
80}
81
82fn to_lower_word(word: &[char]) -> Cow<'_, [char]> {
83    if word.iter().any(|c| c.is_uppercase()) {
84        Cow::Owned(
85            word.iter()
86                .flat_map(|c| c.to_lowercase())
87                .collect::<Vec<_>>(),
88        )
89    } else {
90        Cow::Borrowed(word)
91    }
92}
93
94/// Checks whether a provided word begins with a vowel _sound_.
95///
96/// It was produced through trial and error.
97/// Matches with 99.71% and 99.77% of vowels and non-vowels in the
98/// Carnegie-Mellon University word -> pronunciation dataset.
99fn starts_with_vowel(word: &[char]) -> bool {
100    let is_likely_initialism = word.iter().all(|c| !c.is_alphabetic() || c.is_uppercase());
101
102    if is_likely_initialism && !word.is_empty() && !is_likely_acronym(word) {
103        return matches!(
104            word[0],
105            'A' | 'E' | 'F' | 'H' | 'I' | 'L' | 'M' | 'N' | 'O' | 'R' | 'S' | 'X'
106        );
107    }
108
109    let word = to_lower_word(word);
110    let word = word.as_ref();
111
112    if matches!(word, ['e', 'u', 'l', 'e', ..]) {
113        return true;
114    }
115
116    if matches!(
117        word,
118        [] | ['u', 'k', ..]
119            | ['e', 'u', 'p', 'h', ..]
120            | ['e', 'u', 'g' | 'l' | 'c', ..]
121            | ['o', 'n', 'e']
122            | ['o', 'n', 'c', 'e']
123    ) {
124        return false;
125    }
126
127    if matches!(word, |['h', 'o', 'u', 'r', ..]| ['h', 'o', 'n', ..]
128        | ['u', 'n', 'i', 'n' | 'm', ..]
129        | ['u', 'n', 'a' | 'u', ..]
130        | ['h', 'e', 'r', 'b', ..]
131        | ['u', 'r', 'b', ..]
132        | ['i', 'n', 't', ..])
133    {
134        return true;
135    }
136
137    if matches!(word, ['u', 'n' | 's', 'i' | 'a' | 'u', ..]) {
138        return false;
139    }
140
141    if matches!(word, ['u', 'n', ..]) {
142        return true;
143    }
144
145    if matches!(word, ['u', 'r', 'g', ..]) {
146        return true;
147    }
148
149    if matches!(word, ['u', 't', 't', ..]) {
150        return true;
151    }
152
153    if matches!(
154        word,
155        ['u', 't' | 'r' | 'n', ..] | ['e', 'u', 'r', ..] | ['u', 'w', ..] | ['u', 's', 'e', ..]
156    ) {
157        return false;
158    }
159
160    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u', 'l' | 'd', ..]) {
161        return true;
162    }
163
164    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u' | '-' | 's', ..]) {
165        return false;
166    }
167
168    if matches!(
169        word,
170        ['s', 'o', 's']
171            | ['r', 'z', ..]
172            | ['n', 'g', ..]
173            | ['n', 'v', ..]
174            | ['x']
175            | ['x', 'b', 'o', 'x']
176            | ['h', 'e', 'i', 'r', ..]
177            | ['h', 'o', 'n', 'o', 'r', ..]
178    ) {
179        return true;
180    }
181
182    if matches!(
183        word,
184        ['j', 'u' | 'o', 'n', ..] | ['j', 'u', 'r', 'a' | 'i' | 'o', ..]
185    ) {
186        return false;
187    }
188
189    if matches!(word, ['x', '-' | '\'' | '.' | 'o' | 's', ..]) {
190        return true;
191    }
192
193    matches!(
194        word,
195        ['a', ..] | ['e', ..] | ['i', ..] | ['o', ..] | ['u', ..]
196    )
197}
198
199fn is_likely_acronym(word: &[char]) -> bool {
200    // If it's three letters or longer, and the first two letters are not consonants, the initialism might be an acronym.
201    // (Like MAC, NASA, LAN, etc.)
202    word.get(..3).is_some_and(|first_chars| {
203        first_chars
204            .iter()
205            .take(2)
206            .fold(0, |acc, char| acc + !char.is_vowel() as u8)
207            < 2
208    })
209}
210
211#[cfg(test)]
212mod tests {
213    use super::AnA;
214    use crate::linting::tests::assert_lint_count;
215
216    #[test]
217    fn detects_html_as_vowel() {
218        assert_lint_count("Here is a HTML document.", AnA, 1);
219    }
220
221    #[test]
222    fn detects_llm_as_vowel() {
223        assert_lint_count("Here is a LLM document.", AnA, 1);
224    }
225
226    #[test]
227    fn detects_llm_hyphen_as_vowel() {
228        assert_lint_count("Here is a LLM-based system.", AnA, 1);
229    }
230
231    #[test]
232    fn detects_euler_as_vowel() {
233        assert_lint_count("This is an Euler brick.", AnA, 0);
234        assert_lint_count("The graph has an Eulerian tour.", AnA, 0);
235    }
236
237    #[test]
238    fn capitalized_fourier() {
239        assert_lint_count("Then, perform a Fourier transform.", AnA, 0);
240    }
241
242    #[test]
243    fn once_over() {
244        assert_lint_count("give this a once-over.", AnA, 0);
245    }
246
247    #[test]
248    fn issue_196() {
249        assert_lint_count("This is formatted as an `ext4` file system.", AnA, 0);
250    }
251
252    #[test]
253    fn allows_lowercase_vowels() {
254        assert_lint_count("not an error", AnA, 0);
255    }
256
257    #[test]
258    fn allows_lowercase_consonants() {
259        assert_lint_count("not a crash", AnA, 0);
260    }
261
262    #[test]
263    fn disallows_lowercase_vowels() {
264        assert_lint_count("not a error", AnA, 1);
265    }
266
267    #[test]
268    fn disallows_lowercase_consonants() {
269        assert_lint_count("not an crash", AnA, 1);
270    }
271
272    #[test]
273    fn allows_uppercase_vowels() {
274        assert_lint_count("not an Error", AnA, 0);
275    }
276
277    #[test]
278    fn allows_uppercase_consonants() {
279        assert_lint_count("not a Crash", AnA, 0);
280    }
281
282    #[test]
283    fn disallows_uppercase_vowels() {
284        assert_lint_count("not a Error", AnA, 1);
285    }
286
287    #[test]
288    fn disallows_uppercase_consonants() {
289        assert_lint_count("not an Crash", AnA, 1);
290    }
291
292    #[test]
293    fn disallows_a_interface() {
294        assert_lint_count(
295            "A interface for an object that can perform linting actions.",
296            AnA,
297            1,
298        );
299    }
300
301    #[test]
302    fn allow_issue_751() {
303        assert_lint_count("He got a 52% approval rating.", AnA, 0);
304    }
305
306    #[test]
307    fn allow_an_mp_and_an_mp3() {
308        assert_lint_count("an MP and an MP3?", AnA, 0);
309    }
310
311    #[test]
312    fn disallow_a_mp_and_a_mp3() {
313        assert_lint_count("a MP and a MP3?", AnA, 2);
314    }
315
316    #[test]
317    fn recognize_acronyms() {
318        // a
319        assert_lint_count("using a MAC address", AnA, 0);
320        assert_lint_count("a NASA spacecraft", AnA, 0);
321        assert_lint_count("a NAT", AnA, 0);
322        assert_lint_count("a REST API", AnA, 0);
323        assert_lint_count("a LIBERO", AnA, 0);
324        assert_lint_count("a README", AnA, 0);
325        assert_lint_count("a LAN", AnA, 0);
326
327        // an
328        assert_lint_count("an RA message", AnA, 0);
329        assert_lint_count("an SI unit", AnA, 0);
330        assert_lint_count("he is an MA of both Oxford and Cambridge", AnA, 0);
331        assert_lint_count("in an FA Cup 6th Round match", AnA, 0);
332        assert_lint_count("a AM transmitter", AnA, 1);
333    }
334}
harper_core/linting/an_a.rs

harper_core/linting/
an_a.rs