Skip to main content

harper_core/
indefinite_article.rs

1use std::borrow::Cow;
2
3use itertools::Itertools;
4
5use crate::case::Case::Upper;
6use crate::char_ext::CharExt;
7use crate::{CaseIterExt, Dialect};
8
9#[derive(PartialEq)]
10pub enum InitialSound {
11    Vowel,
12    Consonant,
13    Either, // for SQL
14}
15
16/// Checks whether a provided word begins with a vowel _sound_. Returns `None` if `word` is empty.
17///
18/// It was produced through trial and error.
19/// Matches with 99.71% and 99.77% of vowels and non-vowels in the
20/// Carnegie-Mellon University word -> pronunciation dataset.
21pub fn starts_with_vowel(word: &[char], dialect: Dialect) -> Option<InitialSound> {
22    if word.is_empty() {
23        return None;
24    }
25
26    if matches!(word, ['L', 'E', 'D'] | ['S', 'Q', 'L'] | ['U', 'R', 'L']) {
27        return Some(InitialSound::Either);
28    }
29
30    // Try to get the first chunk of a word that appears to be a partial initialism.
31    // For example:
32    // - `RFL` from `RFLink`
33    // - `m` from `mDNS`
34    let word = {
35        let word_casing = word.get_casing_unfiltered();
36        match word_casing.as_slice() {
37            // Lower-upper or upper-upper, possibly a (partial) initialism.
38            [Some(first_char_case), Some(Upper), ..] => {
39                &word[0..word_casing
40                    .iter()
41                    .position(|c| *c != Some(*first_char_case))
42                    .unwrap_or(word.len())]
43            }
44            // Lower-lower or upper-lower, unlikely to be a partial initialism.
45            _ => word,
46        }
47    };
48
49    let is_likely_initialism = word.iter().all(|c| !c.is_alphabetic() || c.is_uppercase());
50
51    if word.len() == 1 || (is_likely_initialism && !is_likely_acronym(word)) {
52        return Some(
53            if matches!(
54                word[0].to_ascii_uppercase(),
55                'A' | 'E' | 'F' | 'H' | 'I' | 'L' | 'M' | 'N' | 'O' | 'R' | 'S' | 'X'
56            ) {
57                InitialSound::Vowel
58            } else {
59                InitialSound::Consonant
60            },
61        );
62    }
63
64    let word = to_lower_word(word);
65    let word = word.as_ref();
66
67    if matches!(word, ['u', 'b', 'i', ..]) {
68        return Some(InitialSound::Either);
69    }
70
71    if matches!(word, ['e', 'u', 'l', 'e', ..]) {
72        return Some(InitialSound::Vowel);
73    }
74
75    if matches!(
76        word,
77        ['u', 'k', ..]
78            | ['u', 'd', 'e', ..] // for 'udev'
79            | ['e', 'u', 'p', 'h', ..]
80            | ['e', 'u', 'g' | 'l' | 'c', ..]
81            | ['o', 'n', 'e', ..]
82            | ['o', 'n', 'c', 'e']
83    ) {
84        return Some(InitialSound::Consonant);
85    }
86
87    if matches!(
88        word,
89        ['h', 'o', 'u', 'r', ..]
90            | ['u', 'n', 'i', 'n' | 'm', ..]
91            | ['u', 'n', 'a' | 'u', ..]
92            | ['u', 'r', 'b', ..]
93            | ['i', 'n', 't', ..]
94    ) {
95        return Some(InitialSound::Vowel);
96    }
97
98    if matches!(word, ['h', 'e', 'r', 'b', ..] if dialect == Dialect::American || dialect == Dialect::Canadian)
99    {
100        return Some(InitialSound::Vowel);
101    }
102
103    if matches!(word, ['u', 'n' | 's', 'i' | 'a' | 'u', ..]) {
104        return Some(InitialSound::Consonant);
105    }
106
107    if matches!(word, ['u', 'n', ..]) {
108        return Some(InitialSound::Vowel);
109    }
110
111    if matches!(word, ['u', 'r', 'g', ..]) {
112        return Some(InitialSound::Vowel);
113    }
114
115    if matches!(word, ['u', 't', 't', ..]) {
116        return Some(InitialSound::Vowel);
117    }
118
119    if matches!(
120        word,
121        ['u', 't' | 'r' | 'n', ..] | ['e', 'u', 'r', ..] | ['u', 'w', ..] | ['u', 's', 'e', ..]
122    ) {
123        return Some(InitialSound::Consonant);
124    }
125
126    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u', 'l' | 'd', ..]) {
127        return Some(InitialSound::Vowel);
128    }
129
130    if matches!(word, ['o', 'n', 'e', 'a' | 'e' | 'i' | 'u' | '-' | 's', ..]) {
131        return Some(InitialSound::Consonant);
132    }
133
134    if matches!(
135        word,
136        ['s', 'o', 's']
137            | ['r', 'z', ..]
138            | ['n', 'g', ..]
139            | ['n', 'v', ..]
140            | ['x', 'b', 'o', 'x']
141            | ['h', 'e', 'i', 'r', ..]
142            | ['h', 'o', 'n', 'o', 'r', ..]
143            | ['h', 'o', 'n', 'e', 's', ..]
144    ) {
145        return Some(InitialSound::Vowel);
146    }
147
148    if matches!(
149        word,
150        ['j', 'u' | 'o', 'n', ..] | ['j', 'u', 'r', 'a' | 'i' | 'o', ..]
151    ) {
152        return Some(InitialSound::Consonant);
153    }
154
155    if matches!(word, ['x', '-' | '\'' | '.' | 'o' | 's', ..]) {
156        return Some(InitialSound::Vowel);
157    }
158
159    if word[0].is_vowel() {
160        return Some(InitialSound::Vowel);
161    }
162
163    Some(InitialSound::Consonant)
164}
165
166fn to_lower_word(word: &[char]) -> Cow<'_, [char]> {
167    if word.iter().any(|c| c.is_uppercase()) {
168        Cow::Owned(
169            word.iter()
170                .flat_map(|c| c.to_lowercase())
171                .collect::<Vec<_>>(),
172        )
173    } else {
174        Cow::Borrowed(word)
175    }
176}
177
178fn is_likely_acronym(word: &[char]) -> bool {
179    /// Does the word contain any sequences that might indicate it's not an acronym?
180    fn word_contains_false_positive_sequence(word: &[char]) -> bool {
181        let likely_false_positive_sequences = [['V', 'C']];
182        for fp_sequence in likely_false_positive_sequences {
183            if word
184                .windows(fp_sequence.len())
185                .any(|subslice| subslice == fp_sequence)
186            {
187                return true;
188            }
189        }
190        false
191    }
192
193    // If the initialism is shorter than this, skip it.
194    const MIN_LEN: usize = 3;
195
196    if let Some(first_chars) = word.get(..MIN_LEN)
197        // Unlikely to be an acronym if it contains non-alphabetic characters.
198        && first_chars.iter().copied().all(char::is_alphabetic)
199        && !word_contains_false_positive_sequence(word)
200    {
201        let vowel_map = first_chars
202            .iter()
203            .map(CharExt::is_vowel)
204            .collect_array::<MIN_LEN>()
205            .unwrap();
206        matches!(vowel_map, [false, true, false] | [false, true, true])
207    } else {
208        false
209    }
210}