Skip to main content

harper_core/
char_string.rs

1use crate::char_ext::CharExt;
2use std::borrow::Cow;
3
4use smallvec::SmallVec;
5
6// TODO: remove this when `SmallVec` allows retrieving this value in a const context.
7pub(crate) const CHAR_STRING_INLINE_SIZE: usize = 16;
8
9/// A char sequence that improves cache locality.
10/// Most English words are fewer than 12 characters.
11pub type CharString = SmallVec<[char; CHAR_STRING_INLINE_SIZE]>;
12
13mod private {
14    pub trait Sealed {}
15
16    impl Sealed for [char] {}
17}
18
19/// Extensions to character sequences that make them easier to wrangle.
20pub trait CharStringExt: private::Sealed {
21    /// Convert all characters to lowercase, returning a new owned vector if any changes were made.
22    fn to_lower(&'_ self) -> Cow<'_, [char]>;
23
24    /// Normalize the character sequence according to the dictionary's standard character set.
25    fn normalized(&'_ self) -> Cow<'_, [char]>;
26
27    /// Convert the character sequence to a String.
28    fn to_string(&self) -> String;
29
30    /// Case-insensitive comparison with a character slice, assuming the right-hand side is lowercase ASCII.
31    /// Only normalizes the left side to lowercase and avoids allocations.
32    fn eq_ch(&self, other: &[char]) -> bool;
33
34    /// Case-insensitive comparison with a string slice, assuming the right-hand side is lowercase ASCII.
35    /// Only normalizes the left side to lowercase and avoids allocations.
36    fn eq_str(&self, other: &str) -> bool;
37
38    /// Case-insensitive comparison with any of a list of string slices, assuming the right-hand side is lowercase ASCII.
39    /// Only normalizes the left side to lowercase and avoids allocations.
40    fn eq_any_ignore_ascii_case_str(&self, others: &[&str]) -> bool;
41
42    /// Case-insensitive comparison with any of a list of character slices, assuming the right-hand side is lowercase ASCII.
43    /// Only normalizes the left side to lowercase and avoids allocations.
44    fn eq_any_ignore_ascii_case_chars(&self, others: &[&[char]]) -> bool;
45
46    /// Case-insensitive check if the string starts with the given ASCII prefix.
47    /// The prefix is assumed to be lowercase.
48    fn starts_with_ignore_ascii_case_str(&self, prefix: &str) -> bool;
49
50    /// Case-insensitive check if the string starts with any of the given ASCII prefixes.
51    /// The prefixes are assumed to be lowercase.
52    fn starts_with_any_ignore_ascii_case_str(&self, prefixes: &[&str]) -> bool;
53
54    /// Case-insensitive check if the string ends with the given ASCII suffix.
55    /// The suffix is assumed to be lowercase.
56    fn ends_with_ignore_ascii_case_chars(&self, suffix: &[char]) -> bool;
57
58    /// Case-insensitive check if the string ends with the given ASCII suffix.
59    /// The suffix is assumed to be lowercase.
60    fn ends_with_ignore_ascii_case_str(&self, suffix: &str) -> bool;
61
62    /// Case-insensitive check if the string ends with any of the given ASCII suffixes.
63    /// The suffixes are assumed to be lowercase.
64    fn ends_with_any_ignore_ascii_case_chars(&self, suffixes: &[&[char]]) -> bool;
65
66    /// Check if the string contains any vowels
67    fn contains_vowel(&self) -> bool;
68}
69
70impl CharStringExt for [char] {
71    fn to_lower(&'_ self) -> Cow<'_, [char]> {
72        if self.iter().all(|c| c.is_lowercase()) {
73            return Cow::Borrowed(self);
74        }
75
76        let mut out = CharString::with_capacity(self.len());
77
78        out.extend(self.iter().flat_map(|v| v.to_lowercase()));
79
80        Cow::Owned(out.to_vec())
81    }
82
83    fn to_string(&self) -> String {
84        self.iter().collect()
85    }
86
87    /// Convert a given character sequence to the standard character set
88    /// the dictionary is in.
89    fn normalized(&'_ self) -> Cow<'_, [char]> {
90        if self.as_ref().iter().any(|c| c.normalized() != *c) {
91            Cow::Owned(
92                self.as_ref()
93                    .iter()
94                    .copied()
95                    .map(|c| c.normalized())
96                    .collect(),
97            )
98        } else {
99            Cow::Borrowed(self)
100        }
101    }
102
103    fn eq_str(&self, other: &str) -> bool {
104        let mut chit = self.iter();
105        let mut strit = other.chars();
106
107        loop {
108            let (c, s) = (chit.next(), strit.next());
109            match (c, s) {
110                (Some(c), Some(s)) => {
111                    if c.to_ascii_lowercase() != s {
112                        return false;
113                    }
114                }
115                (None, None) => return true,
116                _ => return false,
117            }
118        }
119    }
120
121    fn eq_ch(&self, other: &[char]) -> bool {
122        self.len() == other.len()
123            && self
124                .iter()
125                .zip(other.iter())
126                .all(|(a, b)| a.to_ascii_lowercase() == *b)
127    }
128
129    fn eq_any_ignore_ascii_case_str(&self, others: &[&str]) -> bool {
130        others.iter().any(|str| self.eq_str(str))
131    }
132
133    fn eq_any_ignore_ascii_case_chars(&self, others: &[&[char]]) -> bool {
134        others.iter().any(|chars| self.eq_ch(chars))
135    }
136
137    fn starts_with_ignore_ascii_case_str(&self, prefix: &str) -> bool {
138        let prefix_len = prefix.chars().count();
139        if self.len() < prefix_len {
140            return false;
141        }
142        self.iter()
143            .take(prefix_len)
144            .zip(prefix.chars())
145            .all(|(a, b)| a.to_ascii_lowercase() == b)
146    }
147
148    fn starts_with_any_ignore_ascii_case_str(&self, prefixes: &[&str]) -> bool {
149        prefixes
150            .iter()
151            .any(|prefix| self.starts_with_ignore_ascii_case_str(prefix))
152    }
153
154    fn ends_with_ignore_ascii_case_str(&self, suffix: &str) -> bool {
155        let suffix_len = suffix.chars().count();
156        if self.len() < suffix_len {
157            return false;
158        }
159        self.iter()
160            .rev()
161            .take(suffix_len)
162            .rev()
163            .zip(suffix.chars())
164            .all(|(a, b)| a.to_ascii_lowercase() == b)
165    }
166
167    fn ends_with_ignore_ascii_case_chars(&self, suffix: &[char]) -> bool {
168        let suffix_len = suffix.len();
169        if self.len() < suffix_len {
170            return false;
171        }
172        self.iter()
173            .rev()
174            .take(suffix_len)
175            .rev()
176            .zip(suffix.iter())
177            .all(|(a, b)| a.to_ascii_lowercase() == *b)
178    }
179
180    fn ends_with_any_ignore_ascii_case_chars(&self, suffixes: &[&[char]]) -> bool {
181        suffixes
182            .iter()
183            .any(|suffix| self.ends_with_ignore_ascii_case_chars(suffix))
184    }
185
186    fn contains_vowel(&self) -> bool {
187        self.iter().any(|c| c.is_vowel())
188    }
189}
190
191macro_rules! char_string {
192    ($string:literal) => {{
193        use crate::char_string::CharString;
194
195        $string.chars().collect::<CharString>()
196    }};
197}
198
199pub(crate) use char_string;
200
201#[cfg(test)]
202mod tests {
203    use super::CharStringExt;
204
205    #[test]
206    fn eq_ignore_ascii_case_chars_matches_lowercase() {
207        assert!(['H', 'e', 'l', 'l', 'o'].eq_ch(&['h', 'e', 'l', 'l', 'o']));
208    }
209
210    #[test]
211    fn eq_ignore_ascii_case_chars_does_not_match_different_word() {
212        assert!(!['H', 'e', 'l', 'l', 'o'].eq_ch(&['w', 'o', 'r', 'l', 'd']));
213    }
214
215    #[test]
216    fn eq_ignore_ascii_case_str_matches_lowercase() {
217        assert!(['H', 'e', 'l', 'l', 'o'].eq_str("hello"));
218    }
219
220    #[test]
221    fn eq_ignore_ascii_case_str_does_not_match_different_word() {
222        assert!(!['H', 'e', 'l', 'l', 'o'].eq_str("world"));
223    }
224
225    #[test]
226    fn ends_with_ignore_ascii_case_chars_matches_suffix() {
227        assert!(['H', 'e', 'l', 'l', 'o'].ends_with_ignore_ascii_case_chars(&['l', 'o']));
228    }
229
230    #[test]
231    fn ends_with_ignore_ascii_case_chars_does_not_match_different_suffix() {
232        assert!(
233            !['H', 'e', 'l', 'l', 'o']
234                .ends_with_ignore_ascii_case_chars(&['w', 'o', 'r', 'l', 'd'])
235        );
236    }
237
238    #[test]
239    fn ends_with_ignore_ascii_case_str_matches_suffix() {
240        assert!(['H', 'e', 'l', 'l', 'o'].ends_with_ignore_ascii_case_str("lo"));
241    }
242
243    #[test]
244    fn ends_with_ignore_ascii_case_str_does_not_match_different_suffix() {
245        assert!(!['H', 'e', 'l', 'l', 'o'].ends_with_ignore_ascii_case_str("world"));
246    }
247
248    #[test]
249    fn differs_only_by_length_1() {
250        assert!(!['b', 'b'].eq_str("b"));
251    }
252
253    #[test]
254    fn differs_only_by_length_2() {
255        assert!(!['c'].eq_str("cc"));
256    }
257}