nucleo_matcher/
utf32_str.rs

1use std::borrow::Cow;
2use std::ops::{Bound, RangeBounds};
3use std::{fmt, slice};
4
5use crate::chars;
6
7/// A UTF32 encoded (char array) string that is used as an input to (fuzzy) matching.
8///
9/// Usually rusts' utf8 encoded strings are great. However during fuzzy matching
10/// operates on codepoints (it should operate on graphemes but that's too much
11/// hassle to deal with). We want to quickly iterate these codepoints between
12/// (up to 5 times) during matching.
13///
14/// Doing codepoint segmentation on the fly not only blows trough the cache
15/// (lookuptables and Icache) but also has nontrivial runtime compared to the
16/// matching itself. Furthermore there are a lot of exta optimizations available
17/// for ascii only text (but checking during each match has too much overhead).
18///
19/// Ofcourse this comes at exta memory cost as we usually still need the ut8
20/// encoded variant for rendering. In the (dominant) case of ascii-only text
21/// we don't require a copy. Furthermore fuzzy matching usually is applied while
22/// the user is typing on the fly so the same item is potentially matched many
23/// times (making the the upfront cost more worth it). That means that its
24/// basically always worth it to presegment the string.
25///
26/// For usecases that only match (a lot of) strings once its possible to keep
27/// char buffer around that is filled with the presegmented chars
28///
29/// Another advantage of this approach is that the matcher will naturally
30/// produce char indices (instead of utf8 offsets) anyway. With a
31/// codepoint basic representation like this the indices can be used
32/// directly
33#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
34pub enum Utf32Str<'a> {
35    /// A string represented as ASCII encoded bytes.
36    /// Correctness invariant: must only contain valid ASCII (<=127)
37    Ascii(&'a [u8]),
38    /// A string represented as an array of unicode codepoints (basically UTF-32).
39    Unicode(&'a [char]),
40}
41
42impl<'a> Utf32Str<'a> {
43    /// Convenience method to construct a `Utf32Str` from a normal utf8 str
44    pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
45        if str.is_ascii() {
46            Utf32Str::Ascii(str.as_bytes())
47        } else {
48            buf.clear();
49            buf.extend(crate::chars::graphemes(str));
50            if buf.iter().all(|c| c.is_ascii()) {
51                return Utf32Str::Ascii(str.as_bytes());
52            }
53            Utf32Str::Unicode(&*buf)
54        }
55    }
56
57    /// Returns the number of characters in this string.
58    #[inline]
59    pub fn len(self) -> usize {
60        match self {
61            Utf32Str::Unicode(codepoints) => codepoints.len(),
62            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
63        }
64    }
65
66    /// Returns whether this string is empty.
67    #[inline]
68    pub fn is_empty(self) -> bool {
69        match self {
70            Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
71            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
72        }
73    }
74
75    /// Creates a slice with a string that contains the characters in
76    /// the specified **character range**.
77    #[inline]
78    pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
79        let start = match range.start_bound() {
80            Bound::Included(&start) => start,
81            Bound::Excluded(&start) => start + 1,
82            Bound::Unbounded => 0,
83        };
84        let end = match range.end_bound() {
85            Bound::Included(&end) => end + 1,
86            Bound::Excluded(&end) => end,
87            Bound::Unbounded => self.len(),
88        };
89        match self {
90            Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
91            Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
92        }
93    }
94
95    /// Returns the number of leading whitespaces in this string
96    #[inline]
97    pub(crate) fn leading_white_space(self) -> usize {
98        match self {
99            Utf32Str::Ascii(bytes) => bytes
100                .iter()
101                .position(|b| !b.is_ascii_whitespace())
102                .unwrap_or(0),
103            Utf32Str::Unicode(codepoints) => codepoints
104                .iter()
105                .position(|c| !c.is_whitespace())
106                .unwrap_or(0),
107        }
108    }
109
110    /// Returns the number of leading whitespaces in this string
111    #[inline]
112    pub(crate) fn trailing_white_space(self) -> usize {
113        match self {
114            Utf32Str::Ascii(bytes) => bytes
115                .iter()
116                .rev()
117                .position(|b| !b.is_ascii_whitespace())
118                .unwrap_or(0),
119            Utf32Str::Unicode(codepoints) => codepoints
120                .iter()
121                .rev()
122                .position(|c| !c.is_whitespace())
123                .unwrap_or(0),
124        }
125    }
126
127    /// Same as `slice` but accepts a u32 range for convenience since
128    /// those are the indices returned by the matcher.
129    #[inline]
130    pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
131        let start = match range.start_bound() {
132            Bound::Included(&start) => start as usize,
133            Bound::Excluded(&start) => start as usize + 1,
134            Bound::Unbounded => 0,
135        };
136        let end = match range.end_bound() {
137            Bound::Included(&end) => end as usize + 1,
138            Bound::Excluded(&end) => end as usize,
139            Bound::Unbounded => self.len(),
140        };
141        match self {
142            Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
143            Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
144        }
145    }
146
147    /// Returns whether this string only contains ascii text.
148    pub fn is_ascii(self) -> bool {
149        matches!(self, Utf32Str::Ascii(_))
150    }
151
152    /// Returns the `n`th character in this string.
153    pub fn get(self, n: u32) -> char {
154        match self {
155            Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
156            Utf32Str::Unicode(codepoints) => codepoints[n as usize],
157        }
158    }
159    pub(crate) fn last(self) -> char {
160        match self {
161            Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
162            Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
163        }
164    }
165
166    pub(crate) fn first(self) -> char {
167        match self {
168            Utf32Str::Ascii(bytes) => bytes[0] as char,
169            Utf32Str::Unicode(codepoints) => codepoints[0],
170        }
171    }
172
173    /// Returns an iterator over the characters in this string
174    pub fn chars(self) -> Chars<'a> {
175        match self {
176            Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
177            Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
178        }
179    }
180}
181
182impl fmt::Debug for Utf32Str<'_> {
183    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
184        write!(f, "\"")?;
185        for c in self.chars() {
186            for c in c.escape_debug() {
187                write!(f, "{c}")?
188            }
189        }
190        write!(f, "\"")
191    }
192}
193
194impl fmt::Display for Utf32Str<'_> {
195    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
196        for c in self.chars() {
197            write!(f, "{c}")?
198        }
199        Ok(())
200    }
201}
202
203pub enum Chars<'a> {
204    Ascii(slice::Iter<'a, u8>),
205    Unicode(slice::Iter<'a, char>),
206}
207impl<'a> Iterator for Chars<'a> {
208    type Item = char;
209
210    fn next(&mut self) -> Option<Self::Item> {
211        match self {
212            Chars::Ascii(iter) => iter.next().map(|&c| c as char),
213            Chars::Unicode(iter) => iter.next().copied(),
214        }
215    }
216}
217
218impl DoubleEndedIterator for Chars<'_> {
219    fn next_back(&mut self) -> Option<Self::Item> {
220        match self {
221            Chars::Ascii(iter) => iter.next_back().map(|&c| c as char),
222            Chars::Unicode(iter) => iter.next_back().copied(),
223        }
224    }
225}
226
227#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
228/// An owned version of [`Utf32Str`].
229pub enum Utf32String {
230    /// A string represented as ASCII encoded bytes.
231    /// Correctness invariant: must only contain valid ASCII (<=127)
232    Ascii(Box<str>),
233    /// A string represented as an array of unicode codepoints (basically UTF-32).
234    Unicode(Box<[char]>),
235}
236
237impl Default for Utf32String {
238    fn default() -> Self {
239        Self::Ascii(String::new().into_boxed_str())
240    }
241}
242
243impl Utf32String {
244    /// Returns the number of characters in this string.
245    #[inline]
246    pub fn len(&self) -> usize {
247        match self {
248            Utf32String::Unicode(codepoints) => codepoints.len(),
249            Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(),
250        }
251    }
252
253    /// Returns whether this string is empty.
254    #[inline]
255    pub fn is_empty(&self) -> bool {
256        match self {
257            Utf32String::Unicode(codepoints) => codepoints.is_empty(),
258            Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
259        }
260    }
261
262    /// Creates a slice with a string that contains the characters in
263    /// the specified **character range**.
264    #[inline]
265    pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str {
266        let start = match range.start_bound() {
267            Bound::Included(&start) => start,
268            Bound::Excluded(&start) => start + 1,
269            Bound::Unbounded => 0,
270        };
271        let end = match range.end_bound() {
272            Bound::Included(&end) => end + 1,
273            Bound::Excluded(&end) => end,
274            Bound::Unbounded => self.len(),
275        };
276        match self {
277            Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]),
278            Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
279        }
280    }
281
282    /// Same as `slice` but accepts a u32 range for convenience since
283    /// those are the indices returned by the matcher.
284    #[inline]
285    pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str {
286        let start = match range.start_bound() {
287            Bound::Included(&start) => start,
288            Bound::Excluded(&start) => start + 1,
289            Bound::Unbounded => 0,
290        };
291        let end = match range.end_bound() {
292            Bound::Included(&end) => end + 1,
293            Bound::Excluded(&end) => end,
294            Bound::Unbounded => self.len() as u32,
295        };
296        match self {
297            Utf32String::Ascii(bytes) => {
298                Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize])
299            }
300            Utf32String::Unicode(codepoints) => {
301                Utf32Str::Unicode(&codepoints[start as usize..end as usize])
302            }
303        }
304    }
305}
306
307impl From<&str> for Utf32String {
308    #[inline]
309    fn from(value: &str) -> Self {
310        if value.is_ascii() {
311            Self::Ascii(value.to_owned().into_boxed_str())
312        } else {
313            Self::Unicode(chars::graphemes(value).collect())
314        }
315    }
316}
317
318impl From<Box<str>> for Utf32String {
319    fn from(value: Box<str>) -> Self {
320        if value.is_ascii() {
321            Self::Ascii(value)
322        } else {
323            Self::Unicode(chars::graphemes(&value).collect())
324        }
325    }
326}
327
328impl From<String> for Utf32String {
329    #[inline]
330    fn from(value: String) -> Self {
331        value.into_boxed_str().into()
332    }
333}
334
335impl<'a> From<Cow<'a, str>> for Utf32String {
336    #[inline]
337    fn from(value: Cow<'a, str>) -> Self {
338        match value {
339            Cow::Borrowed(value) => value.into(),
340            Cow::Owned(value) => value.into(),
341        }
342    }
343}
344
345impl fmt::Debug for Utf32String {
346    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
347        write!(f, "{:?}", self.slice(..))
348    }
349}
350
351impl fmt::Display for Utf32String {
352    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
353        write!(f, "{}", self.slice(..))
354    }
355}