ncp_matcher/
utf32_str.rs

1#[cfg(test)]
2mod tests;
3
4use std::borrow::Cow;
5use std::ops::{Bound, RangeBounds};
6use std::{fmt, slice};
7
8use memchr::memmem;
9
10use crate::chars;
11
12/// Check if a given string can be represented internally as the `Ascii` variant in a
13/// [`Utf32String`] or a [`Utf32Str`].
14///
15/// This returns true if the string is ASCII and does not contain a windows-style newline
16/// `'\r'`.
17/// The additional carriage return check is required since even for strings consisting only
18/// of ASCII, the windows-style newline `\r\n` is treated as a single grapheme.
19#[inline]
20fn has_ascii_graphemes(string: &str) -> bool {
21    string.is_ascii() && memmem::find(string.as_bytes(), b"\r\n").is_none()
22}
23
24/// A UTF-32 encoded (char array) string that is used as an input to (fuzzy) matching.
25///
26/// This is mostly intended as an internal string type, but some methods are exposed for
27/// convenience. We make the following API guarantees for `Utf32Str(ing)`s produced from a string
28/// using one of its `From<T>` constructors for string types `T` or from the
29/// [`Utf32Str::new`] method.
30///
31/// 1. The `Ascii` variant contains a byte buffer which is guaranteed to be a valid string
32///    slice.
33/// 2. It is guaranteed that the string slice internal to the `Ascii` variant is identical
34///    to the original string.
35/// 3. The length of a `Utf32Str(ing)` is exactly the number of graphemes in the original string.
36///
37/// Since `Utf32Str(ing)`s variants may be constructed directly, you **must not** make these
38/// assumptions when handling `Utf32Str(ing)`s of unknown origin.
39///
40/// ## Caveats
41/// Despite the name, this type is quite far from being a true string type. Here are some
42/// examples demonstrating this.
43///
44/// ### String conversions are not round-trip
45/// In the presence of a multi-codepoint grapheme (e.g. `"u\u{0308}"` which is `u +
46/// COMBINING_DIAERESIS`), the trailing codepoints are truncated.
47/// ```
48/// # use ncp_matcher::Utf32String;
49/// assert_eq!(Utf32String::from("u\u{0308}").to_string(), "u");
50/// ```
51///
52/// ### Indexing is done by grapheme
53/// Indexing into a string is done by grapheme rather than by codepoint.
54/// ```
55/// # use ncp_matcher::Utf32String;
56/// assert!(Utf32String::from("au\u{0308}").len() == 2);
57/// ```
58///
59/// ### A `Unicode` variant may be produced by all-ASCII characters.
60/// Since the windows-style newline `\r\n` is ASCII only but considered to be a single grapheme,
61/// strings containing `\r\n` will still result in a `Unicode` variant.
62/// ```
63/// # use ncp_matcher::Utf32String;
64/// let s = Utf32String::from("\r\n");
65/// assert!(!s.slice(..).is_ascii());
66/// assert!(s.len() == 1);
67/// assert!(s.slice(..).get(0) == '\n');
68/// ```
69///
70/// ## Design rationale
71/// Usually Rust's UTF-8 encoded strings are great. However, since fuzzy matching
72/// operates on codepoints (ideally, it should operate on graphemes but that's too
73/// much hassle to deal with), we want to quickly iterate over codepoints (up to 5
74/// times) during matching.
75///
76/// Doing codepoint segmentation on the fly not only blows trough the cache
77/// (lookup tables and I-cache) but also has nontrivial runtime compared to the
78/// matching itself. Furthermore there are many extra optimizations available
79/// for ASCII only text, but checking each match has too much overhead.
80///
81/// Of course, this comes at extra memory cost as we usually still need the UTF-8
82/// encoded variant for rendering. In the (dominant) case of ASCII-only text
83/// we don't require a copy. Furthermore fuzzy matching usually is applied while
84/// the user is typing on the fly so the same item is potentially matched many
85/// times (making the the up-front cost more worth it). That means that its
86/// basically always worth it to pre-segment the string.
87///
88/// For usecases that only match (a lot of) strings once its possible to keep
89/// char buffer around that is filled with the presegmented chars.
90///
91/// Another advantage of this approach is that the matcher will naturally
92/// produce grapheme indices (instead of utf8 offsets) anyway. With a
93/// codepoint basic representation like this the indices can be used
94/// directly
95#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
96pub enum Utf32Str<'a> {
97    /// A string represented as ASCII encoded bytes.
98    ///
99    /// Correctness invariant: must only contain valid ASCII (`<= 127`)
100    Ascii(&'a [u8]),
101    /// A string represented as an array of unicode codepoints (basically UTF-32).
102    Unicode(&'a [char]),
103}
104
105impl<'a> Utf32Str<'a> {
106    /// Convenience method to construct a `Utf32Str` from a normal UTF-8 str
107    pub fn new(str: &'a str, buf: &'a mut Vec<char>) -> Self {
108        if has_ascii_graphemes(str) {
109            Utf32Str::Ascii(str.as_bytes())
110        } else {
111            buf.clear();
112            buf.extend(crate::chars::graphemes(str));
113            Utf32Str::Unicode(buf)
114        }
115    }
116
117    /// Returns the number of characters in this string.
118    #[inline]
119    pub fn len(self) -> usize {
120        match self {
121            Utf32Str::Unicode(codepoints) => codepoints.len(),
122            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.len(),
123        }
124    }
125
126    /// Returns whether this string is empty.
127    #[inline]
128    pub fn is_empty(self) -> bool {
129        match self {
130            Utf32Str::Unicode(codepoints) => codepoints.is_empty(),
131            Utf32Str::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
132        }
133    }
134
135    /// Creates a slice with a string that contains the characters in
136    /// the specified **character range**.
137    #[inline]
138    pub fn slice(self, range: impl RangeBounds<usize>) -> Utf32Str<'a> {
139        let start = match range.start_bound() {
140            Bound::Included(&start) => start,
141            Bound::Excluded(&start) => start + 1,
142            Bound::Unbounded => 0,
143        };
144        let end = match range.end_bound() {
145            Bound::Included(&end) => end + 1,
146            Bound::Excluded(&end) => end,
147            Bound::Unbounded => self.len(),
148        };
149        match self {
150            Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
151            Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
152        }
153    }
154
155    /// Returns the number of leading whitespaces in this string
156    #[inline]
157    pub(crate) fn leading_white_space(self) -> usize {
158        match self {
159            Utf32Str::Ascii(bytes) => bytes
160                .iter()
161                .position(|b| !b.is_ascii_whitespace())
162                .unwrap_or(0),
163            Utf32Str::Unicode(codepoints) => codepoints
164                .iter()
165                .position(|c| !c.is_whitespace())
166                .unwrap_or(0),
167        }
168    }
169
170    /// Returns the number of trailing whitespaces in this string
171    #[inline]
172    pub(crate) fn trailing_white_space(self) -> usize {
173        match self {
174            Utf32Str::Ascii(bytes) => bytes
175                .iter()
176                .rev()
177                .position(|b| !b.is_ascii_whitespace())
178                .unwrap_or(0),
179            Utf32Str::Unicode(codepoints) => codepoints
180                .iter()
181                .rev()
182                .position(|c| !c.is_whitespace())
183                .unwrap_or(0),
184        }
185    }
186
187    /// Same as `slice` but accepts a u32 range for convenience since
188    /// those are the indices returned by the matcher.
189    #[inline]
190    pub fn slice_u32(self, range: impl RangeBounds<u32>) -> Utf32Str<'a> {
191        let start = match range.start_bound() {
192            Bound::Included(&start) => start as usize,
193            Bound::Excluded(&start) => start as usize + 1,
194            Bound::Unbounded => 0,
195        };
196        let end = match range.end_bound() {
197            Bound::Included(&end) => end as usize + 1,
198            Bound::Excluded(&end) => end as usize,
199            Bound::Unbounded => self.len(),
200        };
201        match self {
202            Utf32Str::Ascii(bytes) => Utf32Str::Ascii(&bytes[start..end]),
203            Utf32Str::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
204        }
205    }
206
207    /// Returns whether this string only contains graphemes which are single ASCII chars.
208    ///
209    /// This is almost equivalent to the string being ASCII, except with the additional requirement
210    /// that the string cannot contain a windows-style newline `\r\n` which is treated as a single
211    /// grapheme.
212    pub fn is_ascii(self) -> bool {
213        matches!(self, Utf32Str::Ascii(_))
214    }
215
216    /// Returns the `n`th character in this string, zero-indexed
217    pub fn get(self, n: u32) -> char {
218        match self {
219            Utf32Str::Ascii(bytes) => bytes[n as usize] as char,
220            Utf32Str::Unicode(codepoints) => codepoints[n as usize],
221        }
222    }
223
224    /// Returns the last character in this string.
225    ///
226    /// Panics if the string is empty.
227    pub(crate) fn last(self) -> char {
228        match self {
229            Utf32Str::Ascii(bytes) => bytes[bytes.len() - 1] as char,
230            Utf32Str::Unicode(codepoints) => codepoints[codepoints.len() - 1],
231        }
232    }
233
234    /// Returns the first character in this string.
235    ///
236    /// Panics if the string is empty.
237    pub(crate) fn first(self) -> char {
238        match self {
239            Utf32Str::Ascii(bytes) => bytes[0] as char,
240            Utf32Str::Unicode(codepoints) => codepoints[0],
241        }
242    }
243
244    /// Returns an iterator over the characters in this string
245    pub fn chars(self) -> Chars<'a> {
246        match self {
247            Utf32Str::Ascii(bytes) => Chars::Ascii(bytes.iter()),
248            Utf32Str::Unicode(codepoints) => Chars::Unicode(codepoints.iter()),
249        }
250    }
251}
252
253impl fmt::Debug for Utf32Str<'_> {
254    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
255        use std::fmt::Write;
256        f.write_char('"')?;
257        for c in self.chars() {
258            for c in c.escape_debug() {
259                f.write_char(c)?;
260            }
261        }
262        f.write_char('"')
263    }
264}
265
266impl fmt::Display for Utf32Str<'_> {
267    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
268        use std::fmt::Write;
269        for c in self.chars() {
270            f.write_char(c)?;
271        }
272        Ok(())
273    }
274}
275
276pub enum Chars<'a> {
277    Ascii(slice::Iter<'a, u8>),
278    Unicode(slice::Iter<'a, char>),
279}
280
281impl Iterator for Chars<'_> {
282    type Item = char;
283
284    fn next(&mut self) -> Option<Self::Item> {
285        match self {
286            Chars::Ascii(iter) => iter.next().map(|&c| c as char),
287            Chars::Unicode(iter) => iter.next().copied(),
288        }
289    }
290}
291
292impl DoubleEndedIterator for Chars<'_> {
293    fn next_back(&mut self) -> Option<Self::Item> {
294        match self {
295            Chars::Ascii(iter) => iter.next_back().map(|&c| c as char),
296            Chars::Unicode(iter) => iter.next_back().copied(),
297        }
298    }
299}
300
301/// An owned version of [`Utf32Str`].
302///
303/// See the API documentation for [`Utf32Str`] for more detail.
304#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Hash)]
305pub enum Utf32String {
306    /// A string represented as ASCII encoded bytes.
307    ///
308    /// Correctness invariant: must only contain valid ASCII (`<= 127`)
309    Ascii(Box<str>),
310    /// A string represented as an array of unicode codepoints (basically UTF-32).
311    Unicode(Box<[char]>),
312}
313
314impl Default for Utf32String {
315    fn default() -> Self {
316        Self::Ascii(String::new().into_boxed_str())
317    }
318}
319
320impl Utf32String {
321    /// Returns the number of characters in this string.
322    #[inline]
323    pub fn len(&self) -> usize {
324        match self {
325            Utf32String::Unicode(codepoints) => codepoints.len(),
326            Utf32String::Ascii(ascii_bytes) => ascii_bytes.len(),
327        }
328    }
329
330    /// Returns whether this string is empty.
331    #[inline]
332    pub fn is_empty(&self) -> bool {
333        match self {
334            Utf32String::Unicode(codepoints) => codepoints.is_empty(),
335            Utf32String::Ascii(ascii_bytes) => ascii_bytes.is_empty(),
336        }
337    }
338
339    /// Creates a slice with a string that contains the characters in
340    /// the specified **character range**.
341    #[inline]
342    pub fn slice(&self, range: impl RangeBounds<usize>) -> Utf32Str<'_> {
343        let start = match range.start_bound() {
344            Bound::Included(&start) => start,
345            Bound::Excluded(&start) => start + 1,
346            Bound::Unbounded => 0,
347        };
348        let end = match range.end_bound() {
349            Bound::Included(&end) => end + 1,
350            Bound::Excluded(&end) => end,
351            Bound::Unbounded => self.len(),
352        };
353        match self {
354            Utf32String::Ascii(bytes) => Utf32Str::Ascii(&bytes.as_bytes()[start..end]),
355            Utf32String::Unicode(codepoints) => Utf32Str::Unicode(&codepoints[start..end]),
356        }
357    }
358
359    /// Same as `slice` but accepts a u32 range for convenience since
360    /// those are the indices returned by the matcher.
361    #[inline]
362    pub fn slice_u32(&self, range: impl RangeBounds<u32>) -> Utf32Str<'_> {
363        let start = match range.start_bound() {
364            Bound::Included(&start) => start,
365            Bound::Excluded(&start) => start + 1,
366            Bound::Unbounded => 0,
367        };
368        let end = match range.end_bound() {
369            Bound::Included(&end) => end + 1,
370            Bound::Excluded(&end) => end,
371            Bound::Unbounded => self.len() as u32,
372        };
373        match self {
374            Utf32String::Ascii(bytes) => {
375                Utf32Str::Ascii(&bytes.as_bytes()[start as usize..end as usize])
376            }
377            Utf32String::Unicode(codepoints) => {
378                Utf32Str::Unicode(&codepoints[start as usize..end as usize])
379            }
380        }
381    }
382}
383
384impl From<&str> for Utf32String {
385    #[inline]
386    fn from(value: &str) -> Self {
387        if has_ascii_graphemes(value) {
388            Self::Ascii(value.to_owned().into_boxed_str())
389        } else {
390            Self::Unicode(chars::graphemes(value).collect())
391        }
392    }
393}
394
395impl From<Box<str>> for Utf32String {
396    fn from(value: Box<str>) -> Self {
397        if has_ascii_graphemes(&value) {
398            Self::Ascii(value)
399        } else {
400            Self::Unicode(chars::graphemes(&value).collect())
401        }
402    }
403}
404
405impl From<String> for Utf32String {
406    #[inline]
407    fn from(value: String) -> Self {
408        value.into_boxed_str().into()
409    }
410}
411
412impl<'a> From<Cow<'a, str>> for Utf32String {
413    #[inline]
414    fn from(value: Cow<'a, str>) -> Self {
415        match value {
416            Cow::Borrowed(value) => value.into(),
417            Cow::Owned(value) => value.into(),
418        }
419    }
420}
421
422impl fmt::Debug for Utf32String {
423    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
424        fmt::Debug::fmt(&self.slice(..), f)
425    }
426}
427
428impl fmt::Display for Utf32String {
429    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
430        fmt::Display::fmt(&self.slice(..), f)
431    }
432}