ncp_matcher/
chars.rs

1//! Utilities for working with (Unicode) characters and codepoints.
2
3use std::fmt::{self, Debug, Display};
4
5use crate::Config;
6#[cfg(feature = "unicode-casefold")]
7use crate::chars::case_fold::CASE_FOLDING_SIMPLE;
8
9// autogenerated by generate-ucd
10#[allow(warnings)]
11#[rustfmt::skip]
12#[cfg(feature = "unicode-casefold")]
13mod case_fold;
14#[cfg(feature = "unicode-normalization")]
15mod normalize;
16
17pub(crate) trait Char: Copy + Eq + Ord + fmt::Display {
18    const ASCII: bool;
19    fn char_class(self, config: &Config) -> CharClass;
20    fn char_class_and_normalize(self, config: &Config) -> (Self, CharClass);
21    fn normalize(self, config: &Config) -> Self;
22}
23
24/// repr tansparent wrapper around u8 with better formatting and `PartialEq<char>` implementation
25#[repr(transparent)]
26#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
27pub(crate) struct AsciiChar(pub u8);
28
29impl AsciiChar {
30    pub fn cast(bytes: &[u8]) -> &[AsciiChar] {
31        unsafe { &*(bytes as *const [u8] as *const [AsciiChar]) }
32    }
33}
34
35impl fmt::Display for AsciiChar {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        Display::fmt(&(self.0 as char), f)
38    }
39}
40
41impl PartialEq<AsciiChar> for char {
42    fn eq(&self, other: &AsciiChar) -> bool {
43        other.0 as char == *self
44    }
45}
46
47impl Char for AsciiChar {
48    const ASCII: bool = true;
49    #[inline]
50    fn char_class(self, config: &Config) -> CharClass {
51        let c = self.0;
52        // using manual if conditions instead optimizes better
53        if c >= b'a' && c <= b'z' {
54            CharClass::Lower
55        } else if c >= b'A' && c <= b'Z' {
56            CharClass::Upper
57        } else if c >= b'0' && c <= b'9' {
58            CharClass::Number
59        } else if c.is_ascii_whitespace() {
60            CharClass::Whitespace
61        } else if config.delimiter_chars.contains(&c) {
62            CharClass::Delimiter
63        } else {
64            CharClass::NonWord
65        }
66    }
67
68    #[inline(always)]
69    fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
70        let char_class = self.char_class(config);
71        if config.ignore_case && char_class == CharClass::Upper {
72            self.0 += 32
73        }
74        (self, char_class)
75    }
76
77    #[inline(always)]
78    fn normalize(mut self, config: &Config) -> Self {
79        if config.ignore_case && self.0 >= b'A' && self.0 <= b'Z' {
80            self.0 += 32
81        }
82        self
83    }
84}
85
86fn char_class_non_ascii(c: char) -> CharClass {
87    if c.is_lowercase() {
88        CharClass::Lower
89    } else if is_upper_case(c) {
90        CharClass::Upper
91    } else if c.is_numeric() {
92        CharClass::Number
93    } else if c.is_alphabetic() {
94        CharClass::Letter
95    } else if c.is_whitespace() {
96        CharClass::Whitespace
97    } else {
98        CharClass::NonWord
99    }
100}
101
102impl Char for char {
103    const ASCII: bool = false;
104    #[inline(always)]
105    fn char_class(self, config: &Config) -> CharClass {
106        if self.is_ascii() {
107            return AsciiChar(self as u8).char_class(config);
108        }
109        char_class_non_ascii(self)
110    }
111
112    #[inline(always)]
113    fn char_class_and_normalize(mut self, config: &Config) -> (Self, CharClass) {
114        if self.is_ascii() {
115            let (c, class) = AsciiChar(self as u8).char_class_and_normalize(config);
116            return (c.0 as char, class);
117        }
118        let char_class = char_class_non_ascii(self);
119        #[cfg(feature = "unicode-casefold")]
120        let mut case_fold = char_class == CharClass::Upper;
121        #[cfg(feature = "unicode-normalization")]
122        if config.normalize {
123            self = normalize::normalize(self);
124            case_fold = true
125        }
126        #[cfg(feature = "unicode-casefold")]
127        if case_fold && config.ignore_case {
128            self = CASE_FOLDING_SIMPLE
129                .binary_search_by_key(&self, |(upper, _)| *upper)
130                .map_or(self, |idx| CASE_FOLDING_SIMPLE[idx].1)
131        }
132        (self, char_class)
133    }
134
135    #[inline(always)]
136    fn normalize(mut self, config: &Config) -> Self {
137        #[cfg(feature = "unicode-normalization")]
138        if config.normalize {
139            self = normalize::normalize(self);
140        }
141        #[cfg(feature = "unicode-casefold")]
142        if config.ignore_case {
143            self = to_lower_case(self)
144        }
145        self
146    }
147}
148
149#[cfg(feature = "unicode-normalization")]
150pub use normalize::normalize;
151#[cfg(feature = "unicode-segmentation")]
152use unicode_segmentation::UnicodeSegmentation;
153
154/// Converts a character to lower case using simple Unicode case folding.
155#[cfg(feature = "unicode-casefold")]
156#[inline(always)]
157pub fn to_lower_case(c: char) -> char {
158    CASE_FOLDING_SIMPLE
159        .binary_search_by_key(&c, |(upper, _)| *upper)
160        .map_or(c, |idx| CASE_FOLDING_SIMPLE[idx].1)
161}
162
163/// Checks if a character is upper case according to simple Unicode case folding.
164///
165/// If the `unicode-casefold` feature is disabled, the equivalent std function is used instead.
166#[inline(always)]
167pub fn is_upper_case(c: char) -> bool {
168    #[cfg(feature = "unicode-casefold")]
169    let val = CASE_FOLDING_SIMPLE
170        .binary_search_by_key(&c, |(upper, _)| *upper)
171        .is_ok();
172    #[cfg(not(feature = "unicode-casefold"))]
173    let val = c.is_uppercase();
174    val
175}
176
177#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Copy, Clone, Hash)]
178pub(crate) enum CharClass {
179    Whitespace,
180    NonWord,
181    Delimiter,
182    Lower,
183    Upper,
184    Letter,
185    Number,
186}
187
188/// Returns an iterator over single-codepoint representations of each grapheme in the provided
189/// text.
190///
191/// For the most part, this is simply the first `char` of a grapheme. The main exception is the
192/// windows-style newline `\r\n`, which is normalized to the char `'\n'`.
193///
194/// This workaround mainly exists since Nucleo cannot match graphemes as single units, so we
195/// must internally map each grapheme to a simpler in-memory representation. This method is used
196/// when constructing `Utf32Str(ing)`.
197pub fn graphemes(text: &str) -> impl Iterator<Item = char> + '_ {
198    #[cfg(feature = "unicode-segmentation")]
199    let res = text.graphemes(true).map(|grapheme| {
200        // we need to special-case this check since `\r\n` is a single grapheme and is
201        // therefore the exception to the rule that normalization of a grapheme should
202        // map to the first character.
203        if grapheme == "\r\n" {
204            '\n'
205        } else {
206            grapheme
207                .chars()
208                .next()
209                .expect("graphemes must be non-empty")
210        }
211    });
212    #[cfg(not(feature = "unicode-segmentation"))]
213    let res = text.chars();
214    res
215}