ib_matcher/matcher/
encoding.rs

1use std::{ops::RangeFrom, slice::SliceIndex};
2
3/// ## Performance
4/// Although multiple encodings are supported, UTF-8 (`str`) is most optimized.
5///
6/// TODO: Extended ASCII code pages
7/// TODO: Index/SliceIndex
8pub trait EncodedStr: Sealed {
9    type CHAR;
10    type SLICE: ?Sized;
11
12    const ELEMENT_LEN_BYTE: usize = core::mem::size_of::<Self::CHAR>();
13    const CHAR: usize = Self::ELEMENT_LEN_BYTE;
14    const UTF8: bool = false;
15
16    fn is_ascii(&self) -> bool;
17    fn as_bytes(&self) -> &[u8];
18
19    unsafe fn get_unchecked<I: SliceIndex<Self::SLICE, Output = Self::SLICE>>(&self, i: I)
20        -> &Self;
21    unsafe fn get_unchecked_from(&self, range: RangeFrom<usize>) -> &Self;
22
23    fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)>;
24    fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)>;
25    fn chars_count(&self) -> usize {
26        self.char_index_strs().count()
27    }
28}
29
30mod private {
31    pub trait Sealed {}
32}
33use private::Sealed;
34
35impl Sealed for str {}
36#[cfg(feature = "encoding")]
37impl Sealed for widestring::U16Str {}
38#[cfg(feature = "encoding")]
39impl Sealed for widestring::U32Str {}
40
41impl EncodedStr for str {
42    type CHAR = u8;
43    type SLICE = str;
44
45    const UTF8: bool = true;
46
47    fn is_ascii(&self) -> bool {
48        self.is_ascii()
49    }
50
51    fn as_bytes(&self) -> &[u8] {
52        self.as_bytes()
53    }
54
55    unsafe fn get_unchecked<I: SliceIndex<Self::SLICE, Output = Self::SLICE>>(
56        &self,
57        i: I,
58    ) -> &Self {
59        self.get_unchecked(i)
60    }
61
62    unsafe fn get_unchecked_from(&self, range: RangeFrom<usize>) -> &Self {
63        self.get_unchecked(range)
64    }
65
66    fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
67        self.char_indices().map(|(i, c)| (i, c, &self[i..]))
68    }
69
70    fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
71        self.char_indices().map(|(i, c)| {
72            let len = c.len_utf8();
73            (c, len, &self[i + len..])
74        })
75    }
76
77    fn chars_count(&self) -> usize {
78        self.chars().count()
79    }
80}
81
82#[cfg(feature = "encoding")]
83impl EncodedStr for widestring::U16Str {
84    type CHAR = u16;
85    type SLICE = [u16];
86
87    fn is_ascii(&self) -> bool {
88        // TODO: Since this may not be optimized with SIMD, should we use `is_in_range` instead?
89        self.chars_lossy().all(|c| c.is_ascii())
90    }
91
92    fn as_bytes(&self) -> &[u8] {
93        unsafe {
94            core::slice::from_raw_parts(
95                self.as_ptr() as *const u8,
96                self.len() * core::mem::size_of::<u16>(),
97            )
98        }
99    }
100
101    unsafe fn get_unchecked<I: SliceIndex<Self::SLICE, Output = Self::SLICE>>(
102        &self,
103        i: I,
104    ) -> &Self {
105        self.get_unchecked(i)
106    }
107
108    unsafe fn get_unchecked_from(&self, range: RangeFrom<usize>) -> &Self {
109        self.get_unchecked(range)
110    }
111
112    fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
113        self.char_indices_lossy().map(|(i, c)| (i, c, &self[i..]))
114    }
115
116    fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
117        self.char_indices_lossy().map(|(i, c)| {
118            let len = c.len_utf16();
119            (c, len, &self[i + len..])
120        })
121    }
122}
123
124#[cfg(feature = "encoding")]
125impl EncodedStr for widestring::U32Str {
126    type CHAR = u32;
127    type SLICE = [u32];
128
129    fn is_ascii(&self) -> bool {
130        // TODO: Since this may not be optimized with SIMD, should we use `is_in_range` instead?
131        self.chars_lossy().all(|c| c.is_ascii())
132    }
133
134    fn as_bytes(&self) -> &[u8] {
135        unsafe {
136            core::slice::from_raw_parts(
137                self.as_ptr() as *const u8,
138                self.len() * core::mem::size_of::<u32>(),
139            )
140        }
141    }
142
143    unsafe fn get_unchecked<I: SliceIndex<Self::SLICE, Output = Self::SLICE>>(
144        &self,
145        i: I,
146    ) -> &Self {
147        self.get_unchecked(i)
148    }
149
150    unsafe fn get_unchecked_from(&self, range: RangeFrom<usize>) -> &Self {
151        self.get_unchecked(range)
152    }
153
154    fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
155        self.char_indices_lossy().map(|(i, c)| (i, c, &self[i..]))
156    }
157
158    fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
159        self.char_indices_lossy()
160            .map(|(i, c)| (c, 1, &self[i + 1..]))
161    }
162}
163
164#[cfg(test)]
165mod tests {
166    #[allow(unused_imports)]
167    use super::*;
168
169    #[cfg(feature = "encoding")]
170    #[test]
171    fn u16_is_ascii() {
172        use widestring::u16str;
173
174        assert!(u16str!("").is_ascii());
175        assert!(u16str!("abc").is_ascii());
176        assert!(u16str!("协作").is_ascii() == false);
177    }
178
179    #[cfg(feature = "encoding")]
180    #[test]
181    fn u32_is_ascii() {
182        use widestring::u32str;
183
184        assert!(u32str!("").is_ascii());
185        assert!(u32str!("abc").is_ascii());
186        assert!(u32str!("协作").is_ascii() == false);
187    }
188}