dangerous/input/
pattern.rs

1// # Safety note about searching for substrings for `Pattern`.
2//
3// As we are looking for exact slice matches, the needle and haystack are both
4// UTF-8, we are searching forward and the fact the first byte of a UTF-8
5// character cannot clash with a continuation byte we can safely search as just
6// raw bytes and return those indexes.
7//
8// For searching within a string we care that we don't return an index that
9// isn't a char boundary, but with the above conditions this is impossible.
10//
11// Note in the below UTF-8 byte sequence table continuation bytes (bytes 2-4)
12// cannot clash with the first byte. This means when the first byte of the
13// needle string matches the first byte of the haystack string, we are at a
14// valid char boundary and if rest of the needle matches the indexes we return
15// are valid.
16//
17// | Length | Byte 1   | Byte 2   | Byte 3   | Byte 4   |
18// | ------ | -------- | -------- | -------- | -------- |
19// | 1      | 0xxxxxxx |          |          |          |
20// | 2      | 110xxxxx | 10xxxxxx |          |          |
21// | 3      | 1110xxxx | 10xxxxxx | 10xxxxxx |          |
22// | 4      | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx |
23
24use crate::input::{Bytes, String};
25use crate::util::fast;
26
27/// Implemented for structures that can be found within an
28/// [`Input`](crate::Input).
29///
30/// You can search for a `char` or `&str` within either `Bytes` or `String`, but
31/// only a `u8` and `&[u8]` within `Bytes`.
32///
33/// Empty slices are invalid patterns and have the following behaviour:
34///
35/// - Finding a match of a empty slice pattern will return `None`.
36/// - Finding a reject of a empty slice pattern will return `Some(0)`.
37///
38/// With the `simd` feature enabled pattern searches are SIMD optimised where
39/// possible.
40///
41/// With the `regex` feature enabled, you can search for regex patterns.
42///
43/// # Safety
44///
45/// The implementation must return valid indexes and lengths for splitting input
46/// as these are not checked.
47pub unsafe trait Pattern<I> {
48    /// Returns the byte index and byte length of the first match and `None` if
49    /// there was no match.
50    fn find_match(self, input: &I) -> Option<(usize, usize)>;
51
52    /// Returns the byte index of the first reject and `None` if there was no
53    /// reject.
54    fn find_reject(self, input: &I) -> Option<usize>;
55}
56
57///////////////////////////////////////////////////////////////////////////////
58// Fn pattern
59
60unsafe impl<'i, F> Pattern<Bytes<'i>> for F
61where
62    F: FnMut(u8) -> bool,
63{
64    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
65        input
66            .as_dangerous()
67            .iter()
68            .copied()
69            .position(self)
70            .map(|i| (i, 1))
71    }
72
73    fn find_reject(mut self, input: &Bytes<'i>) -> Option<usize> {
74        input
75            .as_dangerous()
76            .iter()
77            .copied()
78            .enumerate()
79            .find_map(|(i, b)| if (self)(b) { None } else { Some(i) })
80    }
81}
82
83unsafe impl<'i, F> Pattern<String<'i>> for F
84where
85    F: FnMut(char) -> bool,
86{
87    fn find_match(mut self, input: &String<'i>) -> Option<(usize, usize)> {
88        for (i, c) in input.as_dangerous().char_indices() {
89            if !(self)(c) {
90                return Some((i, c.len_utf8()));
91            }
92        }
93        None
94    }
95
96    fn find_reject(mut self, input: &String<'i>) -> Option<usize> {
97        input.as_dangerous().char_indices().find_map(
98            |(i, b)| {
99                if (self)(b) {
100                    None
101                } else {
102                    Some(i)
103                }
104            },
105        )
106    }
107}
108
109///////////////////////////////////////////////////////////////////////////////
110// Token pattern
111
112unsafe impl<'i> Pattern<Bytes<'i>> for u8 {
113    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
114        fast::find_u8_match(self, input.as_dangerous()).map(|index| (index, 1))
115    }
116
117    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
118        fast::find_u8_reject(self, input.as_dangerous())
119    }
120}
121
122unsafe impl<'i> Pattern<Bytes<'i>> for char {
123    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
124        fast::find_char_match(self, input.as_dangerous()).map(|index| (index, self.len_utf8()))
125    }
126
127    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
128        fast::find_char_reject(self, input.as_dangerous())
129    }
130}
131
132unsafe impl<'i> Pattern<String<'i>> for char {
133    fn find_match(self, input: &String<'i>) -> Option<(usize, usize)> {
134        fast::find_char_match(self, input.as_dangerous().as_bytes())
135            .map(|index| (index, self.len_utf8()))
136    }
137
138    fn find_reject(self, input: &String<'i>) -> Option<usize> {
139        fast::find_char_reject(self, input.as_dangerous().as_bytes())
140    }
141}
142
143///////////////////////////////////////////////////////////////////////////////
144// Sub-slice pattern
145
146unsafe impl<'i> Pattern<Bytes<'i>> for &[u8] {
147    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
148        fast::find_slice_match(self, input.as_dangerous()).map(|index| (index, self.len()))
149    }
150
151    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
152        fast::find_slice_reject(self, input.as_dangerous())
153    }
154}
155
156unsafe impl<'i, const N: usize> Pattern<Bytes<'i>> for &[u8; N] {
157    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
158        fast::find_slice_match(self, input.as_dangerous()).map(|index| (index, self.len()))
159    }
160
161    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
162        fast::find_slice_reject(self, input.as_dangerous())
163    }
164}
165
166unsafe impl<'i> Pattern<Bytes<'i>> for &str {
167    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
168        fast::find_slice_match(self.as_bytes(), input.as_dangerous())
169            .map(|index| (index, self.len()))
170    }
171
172    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
173        fast::find_slice_reject(self.as_bytes(), input.as_dangerous())
174    }
175}
176
177unsafe impl<'i> Pattern<String<'i>> for &str {
178    #[inline]
179    fn find_match(self, input: &String<'i>) -> Option<(usize, usize)> {
180        fast::find_slice_match(self.as_bytes(), input.as_dangerous().as_bytes())
181            .map(|index| (index, self.len()))
182    }
183
184    #[inline]
185    fn find_reject(self, input: &String<'i>) -> Option<usize> {
186        fast::find_slice_reject(self.as_bytes(), input.as_dangerous().as_bytes())
187    }
188}
189
190///////////////////////////////////////////////////////////////////////////////
191// Regex pattern
192
193#[cfg(feature = "regex")]
194unsafe impl<'i> Pattern<String<'i>> for &regex::Regex {
195    fn find_match(self, input: &String<'i>) -> Option<(usize, usize)> {
196        regex::Regex::find(self, input.as_dangerous()).map(|m| (m.start(), m.end() - m.start()))
197    }
198
199    fn find_reject(self, input: &String<'i>) -> Option<usize> {
200        let mut maybe_reject = 0;
201        loop {
202            match regex::Regex::find_at(self, input.as_dangerous(), maybe_reject) {
203                Some(m) if input.as_dangerous().len() == m.end() => return None,
204                Some(m) => {
205                    maybe_reject = m.end();
206                }
207                None => return Some(maybe_reject),
208            }
209        }
210    }
211}
212
213#[cfg(feature = "regex")]
214unsafe impl<'i> Pattern<Bytes<'i>> for &regex::bytes::Regex {
215    fn find_match(self, input: &Bytes<'i>) -> Option<(usize, usize)> {
216        regex::bytes::Regex::find(self, input.as_dangerous())
217            .map(|m| (m.start(), m.end() - m.start()))
218    }
219
220    fn find_reject(self, input: &Bytes<'i>) -> Option<usize> {
221        let mut maybe_reject = 0;
222        loop {
223            match regex::bytes::Regex::find_at(self, input.as_dangerous(), maybe_reject) {
224                Some(m) if input.len() == m.end() => return None,
225                Some(m) => {
226                    maybe_reject = m.end();
227                }
228                None => return Some(maybe_reject),
229            }
230        }
231    }
232}