1use needle::*;
2use haystack::{Haystack, Span};
3use std::ops::Range;
4use slices::slice::{TwoWaySearcher, SliceSearcher, NaiveSearcher};
5#[cfg(test)]
6use ext::{match_ranges, rmatch_ranges, starts_with, ends_with};
7
8use super::wtf8::{HighSurrogate, LowSurrogate, ThreeByteSeq, Wtf8};
9
10unsafe impl<'p> Searcher<Wtf8> for TwoWaySearcher<'p, u8> {
14 #[inline]
15 fn search(&mut self, span: Span<&Wtf8>) -> Option<Range<usize>> {
16 let (hay, range) = span.into_parts();
17 self.next(hay.as_inner(), range)
18 }
19}
20
21unsafe impl<'p> ReverseSearcher<Wtf8> for TwoWaySearcher<'p, u8> {
22 #[inline]
23 fn rsearch(&mut self, span: Span<&Wtf8>) -> Option<Range<usize>> {
24 let (hay, range) = span.into_parts();
25 self.next_back(hay.as_inner(), range)
26 }
27}
28
29fn span_as_inner(span: Span<&Wtf8>) -> Span<&[u8]> {
30 let (hay, range) = span.into_parts();
31 unsafe { Span::from_parts(hay.as_inner(), range) }
32}
33
34unsafe impl<'p> Consumer<Wtf8> for NaiveSearcher<'p, u8> {
35 #[inline]
36 fn consume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
37 self.consume(span_as_inner(span))
38 }
39
40 #[inline]
41 fn trim_start(&mut self, hay: &Wtf8) -> usize {
42 self.trim_start(hay.as_inner())
43 }
44}
45
46unsafe impl<'p> ReverseConsumer<Wtf8> for NaiveSearcher<'p, u8> {
47 #[inline]
48 fn rconsume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
49 self.rconsume(span_as_inner(span))
50 }
51
52 #[inline]
53 fn trim_end(&mut self, hay: &Wtf8) -> usize {
54 self.trim_end(hay.as_inner())
55 }
56}
57
58#[derive(Debug)]
59enum SurrogateType {
60 Split,
61 Canonical,
62 Empty,
63}
64
65fn extend_subrange(
66 range: Range<usize>,
67 mut subrange: Range<usize>,
68 low_type: SurrogateType,
69 high_type: SurrogateType,
70) -> Range<usize> {
71 subrange.start -= match low_type {
72 SurrogateType::Empty => 0,
73 SurrogateType::Split if subrange.start != range.start + 3 => 2,
74 _ => 3,
75 };
76 subrange.end += match high_type {
77 SurrogateType::Empty => 0,
78 SurrogateType::Split if subrange.end + 3 != range.end => 2,
79 _ => 3,
80 };
81 subrange
82}
83
84#[derive(Debug, Clone)]
85pub struct LowSurrogateSearcher {
86 canonical: u16,
87}
88
89impl LowSurrogateSearcher {
90 #[inline]
91 fn new(ls: LowSurrogate) -> Self {
92 Self {
93 canonical: ls.value() & 0xcfff,
94 }
95 }
96
97 #[inline]
98 fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> {
99 let tbs = tbs.value();
100 if (tbs & 0xcfff) as u16 != self.canonical {
101 return None;
102 }
103 match tbs >> 12 {
104 0xedb => Some(SurrogateType::Canonical),
105 0x800..=0xbff => Some(SurrogateType::Split),
106 _ => None,
107 }
108 }
109}
110
111#[derive(Debug, Clone)]
112pub struct HighSurrogateSearcher {
113 canonical: u32,
114 split: u32,
115}
116
117impl HighSurrogateSearcher {
118 #[inline]
119 fn new(hs: HighSurrogate) -> Self {
120 let c = hs.value();
141 let s = ((c & 0xf03) | (c & 0x3c) << 2) + 0x100;
142 let s = (s & 0x3ff) | (s & 0xfc00) << 2 | 0x808;
143 Self {
144 canonical: c as u32 | 0xed0000,
145 split: s as u32 | 0xf0000,
146 }
147 }
148
149 #[inline]
150 fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> {
151 let tbs = tbs.value();
152 if tbs == self.canonical {
153 Some(SurrogateType::Canonical)
154 } else if tbs >> 4 == self.split {
155 Some(SurrogateType::Split)
156 } else {
157 None
158 }
159 }
160}
161
162#[derive(Debug, Clone)]
163pub struct Wtf8Searcher<S> {
164 low: Option<LowSurrogateSearcher>,
165 middle: S,
166 high: Option<HighSurrogateSearcher>,
167}
168
169fn compare_boundary_surrogates(
170 low: &Option<LowSurrogateSearcher>,
171 high: &Option<HighSurrogateSearcher>,
172 bytes: &[u8],
173 range: Range<usize>,
174 subrange: Range<usize>,
175) -> Option<(SurrogateType, SurrogateType)> {
176 let low_type = if let Some(low) = low {
177 if subrange.start - range.start < 3 {
178 return None;
179 }
180 let tbs = unsafe { bytes.get_unchecked((subrange.start - 3)..subrange.start) };
181 low.is_match(ThreeByteSeq::new(tbs))?
182 } else {
183 SurrogateType::Empty
184 };
185
186 let high_type = if let Some(high) = high {
187 if range.end - subrange.end < 3 {
188 return None;
189 }
190 let tbs = unsafe { bytes.get_unchecked(subrange.end..(subrange.end + 3)) };
191 high.is_match(ThreeByteSeq::new(tbs))?
192 } else {
193 SurrogateType::Empty
194 };
195
196 Some((low_type, high_type))
197}
198
199unsafe impl<'p> Searcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> {
200 #[inline]
201 fn search(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> {
202 let (hay, range) = span.clone().into_parts();
203 while let Some(subrange) = self.middle.search(span.clone()) {
204 if let Some((low_type, high_type)) = compare_boundary_surrogates(
205 &self.low,
206 &self.high,
207 hay.as_inner(),
208 range.clone(),
209 subrange.clone(),
210 ) {
211 return Some(extend_subrange(range, subrange, low_type, high_type));
212 } else {
213 span = unsafe { Span::from_parts(hay, subrange.end..range.end) };
214 }
215 }
216 None
217 }
218}
219
220unsafe impl<'p> Consumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> {
221 #[inline]
222 fn consume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
223 let (hay, range) = span.into_parts();
224 let bytes = hay[range.clone()].as_inner();
225 let low_len = if self.low.is_some() { 3 } else { 0 };
226 let high_len = if self.high.is_some() { 3 } else { 0 };
227 let middle = self.middle.needle();
228 let mut match_len = low_len + middle.len() + high_len;
229 if bytes.len() < match_len {
230 return None;
231 }
232 let middle_start = low_len;
233 let middle_end = match_len - high_len;
234 if &bytes[middle_start..middle_end] != middle {
235 return None;
236 }
237 if let Some(high) = &self.high {
238 if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? {
239 if bytes.len() != match_len {
240 match_len -= 1;
241 }
242 }
243 }
244 if let Some(low) = &self.low {
245 if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(bytes))? {
246 if range.start != 0 {
247 match_len -= 1;
248 }
249 }
250 }
251 Some(range.start + match_len)
252 }
253}
254
255unsafe impl<'p> ReverseSearcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> {
256 #[inline]
257 fn rsearch(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> {
258 let (hay, range) = span.clone().into_parts();
259 while let Some(subrange) = self.middle.rsearch(span.clone()) {
260 if let Some((low_type, high_type)) = compare_boundary_surrogates(
261 &self.low,
262 &self.high,
263 hay.as_inner(),
264 range.clone(),
265 subrange.clone(),
266 ) {
267 return Some(extend_subrange(range, subrange, low_type, high_type));
268 } else {
269 span = unsafe { Span::from_parts(hay, range.start..subrange.start) };
270 }
271 }
272 None
273 }
274}
275
276unsafe impl<'p> ReverseConsumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> {
277 #[inline]
278 fn rconsume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
279 let (hay, range) = span.into_parts();
280 let bytes = hay[range.clone()].as_inner();
281 let low_len = if self.low.is_some() { 3 } else { 0 };
282 let high_len = if self.high.is_some() { 3 } else { 0 };
283 let middle = self.middle.needle();
284 let mut match_len = low_len + middle.len() + high_len;
285 if bytes.len() < match_len {
286 return None;
287 }
288 let middle_end = bytes.len() - high_len;
289 let middle_start = middle_end - middle.len();
290 if &bytes[middle_start..middle_end] != middle {
291 return None;
292 }
293 if let Some(low) = &self.low {
294 let start_index = bytes.len() - match_len;
295 if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(&bytes[start_index..]))? {
296 if start_index != 0 {
297 match_len -= 1;
298 }
299 }
300 }
301 if let Some(high) = &self.high {
302 if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? {
303 if bytes.len() != range.end {
304 match_len -= 1;
305 }
306 }
307 }
308 Some(range.end - match_len)
309 }
310}
311
312impl<'p, H: Haystack<Target = Wtf8>> Needle<H> for &'p Wtf8 {
313 type Searcher = Wtf8Searcher<SliceSearcher<'p, u8>>;
314 type Consumer = Wtf8Searcher<NaiveSearcher<'p, u8>>;
315
316 fn into_searcher(self) -> Self::Searcher {
317 let (low, middle, high) = self.canonicalize();
318 Wtf8Searcher {
319 low: low.map(LowSurrogateSearcher::new),
320 middle: SliceSearcher::new(middle),
321 high: high.map(HighSurrogateSearcher::new),
322 }
323 }
324
325 fn into_consumer(self) -> Self::Consumer {
326 let (low, middle, high) = self.canonicalize();
327 Wtf8Searcher {
328 low: low.map(LowSurrogateSearcher::new),
329 middle: NaiveSearcher::new(middle),
330 high: high.map(HighSurrogateSearcher::new),
331 }
332 }
333}
334
335impl<'h, 'p> Needle<&'h Wtf8> for &'p str {
338 type Searcher = SliceSearcher<'p, u8>;
339 type Consumer = NaiveSearcher<'p, u8>;
340
341 fn into_searcher(self) -> Self::Searcher {
342 SliceSearcher::new(self.as_bytes())
343 }
344
345 fn into_consumer(self) -> Self::Consumer {
346 NaiveSearcher::new(self.as_bytes())
347 }
348}