pattern_3/omgwtf8/
wtf8_pat.rs

1use needle::*;
2use haystack::{Haystack, Span};
3use std::ops::Range;
4use slices::slice::{TwoWaySearcher, SliceSearcher, NaiveSearcher};
5#[cfg(test)]
6use ext::{match_ranges, rmatch_ranges, starts_with, ends_with};
7
8use super::wtf8::{HighSurrogate, LowSurrogate, ThreeByteSeq, Wtf8};
9
10// The TwoWaySearcher will not match unpaired surrogate at boundary, so no need
11// to convert logical range to physical range.
12
13unsafe impl<'p> Searcher<Wtf8> for TwoWaySearcher<'p, u8> {
14    #[inline]
15    fn search(&mut self, span: Span<&Wtf8>) -> Option<Range<usize>> {
16        let (hay, range) = span.into_parts();
17        self.next(hay.as_inner(), range)
18    }
19}
20
21unsafe impl<'p> ReverseSearcher<Wtf8> for TwoWaySearcher<'p, u8> {
22    #[inline]
23    fn rsearch(&mut self, span: Span<&Wtf8>) -> Option<Range<usize>> {
24        let (hay, range) = span.into_parts();
25        self.next_back(hay.as_inner(), range)
26    }
27}
28
29fn span_as_inner(span: Span<&Wtf8>) -> Span<&[u8]> {
30    let (hay, range) = span.into_parts();
31    unsafe { Span::from_parts(hay.as_inner(), range) }
32}
33
34unsafe impl<'p> Consumer<Wtf8> for NaiveSearcher<'p, u8> {
35    #[inline]
36    fn consume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
37        self.consume(span_as_inner(span))
38    }
39
40    #[inline]
41    fn trim_start(&mut self, hay: &Wtf8) -> usize {
42        self.trim_start(hay.as_inner())
43    }
44}
45
46unsafe impl<'p> ReverseConsumer<Wtf8> for NaiveSearcher<'p, u8> {
47    #[inline]
48    fn rconsume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
49        self.rconsume(span_as_inner(span))
50    }
51
52    #[inline]
53    fn trim_end(&mut self, hay: &Wtf8) -> usize {
54        self.trim_end(hay.as_inner())
55    }
56}
57
58#[derive(Debug)]
59enum SurrogateType {
60    Split,
61    Canonical,
62    Empty,
63}
64
65fn extend_subrange(
66    range: Range<usize>,
67    mut subrange: Range<usize>,
68    low_type: SurrogateType,
69    high_type: SurrogateType,
70) -> Range<usize> {
71    subrange.start -= match low_type {
72        SurrogateType::Empty => 0,
73        SurrogateType::Split if subrange.start != range.start + 3 => 2,
74        _ => 3,
75    };
76    subrange.end += match high_type {
77        SurrogateType::Empty => 0,
78        SurrogateType::Split if subrange.end + 3 != range.end => 2,
79        _ => 3,
80    };
81    subrange
82}
83
84#[derive(Debug, Clone)]
85pub struct LowSurrogateSearcher {
86    canonical: u16,
87}
88
89impl LowSurrogateSearcher {
90    #[inline]
91    fn new(ls: LowSurrogate) -> Self {
92        Self {
93            canonical: ls.value() & 0xcfff,
94        }
95    }
96
97    #[inline]
98    fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> {
99        let tbs = tbs.value();
100        if (tbs & 0xcfff) as u16 != self.canonical {
101            return None;
102        }
103        match tbs >> 12 {
104            0xedb => Some(SurrogateType::Canonical),
105            0x800..=0xbff => Some(SurrogateType::Split),
106            _ => None,
107        }
108    }
109}
110
111#[derive(Debug, Clone)]
112pub struct HighSurrogateSearcher {
113    canonical: u32,
114    split: u32,
115}
116
117impl HighSurrogateSearcher {
118    #[inline]
119    fn new(hs: HighSurrogate) -> Self {
120        // the canonical representation
121        //
122        //          c = 1010 jihg 10fe dcba
123        //
124        // rearrange
125        //
126        //  c & 0xf03 = 0000 jihg 0000 00ba
127        //   c & 0xfc = 0000 0000 00fe dc00
128        // ...|...<<2 = 0000 jihg fedc 00ba
129        //  ...+0x100 = 000K JIHG fedc 00ba
130        //
131        // rearrange again
132        //
133        //  s & 0x3ff = 0000 00HG fedc 00ba
134        // s & 0xfc00 = 000K JI00 0000 0000
135        // ...|...<<2 = 0KJI 00HG fedc 00ba
136        //  ...|0x808 = 0KJI 10HG fedc 10ba
137        //
138        // this will be the split representation shifted right by 4 bits.
139
140        let c = hs.value();
141        let s = ((c & 0xf03) | (c & 0x3c) << 2) + 0x100;
142        let s = (s & 0x3ff) | (s & 0xfc00) << 2 | 0x808;
143        Self {
144            canonical: c as u32 | 0xed0000,
145            split: s as u32 | 0xf0000,
146        }
147    }
148
149    #[inline]
150    fn is_match(&self, tbs: ThreeByteSeq) -> Option<SurrogateType> {
151        let tbs = tbs.value();
152        if tbs == self.canonical {
153            Some(SurrogateType::Canonical)
154        } else if tbs >> 4 == self.split {
155            Some(SurrogateType::Split)
156        } else {
157            None
158        }
159    }
160}
161
162#[derive(Debug, Clone)]
163pub struct Wtf8Searcher<S> {
164    low: Option<LowSurrogateSearcher>,
165    middle: S,
166    high: Option<HighSurrogateSearcher>,
167}
168
169fn compare_boundary_surrogates(
170    low: &Option<LowSurrogateSearcher>,
171    high: &Option<HighSurrogateSearcher>,
172    bytes: &[u8],
173    range: Range<usize>,
174    subrange: Range<usize>,
175) -> Option<(SurrogateType, SurrogateType)> {
176    let low_type = if let Some(low) = low {
177        if subrange.start - range.start < 3 {
178            return None;
179        }
180        let tbs = unsafe { bytes.get_unchecked((subrange.start - 3)..subrange.start) };
181        low.is_match(ThreeByteSeq::new(tbs))?
182    } else {
183        SurrogateType::Empty
184    };
185
186    let high_type = if let Some(high) = high {
187        if range.end - subrange.end < 3 {
188            return None;
189        }
190        let tbs = unsafe { bytes.get_unchecked(subrange.end..(subrange.end + 3)) };
191        high.is_match(ThreeByteSeq::new(tbs))?
192    } else {
193        SurrogateType::Empty
194    };
195
196    Some((low_type, high_type))
197}
198
199unsafe impl<'p> Searcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> {
200    #[inline]
201    fn search(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> {
202        let (hay, range) = span.clone().into_parts();
203        while let Some(subrange) = self.middle.search(span.clone()) {
204            if let Some((low_type, high_type)) = compare_boundary_surrogates(
205                &self.low,
206                &self.high,
207                hay.as_inner(),
208                range.clone(),
209                subrange.clone(),
210            ) {
211                return Some(extend_subrange(range, subrange, low_type, high_type));
212            } else {
213                span = unsafe { Span::from_parts(hay, subrange.end..range.end) };
214            }
215        }
216        None
217    }
218}
219
220unsafe impl<'p> Consumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> {
221    #[inline]
222    fn consume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
223        let (hay, range) = span.into_parts();
224        let bytes = hay[range.clone()].as_inner();
225        let low_len = if self.low.is_some() { 3 } else { 0 };
226        let high_len = if self.high.is_some() { 3 } else { 0 };
227        let middle = self.middle.needle();
228        let mut match_len = low_len + middle.len() + high_len;
229        if bytes.len() < match_len {
230            return None;
231        }
232        let middle_start = low_len;
233        let middle_end = match_len - high_len;
234        if &bytes[middle_start..middle_end] != middle {
235            return None;
236        }
237        if let Some(high) = &self.high {
238            if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? {
239                if bytes.len() != match_len {
240                    match_len -= 1;
241                }
242            }
243        }
244        if let Some(low) = &self.low {
245            if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(bytes))? {
246                if range.start != 0 {
247                    match_len -= 1;
248                }
249            }
250        }
251        Some(range.start + match_len)
252    }
253}
254
255unsafe impl<'p> ReverseSearcher<Wtf8> for Wtf8Searcher<SliceSearcher<'p, u8>> {
256    #[inline]
257    fn rsearch(&mut self, mut span: Span<&Wtf8>) -> Option<Range<usize>> {
258        let (hay, range) = span.clone().into_parts();
259        while let Some(subrange) = self.middle.rsearch(span.clone()) {
260            if let Some((low_type, high_type)) = compare_boundary_surrogates(
261                &self.low,
262                &self.high,
263                hay.as_inner(),
264                range.clone(),
265                subrange.clone(),
266            ) {
267                return Some(extend_subrange(range, subrange, low_type, high_type));
268            } else {
269                span = unsafe { Span::from_parts(hay, range.start..subrange.start) };
270            }
271        }
272        None
273    }
274}
275
276unsafe impl<'p> ReverseConsumer<Wtf8> for Wtf8Searcher<NaiveSearcher<'p, u8>> {
277    #[inline]
278    fn rconsume(&mut self, span: Span<&Wtf8>) -> Option<usize> {
279        let (hay, range) = span.into_parts();
280        let bytes = hay[range.clone()].as_inner();
281        let low_len = if self.low.is_some() { 3 } else { 0 };
282        let high_len = if self.high.is_some() { 3 } else { 0 };
283        let middle = self.middle.needle();
284        let mut match_len = low_len + middle.len() + high_len;
285        if bytes.len() < match_len {
286            return None;
287        }
288        let middle_end = bytes.len() - high_len;
289        let middle_start = middle_end - middle.len();
290        if &bytes[middle_start..middle_end] != middle {
291            return None;
292        }
293        if let Some(low) = &self.low {
294            let start_index = bytes.len() - match_len;
295            if let SurrogateType::Split = low.is_match(ThreeByteSeq::new(&bytes[start_index..]))? {
296                if start_index != 0 {
297                    match_len -= 1;
298                }
299            }
300        }
301        if let Some(high) = &self.high {
302            if let SurrogateType::Split = high.is_match(ThreeByteSeq::new(&bytes[middle_end..]))? {
303                if bytes.len() != range.end {
304                    match_len -= 1;
305                }
306            }
307        }
308        Some(range.end - match_len)
309    }
310}
311
312impl<'p, H: Haystack<Target = Wtf8>> Needle<H> for &'p Wtf8 {
313    type Searcher = Wtf8Searcher<SliceSearcher<'p, u8>>;
314    type Consumer = Wtf8Searcher<NaiveSearcher<'p, u8>>;
315
316    fn into_searcher(self) -> Self::Searcher {
317        let (low, middle, high) = self.canonicalize();
318        Wtf8Searcher {
319            low: low.map(LowSurrogateSearcher::new),
320            middle: SliceSearcher::new(middle),
321            high: high.map(HighSurrogateSearcher::new),
322        }
323    }
324
325    fn into_consumer(self) -> Self::Consumer {
326        let (low, middle, high) = self.canonicalize();
327        Wtf8Searcher {
328            low: low.map(LowSurrogateSearcher::new),
329            middle: NaiveSearcher::new(middle),
330            high: high.map(HighSurrogateSearcher::new),
331        }
332    }
333}
334
335// FIXME cannot impl `Needle<(_: Haystack<Target = Wtf8>)>` due to RFC 1672 being postponed.
336// (need to wait for chalk)
337impl<'h, 'p> Needle<&'h Wtf8> for &'p str {
338    type Searcher = SliceSearcher<'p, u8>;
339    type Consumer = NaiveSearcher<'p, u8>;
340
341    fn into_searcher(self) -> Self::Searcher {
342        SliceSearcher::new(self.as_bytes())
343    }
344
345    fn into_consumer(self) -> Self::Consumer {
346        NaiveSearcher::new(self.as_bytes())
347    }
348}