pattern_3/slices/
slice.rs

1use needle::*;
2use haystack::{Hay, Span};
3use std::cmp::{Ordering, max, min};
4use std::usize;
5use std::ops::Range;
6
7//------------------------------------------------------------------------------
8// Two way searcher helpers
9//------------------------------------------------------------------------------
10
11type FastSkipByteset = u64;
12
13trait FastSkipOptimization {
14    fn byteset_mask(&self) -> FastSkipByteset;
15}
16
17impl<T: ?Sized> FastSkipOptimization for T {
18    #[inline]
19    default fn byteset_mask(&self) -> FastSkipByteset { !0 }
20}
21
22impl FastSkipOptimization for u8 {
23    #[inline]
24    fn byteset_mask(&self) -> FastSkipByteset { 1 << (self & 63) }
25}
26
27trait MaximalSuffix: Sized {
28    // Compute the maximal suffix of `&[T]`.
29    //
30    // The maximal suffix is a possible critical factorization (u, v) of `arr`.
31    //
32    // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
33    // period of v.
34    //
35    // `order` determines if lexical order is `<` or `>`. Both
36    // orders must be computed -- the ordering with the largest `i` gives
37    // a critical factorization.
38    //
39    // For long period cases, the resulting period is not exact (it is too short).
40    fn maximal_suffix(arr: &[Self], order: Ordering) -> (usize, usize);
41
42    // Compute the maximal suffix of the reverse of `arr`.
43    //
44    // The maximal suffix is a possible critical factorization (u', v') of `arr`.
45    //
46    // Returns `i` where `i` is the starting index of v', from the back;
47    // returns immediately when a period of `known_period` is reached.
48    //
49    // `order_greater` determines if lexical order is `<` or `>`. Both
50    // orders must be computed -- the ordering with the largest `i` gives
51    // a critical factorization.
52    //
53    // For long period cases, the resulting period is not exact (it is too short).
54    fn reverse_maximal_suffix(arr: &[Self], known_period: usize, order: Ordering) -> usize;
55}
56
57// fallback to naive search for non-Ord slices.
58impl<T: PartialEq> MaximalSuffix for T {
59    default fn maximal_suffix(_: &[Self], _: Ordering) -> (usize, usize) {
60        (0, 1)
61    }
62
63    default fn reverse_maximal_suffix(_: &[Self], _: usize, _: Ordering) -> usize {
64        0
65    }
66}
67
68impl<T: Ord> MaximalSuffix for T {
69    fn maximal_suffix(arr: &[Self], order: Ordering) -> (usize, usize) {
70        let mut left = 0; // Corresponds to i in the paper
71        let mut right = 1; // Corresponds to j in the paper
72        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
73                            // to match 0-based indexing.
74        let mut period = 1; // Corresponds to p in the paper
75
76        while let Some(a) = arr.get(right + offset) {
77            // `left` will be inbounds when `right` is.
78            let b = &arr[left + offset];
79            match a.cmp(b) {
80                Ordering::Equal => {
81                    // Advance through repetition of the current period.
82                    if offset + 1 == period {
83                        right += offset + 1;
84                        offset = 0;
85                    } else {
86                        offset += 1;
87                    }
88                }
89                o if o == order => {
90                    // Suffix is smaller, period is entire prefix so far.
91                    right += offset + 1;
92                    offset = 0;
93                    period = right - left;
94                }
95                _ => {
96                    // Suffix is larger, start over from current location.
97                    left = right;
98                    right += 1;
99                    offset = 0;
100                    period = 1;
101                }
102            };
103        }
104        (left, period)
105    }
106
107    fn reverse_maximal_suffix(arr: &[Self], known_period: usize, order: Ordering) -> usize {
108        let mut left = 0; // Corresponds to i in the paper
109        let mut right = 1; // Corresponds to j in the paper
110        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
111                            // to match 0-based indexing.
112        let mut period = 1; // Corresponds to p in the paper
113        let n = arr.len();
114
115        while right + offset < n {
116            let a = &arr[n - (1 + right + offset)];
117            let b = &arr[n - (1 + left + offset)];
118            match a.cmp(b) {
119                Ordering::Equal => {
120                    // Advance through repetition of the current period.
121                    if offset + 1 == period {
122                        right += offset + 1;
123                        offset = 0;
124                    } else {
125                        offset += 1;
126                    }
127                }
128                o if o == order => {
129                    // Suffix is smaller, period is entire prefix so far.
130                    right += offset + 1;
131                    offset = 0;
132                    period = right - left;
133                }
134                _ => {
135                    // Suffix is larger, start over from current location.
136                    left = right;
137                    right += 1;
138                    offset = 0;
139                    period = 1;
140                }
141            }
142            if period == known_period {
143                break;
144            }
145        }
146        debug_assert!(period <= known_period);
147        left
148    }
149}
150
151//------------------------------------------------------------------------------
152// Two way searcher
153//------------------------------------------------------------------------------
154
155struct LongPeriod;
156struct ShortPeriod;
157
158trait Period {
159    const IS_LONG_PERIOD: bool;
160}
161impl Period for LongPeriod {
162    const IS_LONG_PERIOD: bool = true;
163}
164impl Period for ShortPeriod {
165    const IS_LONG_PERIOD: bool = false;
166}
167
168#[derive(Debug)]
169pub struct TwoWaySearcher<'p, T: 'p> {
170    // constants
171    /// critical factorization index
172    crit_pos: usize,
173    /// critical factorization index for reversed needle
174    crit_pos_back: usize,
175
176    period: usize,
177
178    /// `byteset` is an extension (not part of the two way algorithm);
179    /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
180    /// to a (byte & 63) == j present in the needle.
181    byteset: FastSkipByteset,
182
183    needle: &'p [T],
184
185    // variables
186    /// index into needle before which we have already matched
187    memory: usize,
188    /// index into needle after which we have already matched
189    memory_back: usize,
190}
191
192impl<'p, T: 'p> Clone for TwoWaySearcher<'p, T> {
193    fn clone(&self) -> Self {
194        *self
195    }
196}
197
198impl<'p, T: 'p> Copy for TwoWaySearcher<'p, T> {}
199
200impl<'p, T> TwoWaySearcher<'p, T>
201where
202    T: PartialEq + 'p,
203{
204    #[inline]
205    fn do_next<P: Period>(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> {
206        let needle = self.needle;
207
208        let mut position = range.start;
209        'search: loop {
210            // Check that we have room to search in
211            // position + needle_last can not overflow if we assume slices
212            // are bounded by isize's range.
213            let i = position + (needle.len() - 1);
214            if i >= range.end {
215                return None;
216            }
217            // let tail_item = &hay[i]; // using get_unchecked here would be slower
218            let tail_item = unsafe { hay.get_unchecked(i) };
219
220            // Quickly skip by large portions unrelated to our substring
221            if !self.byteset_contains(tail_item) {
222                position += needle.len();
223                if !P::IS_LONG_PERIOD {
224                    self.memory = 0;
225                }
226                continue 'search;
227            }
228
229            // See if the right part of the needle matches
230            let start = if P::IS_LONG_PERIOD {
231                self.crit_pos
232            } else {
233                max(self.crit_pos, self.memory)
234            };
235            for i in start..needle.len() {
236                if unsafe { needle.get_unchecked(i) != hay.get_unchecked(position + i) } {
237                    position += i - self.crit_pos + 1;
238                    if !P::IS_LONG_PERIOD {
239                        self.memory = 0;
240                    }
241                    continue 'search;
242                }
243            }
244
245            // See if the left part of the needle matches
246            let start = if P::IS_LONG_PERIOD { 0 } else { self.memory };
247            for i in (start..self.crit_pos).rev() {
248                if unsafe { needle.get_unchecked(i) != hay.get_unchecked(position + i) } {
249                    position += self.period;
250                    if !P::IS_LONG_PERIOD {
251                        self.memory = needle.len() - self.period;
252                    }
253                    continue 'search;
254                }
255            }
256
257            // We have found a match!
258            // Note: add self.period instead of needle.len() to have overlapping matches
259            if !P::IS_LONG_PERIOD {
260                self.memory = 0; // set to needle.len() - self.period for overlapping matches
261            }
262            return Some(position..(position + needle.len()));
263        }
264    }
265
266    #[inline]
267    pub(crate) fn next(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> {
268        if self.memory != usize::MAX {
269            self.do_next::<ShortPeriod>(hay, range)
270        } else {
271            self.do_next::<LongPeriod>(hay, range)
272        }
273    }
274
275    #[inline]
276    fn do_next_back<P: Period>(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> {
277        let needle = self.needle;
278        let mut end = range.end;
279        'search: loop {
280            // Check that we have room to search in
281            // end - needle.len() will wrap around when there is no more room,
282            // but due to slice length limits it can never wrap all the way back
283            // into the length of hay.
284            if needle.len() + range.start > end {
285                return None;
286            }
287            let front_item = unsafe { hay.get_unchecked(end.wrapping_sub(needle.len())) };
288
289            // Quickly skip by large portions unrelated to our substring
290            if !self.byteset_contains(front_item) {
291                end -= needle.len();
292                if !P::IS_LONG_PERIOD {
293                    self.memory_back = needle.len();
294                }
295                continue 'search;
296            }
297
298            // See if the left part of the needle matches
299            let crit = if P::IS_LONG_PERIOD {
300                self.crit_pos_back
301            } else {
302                min(self.crit_pos_back, self.memory_back)
303            };
304            for i in (0..crit).rev() {
305                if unsafe { needle.get_unchecked(i) != hay.get_unchecked(end - needle.len() + i) } {
306                    end -= self.crit_pos_back - i;
307                    if !P::IS_LONG_PERIOD {
308                        self.memory_back = needle.len();
309                    }
310                    continue 'search;
311                }
312            }
313
314            // See if the right part of the needle matches
315            let needle_end = if P::IS_LONG_PERIOD { needle.len() } else { self.memory_back };
316            for i in self.crit_pos_back..needle_end {
317                if unsafe { needle.get_unchecked(i) != hay.get_unchecked(end - needle.len() + i) } {
318                    end -= self.period;
319                    if !P::IS_LONG_PERIOD {
320                        self.memory_back = self.period;
321                    }
322                    continue 'search;
323                }
324            }
325
326            // We have found a match!
327            if !P::IS_LONG_PERIOD {
328                self.memory_back = needle.len();
329            }
330            return Some((end - needle.len())..end);
331        }
332    }
333
334    #[inline]
335    pub(crate) fn next_back(&mut self, hay: &[T], range: Range<usize>) -> Option<Range<usize>> {
336        if self.memory != usize::MAX {
337            self.do_next_back::<ShortPeriod>(hay, range)
338        } else {
339            self.do_next_back::<LongPeriod>(hay, range)
340        }
341    }
342
343    #[inline]
344    pub(crate) fn new(needle: &'p [T]) -> Self {
345        let res_lt = T::maximal_suffix(needle, Ordering::Less);
346        let res_gt = T::maximal_suffix(needle, Ordering::Greater);
347        let (crit_pos, period) = max(res_lt, res_gt);
348
349        let byteset = Self::byteset_create(needle);
350
351        // A particularly readable explanation of what's going on here can be found
352        // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
353        // see the code for "Algorithm CP" on p. 323.
354        //
355        // What's going on is we have some critical factorization (u, v) of the
356        // needle, and we want to determine whether u is a suffix of
357        // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
358        // "Algorithm CP2", which is optimized for when the period of the needle
359        // is large.
360        if needle[..crit_pos] == needle[period..(period + crit_pos)] {
361            // short period case -- the period is exact
362            // compute a separate critical factorization for the reversed needle
363            // x = u' v' where |v'| < period(x).
364            //
365            // This is sped up by the period being known already.
366            // Note that a case like x = "acba" may be factored exactly forwards
367            // (crit_pos = 1, period = 3) while being factored with approximate
368            // period in reverse (crit_pos = 2, period = 2). We use the given
369            // reverse factorization but keep the exact period.
370            let crit_pos_back = needle.len() - max(
371                T::reverse_maximal_suffix(needle, period, Ordering::Greater),
372                T::reverse_maximal_suffix(needle, period, Ordering::Less),
373            );
374
375            Self {
376                crit_pos,
377                crit_pos_back,
378                period,
379                byteset,
380                needle,
381                memory: 0,
382                memory_back: needle.len(),
383            }
384        } else {
385            Self {
386                crit_pos,
387                crit_pos_back: crit_pos,
388                period: max(crit_pos, needle.len() - crit_pos) + 1,
389                byteset,
390                needle,
391                memory: usize::MAX, // Dummy value to signify that the period is long
392                memory_back: usize::MAX,
393            }
394        }
395    }
396
397    #[inline]
398    fn byteset_create(needle: &[T]) -> FastSkipByteset {
399        needle.iter().fold(0, |a, b| b.byteset_mask() | a)
400    }
401    #[inline]
402    fn byteset_contains(&self, item: &T) -> bool {
403        (self.byteset & item.byteset_mask()) != 0
404    }
405}
406
407unsafe impl<'p, T> Searcher<[T]> for TwoWaySearcher<'p, T>
408where
409    T: PartialEq + 'p,
410{
411    #[inline]
412    fn search(&mut self, span: Span<&[T]>) -> Option<Range<usize>> {
413        let (hay, range) = span.into_parts();
414        self.next(hay, range)
415    }
416}
417
418unsafe impl<'p, T> ReverseSearcher<[T]> for TwoWaySearcher<'p, T>
419where
420    T: PartialEq + 'p,
421{
422    #[inline]
423    fn rsearch(&mut self, span: Span<&[T]>) -> Option<Range<usize>> {
424        let (hay, range) = span.into_parts();
425        self.next_back(hay, range)
426    }
427}
428
429//------------------------------------------------------------------------------
430// Naive (state-less) searcher
431//------------------------------------------------------------------------------
432
433#[derive(Debug)]
434pub struct NaiveSearcher<'p, T: 'p>(&'p [T]);
435
436impl<'p, T: 'p> Clone for NaiveSearcher<'p, T> {
437    fn clone(&self) -> Self {
438        *self
439    }
440}
441
442impl<'p, T: 'p> Copy for NaiveSearcher<'p, T> {}
443
444unsafe impl<'p, T> Consumer<[T]> for NaiveSearcher<'p, T>
445where
446    T: PartialEq + 'p,
447{
448    #[inline]
449    fn consume(&mut self, span: Span<&[T]>) -> Option<usize> {
450        let (hay, range) = span.into_parts();
451        let check_end = range.start + self.0.len();
452        if range.end < check_end {
453            return None;
454        }
455        if unsafe { hay.get_unchecked(range.start..check_end) } == self.0 {
456            Some(check_end)
457        } else {
458            None
459        }
460    }
461}
462
463unsafe impl<'p, T> ReverseConsumer<[T]> for NaiveSearcher<'p, T>
464where
465    T: PartialEq + 'p,
466{
467    #[inline]
468    fn rconsume(&mut self, span: Span<&[T]>) -> Option<usize> {
469        let (hay, range) = span.into_parts();
470        if range.start + self.0.len() > range.end {
471            return None;
472        }
473        let index = range.end - self.0.len();
474        if unsafe { hay.get_unchecked(index..range.end) } == self.0 {
475            Some(index)
476        } else {
477            None
478        }
479    }
480}
481
482impl<'p, T: 'p> NaiveSearcher<'p, T> {
483    #[inline]
484    pub fn new(slice: &'p [T]) -> Self {
485        NaiveSearcher(slice)
486    }
487
488    #[inline]
489    pub fn needle(&self) -> &'p [T] {
490        self.0
491    }
492}
493
494//------------------------------------------------------------------------------
495// Slice searcher
496//------------------------------------------------------------------------------
497
498#[derive(Debug)]
499pub enum SliceSearcher<'p, T: 'p> {
500    TwoWay(TwoWaySearcher<'p, T>),
501    Empty(EmptySearcher),
502}
503
504impl<'p, T: PartialEq + 'p> SliceSearcher<'p, T> {
505    #[inline]
506    pub fn new(slice: &'p [T]) -> Self {
507        if slice.is_empty() {
508            SliceSearcher::Empty(EmptySearcher::default())
509        } else {
510            SliceSearcher::TwoWay(TwoWaySearcher::new(slice))
511        }
512    }
513
514    #[inline]
515    pub fn needle(&self) -> &'p [T] {
516        match self {
517            SliceSearcher::TwoWay(s) => s.needle,
518            SliceSearcher::Empty(_) => &[],
519        }
520    }
521}
522
523impl<'p, T: 'p> Clone for SliceSearcher<'p, T> {
524    #[inline]
525    fn clone(&self) -> Self {
526        match self {
527            SliceSearcher::TwoWay(s) => SliceSearcher::TwoWay(*s),
528            SliceSearcher::Empty(s) => SliceSearcher::Empty(s.clone()),
529        }
530    }
531}
532
533macro_rules! forward {
534    (searcher: $self:expr, $s:ident => $e:expr) => {
535        match $self {
536            SliceSearcher::TwoWay($s) => $e,
537            SliceSearcher::Empty($s) => $e,
538        }
539    };
540}
541
542unsafe impl<'p, T, A> Searcher<A> for SliceSearcher<'p, T>
543where
544    A: Hay<Index = usize> + ?Sized,
545    TwoWaySearcher<'p, T>: Searcher<A>,
546{
547    #[inline]
548    fn search(&mut self, span: Span<&A>) -> Option<Range<usize>> {
549        forward!(searcher: self, s => s.search(span))
550    }
551}
552
553unsafe impl<'p, T, A> ReverseSearcher<A> for SliceSearcher<'p, T>
554where
555    A: Hay<Index = usize> + ?Sized,
556    TwoWaySearcher<'p, T>: ReverseSearcher<A>,
557{
558    #[inline]
559    fn rsearch(&mut self, span: Span<&A>) -> Option<Range<usize>> {
560        forward!(searcher: self, s => s.rsearch(span))
561    }
562}
563
564macro_rules! impl_needle {
565    (<[$($gen:tt)*]> $ty:ty) => {
566        impl<$($gen)*> Needle<$ty> for &'p [T]
567        where
568            T: PartialEq + 'p,
569        {
570            type Searcher = SliceSearcher<'p, T>;
571            type Consumer = NaiveSearcher<'p, T>;
572
573            #[inline]
574            fn into_searcher(self) -> Self::Searcher {
575                SliceSearcher::new(self)
576            }
577
578            #[inline]
579            fn into_consumer(self) -> Self::Consumer {
580                NaiveSearcher::new(self)
581            }
582        }
583    }
584}
585
586impl_needle!(<['p, 'h, T]> &'h [T]);
587impl_needle!(<['p, 'h, T]> &'h mut [T]);
588#[cfg(feature = "std")]
589impl_needle!(<['p, T]> Vec<T>);