subslice/
lib.rs

1#![cfg_attr(not(test), no_std)]
2#![cfg_attr(feature = "pattern", feature(pattern))]
3
4//! Generalization of `str::find` to both `str` and `[_]`, see [`SubsliceExt`](trait.SubsliceExt.html) for docs.
5
6#[cfg(test)]
7extern crate core;
8
9use core::cmp;
10use core::usize;
11
12extern crate memchr;
13
14mod tw;
15pub mod bmh;
16#[cfg(feature = "test-set")]
17pub mod set;
18
19#[cfg(feature = "pattern")]
20use core::str::pattern::{
21    Pattern,
22    Searcher,
23    ReverseSearcher,
24    SearchStep,
25};
26
27mod private { pub trait Sealed {} }
28
29impl<A> private::Sealed for [A] {}
30impl private::Sealed for str {}
31
32/// Trait of types which can be searched for subsequences
33pub trait SubsliceExt: private::Sealed {
34    /// Find the first subslice of `self` which is equal to `other`, and return the index of its
35    /// start.
36    ///
37    /// *O*(`self.len() + other.len()`) time
38    fn find(&self, other: &Self) -> Option<usize>;
39
40    /// Find the last subslice of `self` which is equal to `other`, and return the index of its
41    /// start.
42    ///
43    /// *O*(`self.len() + other.len()`) time
44    fn rfind(&self, other: &Self) -> Option<usize>;
45}
46
47impl<A: Ord> SubsliceExt for [A] {
48    #[inline]
49    fn find(&self, other: &Self) -> Option<usize> {
50        if other.is_empty() { Some(0) }
51        else if other.len() == 1 { self.iter().position(|a| *a == other[0]) }
52        else {
53            let mut searcher = TwoWaySearcher::new(other, self.len());
54            let is_long = searcher.memory == usize::MAX;
55            // write out `true` and `false` cases to encourage the compiler
56            // to specialize the two cases separately.
57            if is_long {
58                searcher.next::<_, MatchOnly>(self, other, true).map(|t| t.0)
59            } else {
60                searcher.next::<_, MatchOnly>(self, other, false).map(|t| t.0)
61            }
62        }
63    }
64
65    #[inline]
66    fn rfind(&self, other: &Self) -> Option<usize> {
67        if other.is_empty() { Some(self.len()) }
68        else if other.len() == 1 { self.iter().rposition(|a| *a == other[0]) }
69        else {
70            let mut searcher = TwoWaySearcher::new(other, self.len());
71            let is_long = searcher.memory == usize::MAX;
72            // write out `true` and `false` cases to encourage the compiler
73            // to specialize the two cases separately.
74            if is_long {
75                searcher.next_back::<_, MatchOnly>(self, other, true).map(|t| t.0)
76            } else {
77                searcher.next_back::<_, MatchOnly>(self, other, false).map(|t| t.0)
78            }
79        }
80    }
81}
82
83impl SubsliceExt for str {
84    #[inline]
85    fn find(&self, other: &Self) -> Option<usize> { self.as_bytes().find(other.as_bytes()) }
86
87    #[inline]
88    fn rfind(&self, other: &Self) -> Option<usize> { self.as_bytes().rfind(other.as_bytes()) }
89}
90
91/// Dummy wrapper for &str
92#[doc(hidden)]
93pub struct Str<'a>(pub &'a str);
94
95#[cfg(feature = "pattern")]
96/// Non-allocating substring search.
97///
98/// Will handle the pattern `""` as returning empty matches at each character
99/// boundary.
100impl<'a, 'b> Pattern<'a> for Str<'b> {
101    type Searcher = StrSearcher<'a, 'b>;
102
103    #[inline]
104    fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
105        StrSearcher::new(haystack, self.0)
106    }
107
108    /// Checks whether the pattern matches at the front of the haystack
109    #[inline]
110    fn is_prefix_of(self, haystack: &'a str) -> bool {
111        let self_ = self.0;
112        haystack.is_char_boundary(self_.len()) &&
113            self_ == &haystack[..self_.len()]
114    }
115
116    /// Checks whether the pattern matches at the back of the haystack
117    #[inline]
118    fn is_suffix_of(self, haystack: &'a str) -> bool {
119        let self_ = self.0;
120        self_.len() <= haystack.len() &&
121            haystack.is_char_boundary(haystack.len() - self_.len()) &&
122            self_ == &haystack[haystack.len() - self_.len()..]
123    }
124
125}
126
127#[derive(Clone, Debug)]
128#[doc(hidden)]
129/// Associated type for `<&str as Pattern<'a>>::Searcher`.
130pub struct StrSearcher<'a, 'b> {
131    haystack: &'a str,
132    needle: &'b str,
133
134    searcher: StrSearcherImpl,
135}
136
137#[derive(Clone, Debug)]
138enum StrSearcherImpl {
139    Empty(EmptyNeedle),
140    TwoWay(TwoWaySearcher),
141}
142
143#[derive(Clone, Debug)]
144struct EmptyNeedle {
145    position: usize,
146    end: usize,
147    is_match_fw: bool,
148    is_match_bw: bool,
149}
150
151impl<'a, 'b> StrSearcher<'a, 'b> {
152    pub fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
153        if needle.is_empty() {
154            StrSearcher {
155                haystack, needle,
156                searcher: StrSearcherImpl::Empty(EmptyNeedle {
157                    position: 0,
158                    end: haystack.len(),
159                    is_match_fw: true,
160                    is_match_bw: true,
161                }),
162            }
163        } else {
164            StrSearcher {
165                haystack, needle,
166                searcher: StrSearcherImpl::TwoWay(
167                    TwoWaySearcher::new(needle.as_bytes(), haystack.len())
168                ),
169            }
170        }
171    }
172}
173
174#[cfg(feature = "pattern")]
175unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
176    fn haystack(&self) -> &'a str { self.haystack }
177
178    #[inline]
179    fn next(&mut self) -> SearchStep {
180        match self.searcher {
181            StrSearcherImpl::Empty(ref mut searcher) => {
182                // empty needle rejects every char and matches every empty string between them
183                let is_match = searcher.is_match_fw;
184                searcher.is_match_fw = !searcher.is_match_fw;
185                let pos = searcher.position;
186                match self.haystack[pos..].chars().next() {
187                    _ if is_match => SearchStep::Match(pos, pos),
188                    None => SearchStep::Done,
189                    Some(ch) => {
190                        searcher.position += ch.len_utf8();
191                        SearchStep::Reject(pos, searcher.position)
192                    }
193                }
194            }
195            StrSearcherImpl::TwoWay(ref mut searcher) => {
196                // TwoWaySearcher produces valid *Match* indices that split at char boundaries
197                // as long as it does correct matching and that haystack and needle are
198                // valid UTF-8
199                // *Rejects* from the algorithm can fall on any indices, but we will walk them
200                // manually to the next character boundary, so that they are utf-8 safe.
201                if searcher.position == self.haystack.len() {
202                    return SearchStep::Done;
203                }
204                let is_long = searcher.memory == usize::MAX;
205                match searcher.next::<_, RejectAndMatch>(self.haystack.as_bytes(),
206                                                      self.needle.as_bytes(),
207                                                      is_long)
208                {
209                    SearchStep::Reject(a, mut b) => {
210                        // skip to next char boundary
211                        while !self.haystack.is_char_boundary(b) {
212                            b += 1;
213                        }
214                        searcher.position = cmp::max(b, searcher.position);
215                        SearchStep::Reject(a, b)
216                    }
217                    otherwise => otherwise,
218                }
219            }
220        }
221    }
222
223    #[inline(always)]
224    fn next_match(&mut self) -> Option<(usize, usize)> {
225        match self.searcher {
226            StrSearcherImpl::Empty(..) => {
227                loop {
228                    match self.next() {
229                        SearchStep::Match(a, b) => return Some((a, b)),
230                        SearchStep::Done => return None,
231                        SearchStep::Reject(..) => { }
232                    }
233                }
234            }
235
236            StrSearcherImpl::TwoWay(ref mut searcher) => {
237                let is_long = searcher.memory == usize::MAX;
238                // write out `true` and `false` cases to encourage the compiler
239                // to specialize the two cases separately.
240                if is_long {
241                    searcher.next::<_, MatchOnly>(self.haystack.as_bytes(),
242                                                  self.needle.as_bytes(),
243                                                  true)
244                } else {
245                    searcher.next::<_, MatchOnly>(self.haystack.as_bytes(),
246                                                  self.needle.as_bytes(),
247                                                  false)
248                }
249            }
250        }
251    }
252}
253
254#[cfg(feature = "pattern")]
255unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
256    #[inline]
257    fn next_back(&mut self) -> SearchStep {
258        match self.searcher {
259            StrSearcherImpl::Empty(ref mut searcher) => {
260                let is_match = searcher.is_match_bw;
261                searcher.is_match_bw = !searcher.is_match_bw;
262                let end = searcher.end;
263                match self.haystack[..end].chars().next_back() {
264                    _ if is_match => SearchStep::Match(end, end),
265                    None => SearchStep::Done,
266                    Some(ch) => {
267                        searcher.end -= ch.len_utf8();
268                        SearchStep::Reject(searcher.end, end)
269                    }
270                }
271            }
272            StrSearcherImpl::TwoWay(ref mut searcher) => {
273                if searcher.end == 0 {
274                    return SearchStep::Done;
275                }
276                let is_long = searcher.memory == usize::MAX;
277                match searcher.next_back::<_, RejectAndMatch>(self.haystack.as_bytes(),
278                                                              self.needle.as_bytes(),
279                                                              is_long)
280                {
281                    SearchStep::Reject(mut a, b) => {
282                        // skip to next char boundary
283                        while !self.haystack.is_char_boundary(a) {
284                            a -= 1;
285                        }
286                        searcher.end = cmp::min(a, searcher.end);
287                        SearchStep::Reject(a, b)
288                    }
289                    otherwise => otherwise,
290                }
291            }
292        }
293    }
294
295    #[inline]
296    fn next_match_back(&mut self) -> Option<(usize, usize)> {
297        match self.searcher {
298            StrSearcherImpl::Empty(..) => {
299                loop {
300                    match self.next_back() {
301                        SearchStep::Match(a, b) => return Some((a, b)),
302                        SearchStep::Done => return None,
303                        SearchStep::Reject(..) => { }
304                    }
305                }
306            }
307            StrSearcherImpl::TwoWay(ref mut searcher) => {
308                let is_long = searcher.memory == usize::MAX;
309                // write out `true` and `false`, like `next_match`
310                if is_long {
311                    searcher.next_back::<_, MatchOnly>(self.haystack.as_bytes(),
312                                                       self.needle.as_bytes(),
313                                                       true)
314                } else {
315                    searcher.next_back::<_, MatchOnly>(self.haystack.as_bytes(),
316                                                       self.needle.as_bytes(),
317                                                       false)
318                }
319            }
320        }
321    }
322}
323
324/// The internal state of the two-way substring search algorithm.
325#[derive(Clone, Debug)]
326#[doc(hidden)]
327pub struct TwoWaySearcher {
328    // constants
329    /// critical factorization index
330    crit_pos: usize,
331    /// critical factorization index for reversed needle
332    crit_pos_back: usize,
333    period: usize,
334
335    // variables
336    position: usize,
337    end: usize,
338    /// index into needle before which we have already matched
339    memory: usize,
340    /// index into needle after which we have already matched
341    memory_back: usize,
342}
343
344/*
345    This is the Two-Way search algorithm, which was introduced in the paper:
346    Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
347
348    Here's some background information.
349
350    A *word* is a string of symbols. The *length* of a word should be a familiar
351    notion, and here we denote it for any word x by |x|.
352    (We also allow for the possibility of the *empty word*, a word of length zero).
353
354    If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
355    *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
356    For example, both 1 and 2 are periods for the string "aa". As another example,
357    the only period of the string "abcd" is 4.
358
359    We denote by period(x) the *smallest* period of x (provided that x is non-empty).
360    This is always well-defined since every non-empty word x has at least one period,
361    |x|. We sometimes call this *the period* of x.
362
363    If u, v and x are words such that x = uv, where uv is the concatenation of u and
364    v, then we say that (u, v) is a *factorization* of x.
365
366    Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
367    that both of the following hold
368
369      - either w is a suffix of u or u is a suffix of w
370      - either w is a prefix of v or v is a prefix of w
371
372    then w is said to be a *repetition* for the factorization (u, v).
373
374    Just to unpack this, there are four possibilities here. Let w = "abc". Then we
375    might have:
376
377      - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
378      - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
379      - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
380      - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
381
382    Note that the word vu is a repetition for any factorization (u,v) of x = uv,
383    so every factorization has at least one repetition.
384
385    If x is a string and (u, v) is a factorization for x, then a *local period* for
386    (u, v) is an integer r such that there is some word w such that |w| = r and w is
387    a repetition for (u, v).
388
389    We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
390    call this *the local period* of (u, v). Provided that x = uv is non-empty, this
391    is well-defined (because each non-empty word has at least one factorization, as
392    noted above).
393
394    It can be proven that the following is an equivalent definition of a local period
395    for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
396    all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
397    defined. (i.e. i > 0 and i + r < |x|).
398
399    Using the above reformulation, it is easy to prove that
400
401        1 <= local_period(u, v) <= period(uv)
402
403    A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
404    *critical factorization*.
405
406    The algorithm hinges on the following theorem, which is stated without proof:
407
408    **Critical Factorization Theorem** Any word x has at least one critical
409    factorization (u, v) such that |u| < period(x).
410
411    The purpose of maximal_suffix is to find such a critical factorization.
412
413    If the period is short, compute another factorization x = u' v' to use
414    for reverse search, chosen instead so that |v'| < period(x).
415
416*/
417impl TwoWaySearcher {
418    pub fn new<A: Ord>(needle: &[A], end: usize) -> TwoWaySearcher {
419        let (crit_pos, period) = TwoWaySearcher::crit_params(needle);
420
421        // A particularly readable explanation of what's going on here can be found
422        // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
423        // see the code for "Algorithm CP" on p. 323.
424        //
425        // What's going on is we have some critical factorization (u, v) of the
426        // needle, and we want to determine whether u is a suffix of
427        // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
428        // "Algorithm CP2", which is optimized for when the period of the needle
429        // is large.
430        if needle[..crit_pos] == needle[period.. period + crit_pos] {
431            // short period case -- the period is exact
432            // compute a separate critical factorization for the reversed needle
433            // x = u' v' where |v'| < period(x).
434            //
435            // This is sped up by the period being known already.
436            // Note that a case like x = "acba" may be factored exactly forwards
437            // (crit_pos = 1, period = 3) while being factored with approximate
438            // period in reverse (crit_pos = 2, period = 2). We use the given
439            // reverse factorization but keep the exact period.
440            let crit_pos_back = needle.len() - cmp::max(
441                TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
442                TwoWaySearcher::reverse_maximal_suffix(needle, period, true));
443
444            TwoWaySearcher {
445                crit_pos, crit_pos_back, period,
446
447                position: 0,
448                end,
449                memory: 0,
450                memory_back: needle.len(),
451            }
452        } else {
453            // long period case -- we have an approximation to the actual period,
454            // and don't use memorization.
455            //
456            // Approximate the period by lower bound max(|u|, |v|) + 1.
457            // The critical factorization is efficient to use for both forward and
458            // reverse search.
459
460            TwoWaySearcher {
461                crit_pos, crit_pos_back: crit_pos,
462                period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
463
464                position: 0,
465                end,
466                memory: usize::MAX, // Dummy value to signify that the period is long
467                memory_back: usize::MAX,
468            }
469        }
470    }
471
472    /// Return the zero-based critical position and period of the provided needle.
473    /// 
474    /// The returned period is incorrect when the actual period is "long." In
475    /// that case the approximation must be computed separately.
476    #[inline(always)]
477    fn crit_params<A: Ord>(needle: &[A]) -> (usize, usize) {
478        let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
479        let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
480
481        if crit_pos_false > crit_pos_true {
482            (crit_pos_false, period_false)
483        } else {
484            (crit_pos_true, period_true)
485        }
486    }
487    // One of the main ideas of Two-Way is that we factorize the needle into
488    // two halves, (u, v), and begin trying to find v in the haystack by scanning
489    // left to right. If v matches, we try to match u by scanning right to left.
490    // How far we can jump when we encounter a mismatch is all based on the fact
491    // that (u, v) is a critical factorization for the needle.
492    #[inline(always)]
493    fn next<A: Eq, S>(&mut self, haystack: &[A], needle: &[A], long_period: bool)
494        -> S::Output
495        where S: TwoWayStrategy
496    {
497        // `next()` uses `self.position` as its cursor
498        let old_pos = self.position;
499        let needle_last = needle.len() - 1;
500        'search: loop {
501            // Check that we have room to search in
502            // position + needle_last can not overflow if we assume slices
503            // are bounded by isize's range.
504            match haystack.get(self.position + needle_last) {
505                Some(_) => (),
506                None => {
507                    self.position = haystack.len();
508                    return S::rejecting(old_pos, self.position);
509                }
510            };
511
512            if S::use_early_reject() && old_pos != self.position {
513                return S::rejecting(old_pos, self.position);
514            }
515
516            // See if the right part of the needle matches
517            let start = if long_period { self.crit_pos }
518                        else { cmp::max(self.crit_pos, self.memory) };
519            for i in start..needle.len() {
520                if needle[i] != haystack[self.position + i] {
521                    self.position += i - self.crit_pos + 1;
522                    if !long_period {
523                        self.memory = 0;
524                    }
525                    continue 'search;
526                }
527            }
528
529            // See if the left part of the needle matches
530            let start = if long_period { 0 } else { self.memory };
531            for i in (start..self.crit_pos).rev() {
532                if needle[i] != haystack[self.position + i] {
533                    self.position += self.period;
534                    if !long_period {
535                        self.memory = needle.len() - self.period;
536                    }
537                    continue 'search;
538                }
539            }
540
541            // We have found a match!
542            let match_pos = self.position;
543
544            // Note: add self.period instead of needle.len() to have overlapping matches
545            self.position += needle.len();
546            if !long_period {
547                self.memory = 0; // set to needle.len() - self.period for overlapping matches
548            }
549
550            return S::matching(match_pos, match_pos + needle.len());
551        }
552    }
553
554    // Follows the ideas in `next()`.
555    //
556    // The definitions are symmetrical, with period(x) = period(reverse(x))
557    // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
558    // is a critical factorization, so is (reverse(v), reverse(u)).
559    //
560    // For the reverse case we have computed a critical factorization x = u' v'
561    // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
562    // thus |v'| < period(x) for the reverse.
563    //
564    // To search in reverse through the haystack, we search forward through
565    // a reversed haystack with a reversed needle, matching first u' and then v'.
566    #[inline]
567    fn next_back<A: Eq, S>(&mut self, haystack: &[A], needle: &[A], long_period: bool)
568        -> S::Output
569        where S: TwoWayStrategy
570    {
571        // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
572        // are independent.
573        let old_end = self.end;
574        'search: loop {
575            // Check that we have room to search in
576            // end - needle.len() will wrap around when there is no more room,
577            // but due to slice length limits it can never wrap all the way back
578            // into the length of haystack.
579            match haystack.get(self.end.wrapping_sub(needle.len())) {
580                Some(_) => (),
581                None => {
582                    self.end = 0;
583                    return S::rejecting(0, old_end);
584                }
585            };
586
587            if S::use_early_reject() && old_end != self.end {
588                return S::rejecting(self.end, old_end);
589            }
590
591            // See if the left part of the needle matches
592            let crit = if long_period { self.crit_pos_back }
593                       else { cmp::min(self.crit_pos_back, self.memory_back) };
594            for i in (0..crit).rev() {
595                if needle[i] != haystack[self.end - needle.len() + i] {
596                    self.end -= self.crit_pos_back - i;
597                    if !long_period {
598                        self.memory_back = needle.len();
599                    }
600                    continue 'search;
601                }
602            }
603
604            // See if the right part of the needle matches
605            let needle_end = if long_period { needle.len() }
606                             else { self.memory_back };
607            for i in self.crit_pos_back..needle_end {
608                if needle[i] != haystack[self.end - needle.len() + i] {
609                    self.end -= self.period;
610                    if !long_period {
611                        self.memory_back = self.period;
612                    }
613                    continue 'search;
614                }
615            }
616
617            // We have found a match!
618            let match_pos = self.end - needle.len();
619            // Note: sub self.period instead of needle.len() to have overlapping matches
620            self.end -= needle.len();
621            if !long_period {
622                self.memory_back = needle.len();
623            }
624
625            return S::matching(match_pos, match_pos + needle.len());
626        }
627    }
628
629    // Compute the maximal suffix of `arr`.
630    //
631    // The maximal suffix is a possible critical factorization (u, v) of `arr`.
632    //
633    // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
634    // period of v.
635    //
636    // `order_greater` determines if lexical order is `<` or `>`. Both
637    // orders must be computed -- the ordering with the largest `i` gives
638    // a critical factorization.
639    //
640    // For long period cases, the resulting period is not exact (it is too short).
641    #[inline]
642    pub fn maximal_suffix<A: Ord>(arr: &[A], order_greater: bool) -> (usize, usize) {
643        let mut left = 0; // Corresponds to i in the paper
644        let mut right = 1; // Corresponds to j in the paper
645        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
646                            // to match 0-based indexing.
647        let mut period = 1; // Corresponds to p in the paper
648
649        while let Some(a) = arr.get(right + offset) {
650            // `left` will be inbounds when `right` is.
651            let b = &arr[left + offset];
652            if (*a < *b && !order_greater) || (*a > *b && order_greater) {
653                // Suffix is smaller, period is entire prefix so far.
654                right += offset + 1;
655                offset = 0;
656                period = right - left;
657            } else if *a == *b {
658                // Advance through repetition of the current period.
659                if offset + 1 == period {
660                    right += offset + 1;
661                    offset = 0;
662                } else {
663                    offset += 1;
664                }
665            } else {
666                // Suffix is larger, start over from current location.
667                left = right;
668                right += 1;
669                offset = 0;
670                period = 1;
671            }
672        }
673        (left, period)
674    }
675
676    // Compute the maximal suffix of the reverse of `arr`.
677    //
678    // The maximal suffix is a possible critical factorization (u', v') of `arr`.
679    //
680    // Returns `i` where `i` is the starting index of v', from the back;
681    // returns immedately when a period of `known_period` is reached.
682    //
683    // `order_greater` determines if lexical order is `<` or `>`. Both
684    // orders must be computed -- the ordering with the largest `i` gives
685    // a critical factorization.
686    //
687    // For long period cases, the resulting period is not exact (it is too short).
688    pub fn reverse_maximal_suffix<A: Ord>(arr: &[A], known_period: usize,
689                                          order_greater: bool) -> usize
690    {
691        let mut left = 0; // Corresponds to i in the paper
692        let mut right = 1; // Corresponds to j in the paper
693        let mut offset = 0; // Corresponds to k in the paper, but starting at 0
694                            // to match 0-based indexing.
695        let mut period = 1; // Corresponds to p in the paper
696        let n = arr.len();
697
698        while right + offset < n {
699            let a = &arr[n - (1 + right + offset)];
700            let b = &arr[n - (1 + left + offset)];
701            if (*a < *b && !order_greater) || (*a > *b && order_greater) {
702                // Suffix is smaller, period is entire prefix so far.
703                right += offset + 1;
704                offset = 0;
705                period = right - left;
706            } else if *a == *b {
707                // Advance through repetition of the current period.
708                if offset + 1 == period {
709                    right += offset + 1;
710                    offset = 0;
711                } else {
712                    offset += 1;
713                }
714            } else {
715                // Suffix is larger, start over from current location.
716                left = right;
717                right += 1;
718                offset = 0;
719                period = 1;
720            }
721            if period == known_period {
722                break;
723            }
724        }
725        debug_assert!(period <= known_period);
726        left
727    }
728}
729
730// TwoWayStrategy allows the algorithm to either skip non-matches as quickly
731// as possible, or to work in a mode where it emits Rejects relatively quickly.
732trait TwoWayStrategy {
733    type Output;
734    fn use_early_reject() -> bool;
735    fn rejecting(usize, usize) -> Self::Output;
736    fn matching(usize, usize) -> Self::Output;
737}
738
739/// Skip to match intervals as quickly as possible
740enum MatchOnly { }
741
742impl TwoWayStrategy for MatchOnly {
743    type Output = Option<(usize, usize)>;
744
745    #[inline]
746    fn use_early_reject() -> bool { false }
747    #[inline]
748    fn rejecting(_a: usize, _b: usize) -> Self::Output { None }
749    #[inline]
750    fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) }
751}
752
753#[cfg(feature = "pattern")]
754/// Emit Rejects regularly
755enum RejectAndMatch { }
756
757#[cfg(feature = "pattern")]
758impl TwoWayStrategy for RejectAndMatch {
759    type Output = SearchStep;
760
761    #[inline]
762    fn use_early_reject() -> bool { true }
763    #[inline]
764    fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) }
765    #[inline]
766    fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) }
767}
768
769
770#[cfg(feature = "pattern")]
771#[cfg(test)]
772impl<'a, 'b> StrSearcher<'a, 'b> {
773    fn twoway(&self) -> &TwoWaySearcher {
774        match self.searcher {
775            StrSearcherImpl::TwoWay(ref inner) => inner,
776            _ => panic!("Not a TwoWaySearcher"),
777        }
778    }
779}
780
781#[cfg(feature = "pattern")]
782#[test]
783fn test_basic() {
784    let t = StrSearcher::new("", "aab");
785    println!("{:?}", t);
786    let t = StrSearcher::new("", "abaaaba");
787    println!("{:?}", t);
788    let mut t = StrSearcher::new("GCATCGCAGAGAGTATACAGTACG", "GCAGAGAG");
789    println!("{:?}", t);
790
791    loop {
792        match t.next() {
793            SearchStep::Done => break,
794            m => println!("{:?}", m),
795        }
796    }
797
798    let mut t = StrSearcher::new("GCATCGCAGAGAGTATACAGTACG", "GCAGAGAG");
799    println!("{:?}", t);
800
801    loop {
802        match t.next_back() {
803            SearchStep::Done => break,
804            m => println!("{:?}", m),
805        }
806    }
807
808    let mut t = StrSearcher::new("banana", "nana");
809    println!("{:?}", t);
810
811    loop {
812        match t.next() {
813            SearchStep::Done => break,
814            m => println!("{:?}", m),
815        }
816    }
817}
818
819#[cfg(feature = "pattern")]
820#[cfg(test)]
821fn contains(hay: &str, n: &str) -> bool {
822    let mut tws = StrSearcher::new(hay, n);
823    loop {
824        match tws.next() {
825            SearchStep::Done => return false,
826            SearchStep::Match(..) => return true,
827            _ => { }
828        }
829    }
830}
831
832#[cfg(feature = "pattern")]
833#[cfg(test)]
834fn contains_rev(hay: &str, n: &str) -> bool {
835    let mut tws = StrSearcher::new(hay, n);
836    loop {
837        match tws.next_back() {
838            SearchStep::Done => return false,
839            SearchStep::Match(..) => return true,
840            rej => { println!("{:?}", rej); }
841        }
842    }
843}
844
845
846#[cfg(feature = "pattern")]
847#[test]
848fn test_contains() {
849    let h = "";
850    let n = "";
851    assert!(contains(h, n));
852    assert!(contains_rev(h, n));
853
854    let h = "BDC\0\0\0";
855    let n = "BDC\u{0}";
856    assert!(contains(h, n));
857    assert!(contains_rev(h, n));
858
859
860    let h = "ADA\0";
861    let n = "ADA";
862    assert!(contains(h, n));
863    assert!(contains_rev(h, n));
864
865    let h = "\u{0}\u{0}\u{0}\u{0}"; 
866    let n = "\u{0}";
867    assert!(contains(h, n));
868    assert!(contains_rev(h, n));
869}
870
871#[cfg(feature = "pattern")]
872#[test]
873fn test_rev_2() {
874    let h = "BDC\0\0\0";
875    let n = "BDC\u{0}";
876    let mut t = StrSearcher::new(h, n);
877    println!("{:?}", t);
878    println!("{:?}", h.contains(&n));
879
880    loop {
881        match t.next_back() {
882            SearchStep::Done => break,
883            m => println!("{:?}", m),
884        }
885    }
886
887    let h = "aabaabx";
888    let n = "aabaab";
889    let mut t = StrSearcher::new(h, n);
890    println!("{:?}", t);
891    assert_eq!(t.twoway().crit_pos, 2);
892    assert_eq!(t.twoway().crit_pos_back, 5);
893
894    loop {
895        match t.next_back() {
896            SearchStep::Done => break,
897            m => println!("{:?}", m),
898        }
899    }
900
901    let h = "abababac";
902    let n = "ababab";
903    let mut t = StrSearcher::new(h, n);
904    println!("{:?}", t);
905    assert_eq!(t.twoway().crit_pos, 1);
906    assert_eq!(t.twoway().crit_pos_back, 5);
907
908    loop {
909        match t.next_back() {
910            SearchStep::Done => break,
911            m => println!("{:?}", m),
912        }
913    }
914
915    let h = "abababac";
916    let n = "abab";
917    let mut t = StrSearcher::new(h, n);
918    println!("{:?}", t);
919
920    loop {
921        match t.next_back() {
922            SearchStep::Done => break,
923            m => println!("{:?}", m),
924        }
925    }
926
927    let h = "baabbbaabc";
928    let n = "baabb";
929    let t = StrSearcher::new(h, n);
930    println!("{:?}", t);
931    assert_eq!(t.twoway().crit_pos, 3);
932    assert_eq!(t.twoway().crit_pos_back, 3);
933
934    let h = "aabaaaabaabxx";
935    let n = "aabaaaabaa";
936    let mut t = StrSearcher::new(h, n);
937    println!("{:?}", t);
938
939    loop {
940        match t.next_back() {
941            SearchStep::Done => break,
942            m => println!("{:?}", m),
943        }
944    }
945
946    let h = "babbabax";
947    let n = "babbab";
948    let mut t = StrSearcher::new(h, n);
949    println!("{:?}", t);
950    assert_eq!(t.twoway().crit_pos, 2);
951    assert_eq!(t.twoway().crit_pos_back, 4);
952
953    loop {
954        match t.next_back() {
955            SearchStep::Done => break,
956            m => println!("{:?}", m),
957        }
958    }
959
960    let h = "xacbaabcax";
961    let n = "abca";
962    let mut t = StrSearcher::new(h, n);
963    assert_eq!(t.next_match_back(), Some((5, 9)));
964
965    let h = "xacbaacbxxcba";
966    let m = "acba";
967    let mut s = StrSearcher::new(h, m);
968    assert_eq!(s.next_match_back(), Some((1, 5)));
969    assert_eq!(s.twoway().crit_pos, 1);
970    assert_eq!(s.twoway().crit_pos_back, 2);
971}
972
973#[cfg(feature = "pattern")]
974#[test]
975fn test_rev_unicode() {
976    let h = "ααααααβ";
977    let n = "αβ";
978    let mut t = StrSearcher::new(h, n);
979    println!("{:?}", t);
980
981    loop {
982        match t.next() {
983            SearchStep::Done => break,
984            m => println!("{:?}", m),
985        }
986    }
987
988    let mut t = StrSearcher::new(h, n);
989    loop {
990        match t.next_back() {
991            SearchStep::Done => break,
992            m => println!("{:?}", m),
993        }
994    }
995}
996
997#[test]
998fn maximal_suffix() {
999    assert_eq!((2, 1), TwoWaySearcher::maximal_suffix(b"aab", false));
1000    assert_eq!((0, 3), TwoWaySearcher::maximal_suffix(b"aab", true));
1001
1002    assert_eq!((0, 3), TwoWaySearcher::maximal_suffix(b"aabaa", true));
1003    assert_eq!((2, 3), TwoWaySearcher::maximal_suffix(b"aabaa", false));
1004
1005    assert_eq!((0, 7), TwoWaySearcher::maximal_suffix(b"gcagagag", false));
1006    assert_eq!((2, 2), TwoWaySearcher::maximal_suffix(b"gcagagag", true));
1007
1008    // both of these factorizations are critial factorizations
1009    assert_eq!((2, 2), TwoWaySearcher::maximal_suffix(b"banana", false));
1010    assert_eq!((1, 2), TwoWaySearcher::maximal_suffix(b"banana", true));
1011    assert_eq!((0, 6), TwoWaySearcher::maximal_suffix(b"zanana", false));
1012    assert_eq!((1, 2), TwoWaySearcher::maximal_suffix(b"zanana", true));
1013}
1014
1015#[test]
1016fn maximal_suffix_verbose() {
1017    fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1018        let mut left: usize = 0; // Corresponds to i in the paper
1019        let mut right = 1; // Corresponds to j in the paper
1020        let mut offset = 0; // Corresponds to k in the paper
1021        let mut period = 1; // Corresponds to p in the paper
1022
1023        macro_rules! asstr {
1024            ($e:expr) => (::std::str::from_utf8($e).unwrap())
1025        }
1026
1027        while let Some(&a) = arr.get(right + offset) {
1028            // `left` will be inbounds when `right` is.
1029            debug_assert!(left <= right);
1030            let b = unsafe { *arr.get_unchecked(left + offset) };
1031            println!("str={}, l={}, r={}, offset={}, p={}", asstr!(arr), left, right, offset, period);
1032            if (a < b && !order_greater) || (a > b && order_greater) {
1033                // Suffix is smaller, period is entire prefix so far.
1034                right += offset + 1;
1035                offset = 0;
1036                period = right - left;
1037            } else if a == b {
1038                // Advance through repetition of the current period.
1039                if offset + 1 == period {
1040                    right += offset + 1;
1041                    offset = 0;
1042                } else {
1043                    offset += 1;
1044                }
1045            } else {
1046                // Suffix is larger, start over from current location.
1047                left = right;
1048                right += 1;
1049                offset = 0;
1050                period = 1;
1051            }
1052        }
1053        println!("str={}, l={}, r={}, offset={}, p={} ==END==", asstr!(arr), left, right, offset, period);
1054        (left, period)
1055    }
1056
1057    fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize {
1058        let n = arr.len();
1059        let mut left: usize = 0; // Corresponds to i in the paper
1060        let mut right = 1; // Corresponds to j in the paper
1061        let mut offset = 0; // Corresponds to k in the paper
1062        let mut period = 1; // Corresponds to p in the paper
1063
1064        macro_rules! asstr {
1065            ($e:expr) => (::std::str::from_utf8($e).unwrap())
1066        }
1067
1068        while right + offset < n {
1069            // `left` will be inbounds when `right` is.
1070            debug_assert!(left <= right);
1071            let a = unsafe { *arr.get_unchecked(n - (1 + right + offset)) };
1072            let b = unsafe { *arr.get_unchecked(n - (1 + left + offset)) };
1073            println!("str={}, l={}, r={}, offset={}, p={}", asstr!(arr), left, right, offset, period);
1074            if (a < b && !order_greater) || (a > b && order_greater) {
1075                // Suffix is smaller, period is entire prefix so far.
1076                right += offset + 1;
1077                offset = 0;
1078                period = right - left;
1079                if period == known_period {
1080                    break;
1081                }
1082            } else if a == b {
1083                // Advance through repetition of the current period.
1084                if offset + 1 == period {
1085                    right += offset + 1;
1086                    offset = 0;
1087                } else {
1088                    offset += 1;
1089                }
1090            } else {
1091                // Suffix is larger, start over from current location.
1092                left = right;
1093                right += 1;
1094                offset = 0;
1095                period = 1;
1096            }
1097        }
1098        println!("str={}, l={}, r={}, offset={}, p={} ==END==", asstr!(arr), left, right, offset, period);
1099        debug_assert!(period == known_period);
1100        left
1101    }
1102
1103    assert_eq!((2, 2), maximal_suffix(b"banana", false));
1104    assert_eq!((1, 2), maximal_suffix(b"banana", true));
1105    assert_eq!((0, 7), maximal_suffix(b"gcagagag", false));
1106    assert_eq!((2, 2), maximal_suffix(b"gcagagag", true));
1107    assert_eq!((2, 1), maximal_suffix(b"bac", false));
1108    assert_eq!((1, 2), maximal_suffix(b"bac", true));
1109    assert_eq!((0, 9), maximal_suffix(b"baaaaaaaa", false));
1110    assert_eq!((1, 1), maximal_suffix(b"baaaaaaaa", true));
1111
1112    assert_eq!((2, 3), maximal_suffix(b"babbabbab", false));
1113    assert_eq!((1, 3), maximal_suffix(b"babbabbab", true));
1114
1115    assert_eq!(2, reverse_maximal_suffix(b"babbabbab", 3, false));
1116    assert_eq!(1, reverse_maximal_suffix(b"babbabbab", 3, true));
1117
1118    assert_eq!((0, 2), maximal_suffix(b"bababa", false));
1119    assert_eq!((1, 2), maximal_suffix(b"bababa", true));
1120
1121    assert_eq!(1, reverse_maximal_suffix(b"bababa", 2, false));
1122    assert_eq!(0, reverse_maximal_suffix(b"bababa", 2, true));
1123
1124    // NOTE: returns "long period" case per = 2, which is an approximation
1125    assert_eq!((2, 2), maximal_suffix(b"abca", false));
1126    assert_eq!((0, 3), maximal_suffix(b"abca", true));
1127
1128    assert_eq!((3, 2), maximal_suffix(b"abcda", false));
1129    assert_eq!((0, 4), maximal_suffix(b"abcda", true));
1130
1131    // "aöa"
1132    assert_eq!((1, 3), maximal_suffix(b"acba", false));
1133    assert_eq!((0, 3), maximal_suffix(b"acba", true));
1134    //assert_eq!(2, reverse_maximal_suffix(b"acba", 3, false));
1135    //assert_eq!(0, reverse_maximal_suffix(b"acba", 3, true));
1136}
1137
1138#[cfg(feature = "pattern")]
1139#[test]
1140fn test_find_rfind() {
1141    fn find(hay: &str, pat: &str) -> Option<usize> {
1142        let mut t = pat.into_searcher(hay);
1143        t.next_match().map(|(x, _)| x)
1144    }
1145
1146    fn rfind(hay: &str, pat: &str) -> Option<usize> {
1147        let mut t = pat.into_searcher(hay);
1148        t.next_match_back().map(|(x, _)| x)
1149    }
1150
1151    // find every substring -- assert that it finds it, or an earlier occurence.
1152    let string = "Việt Namacbaabcaabaaba";
1153    for (i, ci) in string.char_indices() {
1154        let ip = i + ci.len_utf8();
1155        for j in string[ip..].char_indices()
1156                             .map(|(i, _)| i)
1157                             .chain(Some(string.len() - ip))
1158        {
1159            let pat = &string[i..ip + j];
1160            assert!(match find(string, pat) {
1161                None => false,
1162                Some(x) => x <= i,
1163            });
1164            assert!(match rfind(string, pat) {
1165                None => false,
1166                Some(x) => x >= i,
1167            });
1168        }
1169    }
1170}