pattern_3/
needle.rs

1//! Needle traits.
2
3use haystack::{Haystack, Hay, Span};
4
5use std::ops::Range;
6
7/// A searcher, for searching a [`Needle`] from a [`Hay`].
8///
9/// This trait provides methods for searching for non-overlapping matches of a
10/// needle starting from the front (left) of a hay.
11///
12/// # Safety
13///
14/// This trait is marked unsafe because the range returned by its methods are
15/// required to lie on valid codeword boundaries in the haystack. This enables
16/// users of this trait to slice the haystack without additional runtime checks.
17///
18/// # Examples
19///
20/// Implement a searcher and consumer which matches `b"Aaaa"` from a byte string.
21///
22/// ```rust
23/// extern crate pattern_3;
24/// use pattern_3::*;
25/// use std::ops::Range;
26///
27/// // The searcher for searching `b"Aaaa"`, using naive search.
28/// // We are going to use this as a needle too.
29/// struct Aaaa;
30///
31/// unsafe impl Searcher<[u8]> for Aaaa {
32///     // search for an `b"Aaaa"` in the middle of the string, returns its range.
33///     fn search(&mut self, span: Span<&[u8]>) -> Option<Range<usize>> {
34///         let (hay, range) = span.into_parts();
35///
36///         let start = range.start;
37///         for (i, window) in hay[range].windows(4).enumerate() {
38///             if *window == b"Aaaa"[..] {
39///                 // remember to include the range offset
40///                 return Some((start + i)..(start + i + 4));
41///             }
42///         }
43///
44///         None
45///     }
46/// }
47///
48/// unsafe impl Consumer<[u8]> for Aaaa {
49///     // checks if an `b"Aaaa" is at the beginning of the string, returns the end index.
50///     fn consume(&mut self, span: Span<&[u8]>) -> Option<usize> {
51///         let (hay, range) = span.into_parts();
52///         let end = range.start.checked_add(4)?;
53///         if end <= range.end && hay[range.start..end] == b"Aaaa"[..] {
54///             Some(end)
55///         } else {
56///             None
57///         }
58///     }
59/// }
60///
61/// impl<H: Haystack<Target = [u8]>> pattern_3::Needle<H> for Aaaa {
62///     type Searcher = Self;
63///     type Consumer = Self;
64///     fn into_searcher(self) -> Self { self }
65///     fn into_consumer(self) -> Self { self }
66/// }
67///
68/// // test with some standard algorithms.
69/// let haystack = &b"Aaaaa!!!Aaa!!!Aaaaaaaaa!!!"[..];
70/// assert_eq!(
71///     ext::split(haystack, Aaaa).collect::<Vec<_>>(),
72///     vec![
73///         &b""[..],
74///         &b"a!!!Aaa!!!"[..],
75///         &b"aaaaa!!!"[..],
76///     ]
77/// );
78/// assert_eq!(
79///     ext::match_ranges(haystack, Aaaa).collect::<Vec<_>>(),
80///     vec![
81///         (0..4, &b"Aaaa"[..]),
82///         (14..18, &b"Aaaa"[..]),
83///     ]
84/// );
85/// assert_eq!(
86///     ext::trim_start(haystack, Aaaa),
87///     &b"a!!!Aaa!!!Aaaaaaaaa!!!"[..]
88/// );
89/// ```
90pub unsafe trait Searcher<A: Hay + ?Sized> {
91    /// Searches for the first range which the needle can be found in the span.
92    ///
93    /// This method is used to support the following standard algorithms:
94    ///
95    /// * [`matches`](::ext::matches)
96    /// * [`contains`](::ext::contains)
97    /// * [`match_indices`](::ext::match_indices)
98    /// * [`find`](::ext::find)
99    /// * [`match_ranges`](::ext::match_ranges)
100    /// * [`find_range`](::ext::find_range)
101    /// * [`split`](::ext::split)
102    /// * [`split_terminator`](::ext::split_terminator)
103    /// * [`splitn`](::ext::splitn)
104    /// * [`replace_with`](::ext::replace_with)
105    /// * [`replacen_with`](::ext::replacen_with)
106    ///
107    /// The hay and the restricted range for searching can be recovered by
108    /// calling `span`[`.into_parts()`](Span::into_parts). The range returned
109    /// by this method
110    /// should be relative to the hay and must be contained within the
111    /// restricted range from the span.
112    ///
113    /// If the needle is not found, this method should return `None`.
114    ///
115    /// The reason this method takes a `Span<&A>` instead of just `&A` is
116    /// because some needles need context information provided by
117    /// the position of the current slice and the content around the slice.
118    /// Regex components like the start-/end-of-text anchors `^`/`$`
119    /// and word boundary `\b` are primary examples.
120    ///
121    /// # Examples
122    ///
123    /// Search for the locations of a substring inside a string, using the
124    /// searcher primitive.
125    ///
126    /// ```
127    /// extern crate pattern_3;
128    /// use pattern_3::{Searcher, Needle, Span};
129    ///
130    /// let mut searcher = Needle::<&str>::into_searcher("::");
131    /// let span = Span::from("lion::tiger::leopard");
132    /// //                     ^   ^      ^        ^
133    /// // string indices:     0   4     11       20
134    ///
135    /// // found the first "::".
136    /// assert_eq!(searcher.search(span.clone()), Some(4..6));
137    ///
138    /// // slice the span to skip the first match.
139    /// let span = unsafe { span.slice_unchecked(6..20) };
140    ///
141    /// // found the second "::".
142    /// assert_eq!(searcher.search(span.clone()), Some(11..13));
143    ///
144    /// // should find nothing now.
145    /// let span = unsafe { span.slice_unchecked(13..20) };
146    /// assert_eq!(searcher.search(span.clone()), None);
147    /// ```
148    fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>>;
149}
150
151/// A consumer, for searching a [`Needle`] from a [`Hay`] anchored at the
152/// beginnning.
153///
154/// This trait provides methods for matching a needle anchored at the beginning
155/// of a hay.
156///
157/// See documentation of [`Searcher`] for an example.
158///
159/// # Safety
160///
161/// This trait is marked unsafe because the range returned by its methods are
162/// required to lie on valid codeword boundaries in the haystack. This enables
163/// users of this trait to slice the haystack without additional runtime checks.
164pub unsafe trait Consumer<A: Hay + ?Sized> {
165    /// Checks if the needle can be found at the beginning of the span.
166    ///
167    /// This method is used to implement the standard algorithm
168    /// [`starts_with()`](::ext::starts_with) as well as providing the default
169    /// implementation for [`.trim_start()`](Consumer::trim_start).
170    ///
171    /// The hay and the restricted range for searching can be recovered by
172    /// calling `span`[`.into_parts()`](Span::into_parts). If a needle can be
173    /// found starting at `range.start`, this method should return the end index
174    /// of the needle relative to the hay.
175    ///
176    /// If the needle cannot be found at the beginning of the span, this method
177    /// should return `None`.
178    ///
179    /// # Examples
180    ///
181    /// Consumes ASCII characters from the beginning.
182    ///
183    /// ```
184    /// extern crate pattern_3;
185    /// use pattern_3::{Consumer, Needle, Span};
186    ///
187    /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii());
188    /// let span = Span::from("Hi😋!!");
189    ///
190    /// // consumes the first ASCII character
191    /// assert_eq!(consumer.consume(span.clone()), Some(1));
192    ///
193    /// // slice the span to skip the first match.
194    /// let span = unsafe { span.slice_unchecked(1..8) };
195    ///
196    /// // matched the second ASCII character
197    /// assert_eq!(consumer.consume(span.clone()), Some(2));
198    ///
199    /// // should match nothing now.
200    /// let span = unsafe { span.slice_unchecked(2..8) };
201    /// assert_eq!(consumer.consume(span.clone()), None);
202    /// ```
203    fn consume(&mut self, span: Span<&A>) -> Option<A::Index>;
204
205    /// Repeatedly removes prefixes of the hay which matches the needle.
206    ///
207    /// This method is used to implement the standard algorithm
208    /// [`trim_start()`](::ext::trim_start).
209    ///
210    /// Returns the start index of the slice after all prefixes are removed.
211    ///
212    /// A fast generic implementation in terms of
213    /// [`.consume()`](Consumer::consume) is provided by default. Nevertheless,
214    /// many needles allow a higher-performance specialization.
215    ///
216    /// # Examples
217    ///
218    /// ```rust
219    /// extern crate pattern_3;
220    /// use pattern_3::{Consumer, Needle, Span};
221    ///
222    /// let mut consumer = Needle::<&str>::into_consumer('x');
223    /// assert_eq!(consumer.trim_start("xxxyy"), 3);
224    ///
225    /// let mut consumer = Needle::<&str>::into_consumer('x');
226    /// assert_eq!(consumer.trim_start("yyxxx"), 0);
227    /// ```
228    #[inline]
229    fn trim_start(&mut self, hay: &A) -> A::Index {
230        let mut offset = hay.start_index();
231        let mut span = Span::from(hay);
232        while let Some(pos) = self.consume(span.clone()) {
233            offset = pos;
234            let (hay, range) = span.into_parts();
235            if pos == range.start {
236                break;
237            }
238            span = unsafe { Span::from_parts(hay, pos..range.end) };
239        }
240        offset
241    }
242}
243
244/// A searcher which can be searched from the end.
245///
246/// This trait provides methods for searching for non-overlapping matches of a
247/// needle starting from the back (right) of a hay.
248///
249/// # Safety
250///
251/// This trait is marked unsafe because the range returned by its methods are
252/// required to lie on valid codeword boundaries in the haystack. This enables
253/// users of this trait to slice the haystack without additional runtime checks.
254pub unsafe trait ReverseSearcher<A: Hay + ?Sized>: Searcher<A> {
255    /// Searches for the last range which the needle can be found in the span.
256    ///
257    /// This method is used to support the following standard algorithms:
258    ///
259    /// * [`rmatches`](::ext::rmatches)
260    /// * [`rmatch_indices`](::ext::rmatch_indices)
261    /// * [`rfind`](::ext::find)
262    /// * [`rmatch_ranges`](::ext::rmatch_ranges)
263    /// * [`rfind_range`](::ext::rfind_range)
264    /// * [`rsplit`](::ext::rsplit)
265    /// * [`rsplit_terminator`](::ext::rsplit_terminator)
266    /// * [`rsplitn`](::ext::rsplitn)
267    ///
268    /// The hay and the restricted range for searching can be recovered by
269    /// calling `span`[`.into_parts()`](Span::into_parts). The returned range
270    /// should be relative to the hay and must be contained within the
271    /// restricted range from the span.
272    ///
273    /// If the needle is not found, this method should return `None`.
274    ///
275    /// # Examples
276    ///
277    /// Search for the locations of a substring inside a string, using the
278    /// searcher primitive.
279    ///
280    /// ```
281    /// extern crate pattern_3;
282    /// use pattern_3::{ReverseSearcher, Needle, Span};
283    ///
284    /// let mut searcher = Needle::<&str>::into_searcher("::");
285    /// let span = Span::from("lion::tiger::leopard");
286    /// //                     ^   ^      ^
287    /// // string indices:     0   4     11
288    ///
289    /// // found the last "::".
290    /// assert_eq!(searcher.rsearch(span.clone()), Some(11..13));
291    ///
292    /// // slice the span to skip the last match.
293    /// let span = unsafe { span.slice_unchecked(0..11) };
294    ///
295    /// // found the second to last "::".
296    /// assert_eq!(searcher.rsearch(span.clone()), Some(4..6));
297    ///
298    /// // should found nothing now.
299    /// let span = unsafe { span.slice_unchecked(0..4) };
300    /// assert_eq!(searcher.rsearch(span.clone()), None);
301    /// ```
302    fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>>;
303}
304
305/// A consumer which can be searched from the end.
306///
307/// This trait provides methods for matching a needle anchored at the end of a
308/// hay.
309///
310/// # Safety
311///
312/// This trait is marked unsafe because the range returned by its methods are
313/// required to lie on valid codeword boundaries in the haystack. This enables
314/// users of this trait to slice the haystack without additional runtime checks.
315pub unsafe trait ReverseConsumer<A: Hay + ?Sized>: Consumer<A> {
316    /// Checks if the needle can be found at the end of the span.
317    ///
318    /// This method is used to implement the standard algorithm
319    /// [`ends_with()`](::ext::ends_with) as well as providing the default
320    /// implementation for [`.trim_end()`](ReverseConsumer::trim_end).
321    ///
322    /// The hay and the restricted range for searching can be recovered by
323    /// calling `span`[`.into_parts()`](Span::into_parts). If a needle can be
324    /// found ending at `range.end`, this method should return the start index
325    /// of the needle relative to the hay.
326    ///
327    /// If the needle cannot be found at the end of the span, this method
328    /// should return `None`.
329    ///
330    /// # Examples
331    ///
332    /// Consumes ASCII characters from the end.
333    ///
334    /// ```
335    /// extern crate pattern_3;
336    /// use pattern_3::{ReverseConsumer, Needle, Span};
337    ///
338    /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii());
339    /// let span = Span::from("Hi😋!!");
340    ///
341    /// // consumes the last ASCII character
342    /// assert_eq!(consumer.rconsume(span.clone()), Some(7));
343    ///
344    /// // slice the span to skip the first match.
345    /// let span = unsafe { span.slice_unchecked(0..7) };
346    ///
347    /// // matched the second to last ASCII character
348    /// assert_eq!(consumer.rconsume(span.clone()), Some(6));
349    ///
350    /// // should match nothing now.
351    /// let span = unsafe { span.slice_unchecked(0..6) };
352    /// assert_eq!(consumer.rconsume(span.clone()), None);
353    /// ```
354    fn rconsume(&mut self, hay: Span<&A>) -> Option<A::Index>;
355
356    /// Repeatedly removes suffixes of the hay which matches the needle.
357    ///
358    /// This method is used to implement the standard algorithm
359    /// [`trim_end()`](::ext::trim_end).
360    ///
361    /// A fast generic implementation in terms of
362    /// [`.rconsume()`](ReverseConsumer::rconsume) is provided by default.
363    /// Nevertheless, many needles allow a higher-performance specialization.
364    ///
365    /// # Examples
366    ///
367    /// ```rust
368    /// extern crate pattern_3;
369    /// use pattern_3::{ReverseConsumer, Needle, Span};
370    ///
371    /// let mut consumer = Needle::<&str>::into_consumer('x');
372    /// assert_eq!(consumer.trim_end("yyxxx"), 2);
373    ///
374    /// let mut consumer = Needle::<&str>::into_consumer('x');
375    /// assert_eq!(consumer.trim_end("xxxyy"), 5);
376    /// ```
377    #[inline]
378    fn trim_end(&mut self, hay: &A) -> A::Index {
379        let mut offset = hay.end_index();
380        let mut span = Span::from(hay);
381        while let Some(pos) = self.rconsume(span.clone()) {
382            offset = pos;
383            let (hay, range) = span.into_parts();
384            if pos == range.end {
385                break;
386            }
387            span = unsafe { Span::from_parts(hay, range.start..pos) };
388        }
389        offset
390    }
391}
392
393/// A searcher which can be searched from both end with consistent results.
394///
395/// Implementing this marker trait enables the following standard algorithms to
396/// return [`DoubleEndedIterator`](std::iter::DoubleEndedIterator)s:
397///
398/// * [`matches`](::ext::matches) / [`rmatches`](::ext::rmatches)
399/// * [`match_indices`](::ext::match_indices) / [`rmatch_indices`](::ext::rmatch_indices)
400/// * [`match_ranges`](::ext::match_ranges) / [`rmatch_ranges`](::ext::rmatch_ranges)
401/// * [`split`](::ext::split) / [`rsplit`](::ext::rsplit)
402/// * [`split_terminator`](::ext::split_terminator) / [`rsplit_terminator`](::ext::rsplit_terminator)
403/// * [`splitn`](::ext::splitn) / [`rsplitn`](::ext::rsplitn)
404///
405/// # Examples
406///
407/// The searcher of a character implements `DoubleEndedSearcher`, while that of
408/// a string does not.
409///
410/// `match_indices` and `rmatch_indices` are reverse of each other only for a
411/// `DoubleEndedSearcher`.
412///
413/// ```rust
414/// extern crate pattern_3;
415/// use pattern_3::ext::{match_indices, rmatch_indices};
416///
417/// // `match_indices` and `rmatch_indices` are exact reverse of each other for a `char` needle.
418/// let forward = match_indices("xxxxx", 'x').collect::<Vec<_>>();
419/// let mut rev_backward = rmatch_indices("xxxxx", 'x').collect::<Vec<_>>();
420/// rev_backward.reverse();
421///
422/// assert_eq!(forward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]);
423/// assert_eq!(rev_backward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]);
424/// assert_eq!(forward, rev_backward);
425///
426/// // this property does not exist on a `&str` needle in general.
427/// let forward = match_indices("xxxxx", "xx").collect::<Vec<_>>();
428/// let mut rev_backward = rmatch_indices("xxxxx", "xx").collect::<Vec<_>>();
429/// rev_backward.reverse();
430///
431/// assert_eq!(forward, vec![(0, "xx"), (2, "xx")]);
432/// assert_eq!(rev_backward, vec![(1, "xx"), (3, "xx")]);
433/// assert_ne!(forward, rev_backward);
434/// ```
435pub unsafe trait DoubleEndedSearcher<A: Hay + ?Sized>: ReverseSearcher<A> {}
436
437/// A consumer which can be searched from both end with consistent results.
438///
439/// It is used to support the following standard algorithm:
440///
441/// * [`trim`](::ext::trim)
442///
443/// The `trim` function is implemented by calling
444/// [`trim_start`](::ext::trim_start) and [`trim_end`](::ext::trim_end) together.
445/// This trait encodes the fact that we can call these two functions in any order.
446///
447/// # Examples
448///
449/// The consumer of a character implements `DoubleEndedConsumer`, while that of
450/// a string does not. `trim` is implemented only for a `DoubleEndedConsumer`.
451///
452/// ```rust
453/// extern crate pattern_3;
454/// use pattern_3::ext::{trim_start, trim_end, trim};
455///
456/// // for a `char`, we get the same trim result no matter which function is called first.
457/// let trim_start_first = trim_end(trim_start("xyxyx", 'x'), 'x');
458/// let trim_end_first = trim_start(trim_end("xyxyx", 'x'), 'x');
459/// let trim_together = trim("xyxyx", 'x');
460/// assert_eq!(trim_start_first, "yxy");
461/// assert_eq!(trim_end_first, "yxy");
462/// assert_eq!(trim_together, "yxy");
463///
464/// // this property does not exist for a `&str` in general.
465/// let trim_start_first = trim_end(trim_start("xyxyx", "xyx"), "xyx");
466/// let trim_end_first = trim_start(trim_end("xyxyx", "xyx"), "xyx");
467/// // let trim_together = trim("xyxyx", 'x'); // cannot be defined
468/// assert_eq!(trim_start_first, "yx");
469/// assert_eq!(trim_end_first, "xy");
470/// // assert_eq!(trim_together, /*????*/); // cannot be defined
471/// ```
472pub unsafe trait DoubleEndedConsumer<A: Hay + ?Sized>: ReverseConsumer<A> {}
473
474/// A needle, a type which can be converted into a searcher.
475///
476/// When using search algorithms like [`split()`](::ext::split), users will
477/// search with a `Needle` e.g. a `&str`. A needle is usually stateless,
478/// however for efficient searching, we often need some preprocessing and
479/// maintain a mutable state. The preprocessed structure is called the
480/// [`Searcher`] of this needle.
481///
482/// The relationship between `Searcher` and `Needle` is similar to `Iterator`
483/// and `IntoIterator`.
484pub trait Needle<H: Haystack>: Sized
485where H::Target: Hay // FIXME: RFC 2089 or 2289
486{
487    /// The searcher associated with this needle.
488    type Searcher: Searcher<H::Target>;
489
490    /// The consumer associated with this needle.
491    type Consumer: Consumer<H::Target>;
492
493    /// Produces a searcher for this needle.
494    fn into_searcher(self) -> Self::Searcher;
495
496    /// Produces a consumer for this needle.
497    ///
498    /// Usually a consumer and a searcher can be the same type.
499    /// Some needles may require different types
500    /// when the two need different optimization strategies. String searching
501    /// is an example of this: we use the Two-Way Algorithm when searching for
502    /// substrings, which needs to preprocess the needle. However this is
503    /// irrelevant for consuming, which only needs to check for string equality
504    /// once. Therefore the Consumer for a string would be a distinct type
505    /// using naive search.
506    fn into_consumer(self) -> Self::Consumer;
507}
508
509/// Searcher of an empty needle.
510///
511/// This searcher will find all empty subslices between any codewords in a
512/// haystack.
513#[derive(Clone, Debug, Default)]
514pub struct EmptySearcher {
515    consumed_start: bool,
516    consumed_end: bool,
517}
518
519unsafe impl<A: Hay + ?Sized> Searcher<A> for EmptySearcher {
520    #[inline]
521    fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>> {
522        let (hay, range) = span.into_parts();
523        let start = if !self.consumed_start {
524            self.consumed_start = true;
525            range.start
526        } else if range.start == range.end {
527            return None;
528        } else {
529            unsafe { hay.next_index(range.start) }
530        };
531        Some(start..start)
532    }
533}
534
535unsafe impl<A: Hay + ?Sized> Consumer<A> for EmptySearcher {
536    #[inline]
537    fn consume(&mut self, span: Span<&A>) -> Option<A::Index> {
538        let (_, range) = span.into_parts();
539        Some(range.start)
540    }
541
542    #[inline]
543    fn trim_start(&mut self, hay: &A) -> A::Index {
544        hay.start_index()
545    }
546}
547
548unsafe impl<A: Hay + ?Sized> ReverseSearcher<A> for EmptySearcher {
549    #[inline]
550    fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>> {
551        let (hay, range) = span.into_parts();
552        let end = if !self.consumed_end {
553            self.consumed_end = true;
554            range.end
555        } else if range.start == range.end {
556            return None;
557        } else {
558            unsafe { hay.prev_index(range.end) }
559        };
560        Some(end..end)
561    }
562}
563
564unsafe impl<A: Hay + ?Sized> ReverseConsumer<A> for EmptySearcher {
565    #[inline]
566    fn rconsume(&mut self, span: Span<&A>) -> Option<A::Index> {
567        let (_, range) = span.into_parts();
568        Some(range.end)
569    }
570
571    #[inline]
572    fn trim_end(&mut self, hay: &A) -> A::Index {
573        hay.end_index()
574    }
575}
576
577unsafe impl<A: Hay + ?Sized> DoubleEndedSearcher<A> for EmptySearcher {}
578unsafe impl<A: Hay + ?Sized> DoubleEndedConsumer<A> for EmptySearcher {}