pattern_3/needle.rs
1//! Needle traits.
2
3use haystack::{Haystack, Hay, Span};
4
5use std::ops::Range;
6
7/// A searcher, for searching a [`Needle`] from a [`Hay`].
8///
9/// This trait provides methods for searching for non-overlapping matches of a
10/// needle starting from the front (left) of a hay.
11///
12/// # Safety
13///
14/// This trait is marked unsafe because the range returned by its methods are
15/// required to lie on valid codeword boundaries in the haystack. This enables
16/// users of this trait to slice the haystack without additional runtime checks.
17///
18/// # Examples
19///
20/// Implement a searcher and consumer which matches `b"Aaaa"` from a byte string.
21///
22/// ```rust
23/// extern crate pattern_3;
24/// use pattern_3::*;
25/// use std::ops::Range;
26///
27/// // The searcher for searching `b"Aaaa"`, using naive search.
28/// // We are going to use this as a needle too.
29/// struct Aaaa;
30///
31/// unsafe impl Searcher<[u8]> for Aaaa {
32/// // search for an `b"Aaaa"` in the middle of the string, returns its range.
33/// fn search(&mut self, span: Span<&[u8]>) -> Option<Range<usize>> {
34/// let (hay, range) = span.into_parts();
35///
36/// let start = range.start;
37/// for (i, window) in hay[range].windows(4).enumerate() {
38/// if *window == b"Aaaa"[..] {
39/// // remember to include the range offset
40/// return Some((start + i)..(start + i + 4));
41/// }
42/// }
43///
44/// None
45/// }
46/// }
47///
48/// unsafe impl Consumer<[u8]> for Aaaa {
49/// // checks if an `b"Aaaa" is at the beginning of the string, returns the end index.
50/// fn consume(&mut self, span: Span<&[u8]>) -> Option<usize> {
51/// let (hay, range) = span.into_parts();
52/// let end = range.start.checked_add(4)?;
53/// if end <= range.end && hay[range.start..end] == b"Aaaa"[..] {
54/// Some(end)
55/// } else {
56/// None
57/// }
58/// }
59/// }
60///
61/// impl<H: Haystack<Target = [u8]>> pattern_3::Needle<H> for Aaaa {
62/// type Searcher = Self;
63/// type Consumer = Self;
64/// fn into_searcher(self) -> Self { self }
65/// fn into_consumer(self) -> Self { self }
66/// }
67///
68/// // test with some standard algorithms.
69/// let haystack = &b"Aaaaa!!!Aaa!!!Aaaaaaaaa!!!"[..];
70/// assert_eq!(
71/// ext::split(haystack, Aaaa).collect::<Vec<_>>(),
72/// vec![
73/// &b""[..],
74/// &b"a!!!Aaa!!!"[..],
75/// &b"aaaaa!!!"[..],
76/// ]
77/// );
78/// assert_eq!(
79/// ext::match_ranges(haystack, Aaaa).collect::<Vec<_>>(),
80/// vec![
81/// (0..4, &b"Aaaa"[..]),
82/// (14..18, &b"Aaaa"[..]),
83/// ]
84/// );
85/// assert_eq!(
86/// ext::trim_start(haystack, Aaaa),
87/// &b"a!!!Aaa!!!Aaaaaaaaa!!!"[..]
88/// );
89/// ```
90pub unsafe trait Searcher<A: Hay + ?Sized> {
91 /// Searches for the first range which the needle can be found in the span.
92 ///
93 /// This method is used to support the following standard algorithms:
94 ///
95 /// * [`matches`](::ext::matches)
96 /// * [`contains`](::ext::contains)
97 /// * [`match_indices`](::ext::match_indices)
98 /// * [`find`](::ext::find)
99 /// * [`match_ranges`](::ext::match_ranges)
100 /// * [`find_range`](::ext::find_range)
101 /// * [`split`](::ext::split)
102 /// * [`split_terminator`](::ext::split_terminator)
103 /// * [`splitn`](::ext::splitn)
104 /// * [`replace_with`](::ext::replace_with)
105 /// * [`replacen_with`](::ext::replacen_with)
106 ///
107 /// The hay and the restricted range for searching can be recovered by
108 /// calling `span`[`.into_parts()`](Span::into_parts). The range returned
109 /// by this method
110 /// should be relative to the hay and must be contained within the
111 /// restricted range from the span.
112 ///
113 /// If the needle is not found, this method should return `None`.
114 ///
115 /// The reason this method takes a `Span<&A>` instead of just `&A` is
116 /// because some needles need context information provided by
117 /// the position of the current slice and the content around the slice.
118 /// Regex components like the start-/end-of-text anchors `^`/`$`
119 /// and word boundary `\b` are primary examples.
120 ///
121 /// # Examples
122 ///
123 /// Search for the locations of a substring inside a string, using the
124 /// searcher primitive.
125 ///
126 /// ```
127 /// extern crate pattern_3;
128 /// use pattern_3::{Searcher, Needle, Span};
129 ///
130 /// let mut searcher = Needle::<&str>::into_searcher("::");
131 /// let span = Span::from("lion::tiger::leopard");
132 /// // ^ ^ ^ ^
133 /// // string indices: 0 4 11 20
134 ///
135 /// // found the first "::".
136 /// assert_eq!(searcher.search(span.clone()), Some(4..6));
137 ///
138 /// // slice the span to skip the first match.
139 /// let span = unsafe { span.slice_unchecked(6..20) };
140 ///
141 /// // found the second "::".
142 /// assert_eq!(searcher.search(span.clone()), Some(11..13));
143 ///
144 /// // should find nothing now.
145 /// let span = unsafe { span.slice_unchecked(13..20) };
146 /// assert_eq!(searcher.search(span.clone()), None);
147 /// ```
148 fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>>;
149}
150
151/// A consumer, for searching a [`Needle`] from a [`Hay`] anchored at the
152/// beginnning.
153///
154/// This trait provides methods for matching a needle anchored at the beginning
155/// of a hay.
156///
157/// See documentation of [`Searcher`] for an example.
158///
159/// # Safety
160///
161/// This trait is marked unsafe because the range returned by its methods are
162/// required to lie on valid codeword boundaries in the haystack. This enables
163/// users of this trait to slice the haystack without additional runtime checks.
164pub unsafe trait Consumer<A: Hay + ?Sized> {
165 /// Checks if the needle can be found at the beginning of the span.
166 ///
167 /// This method is used to implement the standard algorithm
168 /// [`starts_with()`](::ext::starts_with) as well as providing the default
169 /// implementation for [`.trim_start()`](Consumer::trim_start).
170 ///
171 /// The hay and the restricted range for searching can be recovered by
172 /// calling `span`[`.into_parts()`](Span::into_parts). If a needle can be
173 /// found starting at `range.start`, this method should return the end index
174 /// of the needle relative to the hay.
175 ///
176 /// If the needle cannot be found at the beginning of the span, this method
177 /// should return `None`.
178 ///
179 /// # Examples
180 ///
181 /// Consumes ASCII characters from the beginning.
182 ///
183 /// ```
184 /// extern crate pattern_3;
185 /// use pattern_3::{Consumer, Needle, Span};
186 ///
187 /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii());
188 /// let span = Span::from("Hi😋!!");
189 ///
190 /// // consumes the first ASCII character
191 /// assert_eq!(consumer.consume(span.clone()), Some(1));
192 ///
193 /// // slice the span to skip the first match.
194 /// let span = unsafe { span.slice_unchecked(1..8) };
195 ///
196 /// // matched the second ASCII character
197 /// assert_eq!(consumer.consume(span.clone()), Some(2));
198 ///
199 /// // should match nothing now.
200 /// let span = unsafe { span.slice_unchecked(2..8) };
201 /// assert_eq!(consumer.consume(span.clone()), None);
202 /// ```
203 fn consume(&mut self, span: Span<&A>) -> Option<A::Index>;
204
205 /// Repeatedly removes prefixes of the hay which matches the needle.
206 ///
207 /// This method is used to implement the standard algorithm
208 /// [`trim_start()`](::ext::trim_start).
209 ///
210 /// Returns the start index of the slice after all prefixes are removed.
211 ///
212 /// A fast generic implementation in terms of
213 /// [`.consume()`](Consumer::consume) is provided by default. Nevertheless,
214 /// many needles allow a higher-performance specialization.
215 ///
216 /// # Examples
217 ///
218 /// ```rust
219 /// extern crate pattern_3;
220 /// use pattern_3::{Consumer, Needle, Span};
221 ///
222 /// let mut consumer = Needle::<&str>::into_consumer('x');
223 /// assert_eq!(consumer.trim_start("xxxyy"), 3);
224 ///
225 /// let mut consumer = Needle::<&str>::into_consumer('x');
226 /// assert_eq!(consumer.trim_start("yyxxx"), 0);
227 /// ```
228 #[inline]
229 fn trim_start(&mut self, hay: &A) -> A::Index {
230 let mut offset = hay.start_index();
231 let mut span = Span::from(hay);
232 while let Some(pos) = self.consume(span.clone()) {
233 offset = pos;
234 let (hay, range) = span.into_parts();
235 if pos == range.start {
236 break;
237 }
238 span = unsafe { Span::from_parts(hay, pos..range.end) };
239 }
240 offset
241 }
242}
243
244/// A searcher which can be searched from the end.
245///
246/// This trait provides methods for searching for non-overlapping matches of a
247/// needle starting from the back (right) of a hay.
248///
249/// # Safety
250///
251/// This trait is marked unsafe because the range returned by its methods are
252/// required to lie on valid codeword boundaries in the haystack. This enables
253/// users of this trait to slice the haystack without additional runtime checks.
254pub unsafe trait ReverseSearcher<A: Hay + ?Sized>: Searcher<A> {
255 /// Searches for the last range which the needle can be found in the span.
256 ///
257 /// This method is used to support the following standard algorithms:
258 ///
259 /// * [`rmatches`](::ext::rmatches)
260 /// * [`rmatch_indices`](::ext::rmatch_indices)
261 /// * [`rfind`](::ext::find)
262 /// * [`rmatch_ranges`](::ext::rmatch_ranges)
263 /// * [`rfind_range`](::ext::rfind_range)
264 /// * [`rsplit`](::ext::rsplit)
265 /// * [`rsplit_terminator`](::ext::rsplit_terminator)
266 /// * [`rsplitn`](::ext::rsplitn)
267 ///
268 /// The hay and the restricted range for searching can be recovered by
269 /// calling `span`[`.into_parts()`](Span::into_parts). The returned range
270 /// should be relative to the hay and must be contained within the
271 /// restricted range from the span.
272 ///
273 /// If the needle is not found, this method should return `None`.
274 ///
275 /// # Examples
276 ///
277 /// Search for the locations of a substring inside a string, using the
278 /// searcher primitive.
279 ///
280 /// ```
281 /// extern crate pattern_3;
282 /// use pattern_3::{ReverseSearcher, Needle, Span};
283 ///
284 /// let mut searcher = Needle::<&str>::into_searcher("::");
285 /// let span = Span::from("lion::tiger::leopard");
286 /// // ^ ^ ^
287 /// // string indices: 0 4 11
288 ///
289 /// // found the last "::".
290 /// assert_eq!(searcher.rsearch(span.clone()), Some(11..13));
291 ///
292 /// // slice the span to skip the last match.
293 /// let span = unsafe { span.slice_unchecked(0..11) };
294 ///
295 /// // found the second to last "::".
296 /// assert_eq!(searcher.rsearch(span.clone()), Some(4..6));
297 ///
298 /// // should found nothing now.
299 /// let span = unsafe { span.slice_unchecked(0..4) };
300 /// assert_eq!(searcher.rsearch(span.clone()), None);
301 /// ```
302 fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>>;
303}
304
305/// A consumer which can be searched from the end.
306///
307/// This trait provides methods for matching a needle anchored at the end of a
308/// hay.
309///
310/// # Safety
311///
312/// This trait is marked unsafe because the range returned by its methods are
313/// required to lie on valid codeword boundaries in the haystack. This enables
314/// users of this trait to slice the haystack without additional runtime checks.
315pub unsafe trait ReverseConsumer<A: Hay + ?Sized>: Consumer<A> {
316 /// Checks if the needle can be found at the end of the span.
317 ///
318 /// This method is used to implement the standard algorithm
319 /// [`ends_with()`](::ext::ends_with) as well as providing the default
320 /// implementation for [`.trim_end()`](ReverseConsumer::trim_end).
321 ///
322 /// The hay and the restricted range for searching can be recovered by
323 /// calling `span`[`.into_parts()`](Span::into_parts). If a needle can be
324 /// found ending at `range.end`, this method should return the start index
325 /// of the needle relative to the hay.
326 ///
327 /// If the needle cannot be found at the end of the span, this method
328 /// should return `None`.
329 ///
330 /// # Examples
331 ///
332 /// Consumes ASCII characters from the end.
333 ///
334 /// ```
335 /// extern crate pattern_3;
336 /// use pattern_3::{ReverseConsumer, Needle, Span};
337 ///
338 /// let mut consumer = Needle::<&str>::into_consumer(|c: char| c.is_ascii());
339 /// let span = Span::from("Hi😋!!");
340 ///
341 /// // consumes the last ASCII character
342 /// assert_eq!(consumer.rconsume(span.clone()), Some(7));
343 ///
344 /// // slice the span to skip the first match.
345 /// let span = unsafe { span.slice_unchecked(0..7) };
346 ///
347 /// // matched the second to last ASCII character
348 /// assert_eq!(consumer.rconsume(span.clone()), Some(6));
349 ///
350 /// // should match nothing now.
351 /// let span = unsafe { span.slice_unchecked(0..6) };
352 /// assert_eq!(consumer.rconsume(span.clone()), None);
353 /// ```
354 fn rconsume(&mut self, hay: Span<&A>) -> Option<A::Index>;
355
356 /// Repeatedly removes suffixes of the hay which matches the needle.
357 ///
358 /// This method is used to implement the standard algorithm
359 /// [`trim_end()`](::ext::trim_end).
360 ///
361 /// A fast generic implementation in terms of
362 /// [`.rconsume()`](ReverseConsumer::rconsume) is provided by default.
363 /// Nevertheless, many needles allow a higher-performance specialization.
364 ///
365 /// # Examples
366 ///
367 /// ```rust
368 /// extern crate pattern_3;
369 /// use pattern_3::{ReverseConsumer, Needle, Span};
370 ///
371 /// let mut consumer = Needle::<&str>::into_consumer('x');
372 /// assert_eq!(consumer.trim_end("yyxxx"), 2);
373 ///
374 /// let mut consumer = Needle::<&str>::into_consumer('x');
375 /// assert_eq!(consumer.trim_end("xxxyy"), 5);
376 /// ```
377 #[inline]
378 fn trim_end(&mut self, hay: &A) -> A::Index {
379 let mut offset = hay.end_index();
380 let mut span = Span::from(hay);
381 while let Some(pos) = self.rconsume(span.clone()) {
382 offset = pos;
383 let (hay, range) = span.into_parts();
384 if pos == range.end {
385 break;
386 }
387 span = unsafe { Span::from_parts(hay, range.start..pos) };
388 }
389 offset
390 }
391}
392
393/// A searcher which can be searched from both end with consistent results.
394///
395/// Implementing this marker trait enables the following standard algorithms to
396/// return [`DoubleEndedIterator`](std::iter::DoubleEndedIterator)s:
397///
398/// * [`matches`](::ext::matches) / [`rmatches`](::ext::rmatches)
399/// * [`match_indices`](::ext::match_indices) / [`rmatch_indices`](::ext::rmatch_indices)
400/// * [`match_ranges`](::ext::match_ranges) / [`rmatch_ranges`](::ext::rmatch_ranges)
401/// * [`split`](::ext::split) / [`rsplit`](::ext::rsplit)
402/// * [`split_terminator`](::ext::split_terminator) / [`rsplit_terminator`](::ext::rsplit_terminator)
403/// * [`splitn`](::ext::splitn) / [`rsplitn`](::ext::rsplitn)
404///
405/// # Examples
406///
407/// The searcher of a character implements `DoubleEndedSearcher`, while that of
408/// a string does not.
409///
410/// `match_indices` and `rmatch_indices` are reverse of each other only for a
411/// `DoubleEndedSearcher`.
412///
413/// ```rust
414/// extern crate pattern_3;
415/// use pattern_3::ext::{match_indices, rmatch_indices};
416///
417/// // `match_indices` and `rmatch_indices` are exact reverse of each other for a `char` needle.
418/// let forward = match_indices("xxxxx", 'x').collect::<Vec<_>>();
419/// let mut rev_backward = rmatch_indices("xxxxx", 'x').collect::<Vec<_>>();
420/// rev_backward.reverse();
421///
422/// assert_eq!(forward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]);
423/// assert_eq!(rev_backward, vec![(0, "x"), (1, "x"), (2, "x"), (3, "x"), (4, "x")]);
424/// assert_eq!(forward, rev_backward);
425///
426/// // this property does not exist on a `&str` needle in general.
427/// let forward = match_indices("xxxxx", "xx").collect::<Vec<_>>();
428/// let mut rev_backward = rmatch_indices("xxxxx", "xx").collect::<Vec<_>>();
429/// rev_backward.reverse();
430///
431/// assert_eq!(forward, vec![(0, "xx"), (2, "xx")]);
432/// assert_eq!(rev_backward, vec![(1, "xx"), (3, "xx")]);
433/// assert_ne!(forward, rev_backward);
434/// ```
435pub unsafe trait DoubleEndedSearcher<A: Hay + ?Sized>: ReverseSearcher<A> {}
436
437/// A consumer which can be searched from both end with consistent results.
438///
439/// It is used to support the following standard algorithm:
440///
441/// * [`trim`](::ext::trim)
442///
443/// The `trim` function is implemented by calling
444/// [`trim_start`](::ext::trim_start) and [`trim_end`](::ext::trim_end) together.
445/// This trait encodes the fact that we can call these two functions in any order.
446///
447/// # Examples
448///
449/// The consumer of a character implements `DoubleEndedConsumer`, while that of
450/// a string does not. `trim` is implemented only for a `DoubleEndedConsumer`.
451///
452/// ```rust
453/// extern crate pattern_3;
454/// use pattern_3::ext::{trim_start, trim_end, trim};
455///
456/// // for a `char`, we get the same trim result no matter which function is called first.
457/// let trim_start_first = trim_end(trim_start("xyxyx", 'x'), 'x');
458/// let trim_end_first = trim_start(trim_end("xyxyx", 'x'), 'x');
459/// let trim_together = trim("xyxyx", 'x');
460/// assert_eq!(trim_start_first, "yxy");
461/// assert_eq!(trim_end_first, "yxy");
462/// assert_eq!(trim_together, "yxy");
463///
464/// // this property does not exist for a `&str` in general.
465/// let trim_start_first = trim_end(trim_start("xyxyx", "xyx"), "xyx");
466/// let trim_end_first = trim_start(trim_end("xyxyx", "xyx"), "xyx");
467/// // let trim_together = trim("xyxyx", 'x'); // cannot be defined
468/// assert_eq!(trim_start_first, "yx");
469/// assert_eq!(trim_end_first, "xy");
470/// // assert_eq!(trim_together, /*????*/); // cannot be defined
471/// ```
472pub unsafe trait DoubleEndedConsumer<A: Hay + ?Sized>: ReverseConsumer<A> {}
473
474/// A needle, a type which can be converted into a searcher.
475///
476/// When using search algorithms like [`split()`](::ext::split), users will
477/// search with a `Needle` e.g. a `&str`. A needle is usually stateless,
478/// however for efficient searching, we often need some preprocessing and
479/// maintain a mutable state. The preprocessed structure is called the
480/// [`Searcher`] of this needle.
481///
482/// The relationship between `Searcher` and `Needle` is similar to `Iterator`
483/// and `IntoIterator`.
484pub trait Needle<H: Haystack>: Sized
485where H::Target: Hay // FIXME: RFC 2089 or 2289
486{
487 /// The searcher associated with this needle.
488 type Searcher: Searcher<H::Target>;
489
490 /// The consumer associated with this needle.
491 type Consumer: Consumer<H::Target>;
492
493 /// Produces a searcher for this needle.
494 fn into_searcher(self) -> Self::Searcher;
495
496 /// Produces a consumer for this needle.
497 ///
498 /// Usually a consumer and a searcher can be the same type.
499 /// Some needles may require different types
500 /// when the two need different optimization strategies. String searching
501 /// is an example of this: we use the Two-Way Algorithm when searching for
502 /// substrings, which needs to preprocess the needle. However this is
503 /// irrelevant for consuming, which only needs to check for string equality
504 /// once. Therefore the Consumer for a string would be a distinct type
505 /// using naive search.
506 fn into_consumer(self) -> Self::Consumer;
507}
508
509/// Searcher of an empty needle.
510///
511/// This searcher will find all empty subslices between any codewords in a
512/// haystack.
513#[derive(Clone, Debug, Default)]
514pub struct EmptySearcher {
515 consumed_start: bool,
516 consumed_end: bool,
517}
518
519unsafe impl<A: Hay + ?Sized> Searcher<A> for EmptySearcher {
520 #[inline]
521 fn search(&mut self, span: Span<&A>) -> Option<Range<A::Index>> {
522 let (hay, range) = span.into_parts();
523 let start = if !self.consumed_start {
524 self.consumed_start = true;
525 range.start
526 } else if range.start == range.end {
527 return None;
528 } else {
529 unsafe { hay.next_index(range.start) }
530 };
531 Some(start..start)
532 }
533}
534
535unsafe impl<A: Hay + ?Sized> Consumer<A> for EmptySearcher {
536 #[inline]
537 fn consume(&mut self, span: Span<&A>) -> Option<A::Index> {
538 let (_, range) = span.into_parts();
539 Some(range.start)
540 }
541
542 #[inline]
543 fn trim_start(&mut self, hay: &A) -> A::Index {
544 hay.start_index()
545 }
546}
547
548unsafe impl<A: Hay + ?Sized> ReverseSearcher<A> for EmptySearcher {
549 #[inline]
550 fn rsearch(&mut self, span: Span<&A>) -> Option<Range<A::Index>> {
551 let (hay, range) = span.into_parts();
552 let end = if !self.consumed_end {
553 self.consumed_end = true;
554 range.end
555 } else if range.start == range.end {
556 return None;
557 } else {
558 unsafe { hay.prev_index(range.end) }
559 };
560 Some(end..end)
561 }
562}
563
564unsafe impl<A: Hay + ?Sized> ReverseConsumer<A> for EmptySearcher {
565 #[inline]
566 fn rconsume(&mut self, span: Span<&A>) -> Option<A::Index> {
567 let (_, range) = span.into_parts();
568 Some(range.end)
569 }
570
571 #[inline]
572 fn trim_end(&mut self, hay: &A) -> A::Index {
573 hay.end_index()
574 }
575}
576
577unsafe impl<A: Hay + ?Sized> DoubleEndedSearcher<A> for EmptySearcher {}
578unsafe impl<A: Hay + ?Sized> DoubleEndedConsumer<A> for EmptySearcher {}