pattern_3/
haystack.rs

1//! Haystacks.
2//!
3//! A *haystack* refers to any linear structure which can be split or sliced
4//! into smaller, non-overlapping parts. Examples are strings and vectors.
5//!
6//! ```rust
7//! let haystack: &str = "hello";       // a string slice (`&str`) is a haystack.
8//! let (a, b) = haystack.split_at(4);  // it can be split into two strings.
9//! let c = &a[1..3];                   // it can be sliced.
10//! ```
11//!
12//! The minimal haystack which cannot be further sliced is called a *codeword*.
13//! For instance, the codeword of a string would be a UTF-8 sequence. A haystack
14//! can therefore be viewed as a consecutive list of codewords.
15//!
16//! The boundary between codewords can be addressed using an *index*. The
17//! numbers 1, 3 and 4 in the snippet above are sample indices of a string. An
18//! index is usually a `usize`.
19//!
20//! An arbitrary number may point outside of a haystack, or in the interior of a
21//! codeword. These indices are invalid. A *valid index* of a certain haystack
22//! would only point to the boundaries.
23
24use std::ops::{Deref, Range};
25use std::fmt::Debug;
26use std::mem;
27
28/// Borrowed [`Haystack`].
29///
30/// Every `Haystack` type can be borrowed as references to `Hay` types. This
31/// allows multiple similar types to share the same implementation (e.g. the
32/// haystacks `&[T]`, `&mut [T]` and `Vec<T>` all have the same corresponding
33/// hay type `[T]`).
34///
35/// In the other words, a `Haystack` is a generalized reference to `Hay`.
36/// `Hay`s are typically implemented on unsized slice types like `str` and `[T]`.
37///
38/// # Safety
39///
40/// This trait is unsafe as there are some unchecked requirements which the
41/// implementor must uphold. Failing to meet these requirements would lead to
42/// out-of-bound access. The safety requirements are written in each member of
43/// this trait.
44pub unsafe trait Hay {
45    /// The index type of the haystack. Typically a `usize`.
46    ///
47    /// Splitting a hay must be sublinear using this index type. For instance,
48    /// if we implement `Hay` for a linked list, the index should not be an
49    /// integer offset (`usize`) as this would require O(n) time to chase the
50    /// pointer and find the split point. Instead, for a linked list we should
51    /// directly use the node pointer as the index.
52    ///
53    /// # Safety
54    ///
55    /// Valid indices of a single hay have a total order, even this type does
56    /// not require an `Ord` bound — for instance, to order two linked list
57    /// cursors, we need to chase the links and see if they meet; this is slow
58    /// and not suitable for implementing `Ord`, but conceptually an ordering
59    /// can be defined on linked list cursors.
60    type Index: Copy + Debug + Eq;
61
62    /// Creates an empty hay.
63    ///
64    /// # Safety
65    ///
66    /// An empty hay's start and end indices must be the same, e.g.
67    ///
68    /// ```rust
69    /// extern crate pattern_3;
70    /// use pattern_3::Hay;
71    ///
72    /// let empty = <str>::empty();
73    /// assert_eq!(empty.start_index(), empty.end_index());
74    /// ```
75    ///
76    /// This also suggests that there is exactly one valid index for an empty
77    /// hay.
78    ///
79    /// There is no guarantee that two separate calls to `.empty()` will produce
80    /// the same hay reference.
81    fn empty<'a>() -> &'a Self;
82
83    /// Obtains the index to the start of the hay.
84    ///
85    /// Usually this method returns `0`.
86    ///
87    /// # Safety
88    ///
89    /// Implementation must ensure that the start index of hay is the first
90    /// valid index, i.e. for all valid indices `i` of `self`, we have
91    /// `self.start_index() <= i`.
92    fn start_index(&self) -> Self::Index;
93
94    /// Obtains the index to the end of the hay.
95    ///
96    /// Usually this method returns the length of the hay.
97    ///
98    /// # Safety
99    ///
100    /// Implementation must ensure that the end index of hay is the last valid
101    /// index, i.e. for all valid indices `i` of `self`, we have
102    /// `i <= self.end_index()`.
103    fn end_index(&self) -> Self::Index;
104
105    /// Returns the next immediate index in this haystack.
106    ///
107    /// # Safety
108    ///
109    /// The `index` must be a valid index, and also must not equal to
110    /// `self.end_index()`.
111    ///
112    /// Implementation must ensure that if `j = self.next_index(i)`, then `j`
113    /// is also a valid index satisfying `j > i`.
114    ///
115    /// # Examples
116    ///
117    /// ```rust
118    /// use pattern_3::Hay;
119    ///
120    /// let sample = "A→😀";
121    /// unsafe {
122    ///     assert_eq!(sample.next_index(0), 1);
123    ///     assert_eq!(sample.next_index(1), 4);
124    ///     assert_eq!(sample.next_index(4), 8);
125    /// }
126    /// ```
127    unsafe fn next_index(&self, index: Self::Index) -> Self::Index;
128
129    /// Returns the previous immediate index in this haystack.
130    ///
131    /// # Safety
132    ///
133    /// The `index` must be a valid index, and also must not equal to
134    /// `self.start_index()`.
135    ///
136    /// Implementation must ensure that if `j = self.prev_index(i)`, then `j`
137    /// is also a valid index satisfying `j < i`.
138    ///
139    /// # Examples
140    ///
141    /// ```rust
142    /// use pattern_3::Hay;
143    ///
144    /// let sample = "A→😀";
145    /// unsafe {
146    ///     assert_eq!(sample.prev_index(8), 4);
147    ///     assert_eq!(sample.prev_index(4), 1);
148    ///     assert_eq!(sample.prev_index(1), 0);
149    /// }
150    /// ```
151    unsafe fn prev_index(&self, index: Self::Index) -> Self::Index;
152
153    /// Obtains a child hay by slicing `self`.
154    ///
155    /// # Safety
156    ///
157    /// The two ends of the range must be valid indices. The start of the range
158    /// must be before the end of the range (`range.start <= range.end`).
159    unsafe fn slice_unchecked(&self, range: Range<Self::Index>) -> &Self;
160}
161
162/// Linear splittable structure.
163///
164/// A `Haystack` is implemented for reference and collection types such as
165/// `&str`, `&mut [T]` and `Vec<T>`. Every haystack can be borrowed as an
166/// underlying representation called a [`Hay`]. Multiple haystacks may share the
167/// same hay type, and thus share the same implementation of string search
168/// algorithms.
169///
170/// In the other words, a `Haystack` is a generalized reference to `Hay`.
171///
172/// # Safety
173///
174/// This trait is unsafe as there are some unchecked requirements which the
175/// implementor must uphold. Failing to meet these requirements would lead to
176/// out-of-bound access. The safety requirements are written in each member of
177/// this trait.
178pub unsafe trait Haystack: Deref + Sized where Self::Target: Hay {
179    /// Creates an empty haystack.
180    fn empty() -> Self;
181
182    /// Splits the haystack into 3 slices around the given range.
183    ///
184    /// This method splits `self` into 3 non-overlapping parts:
185    ///
186    /// 1. Before the range (`self[..range.start]`),
187    /// 2. Inside the range (`self[range]`), and
188    /// 3. After the range (`self[range.end..]`)
189    ///
190    /// The returned array contains these 3 parts in order.
191    ///
192    /// # Safety
193    ///
194    /// Caller should ensure that the starts and end indices of `range` are
195    /// valid indices for the haystack `self` with `range.start <= range.end`.
196    ///
197    /// If the haystack is a mutable reference (`&mut A`), implementation must
198    /// ensure that the 3 returned haystack are truly non-overlapping in memory.
199    /// This is required to uphold the "Aliasing XOR Mutability" guarantee. If a
200    /// haystack cannot be physically split into non-overlapping parts (e.g. in
201    /// `OsStr`), then `&mut A` should not implement `Haystack` either.
202    ///
203    /// # Examples
204    ///
205    /// ```rust
206    /// use pattern_3::Haystack;
207    ///
208    /// let haystack = &mut [0, 1, 2, 3, 4, 5, 6];
209    /// let [left, middle, right] = unsafe { haystack.split_around(2..6) };
210    /// assert_eq!(left, &mut [0, 1]);
211    /// assert_eq!(middle, &mut [2, 3, 4, 5]);
212    /// assert_eq!(right, &mut [6]);
213    /// ```
214    unsafe fn split_around(self, range: Range<<Self::Target as Hay>::Index>) -> [Self; 3];
215
216    /// Subslices this haystack.
217    ///
218    /// # Safety
219    ///
220    /// The starts and end indices of `range` must be valid indices for the
221    /// haystack `self` with `range.start <= range.end`.
222    unsafe fn slice_unchecked(self, range: Range<<Self::Target as Hay>::Index>) -> Self {
223        let [_, middle, _] = self.split_around(range);
224        middle
225    }
226
227    /// Transforms the range from relative to self's parent to the original
228    /// haystack it was sliced from.
229    ///
230    /// Typically this method can be simply implemented as
231    ///
232    /// ```text
233    /// (original.start + parent.start)..(original.start + parent.end)
234    /// ```
235    ///
236    /// If this haystack is a [`SharedHaystack`], this method would never be
237    /// called.
238    ///
239    /// # Safety
240    ///
241    /// The `parent` range should be a valid range relative to a hay *a*, which
242    /// was used to slice out *self*: `self == &a[parent]`.
243    ///
244    /// Similarly, the `original` range should be a valid range relative to
245    /// another hay *b* used to slice out *a*: `a == &b[original]`.
246    ///
247    /// The distance of `parent` must be consistent with the length of `self`.
248    ///
249    /// This method should return a range which satisfies:
250    ///
251    /// ```text
252    /// self == &b[parent][original] == &b[range]
253    /// ```
254    ///
255    /// Slicing can be destructive and *invalidates* some indices, in particular
256    /// for owned type with a pointer-like index, e.g. linked list. In this
257    /// case, one should derive an entirely new index range from `self`, e.g.
258    /// returning `self.start_index()..self.end_index()`.
259    ///
260    /// # Examples
261    ///
262    /// ```rust
263    /// use pattern_3::Haystack;
264    ///
265    /// let hay = b"This is a sample haystack";
266    /// let this = hay[2..23][3..19].to_vec();
267    /// assert_eq!(&*this, &hay[this.restore_range(2..23, 3..19)]);
268    /// ```
269    fn restore_range(
270        &self,
271        original: Range<<Self::Target as Hay>::Index>,
272        parent: Range<<Self::Target as Hay>::Index>,
273    ) -> Range<<Self::Target as Hay>::Index>;
274}
275
276/// A [`Haystack`] which can be shared and cheaply cloned (e.g. `&H`, `Rc<H>`).
277///
278/// If a haystack implements this marker trait, during internal operations the
279/// original haystack will be retained in full and cloned, rather than being
280/// sliced and splitted. Being a shared haystack allows searcher to see the
281/// entire haystack, including the consumed portion.
282pub trait SharedHaystack: Haystack + Clone
283where Self::Target: Hay // FIXME: RFC 2089 or 2289
284{}
285
286/// The borrowing behavior differs between a (unique) haystack and shared
287/// haystack. We use *specialization* to distinguish between these behavior:
288///
289/// * When using `split_around()` and `slice_unchecked()` with a unique
290///     haystack, the original haystack will be splitted or sliced accordingly
291///     to maintain unique ownership.
292/// * When using these functions with a shared haystack, the original haystack
293///     will be cloned in full as that could provide more context into
294///     searchers.
295///
296/// This trait will never be public.
297trait SpanBehavior: Haystack
298where Self::Target: Hay // FIXME: RFC 2089 or 2289
299{
300    fn take(&mut self) -> Self;
301
302    fn from_span(span: Span<Self>) -> Self;
303
304    unsafe fn split_around_for_span(self, subrange: Range<<Self::Target as Hay>::Index>) -> [Self; 3];
305
306    unsafe fn slice_unchecked_for_span(self, subrange: Range<<Self::Target as Hay>::Index>) -> Self;
307
308    fn borrow_range(
309        &self,
310        range: Range<<Self::Target as Hay>::Index>,
311    ) -> Range<<Self::Target as Hay>::Index>;
312
313    fn do_restore_range(
314        &self,
315        range: Range<<Self::Target as Hay>::Index>,
316        subrange: Range<<Self::Target as Hay>::Index>,
317    ) -> Range<<Self::Target as Hay>::Index>;
318}
319
320impl<H: Haystack> SpanBehavior for H
321where H::Target: Hay // FIXME: RFC 2089 or 2289
322{
323    #[inline]
324    default fn take(&mut self) -> Self {
325        mem::replace(self, Self::empty())
326    }
327
328    #[inline]
329    default fn from_span(span: Span<Self>) -> Self {
330        span.haystack
331    }
332
333    #[inline]
334    default fn borrow_range(&self, _: Range<<Self::Target as Hay>::Index>) -> Range<<Self::Target as Hay>::Index> {
335        self.start_index()..self.end_index()
336    }
337
338    #[inline]
339    default fn do_restore_range(
340        &self,
341        range: Range<<Self::Target as Hay>::Index>,
342        subrange: Range<<Self::Target as Hay>::Index>,
343    ) -> Range<<Self::Target as Hay>::Index> {
344        self.restore_range(range, subrange)
345    }
346
347    #[inline]
348    default unsafe fn split_around_for_span(self, subrange: Range<<Self::Target as Hay>::Index>) -> [Self; 3] {
349        self.split_around(subrange)
350    }
351
352    #[inline]
353    default unsafe fn slice_unchecked_for_span(self, subrange: Range<<Self::Target as Hay>::Index>) -> Self {
354        self.slice_unchecked(subrange)
355    }
356}
357
358impl<H: SharedHaystack> SpanBehavior for H
359where H::Target: Hay // FIXME: RFC 2089 or 2289
360{
361    #[inline]
362    fn take(&mut self) -> Self {
363        self.clone()
364    }
365
366    #[inline]
367    fn from_span(span: Span<Self>) -> Self {
368        unsafe {
369            span.haystack.slice_unchecked(span.range)
370        }
371    }
372
373    #[inline]
374    fn borrow_range(&self, range: Range<<Self::Target as Hay>::Index>) -> Range<<Self::Target as Hay>::Index> {
375        range
376    }
377
378    #[inline]
379    fn do_restore_range(
380        &self,
381        _: Range<<Self::Target as Hay>::Index>,
382        subrange: Range<<Self::Target as Hay>::Index>,
383    ) -> Range<<Self::Target as Hay>::Index> {
384        subrange
385    }
386
387    #[inline]
388    unsafe fn split_around_for_span(self, _: Range<<Self::Target as Hay>::Index>) -> [Self; 3] {
389        [self.clone(), self.clone(), self]
390    }
391
392    #[inline]
393    unsafe fn slice_unchecked_for_span(self, _: Range<<Self::Target as Hay>::Index>) -> Self {
394        self
395    }
396}
397
398
399
400/// A span is a haystack coupled with the original range where the haystack is found.
401///
402/// It can be considered as a tuple `(H, Range<H::Target::Index>)`
403/// where the range is guaranteed to be valid for the haystack.
404///
405/// # Examples
406///
407/// ```
408/// use pattern_3::Span;
409///
410/// let orig_str = "Hello世界";
411/// let orig_span = Span::<&str>::from(orig_str);
412///
413/// // slice a span.
414/// let span = unsafe { orig_span.slice_unchecked(3..8) };
415///
416/// // further slicing (note the range is relative to the original span)
417/// let subspan = unsafe { span.slice_unchecked(4..8) };
418///
419/// // obtains the substring.
420/// let substring = subspan.into();
421/// assert_eq!(substring, "o世");
422/// ```
423///
424/// Visualizing the spans:
425///
426/// ```text
427///
428/// 0   1   2   3   4   5   6   7   8   9  10  11
429/// +---+---+---+---+---+---+---+---+---+---+---+
430/// | H | e | l | l | o | U+4E16    | U+754C    |    orig_str
431/// +---+---+---+---+---+---+---+---+---+---+---+
432///
433/// ^___________________________________________^    orig_span = (orig_str, 0..11)
434///
435///             ^___________________^                span = (orig_str, 3..8)
436///
437///                 ^_______________^                subspan = (orig_str, 4..8)
438/// ```
439#[derive(Debug, Clone)]
440pub struct Span<H: Haystack>
441where H::Target: Hay // FIXME: RFC 2089 or 2289
442{
443    haystack: H,
444    range: Range<<<H as Deref>::Target as Hay>::Index>,
445    //^ The `<H as Trait>` is to trick `#[derive]` not to generate
446    //  the where bound for `H::Hay`.
447}
448
449/// Creates a span which covers the entire haystack.
450impl<H: Haystack> From<H> for Span<H>
451where H::Target: Hay // FIXME: RFC 2089 or 2289
452{
453    #[inline]
454    fn from(haystack: H) -> Self {
455        let range = haystack.start_index()..haystack.end_index();
456        Self { haystack, range }
457    }
458}
459
460impl<H: SharedHaystack> Span<H>
461where H::Target: Hay // FIXME: RFC 2089 or 2289
462{
463    /// Decomposes this span into the original haystack, and the range it focuses on.
464    #[inline]
465    pub fn into_parts(self) -> (H, Range<<H::Target as Hay>::Index>) {
466        (self.haystack, self.range)
467    }
468
469    /// Creates a span from a haystack, and a range it should focus on.
470    ///
471    /// # Safety
472    ///
473    /// The `range` must be a valid range relative to `haystack`.
474    #[inline]
475    pub unsafe fn from_parts(haystack: H, range: Range<<H::Target as Hay>::Index>) -> Self {
476        Self { haystack, range }
477    }
478}
479
480impl<'h> Span<&'h str> {
481    /// Reinterprets the string span as a byte-array span.
482    #[inline]
483    pub fn as_bytes(self) -> Span<&'h [u8]> {
484        Span {
485            haystack: self.haystack.as_bytes(),
486            range: self.range,
487        }
488    }
489}
490
491impl<H: Haystack> Span<H>
492where H::Target: Hay // FIXME: RFC 2089 or 2289
493{
494    /// The range of the span, relative to the ultimate original haystack it was sliced from.
495    #[inline]
496    pub fn original_range(&self) -> Range<<H::Target as Hay>::Index> {
497        self.range.clone()
498    }
499
500    /// Borrows a shared span.
501    #[inline]
502    pub fn borrow(&self) -> Span<&H::Target> {
503        Span {
504            haystack: &*self.haystack,
505            range: self.haystack.borrow_range(self.range.clone()),
506        }
507    }
508
509    /// Checks whether this span is empty.
510    #[inline]
511    pub fn is_empty(&self) -> bool {
512        self.range.start == self.range.end
513    }
514
515    /// Returns this span by value, and replaces the original span by an empty
516    /// span.
517    #[inline]
518    pub fn take(&mut self) -> Self {
519        let haystack = self.haystack.take();
520        let range = self.range.clone();
521        self.range.end = self.range.start;
522        Span { haystack, range }
523    }
524
525    // FIXME: This should be changed to an `impl From<Span<H>> for H`.
526    /// Slices the original haystack to the focused range.
527    #[inline]
528    pub fn into(self) -> H {
529        H::from_span(self)
530    }
531
532    /// Splits this span into 3 spans around the given range.
533    ///
534    /// # Safety
535    ///
536    /// `subrange` must be a valid range relative to `self.borrow()`. A safe
537    /// usage is like:
538    ///
539    /// ```rust
540    /// # use pattern_3::{Span, Needle, Searcher};
541    /// # let span = Span::from("foo");
542    /// # let mut searcher = <&str as Needle<&str>>::into_searcher("o");
543    /// # (|| -> Option<()> {
544    /// let range = searcher.search(span.borrow())?;
545    /// let [left, middle, right] = unsafe { span.split_around(range) };
546    /// # Some(()) })();
547    /// ```
548    #[inline]
549    pub unsafe fn split_around(self, subrange: Range<<H::Target as Hay>::Index>) -> [Self; 3] {
550        let self_range = self.haystack.borrow_range(self.range.clone());
551        let [left, middle, right] = self.haystack.split_around_for_span(subrange.clone());
552
553        let left_range = left.do_restore_range(self.range.clone(), self_range.start..subrange.start);
554        let right_range = right.do_restore_range(self.range.clone(), subrange.end..self_range.end);
555        let middle_range = middle.do_restore_range(self.range, subrange);
556
557        [
558            Self { haystack: left, range: left_range },
559            Self { haystack: middle, range: middle_range },
560            Self { haystack: right, range: right_range },
561        ]
562    }
563
564    /// Slices this span to the given range.
565    ///
566    /// # Safety
567    ///
568    /// `subrange` must be a valid range relative to `self.borrow()`.
569    #[inline]
570    pub unsafe fn slice_unchecked(self, subrange: Range<<H::Target as Hay>::Index>) -> Self {
571        let haystack = self.haystack.slice_unchecked_for_span(subrange.clone());
572        let range = haystack.do_restore_range(self.range, subrange);
573        Self { haystack, range }
574    }
575}
576
577unsafe impl<'a, A: Hay + ?Sized + 'a> Haystack for &'a A {
578    #[inline]
579    fn empty() -> Self {
580        A::empty()
581    }
582
583    #[inline]
584    unsafe fn split_around(self, range: Range<A::Index>) -> [Self; 3] {
585        [
586            self.slice_unchecked(self.start_index()..range.start),
587            self.slice_unchecked(range.clone()),
588            self.slice_unchecked(range.end..self.end_index()),
589        ]
590    }
591
592    #[inline]
593    unsafe fn slice_unchecked(self, range: Range<A::Index>) -> Self {
594        A::slice_unchecked(self, range)
595    }
596
597    #[inline]
598    fn restore_range(&self, _: Range<A::Index>, _: Range<A::Index>) -> Range<A::Index> {
599        unreachable!()
600    }
601}
602
603impl<'a, A: Hay + ?Sized + 'a> SharedHaystack for &'a A {}