regex_cursor/
cursor.rs

1pub trait IntoCursor {
2    type Cursor: Cursor;
3    fn into_cursor(self) -> Self::Cursor;
4}
5
6impl<C: Cursor> IntoCursor for C {
7    type Cursor = Self;
8
9    fn into_cursor(self) -> Self {
10        self
11    }
12}
13
14/// A cursor that allows traversing a discontiguous string like a rope.
15pub trait Cursor {
16    /// Returns the current chunk. If [`utf8_aware`](Cursor::utf8_aware) returns true then this function
17    /// must **never** return a chunk that splits a unicode codepoint.
18    /// See [`utf8_aware`](Cursor::utf8_aware) for details.
19    ///
20    /// Must never return an empty byteslice unless the underlying collection is empty.
21    fn chunk(&self) -> &[u8];
22    /// Whether this cursor is aware of utf-8 codepoint boundaries.
23    ///
24    /// **`true`** means that his cursor must never split a unicode codepoint at a
25    /// chunk boundary. In that case all regex features are supported.
26    ///
27    /// **`false`** means that his cursor can not be used for utf-8 mode
28    /// matching (only affects empty strings) and can not be used to match
29    /// unicode word boundaries.
30    fn utf8_aware(&self) -> bool {
31        true
32    }
33    /// Advances the cursor to the next chunk if possible. In that case `true`
34    /// must be returned. If the end of data is reached this function should
35    /// return `false` and **not change the chunk**
36    fn advance(&mut self) -> bool;
37    /// Moves the cursor to the previous chunk if possible. In that case `true`
38    /// must be returned If the start of data is reached this function should
39    /// return `false` and **not change the chunk**
40    fn backtrack(&mut self) -> bool;
41    /// Returns the total length of the data. This does not
42    /// take the current cursor position into account and should
43    /// not change with calls to [`advance`](Cursor::advance) and [`backtrack`](Cursor::backtrack).
44    fn total_bytes(&self) -> Option<usize>;
45    /// The offset of the current chunk from the start of the haystack in bytes
46    fn offset(&self) -> usize;
47}
48
49impl<C: Cursor> Cursor for &mut C {
50    fn chunk(&self) -> &[u8] {
51        C::chunk(self)
52    }
53
54    fn utf8_aware(&self) -> bool {
55        C::utf8_aware(self)
56    }
57
58    fn advance(&mut self) -> bool {
59        C::advance(self)
60    }
61
62    fn backtrack(&mut self) -> bool {
63        C::backtrack(self)
64    }
65
66    fn total_bytes(&self) -> Option<usize> {
67        C::total_bytes(self)
68    }
69
70    fn offset(&self) -> usize {
71        C::offset(self)
72    }
73}
74
75impl Cursor for &[u8] {
76    fn chunk(&self) -> &[u8] {
77        self
78    }
79
80    // true since there are no chunk bounderies
81    fn utf8_aware(&self) -> bool {
82        true
83    }
84
85    fn advance(&mut self) -> bool {
86        false
87    }
88
89    fn backtrack(&mut self) -> bool {
90        false
91    }
92
93    fn total_bytes(&self) -> Option<usize> {
94        Some(self.len())
95    }
96    fn offset(&self) -> usize {
97        0
98    }
99}
100
101impl Cursor for &str {
102    fn chunk(&self) -> &[u8] {
103        self.as_bytes()
104    }
105
106    // true since there are no chunk bounderies
107    fn utf8_aware(&self) -> bool {
108        true
109    }
110
111    fn advance(&mut self) -> bool {
112        false
113    }
114
115    fn backtrack(&mut self) -> bool {
116        false
117    }
118    fn total_bytes(&self) -> Option<usize> {
119        Some(<str>::len(self))
120    }
121
122    fn offset(&self) -> usize {
123        0
124    }
125}
126
127#[cfg(feature = "ropey")]
128#[derive(Clone, Copy)]
129enum Pos {
130    ChunkStart,
131    ChunkEnd,
132}
133
134#[cfg(feature = "ropey")]
135#[derive(Clone)]
136pub struct RopeyCursor<'a> {
137    iter: ropey::iter::Chunks<'a>,
138    current: &'a [u8],
139    pos: Pos,
140    len: usize,
141    offset: usize,
142}
143
144#[cfg(feature = "ropey")]
145impl<'a> RopeyCursor<'a> {
146    pub fn new(slice: ropey::RopeSlice<'a>) -> Self {
147        let iter = slice.chunks();
148        let mut res =
149            Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset: 0 };
150        res.advance();
151        res
152    }
153
154    pub fn at(slice: ropey::RopeSlice<'a>, at: usize) -> Self {
155        let (iter, offset, _, _) = slice.chunks_at_byte(at);
156        if offset == slice.len_bytes() {
157            let mut res =
158                Self { current: &[], iter, pos: Pos::ChunkStart, len: slice.len_bytes(), offset };
159            res.backtrack();
160            res
161        } else {
162            let mut res =
163                Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset };
164            res.advance();
165            res
166        }
167    }
168}
169
170#[cfg(feature = "ropey")]
171impl Cursor for RopeyCursor<'_> {
172    fn chunk(&self) -> &[u8] {
173        self.current
174    }
175
176    fn advance(&mut self) -> bool {
177        match self.pos {
178            Pos::ChunkStart => {
179                self.iter.next();
180                self.pos = Pos::ChunkEnd;
181            }
182            Pos::ChunkEnd => (),
183        }
184        for next in self.iter.by_ref() {
185            if next.is_empty() {
186                continue;
187            }
188            self.offset += self.current.len();
189            self.current = next.as_bytes();
190            return true;
191        }
192        false
193    }
194
195    fn backtrack(&mut self) -> bool {
196        match self.pos {
197            Pos::ChunkStart => {}
198            Pos::ChunkEnd => {
199                self.iter.prev();
200                self.pos = Pos::ChunkStart;
201            }
202        }
203        while let Some(prev) = self.iter.prev() {
204            if prev.is_empty() {
205                continue;
206            }
207            self.offset -= prev.len();
208            self.current = prev.as_bytes();
209            return true;
210        }
211        false
212    }
213
214    fn utf8_aware(&self) -> bool {
215        true
216    }
217
218    fn total_bytes(&self) -> Option<usize> {
219        Some(self.len)
220    }
221
222    fn offset(&self) -> usize {
223        self.offset
224    }
225}
226
227#[cfg(feature = "ropey")]
228impl<'a> IntoCursor for ropey::RopeSlice<'a> {
229    type Cursor = RopeyCursor<'a>;
230
231    fn into_cursor(self) -> Self::Cursor {
232        RopeyCursor::new(self)
233    }
234}
235
236#[cfg(feature = "ropey")]
237impl<'a> IntoCursor for &'a ropey::Rope {
238    type Cursor = RopeyCursor<'a>;
239
240    fn into_cursor(self) -> Self::Cursor {
241        RopeyCursor::new(self.slice(..))
242    }
243}
244#[cfg(all(feature = "ropey", test))]
245mod ropey_test {
246    use ropey::Rope;
247
248    use crate::cursor::IntoCursor;
249    use crate::Cursor;
250
251    #[test]
252    fn smoke_test() {
253        let rope = Rope::from_str("abc");
254        let mut cursor = rope.into_cursor();
255        assert_eq!(cursor.chunk(), "abc".as_bytes());
256        assert!(!cursor.advance());
257        assert_eq!(cursor.chunk(), "abc".as_bytes());
258        assert!(!cursor.backtrack());
259        assert_eq!(cursor.chunk(), "abc".as_bytes());
260        let rope = Rope::from("abc".repeat(5000));
261        let mut cursor = rope.into_cursor();
262        let mut offset = 0;
263        loop {
264            assert_eq!(cursor.offset(), offset);
265            offset += cursor.chunk().len();
266            if !cursor.advance() {
267                break;
268            }
269        }
270        loop {
271            offset -= cursor.chunk().len();
272            assert_eq!(cursor.offset(), offset);
273            if !cursor.backtrack() {
274                break;
275            }
276        }
277        assert_eq!(cursor.offset(), 0);
278        assert_eq!(offset, 0);
279    }
280}