oak_core/source/
cursor.rs

1use crate::source::{Source, TextChunk, simd::SimdScanner};
2use core::range::Range;
3use std::fmt;
4
5/// A cursor over a source that allows for efficient navigation and scanning.
6pub struct SourceCursor<'s, S: Source + ?Sized> {
7    source: &'s S,
8    offset: usize,
9    chunk: TextChunk<'s>,
10    scratch: String,
11}
12
13impl<'s, S: Source + ?Sized> fmt::Debug for SourceCursor<'s, S> {
14    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
15        f.debug_struct("SourceCursor").field("offset", &self.offset).field("chunk_start", &self.chunk.start).field("chunk_end", &self.chunk.end()).finish()
16    }
17}
18
19impl<'s, S: Source + ?Sized> SourceCursor<'s, S> {
20    /// Creates a new SourceCursor at the start of the source.
21    pub fn new(source: &'s S) -> Self {
22        Self::new_at(source, 0)
23    }
24
25    /// Creates a new SourceCursor at the specified offset.
26    pub fn new_at(source: &'s S, offset: usize) -> Self {
27        let end = source.length();
28        let offset = offset.min(end);
29        let chunk = source.chunk_at(offset);
30        Self { source, offset, chunk, scratch: String::new() }
31    }
32
33    /// Returns the current byte offset of the cursor.
34    #[inline]
35    pub fn position(&self) -> usize {
36        self.offset
37    }
38
39    /// Sets the current byte offset of the cursor.
40    /// Returns the previous offset.
41    #[inline]
42    pub fn set_position(&mut self, offset: usize) -> usize {
43        let last = self.offset;
44        self.offset = offset.min(self.source.length());
45        last
46    }
47
48    /// Returns the source that this cursor is iterating over.
49    #[inline]
50    pub fn source(&self) -> &'s S {
51        self.source
52    }
53
54    /// Ensures that the current chunk is valid for the current offset.
55    fn ensure_chunk(&mut self) {
56        let end = self.source.length();
57        if self.offset > end {
58            self.offset = end;
59        }
60        if self.offset < self.chunk.start || self.offset > self.chunk.end() {
61            self.chunk = self.source.chunk_at(self.offset);
62        }
63    }
64
65    /// Returns the remaining text in the current chunk.
66    pub fn rest(&mut self) -> &str {
67        self.ensure_chunk();
68        self.chunk.slice_from(self.offset)
69    }
70
71    /// Returns the end byte offset of the current chunk.
72    pub fn chunk_end(&mut self) -> usize {
73        self.ensure_chunk();
74        self.chunk.end()
75    }
76
77    /// Peeks at the next character without advancing the cursor.
78    pub fn peek_char(&mut self) -> Option<char> {
79        if self.offset >= self.chunk.start {
80            let rel = self.offset - self.chunk.start;
81            if rel < self.chunk.text.len() {
82                // Safety: rel is checked to be less than text length
83                let text = unsafe { self.chunk.text.get_unchecked(rel..) };
84                return text.chars().next();
85            }
86        }
87        self.rest().chars().next()
88    }
89
90    /// Skips common ASCII whitespace using SIMD if possible.
91    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
92        let start = self.offset;
93        loop {
94            self.ensure_chunk();
95            let rel = self.offset.saturating_sub(self.chunk.start);
96            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
97
98            if bytes.is_empty() {
99                if self.offset >= self.source.length() {
100                    break;
101                }
102                self.chunk = self.source.chunk_at(self.offset);
103                continue;
104            }
105
106            let skipped = SimdScanner::skip_ascii_whitespace(bytes);
107            self.offset += skipped;
108
109            if skipped < bytes.len() || self.offset >= self.source.length() {
110                break;
111            }
112        }
113        Range { start, end: self.offset }
114    }
115
116    /// Skips ASCII digits using SIMD if possible.
117    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
118        let start = self.offset;
119        loop {
120            self.ensure_chunk();
121            let rel = self.offset.saturating_sub(self.chunk.start);
122            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
123
124            if bytes.is_empty() {
125                if self.offset >= self.source.length() {
126                    break;
127                }
128                self.chunk = self.source.chunk_at(self.offset);
129                continue;
130            }
131
132            let skipped = SimdScanner::skip_ascii_digits(bytes);
133            self.offset += skipped;
134
135            if skipped < bytes.len() || self.offset >= self.source.length() {
136                break;
137            }
138        }
139        Range { start, end: self.offset }
140    }
141
142    /// Skips ASCII identifier continue characters using SIMD if possible.
143    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
144        let start = self.offset;
145        loop {
146            self.ensure_chunk();
147            let rel = self.offset.saturating_sub(self.chunk.start);
148            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
149
150            if bytes.is_empty() {
151                if self.offset >= self.source.length() {
152                    break;
153                }
154                self.chunk = self.source.chunk_at(self.offset);
155                continue;
156            }
157
158            let skipped = SimdScanner::skip_ascii_ident_continue(bytes);
159            self.offset += skipped;
160
161            if skipped < bytes.len() || self.offset >= self.source.length() {
162                break;
163            }
164        }
165        Range { start, end: self.offset }
166    }
167
168    /// Skips until the specified byte is found.
169    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
170        let start = self.offset;
171        loop {
172            self.ensure_chunk();
173            let rel = self.offset.saturating_sub(self.chunk.start);
174            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
175
176            if bytes.is_empty() {
177                if self.offset >= self.source.length() {
178                    break;
179                }
180                self.chunk = self.source.chunk_at(self.offset);
181                continue;
182            }
183
184            let skipped = SimdScanner::skip_until(bytes, target);
185            self.offset += skipped;
186
187            if skipped < bytes.len() || self.offset >= self.source.length() {
188                break;
189            }
190        }
191        Range { start, end: self.offset }
192    }
193
194    /// Peeks at the next byte without advancing. the cursor.
195    #[inline(always)]
196    pub fn peek_byte(&mut self) -> Option<u8> {
197        if self.offset >= self.chunk.start {
198            let rel = self.offset - self.chunk.start;
199            let bytes = self.chunk.text.as_bytes();
200            if rel < bytes.len() {
201                return Some(unsafe { *bytes.get_unchecked(rel) });
202            }
203        }
204        self.ensure_chunk();
205        let rel = self.offset - self.chunk.start;
206        let bytes = self.chunk.text.as_bytes();
207        bytes.get(rel).copied()
208    }
209
210    /// Advances the cursor by the specified number of bytes.
211    pub fn advance_bytes(&mut self, len: usize) -> usize {
212        self.offset = (self.offset + len).min(self.source.length());
213        self.offset
214    }
215
216    /// Advances the cursor by one character and returns it.
217    pub fn advance_char(&mut self) -> Option<char> {
218        let ch = self.peek_char()?;
219        self.advance_bytes(ch.len_utf8());
220        Some(ch)
221    }
222
223    /// Advances the cursor by one byte and returns it.
224    #[inline(always)]
225    pub fn advance_byte(&mut self) -> Option<u8> {
226        let b = self.peek_byte()?;
227        self.offset += 1;
228        Some(b)
229    }
230
231    /// Advances the cursor while the predicate is true and returns the range.
232    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
233        let start = self.offset;
234
235        loop {
236            // Ensure we have a valid chunk for current offset
237            self.ensure_chunk();
238
239            // Get text slice from current offset
240            let rel = self.offset.saturating_sub(self.chunk.start);
241            let text = if rel < self.chunk.text.len() { unsafe { self.chunk.text.get_unchecked(rel..) } } else { "" };
242
243            if text.is_empty() {
244                // If text is empty, it means we are at the end of the chunk (or source).
245                // If we are at the end of source, break.
246                if self.offset >= self.source.length() {
247                    break;
248                }
249                // Otherwise force move to next chunk
250                self.chunk = self.source.chunk_at(self.offset);
251                // Continue loop to process next chunk
252                continue;
253            }
254
255            let mut advanced = 0;
256            let mut stop = false;
257
258            // Iterate over characters in the current chunk slice
259            for (i, ch) in text.char_indices() {
260                if !pred(ch) {
261                    advanced = i;
262                    stop = true;
263                    break;
264                }
265                advanced = i + ch.len_utf8();
266            }
267
268            self.offset += advanced;
269
270            if stop {
271                break;
272            }
273
274            // If we consumed the whole chunk but didn't stop, we need to check if we are at EOF
275            if self.offset >= self.source.length() {
276                break;
277            }
278            // If not at EOF, the loop will continue, ensure_chunk will get the next chunk
279        }
280
281        Range { start, end: self.offset }
282    }
283
284    /// Advances the cursor while the byte predicate is true and returns the range.
285    #[inline(always)]
286    pub fn take_while_byte(&mut self, mut pred: impl FnMut(u8) -> bool) -> Range<usize> {
287        let start = self.offset;
288
289        loop {
290            self.ensure_chunk();
291            let rel = self.offset.saturating_sub(self.chunk.start);
292            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
293
294            if bytes.is_empty() {
295                if self.offset >= self.source.length() {
296                    break;
297                }
298                self.chunk = self.source.chunk_at(self.offset);
299                continue;
300            }
301
302            let mut advanced = 0;
303            let mut stop = false;
304
305            for (i, &b) in bytes.iter().enumerate() {
306                if !pred(b) {
307                    advanced = i;
308                    stop = true;
309                    break;
310                }
311                advanced = i + 1;
312            }
313
314            self.offset += advanced;
315            if stop || self.offset >= self.source.length() {
316                break;
317            }
318        }
319
320        Range { start, end: self.offset }
321    }
322
323    /// Returns `true` if the source text at the current position starts with the given pattern.
324    pub fn starts_with(&mut self, pattern: &str) -> bool {
325        self.ensure_chunk();
326        let chunk_text = self.chunk.text;
327        let rest = chunk_text.get(self.offset.saturating_sub(self.chunk.start)..).unwrap_or("");
328        if rest.len() >= pattern.len() {
329            return rest.as_bytes().get(..pattern.len()) == Some(pattern.as_bytes());
330        }
331
332        self.scratch.clear();
333        self.scratch.push_str(rest);
334        let mut next = self.chunk.end();
335        let end = self.source.length();
336        while self.scratch.len() < pattern.len() && next < end {
337            let chunk = self.source.chunk_at(next);
338            self.scratch.push_str(chunk.text);
339            next = chunk.end();
340        }
341        self.scratch.as_bytes().get(..pattern.len()) == Some(pattern.as_bytes())
342    }
343
344    /// Consumes the given pattern if it matches at the current position.
345    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
346        if !self.starts_with(pattern) {
347            return false;
348        }
349        self.advance_bytes(pattern.len());
350        true
351    }
352
353    /// Finds the first occurrence of the given pattern in the source text starting from the current position.
354    pub fn find_str(&mut self, pattern: &str) -> Option<usize> {
355        if pattern.is_empty() {
356            return Some(self.offset);
357        }
358
359        let pat_len = pattern.len();
360        let mut offset = self.offset;
361        let end = self.source.length();
362        while offset < end {
363            self.offset = offset;
364            self.ensure_chunk();
365            let text = self.chunk.slice_from(offset);
366            if let Some(pos) = text.find(pattern) {
367                return Some(offset + pos);
368            }
369            let chunk_end = self.chunk.end();
370            if chunk_end >= end {
371                return None;
372            }
373
374            if pat_len > 1 {
375                let keep = pat_len - 1;
376                self.scratch.clear();
377                let tail = text.get(text.len().saturating_sub(keep)..).unwrap_or("");
378                self.scratch.push_str(tail);
379                let tail_abs_start = chunk_end.saturating_sub(tail.len());
380                let next_chunk = self.source.chunk_at(chunk_end);
381                self.scratch.push_str(next_chunk.text);
382                if let Some(pos) = self.scratch.find(pattern) {
383                    return Some(tail_abs_start + pos);
384                }
385            }
386
387            offset = chunk_end;
388        }
389        None
390    }
391}