Skip to main content

oak_core/source/
cursor.rs

1use crate::source::{Source, TextChunk, simd::SimdScanner};
2use core::range::Range;
3use std::fmt;
4
5/// A cursor over a source that allows for efficient navigation and scanning.
6pub struct SourceCursor<'s, S: Source + ?Sized> {
7    source: &'s S,
8    offset: usize,
9    chunk: TextChunk<'s>,
10    scratch: String,
11}
12
13impl<'s, S: Source + ?Sized> fmt::Debug for SourceCursor<'s, S> {
14    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
15        f.debug_struct("SourceCursor").field("offset", &self.offset).field("chunk_start", &self.chunk.start).field("chunk_end", &self.chunk.end()).finish()
16    }
17}
18
19impl<'s, S: Source + ?Sized> SourceCursor<'s, S> {
20    /// Creates a new SourceCursor at the start of the source.
21    pub fn new(source: &'s S) -> Self {
22        Self::new_at(source, 0)
23    }
24
25    /// Creates a new SourceCursor at the specified offset.
26    pub fn new_at(source: &'s S, offset: usize) -> Self {
27        let end = source.length();
28        let offset = offset.min(end);
29        let chunk = source.chunk_at(offset);
30        Self { source, offset, chunk, scratch: String::new() }
31    }
32
33    /// Returns the current byte offset of the cursor.
34    #[inline]
35    pub fn position(&self) -> usize {
36        self.offset
37    }
38
39    /// Sets the current byte offset of the cursor.
40    /// Returns the previous offset.
41    #[inline]
42    pub fn set_position(&mut self, offset: usize) -> usize {
43        let last = self.offset;
44        self.offset = offset.min(self.source.length());
45        last
46    }
47
48    /// Returns the source that this cursor is iterating over.
49    #[inline]
50    pub fn source(&self) -> &'s S {
51        self.source
52    }
53
54    /// Ensures that the current chunk is valid for the current offset.
55    fn ensure_chunk(&mut self) {
56        let end = self.source.length();
57        if self.offset > end {
58            self.offset = end;
59        }
60        if self.offset < self.chunk.start || self.offset > self.chunk.end() {
61            self.chunk = self.source.chunk_at(self.offset);
62        }
63    }
64
65    /// Returns the remaining text in the current chunk.
66    pub fn rest(&mut self) -> &str {
67        self.ensure_chunk();
68        self.chunk.slice_from(self.offset)
69    }
70
71    /// Returns the end byte offset of the current chunk.
72    pub fn chunk_end(&mut self) -> usize {
73        self.ensure_chunk();
74        self.chunk.end()
75    }
76
77    /// Peeks at the next character without advancing the cursor.
78    pub fn peek_char(&mut self) -> Option<char> {
79        if self.offset >= self.chunk.start {
80            let rel = self.offset - self.chunk.start;
81            if rel < self.chunk.text.len() {
82                // Safety: rel is checked to be less than text length
83                let text = unsafe { self.chunk.text.get_unchecked(rel..) };
84                return text.chars().next();
85            }
86        }
87        self.rest().chars().next()
88    }
89
90    /// Peeks at the character at the specified byte offset relative to the current position.
91    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
92        let target_offset = self.offset + n;
93        if target_offset >= self.source.length() {
94            return None;
95        }
96        if target_offset >= self.chunk.start && target_offset < self.chunk.end() {
97            let rel = target_offset - self.chunk.start;
98            let text = unsafe { self.chunk.text.get_unchecked(rel..) };
99            return text.chars().next();
100        }
101        self.source.get_char_at(target_offset)
102    }
103
104    /// Skips common ASCII whitespace using SIMD if possible.
105    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
106        let start = self.offset;
107        loop {
108            self.ensure_chunk();
109            let rel = self.offset.saturating_sub(self.chunk.start);
110            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
111
112            if bytes.is_empty() {
113                if self.offset >= self.source.length() {
114                    break;
115                }
116                self.chunk = self.source.chunk_at(self.offset);
117                continue;
118            }
119
120            let skipped = SimdScanner::skip_ascii_whitespace(bytes);
121            self.offset += skipped;
122
123            if skipped < bytes.len() || self.offset >= self.source.length() {
124                break;
125            }
126        }
127        Range { start, end: self.offset }
128    }
129
130    /// Skips ASCII digits using SIMD if possible.
131    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
132        let start = self.offset;
133        loop {
134            self.ensure_chunk();
135            let rel = self.offset.saturating_sub(self.chunk.start);
136            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
137
138            if bytes.is_empty() {
139                if self.offset >= self.source.length() {
140                    break;
141                }
142                self.chunk = self.source.chunk_at(self.offset);
143                continue;
144            }
145
146            let skipped = SimdScanner::skip_ascii_digits(bytes);
147            self.offset += skipped;
148
149            if skipped < bytes.len() || self.offset >= self.source.length() {
150                break;
151            }
152        }
153        Range { start, end: self.offset }
154    }
155
156    /// Skips ASCII identifier continue characters using SIMD if possible.
157    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
158        let start = self.offset;
159        loop {
160            self.ensure_chunk();
161            let rel = self.offset.saturating_sub(self.chunk.start);
162            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
163
164            if bytes.is_empty() {
165                if self.offset >= self.source.length() {
166                    break;
167                }
168                self.chunk = self.source.chunk_at(self.offset);
169                continue;
170            }
171
172            let skipped = SimdScanner::skip_ascii_ident_continue(bytes);
173            self.offset += skipped;
174
175            if skipped < bytes.len() || self.offset >= self.source.length() {
176                break;
177            }
178        }
179        Range { start, end: self.offset }
180    }
181
182    /// Skips until the specified byte is found.
183    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
184        let start = self.offset;
185        loop {
186            self.ensure_chunk();
187            let rel = self.offset.saturating_sub(self.chunk.start);
188            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
189
190            if bytes.is_empty() {
191                if self.offset >= self.source.length() {
192                    break;
193                }
194                self.chunk = self.source.chunk_at(self.offset);
195                continue;
196            }
197
198            let skipped = SimdScanner::skip_until(bytes, target);
199            self.offset += skipped;
200
201            if skipped < bytes.len() || self.offset >= self.source.length() {
202                break;
203            }
204        }
205        Range { start, end: self.offset }
206    }
207
208    /// Peeks at the next byte without advancing. the cursor.
209    #[inline(always)]
210    pub fn peek_byte(&mut self) -> Option<u8> {
211        if self.offset >= self.chunk.start {
212            let rel = self.offset - self.chunk.start;
213            let bytes = self.chunk.text.as_bytes();
214            if rel < bytes.len() {
215                return Some(unsafe { *bytes.get_unchecked(rel) });
216            }
217        }
218        self.ensure_chunk();
219        let rel = self.offset - self.chunk.start;
220        let bytes = self.chunk.text.as_bytes();
221        bytes.get(rel).copied()
222    }
223
224    /// Advances the cursor by the specified number of bytes.
225    pub fn advance_bytes(&mut self, len: usize) -> usize {
226        self.offset = (self.offset + len).min(self.source.length());
227        self.offset
228    }
229
230    /// Advances the cursor by one character and returns it.
231    pub fn advance_char(&mut self) -> Option<char> {
232        let ch = self.peek_char()?;
233        self.advance_bytes(ch.len_utf8());
234        Some(ch)
235    }
236
237    /// Advances the cursor by one byte and returns it.
238    #[inline(always)]
239    pub fn advance_byte(&mut self) -> Option<u8> {
240        let b = self.peek_byte()?;
241        self.offset += 1;
242        Some(b)
243    }
244
245    /// Advances the cursor while the predicate is true and returns the range.
246    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
247        let start = self.offset;
248
249        loop {
250            // Ensure we have a valid chunk for current offset
251            self.ensure_chunk();
252
253            // Get text slice from current offset
254            let rel = self.offset.saturating_sub(self.chunk.start);
255            let text = if rel < self.chunk.text.len() { unsafe { self.chunk.text.get_unchecked(rel..) } } else { "" };
256
257            if text.is_empty() {
258                // If text is empty, it means we are at the end of the chunk (or source).
259                // If we are at the end of source, break.
260                if self.offset >= self.source.length() {
261                    break;
262                }
263                // Otherwise force move to next chunk
264                self.chunk = self.source.chunk_at(self.offset);
265                // Continue loop to process next chunk
266                continue;
267            }
268
269            let mut advanced = 0;
270            let mut stop = false;
271
272            // Iterate over characters in the current chunk slice
273            for (i, ch) in text.char_indices() {
274                if !pred(ch) {
275                    advanced = i;
276                    stop = true;
277                    break;
278                }
279                advanced = i + ch.len_utf8();
280            }
281
282            self.offset += advanced;
283
284            if stop {
285                break;
286            }
287
288            // If we consumed the whole chunk but didn't stop, we need to check if we are at EOF
289            if self.offset >= self.source.length() {
290                break;
291            }
292            // If not at EOF, the loop will continue, ensure_chunk will get the next chunk
293        }
294
295        Range { start, end: self.offset }
296    }
297
298    /// Advances the cursor while the byte predicate is true and returns the range.
299    #[inline(always)]
300    pub fn take_while_byte(&mut self, mut pred: impl FnMut(u8) -> bool) -> Range<usize> {
301        let start = self.offset;
302
303        loop {
304            self.ensure_chunk();
305            let rel = self.offset.saturating_sub(self.chunk.start);
306            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
307
308            if bytes.is_empty() {
309                if self.offset >= self.source.length() {
310                    break;
311                }
312                self.chunk = self.source.chunk_at(self.offset);
313                continue;
314            }
315
316            let mut advanced = 0;
317            let mut stop = false;
318
319            for (i, &b) in bytes.iter().enumerate() {
320                if !pred(b) {
321                    advanced = i;
322                    stop = true;
323                    break;
324                }
325                advanced = i + 1;
326            }
327
328            self.offset += advanced;
329            if stop || self.offset >= self.source.length() {
330                break;
331            }
332        }
333
334        Range { start, end: self.offset }
335    }
336
337    /// Returns `true` if the source text at the current position starts with the given pattern.
338    pub fn starts_with(&mut self, pattern: &str) -> bool {
339        self.ensure_chunk();
340        let chunk_text = self.chunk.text;
341        let rest = chunk_text.get(self.offset.saturating_sub(self.chunk.start)..).unwrap_or("");
342        if rest.len() >= pattern.len() {
343            return rest.as_bytes().get(..pattern.len()) == Some(pattern.as_bytes());
344        }
345
346        self.scratch.clear();
347        self.scratch.push_str(rest);
348        let mut next = self.chunk.end();
349        let end = self.source.length();
350        while self.scratch.len() < pattern.len() && next < end {
351            let chunk = self.source.chunk_at(next);
352            self.scratch.push_str(chunk.text);
353            next = chunk.end();
354        }
355        self.scratch.as_bytes().get(..pattern.len()) == Some(pattern.as_bytes())
356    }
357
358    /// Consumes the given pattern if it matches at the current position.
359    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
360        if !self.starts_with(pattern) {
361            return false;
362        }
363        self.advance_bytes(pattern.len());
364        true
365    }
366
367    /// Finds the first occurrence of the given pattern in the source text starting from the current position.
368    pub fn find_str(&mut self, pattern: &str) -> Option<usize> {
369        if pattern.is_empty() {
370            return Some(self.offset);
371        }
372
373        let pat_len = pattern.len();
374        let mut offset = self.offset;
375        let end = self.source.length();
376        while offset < end {
377            self.offset = offset;
378            self.ensure_chunk();
379            let text = self.chunk.slice_from(offset);
380            if let Some(pos) = text.find(pattern) {
381                return Some(offset + pos);
382            }
383            let chunk_end = self.chunk.end();
384            if chunk_end >= end {
385                return None;
386            }
387
388            if pat_len > 1 {
389                let keep = pat_len - 1;
390                self.scratch.clear();
391                let tail = text.get(text.len().saturating_sub(keep)..).unwrap_or("");
392                self.scratch.push_str(tail);
393                let tail_abs_start = chunk_end.saturating_sub(tail.len());
394                let next_chunk = self.source.chunk_at(chunk_end);
395                self.scratch.push_str(next_chunk.text);
396                if let Some(pos) = self.scratch.find(pattern) {
397                    return Some(tail_abs_start + pos);
398                }
399            }
400
401            offset = chunk_end;
402        }
403        None
404    }
405}