Skip to main content

oak_core/source/
cursor.rs

1use crate::source::{Source, TextChunk, simd::SimdScanner};
2use core::range::Range;
3use std::fmt;
4
5/// A cursor over a source that allows for efficient navigation and scanning.
6///
7/// # Examples
8///
9/// ```rust
10/// # #![feature(new_range_api)]
11/// # use oak_core::source::{SourceCursor, SourceText};
12/// let source = SourceText::new("hello world");
13/// let mut cursor = SourceCursor::new(&source);
14///
15/// assert_eq!(cursor.peek_char(), Some('h'));
16/// cursor.set_position(6);
17/// assert_eq!(cursor.peek_char(), Some('w'));
18/// ```
19pub struct SourceCursor<'s, S: Source + ?Sized> {
20    source: &'s S,
21    offset: usize,
22    chunk: TextChunk<'s>,
23    scratch: String,
24}
25
26impl<'s, S: Source + ?Sized> fmt::Debug for SourceCursor<'s, S> {
27    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
28        f.debug_struct("SourceCursor").field("offset", &self.offset).field("chunk_start", &self.chunk.start).field("chunk_end", &self.chunk.end()).finish()
29    }
30}
31
32impl<'s, S: Source + ?Sized> SourceCursor<'s, S> {
33    /// Creates a new SourceCursor at the start of the source.
34    pub fn new(source: &'s S) -> Self {
35        Self::new_at(source, 0)
36    }
37
38    /// Creates a new SourceCursor at the specified offset.
39    pub fn new_at(source: &'s S, offset: usize) -> Self {
40        let end = source.length();
41        let offset = offset.min(end);
42        let chunk = source.chunk_at(offset);
43        Self { source, offset, chunk, scratch: String::new() }
44    }
45
46    /// Returns the current byte offset of the cursor.
47    #[inline]
48    pub fn position(&self) -> usize {
49        self.offset
50    }
51
52    /// Sets the current byte offset of the cursor.
53    /// Returns the previous offset.
54    #[inline]
55    pub fn set_position(&mut self, offset: usize) -> usize {
56        let last = self.offset;
57        self.offset = offset.min(self.source.length());
58        last
59    }
60
61    /// Returns the source that this cursor is iterating over.
62    #[inline]
63    pub fn source(&self) -> &'s S {
64        self.source
65    }
66
67    /// Ensures that the current chunk is valid for the current offset.
68    fn ensure_chunk(&mut self) {
69        let end = self.source.length();
70        if self.offset > end {
71            self.offset = end
72        }
73        // If the offset is outside the current chunk, or at the very end of the current chunk
74        // (but not at the end of the source), we need to fetch a new chunk.
75        if self.offset < self.chunk.start || self.offset > self.chunk.end() || (self.offset == self.chunk.end() && self.offset < end) {
76            self.chunk = self.source.chunk_at(self.offset)
77        }
78    }
79
80    /// Returns the remaining text in the current chunk.
81    pub fn rest(&mut self) -> &str {
82        self.ensure_chunk();
83        self.chunk.slice_from(self.offset)
84    }
85
86    /// Returns the end byte offset of the current chunk.
87    pub fn chunk_end(&mut self) -> usize {
88        self.ensure_chunk();
89        self.chunk.end()
90    }
91
92    /// Peeks at the next character without advancing the cursor.
93    pub fn peek_char(&mut self) -> Option<char> {
94        if self.offset >= self.chunk.start {
95            let rel = self.offset - self.chunk.start;
96            if rel < self.chunk.text.len() {
97                // Ensure rel is at a character boundary
98                if self.chunk.text.is_char_boundary(rel) {
99                    let text = unsafe { self.chunk.text.get_unchecked(rel..) };
100                    return text.chars().next();
101                }
102                else {
103                    // If not at a boundary, something is wrong with the offset
104                    // We should probably advance to the next boundary
105                    let mut i = rel;
106                    while i < self.chunk.text.len() && !self.chunk.text.is_char_boundary(i) {
107                        i += 1
108                    }
109                    if i < self.chunk.text.len() {
110                        let text = unsafe { self.chunk.text.get_unchecked(i..) };
111                        return text.chars().next();
112                    }
113                }
114            }
115        }
116        self.rest().chars().next()
117    }
118
119    /// Peeks at the character at the specified byte offset relative to the current position.
120    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
121        let target_offset = self.offset + n;
122        if target_offset >= self.source.length() {
123            return None;
124        }
125        if target_offset >= self.chunk.start && target_offset < self.chunk.end() {
126            let rel = target_offset - self.chunk.start;
127            let text = self.chunk.text.get(rel..).unwrap_or("");
128            return text.chars().next();
129        }
130        self.source.get_char_at(target_offset)
131    }
132
133    /// Peeks at the character immediately following the current character.
134    pub fn peek_next_char(&mut self) -> Option<char> {
135        let ch = self.peek_char()?;
136        self.peek_next_n(ch.len_utf8())
137    }
138
139    /// Skips common ASCII whitespace using SIMD if possible.
140    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
141        let start = self.offset;
142        loop {
143            self.ensure_chunk();
144            let rel = self.offset.saturating_sub(self.chunk.start);
145            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
146
147            if bytes.is_empty() {
148                if self.offset >= self.source.length() {
149                    break;
150                }
151                self.chunk = self.source.chunk_at(self.offset);
152                continue;
153            }
154
155            let skipped = SimdScanner::skip_ascii_whitespace(bytes);
156            self.offset += skipped;
157
158            if skipped < bytes.len() || self.offset >= self.source.length() {
159                break;
160            }
161        }
162        Range { start, end: self.offset }
163    }
164
165    /// Skips ASCII digits using SIMD if possible.
166    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
167        let start = self.offset;
168        loop {
169            self.ensure_chunk();
170            let rel = self.offset.saturating_sub(self.chunk.start);
171            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
172
173            if bytes.is_empty() {
174                if self.offset >= self.source.length() {
175                    break;
176                }
177                self.chunk = self.source.chunk_at(self.offset);
178                continue;
179            }
180
181            let skipped = SimdScanner::skip_ascii_digits(bytes);
182            self.offset += skipped;
183
184            if skipped < bytes.len() || self.offset >= self.source.length() {
185                break;
186            }
187        }
188        Range { start, end: self.offset }
189    }
190
191    /// Skips ASCII identifier continue characters using SIMD if possible.
192    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
193        let start = self.offset;
194        loop {
195            self.ensure_chunk();
196            let rel = self.offset.saturating_sub(self.chunk.start);
197            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
198
199            if bytes.is_empty() {
200                if self.offset >= self.source.length() {
201                    break;
202                }
203                self.chunk = self.source.chunk_at(self.offset);
204                continue;
205            }
206
207            let skipped = SimdScanner::skip_ascii_ident_continue(bytes);
208            self.offset += skipped;
209
210            if skipped < bytes.len() || self.offset >= self.source.length() {
211                break;
212            }
213        }
214        Range { start, end: self.offset }
215    }
216
217    /// Skips until the specified byte is found.
218    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
219        let start = self.offset;
220        loop {
221            self.ensure_chunk();
222            let rel = self.offset.saturating_sub(self.chunk.start);
223            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
224
225            if bytes.is_empty() {
226                if self.offset >= self.source.length() {
227                    break;
228                }
229                self.chunk = self.source.chunk_at(self.offset);
230                continue;
231            }
232
233            let skipped = SimdScanner::skip_until(bytes, target);
234            self.offset += skipped;
235
236            if skipped < bytes.len() || self.offset >= self.source.length() {
237                break;
238            }
239        }
240        Range { start, end: self.offset }
241    }
242
243    /// Peeks at the next byte without advancing. the cursor.
244    #[inline(always)]
245    pub fn peek_byte(&mut self) -> Option<u8> {
246        if self.offset >= self.chunk.start {
247            let rel = self.offset - self.chunk.start;
248            let bytes = self.chunk.text.as_bytes();
249            if rel < bytes.len() {
250                return Some(unsafe { *bytes.get_unchecked(rel) });
251            }
252        }
253        self.ensure_chunk();
254        let rel = self.offset - self.chunk.start;
255        let bytes = self.chunk.text.as_bytes();
256        bytes.get(rel).copied()
257    }
258
259    /// Advances the cursor by the specified number of bytes.
260    pub fn advance_bytes(&mut self, len: usize) -> usize {
261        self.offset = (self.offset + len).min(self.source.length());
262        self.offset
263    }
264
265    /// Advances the cursor by one character and returns it.
266    pub fn advance_char(&mut self) -> Option<char> {
267        let ch = self.peek_char()?;
268        self.advance_bytes(ch.len_utf8());
269        Some(ch)
270    }
271
272    /// Advances the cursor by one byte and returns it.
273    #[inline(always)]
274    pub fn advance_byte(&mut self) -> Option<u8> {
275        let b = self.peek_byte()?;
276        self.offset += 1;
277        Some(b)
278    }
279
280    /// Advances the cursor while the predicate is true and returns the range.
281    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
282        let start = self.offset;
283
284        loop {
285            // Ensure we have a valid chunk for current offset
286            self.ensure_chunk();
287
288            // Get text slice from current offset
289            let rel = self.offset.saturating_sub(self.chunk.start);
290            let text = if rel < self.chunk.text.len() { unsafe { self.chunk.text.get_unchecked(rel..) } } else { "" };
291
292            if text.is_empty() {
293                // If text is empty, it means we are at the end of the chunk (or source).
294                // If we are at the end of source, break.
295                if self.offset >= self.source.length() {
296                    break;
297                }
298                // Otherwise force move to next chunk
299                self.chunk = self.source.chunk_at(self.offset);
300                // Continue loop to process next chunk
301                continue;
302            }
303
304            let mut advanced = 0;
305            let mut stop = false;
306
307            // Iterate over characters in the current chunk slice
308            for (i, ch) in text.char_indices() {
309                if !pred(ch) {
310                    advanced = i;
311                    stop = true;
312                    break;
313                }
314                advanced = i + ch.len_utf8()
315            }
316
317            self.offset += advanced;
318
319            if stop {
320                break;
321            }
322
323            // If we consumed the whole chunk but didn't stop, we need to check if we are at EOF
324            if self.offset >= self.source.length() {
325                break;
326            }
327            // If not at EOF, the loop will continue, ensure_chunk will get the next chunk
328        }
329
330        Range { start, end: self.offset }
331    }
332
333    /// Advances the cursor while the byte predicate is true and returns the range.
334    #[inline(always)]
335    pub fn take_while_byte(&mut self, mut pred: impl FnMut(u8) -> bool) -> Range<usize> {
336        let start = self.offset;
337
338        loop {
339            self.ensure_chunk();
340            let rel = self.offset.saturating_sub(self.chunk.start);
341            let bytes = if rel < self.chunk.text.len() { unsafe { self.chunk.text.as_bytes().get_unchecked(rel..) } } else { &[] };
342
343            if bytes.is_empty() {
344                if self.offset >= self.source.length() {
345                    break;
346                }
347                self.chunk = self.source.chunk_at(self.offset);
348                continue;
349            }
350
351            let mut advanced = 0;
352            let mut stop = false;
353
354            for (i, &b) in bytes.iter().enumerate() {
355                if !pred(b) {
356                    advanced = i;
357                    stop = true;
358                    break;
359                }
360                advanced = i + 1
361            }
362
363            self.offset += advanced;
364            if stop || self.offset >= self.source.length() {
365                break;
366            }
367        }
368
369        Range { start, end: self.offset }
370    }
371
372    /// Returns `true` if the source text at the current position starts with the given pattern.
373    pub fn starts_with(&mut self, pattern: &str) -> bool {
374        self.ensure_chunk();
375        let chunk_text = self.chunk.text;
376        let offset_in_chunk = self.offset.saturating_sub(self.chunk.start);
377
378        // Ensure offset_in_chunk is on a character boundary
379        let rest = if chunk_text.is_char_boundary(offset_in_chunk) {
380            chunk_text.get(offset_in_chunk..).unwrap_or("")
381        }
382        else {
383            // If not on a boundary, try to find the next boundary
384            let mut i = offset_in_chunk;
385            while i < chunk_text.len() && !chunk_text.is_char_boundary(i) {
386                i += 1
387            }
388            chunk_text.get(i..).unwrap_or("")
389        };
390
391        if rest.len() >= pattern.len() {
392            return rest.starts_with(pattern);
393        }
394
395        self.scratch.clear();
396        self.scratch.push_str(rest);
397        let mut next = self.chunk.end();
398        let end = self.source.length();
399        while self.scratch.len() < pattern.len() && next < end {
400            let chunk = self.source.chunk_at(next);
401            self.scratch.push_str(chunk.text);
402            next = chunk.end()
403        }
404        self.scratch.starts_with(pattern)
405    }
406
407    /// Consumes the given pattern if it matches at the current position.
408    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
409        if !self.starts_with(pattern) {
410            return false;
411        }
412        self.advance_bytes(pattern.len());
413        true
414    }
415
416    /// Finds the first occurrence of the given pattern in the source text starting from the current position.
417    pub fn find_str(&mut self, pattern: &str) -> Option<usize> {
418        if pattern.is_empty() {
419            return Some(self.offset);
420        }
421
422        let pat_len = pattern.len();
423        let mut offset = self.offset;
424        let end = self.source.length();
425        while offset < end {
426            self.offset = offset;
427            self.ensure_chunk();
428            let text = self.chunk.slice_from(offset);
429            if let Some(pos) = text.find(pattern) {
430                return Some(offset + pos);
431            }
432            let chunk_end = self.chunk.end();
433            if chunk_end >= end {
434                return None;
435            }
436
437            if pat_len > 1 {
438                let keep = pat_len - 1;
439                self.scratch.clear();
440                let tail = text.get(text.len().saturating_sub(keep)..).unwrap_or("");
441                self.scratch.push_str(tail);
442                let tail_abs_start = chunk_end.saturating_sub(tail.len());
443                let next_chunk = self.source.chunk_at(chunk_end);
444                self.scratch.push_str(next_chunk.text);
445                if let Some(pos) = self.scratch.find(pattern) {
446                    return Some(tail_abs_start + pos);
447                }
448            }
449
450            offset = chunk_end
451        }
452        None
453    }
454}