oak_core/lexer/
state.rs

1use crate::{
2    Language, TokenType,
3    errors::OakError,
4    lexer::{LexOutput, LexerCache, Token, Tokens},
5    source::{Source, SourceCursor},
6};
7pub use core::range::Range;
8use std::borrow::Cow;
9
10/// Represents the state of the lexer during a tokenization session.
11///
12/// This struct maintains the current position and context during
13/// tokenization, enabling incremental and resumable lexing operations.
14/// It tracks the current position in the source text, collected tokens,
15/// and any errors encountered.
16#[derive(Debug)]
17pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
18    pub(crate) cursor: SourceCursor<'s, S>,
19    pub(crate) tokens: Vec<Token<L::TokenType>>,
20    pub(crate) errors: Vec<OakError>,
21}
22
23impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
24    /// Creates a new lexer state with the given source text.
25    ///
26    /// # Arguments
27    ///
28    /// * `source` - The source text to lex
29    ///
30    /// # Returns
31    ///
32    /// A new `LexerState` initialized at the beginning of the source
33    pub fn new(source: &'s S) -> Self {
34        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
35    }
36
37    /// Creates a new lexer state with the given source text and incremental cache.
38    ///
39    /// # Arguments
40    ///
41    /// * `source` - The source text to lex
42    /// * `relex_from` - The minimum byte offset that may have been affected by edits
43    ///   (use `source.length()` to indicate no edits)
44    /// * `cache` - The incremental cache containing previous lexing results
45    ///
46    /// # Returns
47    ///
48    /// A new `LexerState` initialized at the beginning of the source with cache support
49    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
50        if !cache.has_tokens() {
51            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
52        }
53
54        let len = source.length();
55        let relex_from = relex_from.min(len);
56
57        if relex_from >= len {
58            let mut tokens = Vec::new();
59            if let Some(cached) = cache.get_tokens() {
60                tokens.extend_from_slice(cached)
61            }
62            else {
63                let count = cache.count_tokens();
64                tokens.reserve(count);
65                for i in 0..count {
66                    if let Some(t) = cache.get_token(i) {
67                        tokens.push(t)
68                    }
69                }
70            }
71            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
72            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
73        }
74
75        if relex_from == 0 {
76            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
77        }
78
79        let mut reused_tokens = Vec::new();
80        const BACKTRACK_TOKENS: usize = 1;
81
82        if let Some(cached) = cache.get_tokens() {
83            let idx = cached.partition_point(|t| t.span.end <= relex_from);
84            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
85            if keep > 0 {
86                reused_tokens.extend_from_slice(&cached[..keep])
87            }
88        }
89        else {
90            let count = cache.count_tokens();
91            for i in 0..count {
92                let Some(token) = cache.get_token(i)
93                else {
94                    break;
95                };
96                if token.span.end <= relex_from {
97                    reused_tokens.push(token);
98                }
99                else {
100                    break;
101                }
102            }
103            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
104            reused_tokens.truncate(keep);
105        }
106
107        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
108        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
109    }
110
111    /// Creates a sub-state for scanning a sub-range of the source.
112    pub fn sub_state(&mut self, start: usize, _end: usize) -> Self {
113        Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![] }
114    }
115
116    /// Returns the source text provider.
117    pub fn get_source(&self) -> &'s S {
118        self.cursor.source()
119    }
120
121    /// Gets the remaining text from the current position to the end of the source.
122    ///
123    /// # Returns
124    ///
125    /// A string slice containing the remaining text
126    pub fn rest(&mut self) -> &str {
127        self.cursor.rest()
128    }
129
130    /// Gets the remaining text as a byte slice.
131    ///
132    /// Useful for byte-oriented scanning operations.
133    #[inline]
134    pub fn rest_bytes(&mut self) -> &[u8] {
135        self.cursor.rest().as_bytes()
136    }
137
138    /// Checks if the lexer has consumed all input from the source.
139    ///
140    /// Returns `true` if the current position is at or beyond the end of the source.
141    pub fn fully_reused(&self) -> bool {
142        self.cursor.position() >= self.cursor.source().length()
143    }
144
145    /// Gets the current byte offset position in the source text.
146    ///
147    /// # Returns
148    ///
149    /// The current byte offset from the start of the source text.
150    #[inline]
151    pub fn get_position(&self) -> usize {
152        self.cursor.position()
153    }
154
155    /// Checks if the lexer has NOT consumed all input from the source.
156    ///
157    /// Returns `true` if there are still bytes left to be scanned.
158    #[inline]
159    pub fn not_at_end(&self) -> bool {
160        self.cursor.position() < self.cursor.source().length()
161    }
162
163    /// Peeks at the next character without advancing the cursor.
164    ///
165    /// Returns `None` if at the end of the source.
166    #[inline]
167    pub fn peek(&mut self) -> Option<char> {
168        self.cursor.peek_char()
169    }
170
171    /// Peeks at the character immediately following the current character.
172    #[inline]
173    pub fn peek_next(&mut self) -> Option<char> {
174        self.cursor.peek_next_char()
175    }
176
177    /// Peeks at the character at the specified byte offset relative to the current position.
178    #[inline]
179    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
180        self.cursor.peek_next_n(n)
181    }
182
183    /// Advances the cursor by the specified number of bytes.
184    #[inline]
185    pub fn advance(&mut self, len: usize) {
186        self.cursor.advance_bytes(len);
187    }
188
189    /// Gets the total length of the source text in bytes.
190    #[inline]
191    pub fn get_length(&self) -> usize {
192        self.cursor.source().length()
193    }
194
195    /// Gets a single character at the specified absolute byte offset.
196    #[inline]
197    pub fn get_char_at(&self, offset: usize) -> Option<char> {
198        self.cursor.source().get_char_at(offset)
199    }
200
201    /// Peeks at the next byte without advancing the cursor.
202    #[inline]
203    pub fn peek_byte(&mut self) -> Option<u8> {
204        self.cursor.peek_byte()
205    }
206
207    /// Advances the cursor by one byte and returns it.
208    #[inline]
209    pub fn advance_byte(&mut self) -> Option<u8> {
210        self.cursor.advance_byte()
211    }
212
213    /// Advances the cursor while the byte predicate is true.
214    ///
215    /// Returns the byte range covered by the matched bytes.
216    #[inline]
217    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
218        self.cursor.take_while_byte(pred)
219    }
220
221    /// Skips common ASCII whitespace (space, tab, newline, carriage return).
222    ///
223    /// Uses SIMD acceleration if available on the platform.
224    /// Returns the range of the skipped whitespace.
225    #[inline]
226    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
227        self.cursor.skip_ascii_whitespace()
228    }
229
230    /// Skips all consecutive ASCII digits at the current position.
231    ///
232    /// Returns the range of the skipped digits.
233    #[inline]
234    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
235        self.cursor.skip_ascii_digits()
236    }
237
238    /// Skips all characters that can continue an ASCII identifier.
239    ///
240    /// This includes alphanumeric characters and underscores.
241    /// Returns the range of the skipped characters.
242    #[inline]
243    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
244        self.cursor.skip_ascii_ident_continue()
245    }
246
247    /// Skips all characters until the target byte is encountered.
248    ///
249    /// The target byte itself is NOT consumed.
250    /// Returns the range of the skipped characters.
251    #[inline]
252    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
253        self.cursor.skip_until(target)
254    }
255
256    /// Skips all ASCII hex digits (0-9, a-f, A-F).
257    ///
258    /// Uses SIMD acceleration if available on the platform.
259    /// Returns the range of the skipped hex digits.
260    #[inline]
261    pub fn skip_ascii_hexdigits(&mut self) -> std::range::Range<usize> {
262        let start = self.get_position();
263        let rest = self.rest_bytes();
264        let skipped = crate::source::SimdScanner::skip_ascii_hexdigits(rest);
265        self.advance(skipped);
266        (start..self.get_position()).into()
267    }
268
269    /// Finds the first occurrence of the target byte in the remaining text.
270    ///
271    /// Uses SIMD acceleration if available on the platform.
272    /// Returns the byte offset relative to the current position, or None if not found.
273    #[inline]
274    pub fn find_byte(&mut self, target: u8) -> Option<usize> {
275        let rest = self.rest_bytes();
276        crate::source::SimdScanner::find_byte(rest, target)
277    }
278
279    /// Finds the first occurrence of any of the 4 bytes in the remaining text.
280    ///
281    /// Uses SIMD acceleration if available on the platform.
282    /// Returns the byte offset relative to the current position, or None if not found.
283    #[inline]
284    pub fn find_first_of_4(&mut self, a: u8, b: u8, c: u8, d: u8) -> Option<usize> {
285        let rest = self.rest_bytes();
286        crate::source::SimdScanner::find_first_of_4(rest, a, b, c, d)
287    }
288
289    /// Scans an ASCII identifier.
290    ///
291    /// An identifier must start with an alphabetic character or an underscore,
292    /// and can be followed by any number of alphanumeric characters or underscores.
293    ///
294    /// # Arguments
295    ///
296    /// * `kind` - The token type to assign if an identifier is found.
297    ///
298    /// # Returns
299    ///
300    /// `true` if an identifier was successfully scanned and added.
301    #[inline]
302    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
303        let start = self.get_position();
304        if let Some(b) = self.peek_byte() {
305            if b == b'_' || b.is_ascii_alphabetic() {
306                self.advance_byte();
307                self.skip_ascii_ident_continue();
308                self.add_token(kind, start, self.get_position());
309                return true;
310            }
311        }
312        false
313    }
314
315    /// Scans a line comment starting with the given prefix.
316    ///
317    /// Consumes the prefix and all characters until the next newline or EOF.
318    ///
319    /// # Arguments
320    ///
321    /// * `kind` - The token type for the line comment.
322    /// * `prefix` - The string sequence that starts the comment (e.g., "//").
323    #[inline]
324    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
325        let start = self.get_position();
326        if self.consume_if_starts_with(prefix) {
327            self.skip_until(b'\n');
328            self.add_token(kind, start, self.get_position());
329            return true;
330        }
331        false
332    }
333
334    /// Scans a block comment with given start and end sequences.
335    ///
336    /// Handles nested comments if the underlying implementation supports it,
337    /// though this basic implementation is non-recursive.
338    ///
339    /// # Arguments
340    ///
341    /// * `kind` - The token type for the block comment.
342    /// * `start_seq` - The sequence that starts the block (e.g., "/*").
343    /// * `end_seq` - The sequence that ends the block (e.g., "*/").
344    #[inline]
345    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
346        let start = self.get_position();
347        if self.consume_if_starts_with(start_seq) {
348            while let Some(_b) = self.peek_byte() {
349                self.skip_until(end_seq.as_bytes()[0]);
350                if self.consume_if_starts_with(end_seq) {
351                    self.add_token(kind, start, self.get_position());
352                    return true;
353                }
354                self.advance_byte();
355            }
356            self.add_token(kind, start, self.get_position());
357            return true;
358        }
359        false
360    }
361
362    /// Gets the tokens collected so far in the lexer state.
363    ///
364    /// # Returns
365    ///
366    /// A slice of tokens collected during lexing.
367    #[inline]
368    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
369        &self.tokens
370    }
371
372    /// Sets the current position to the specified byte offset.
373    ///
374    /// # Arguments
375    ///
376    /// * `offset` - The new byte offset position.
377    ///
378    /// # Returns
379    ///
380    /// The previous byte offset position.
381    #[inline]
382    pub fn set_position(&mut self, offset: usize) -> usize {
383        self.cursor.set_position(offset)
384    }
385
386    /// Returns a reference to the underlying source.
387    pub fn source(&self) -> &'s S {
388        self.cursor.source()
389    }
390
391    /// Returns the text in the specified byte range.
392    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
393        self.cursor.source().get_text_in(range)
394    }
395
396    /// Returns the text from the specified byte offset to the end of the source.
397    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
398        self.cursor.source().get_text_from(offset)
399    }
400
401    /// Checks if the source starts with the given pattern at the current position.
402    pub fn starts_with(&mut self, pattern: &str) -> bool {
403        self.cursor.starts_with(pattern)
404    }
405
406    /// Consumes the pattern if it exists at the current position.
407    ///
408    /// Returns `true` if the pattern was found and consumed, advancing the cursor.
409    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
410        self.cursor.consume_if_starts_with(pattern)
411    }
412
413    /// Adds an error to the lexer state's diagnostics.
414    ///
415    /// # Arguments
416    ///
417    /// * `error` - The error to add.
418    #[inline]
419    pub fn add_error(&mut self, error: impl Into<OakError>) {
420        self.errors.push(error.into());
421    }
422
423    /// Adds a token to the lexer state.
424    ///
425    /// # Arguments
426    ///
427    /// * `kind` - The kind/type of the token.
428    /// * `start` - The starting byte offset.
429    /// * `end` - The ending byte offset.
430    #[inline]
431    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
432        self.tokens.push(Token { kind, span: Range { start, end } });
433    }
434
435    /// Adds an end-of-file (EOF) token to the lexer state.
436    ///
437    /// This method creates and adds an `END_OF_STREAM` token at the current position.
438    /// It is typically called when the lexer reaches the end of the source text.
439    ///
440    /// # Examples
441    ///
442    /// ```ignore
443    /// #![feature(new_range_api)]
444    /// # use core::range::Range;
445    /// # use oak_core::lexer::{LexerState, Token};
446    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
447    /// #
448    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
449    /// # enum SimpleToken {
450    /// #     End,
451    /// # }
452    /// #
453    /// # impl TokenType for SimpleToken {
454    /// #     const END_OF_STREAM: Self = SimpleToken::End;
455    /// #     type Role = UniversalTokenRole;
456    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
457    /// # }
458    /// #
459    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
460    /// # enum SimpleElement {}
461    /// #
462    /// # impl ElementType for SimpleElement {
463    /// #     type Role = UniversalElementRole;
464    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
465    /// # }
466    /// #
467    /// # #[derive(Clone)]
468    /// # struct SimpleLanguage;
469    /// #
470    /// # impl Language for SimpleLanguage {
471    /// #     const NAME: &'static str = "simple";
472    /// #     type TokenType = SimpleToken;
473    /// #     type ElementType = SimpleElement;
474    /// #     type TypedRoot = ();
475    /// # }
476    /// #
477    /// let source = SourceText::new("test");
478    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
479    /// state.take_while(|_| true);
480    /// state.add_eof();
481    ///
482    /// assert_eq!(state.get_tokens().len(), 1);
483    /// assert_eq!(state.get_tokens()[0].span, Range { start: 4, end: 4 });
484    /// ```
485    #[inline]
486    pub fn add_eof(&mut self) {
487        let end = self.get_position();
488        self.add_token(L::TokenType::END_OF_STREAM, end, end)
489    }
490
491    /// Gets the current character at the current position.
492    ///
493    /// # Returns
494    ///
495    /// The current character, or `None` if at the end of the source
496    #[inline]
497    pub fn current(&mut self) -> Option<char> {
498        self.cursor.peek_char()
499    }
500
501    /// Advances the position by the current character's length.
502    ///
503    /// # Returns
504    ///
505    /// The character that was skipped, or `None` if at the end of the source
506    #[inline]
507    pub fn bump(&mut self) -> Option<char> {
508        let ch = self.peek()?;
509        self.advance(ch.len_utf8());
510        Some(ch)
511    }
512
513    /// Advances the position by the token's length and adds the token to the lexer state.
514    ///
515    /// This method combines two common operations: advancing the lexer position
516    /// and adding a token to the token list. It calculates the advance distance
517    /// from the token's span, ensuring consistent positioning.
518    ///
519    /// # Arguments
520    ///
521    /// * `token` - The token to add to the lexer state
522    ///
523    /// # Returns
524    ///
525    /// The new byte offset position after advancing
526    ///
527    /// # Examples
528    ///
529    /// ```ignore
530    /// #![feature(new_range_api)]
531    /// # use core::range::Range;
532    /// # use oak_core::lexer::{LexerState, Token};
533    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
534    /// #
535    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
536    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
537    /// # enum SimpleToken { Identifier, End }
538    /// #
539    /// # impl TokenType for SimpleToken {
540    /// #     const END_OF_STREAM: Self = SimpleToken::End;
541    /// #     type Role = UniversalTokenRole;
542    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
543    /// # }
544    /// #
545    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
546    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
547    /// # enum SimpleElement {}
548    /// #
549    /// # impl ElementType for SimpleElement {
550    /// #     type Role = UniversalElementRole;
551    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
552    /// # }
553    /// #
554    /// # #[derive(Clone)]
555    /// # struct SimpleLanguage;
556    /// #
557    /// # impl Language for SimpleLanguage {
558    /// #     const NAME: &'static str = "simple";
559    /// #     type TokenType = SimpleToken;
560    /// #     type ElementType = SimpleElement;
561    /// #     type TypedRoot = ();
562    /// # }
563    /// #
564    /// let source = SourceText::new("hello world");
565    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
566    ///
567    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
568    ///
569    /// assert_eq!(state.get_position(), 0);
570    ///
571    /// let new_pos = state.advance_with(token);
572    ///
573    /// assert_eq!(new_pos, 5);
574    /// assert_eq!(state.get_position(), 5);
575    /// assert_eq!(state.get_tokens().len(), 1);
576    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
577    /// ```
578    ///
579    /// # Note
580    ///
581    /// The caller must ensure that the token's span is valid and that the advance
582    /// does not split multi-byte UTF-8 characters. The token should be created
583    /// with proper character boundaries.
584    #[inline]
585    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
586        self.cursor.advance_bytes(token.length());
587        self.tokens.push(token);
588        self.cursor.position()
589    }
590
591    /// Consumes characters while the predicate returns true, returning the consumed range.
592    ///
593    /// This method iterates through the source text from the current position,
594    /// consuming characters as long as the predicate function returns true.
595    /// It's commonly used for recognizing patterns like identifiers, numbers,
596    /// or whitespace sequences.
597    ///
598    /// # Arguments
599    ///
600    /// * `pred` - A closure that takes a character and returns true if the character
601    ///            should be consumed, false otherwise
602    ///
603    /// # Returns
604    ///
605    /// A byte range representing the span of consumed characters
606    ///
607    /// # Examples
608    ///
609    /// ```ignore
610    /// #![feature(new_range_api)]
611    /// # use core::range::Range;
612    /// # use oak_core::lexer::{LexerState, Token};
613    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
614    /// #
615    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
616    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
617    /// # enum SimpleToken { End }
618    /// #
619    /// # impl TokenType for SimpleToken {
620    /// #     const END_OF_STREAM: Self = SimpleToken::End;
621    /// #     type Role = UniversalTokenRole;
622    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
623    /// # }
624    /// #
625    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
626    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
627    /// # enum SimpleElement {}
628    /// #
629    /// # impl ElementType for SimpleElement {
630    /// #     type Role = UniversalElementRole;
631    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
632    /// # }
633    /// #
634    /// # #[derive(Clone)]
635    /// # struct SimpleLanguage;
636    /// #
637    /// # impl Language for SimpleLanguage {
638    /// #     const NAME: &'static str = "simple";
639    /// #     type TokenType = SimpleToken;
640    /// #     type ElementType = SimpleElement;
641    /// #     type TypedRoot = ();
642    /// # }
643    /// #
644    /// let source = SourceText::new("hello123world");
645    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
646    ///
647    /// let range = state.take_while(|c| c.is_alphabetic());
648    ///
649    /// assert_eq!(range, Range { start: 0, end: 5 });
650    /// assert_eq!(state.get_position(), 5);
651    ///
652    /// let range = state.take_while(|c| c.is_numeric());
653    ///
654    /// assert_eq!(range, Range { start: 5, end: 8 });
655    /// assert_eq!(state.get_position(), 8);
656    /// ```
657    ///
658    /// # Performance Note
659    ///
660    /// This method operates on a character-by-character basis, which means it
661    /// correctly handles multi-byte UTF-8 characters. For performance-critical
662    /// code, consider using byte-based methods when working with ASCII-only text.
663    pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
664        self.cursor.take_while(pred)
665    }
666
667    /// Performs a safety check to prevent infinite loops during lexing.
668    ///
669    /// This method ensures that the lexer always makes progress by forcing
670    /// advancement when stuck at the same position. It's used as a safeguard
671    /// against infinite loops in lexer implementations.
672    ///
673    /// The method compares the current position with a previously saved "safe point"
674    /// position. If they're the same, it means the lexer hasn't made progress since
675    /// that safe point, potentially indicating an infinite loop. In this case, the
676    /// method forces advancement by at least one character.
677    ///
678    /// # Arguments
679    ///
680    /// * `safe_point` - The position to check against for potential deadlock
681    ///
682    /// # Examples
683    ///
684    /// ```ignore
685    /// #![feature(new_range_api)]
686    /// # use oak_core::lexer::{LexerState, Token};
687    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
688    /// #
689    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
690    /// # enum SimpleToken { End }
691    /// #
692    /// # impl TokenType for SimpleToken {
693    /// #     const END_OF_STREAM: Self = SimpleToken::End;
694    /// #     type Role = UniversalTokenRole;
695    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
696    /// # }
697    /// #
698    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
699    /// # enum SimpleElement {}
700    /// #
701    /// # impl ElementType for SimpleElement {
702    /// #     type Role = UniversalElementRole;
703    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
704    /// # }
705    /// #
706    /// # struct SimpleLanguage;
707    /// #
708    /// # impl Language for SimpleLanguage {
709    /// #     const NAME: &'static str = "simple";
710    /// #     type TokenType = SimpleToken;
711    /// #     type ElementType = SimpleElement;
712    /// #     type TypedRoot = ();
713    /// # }
714    /// #
715    /// let source = SourceText::new("test");
716    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
717    ///
718    /// let safe_point = state.get_position();
719    ///
720    /// state.advance_if_dead_lock(safe_point);
721    ///
722    /// assert!(state.get_position() >= safe_point);
723    /// ```
724    ///
725    /// # Usage in Lexer Implementations
726    ///
727    /// This method is typically used at the beginning or end of lexing loops:
728    ///
729    /// ```ignore
730    /// loop {
731    ///     let safe_point = state.get_position();
732    ///
733    ///     if let Some(token) = try_recognize_token(&mut state) {
734    ///         continue;
735    ///     }
736    ///
737    ///     state.advance_if_dead_lock(safe_point);
738    ///
739    ///     if state.not_at_end() {
740    ///         continue;
741    ///     } else {
742    ///         break;
743    ///     }
744    /// }
745    /// ```
746    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
747        if self.cursor.position() == safe_point {
748            if let Some(ch) = self.peek() { self.advance(ch.len_utf8()) } else { self.advance(1) }
749        }
750    }
751
752    /// Finishes lexing and returns the final output with tokens and diagnostics.
753    ///
754    /// This method concludes the lexing process by converting the collected tokens
755    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
756    /// represents the overall success or failure of the lexing operation.
757    ///
758    /// If the result is `Ok`, the tokens are returned as the successful result.
759    /// If the result is `Err`, the error is returned as the failure result.
760    /// In both cases, any collected diagnostic errors are included in the output.
761    ///
762    /// # Arguments
763    ///
764    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
765    ///
766    /// # Returns
767    ///
768    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
769    ///
770    /// # Examples
771    ///
772    /// ```
773    /// #![feature(new_range_api)]
774    /// # use oak_core::lexer::{LexerState, Token};
775    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
776    /// #
777    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
778    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
779    /// # enum SimpleToken { Identifier, End }
780    /// #
781    /// # impl TokenType for SimpleToken {
782    /// #     const END_OF_STREAM: Self = SimpleToken::End;
783    /// #     type Role = UniversalTokenRole;
784    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
785    /// # }
786    /// #
787    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
788    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
789    /// # enum SimpleElement {}
790    /// #
791    /// # impl ElementType for SimpleElement {
792    /// #     type Role = UniversalElementRole;
793    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
794    /// # }
795    /// #
796    /// # struct SimpleLanguage;
797    /// #
798    /// # impl Language for SimpleLanguage {
799    /// #     const NAME: &'static str = "simple";
800    /// #     type TokenType = SimpleToken;
801    /// #     type ElementType = SimpleElement;
802    /// #     type TypedRoot = ();
803    /// # }
804    /// #
805    /// let source = SourceText::new("test");
806    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
807    ///
808    /// state.add_token(SimpleToken::Identifier, 0, 4);
809    ///
810    /// let output = state.finish(Ok(()));
811    ///
812    /// assert!(output.result.is_ok());
813    /// assert_eq!(output.result.unwrap().len(), 1);
814    /// assert_eq!(output.diagnostics.len(), 0);
815    ///
816    /// let source2 = SourceText::new("test");
817    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
818    /// state2.add_error(OakError::custom_error("Test error"));
819    ///
820    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
821    ///
822    /// assert!(output2.result.is_err());
823    /// assert_eq!(output2.diagnostics.len(), 1);
824    /// ```
825    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
826        match result {
827            Ok(_) => {
828                let tokens: Tokens<L> = self.tokens.into();
829                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
830            }
831            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
832        }
833    }
834
835    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
836    ///
837    /// This method is similar to `finish` but additionally updates the incremental cache
838    /// with the new tokens. It's used for incremental lexing where the results need to
839    /// be cached for future reuse when the source text changes.
840    ///
841    /// The method first creates the output in the same way as `finish`, then updates
842    /// the cache's `last_lex` field with the new tokens. This enables the next call
843    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
844    ///
845    /// # Arguments
846    ///
847    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
848    /// * `cache` - The incremental cache to update with the new tokens
849    ///
850    /// # Returns
851    ///
852    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
853    ///
854    /// # Examples
855    ///
856    /// ```ignore
857    /// #![feature(new_range_api)]
858    /// # use core::range::Range;
859    /// # use oak_core::lexer::{LexerState, Token};
860    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
861    /// # use oak_core::parser::session::ParseSession;
862    /// #
863    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
864    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
865    /// # enum SimpleToken { Identifier, End }
866    /// #
867    /// # impl TokenType for SimpleToken {
868    /// #     const END_OF_STREAM: Self = SimpleToken::End;
869    /// #     type Role = UniversalTokenRole;
870    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
871    /// # }
872    /// #
873    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
874    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
875    /// # enum SimpleElement {}
876    /// #
877    /// # impl ElementType for SimpleElement {
878    /// #     type Role = UniversalElementRole;
879    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
880    /// # }
881    /// #
882    /// # struct SimpleLanguage;
883    /// #
884    /// # impl Language for SimpleLanguage {
885    /// #     const NAME: &'static str = "simple";
886    /// #     type TokenType = SimpleToken;
887    /// #     type ElementType = SimpleElement;
888    /// #     type TypedRoot = ();
889    /// # }
890    /// #
891    /// let source = SourceText::new("test");
892    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
893    ///
894    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
895    ///
896    /// state.add_token(SimpleToken::Identifier, 0, 4);
897    ///
898    /// let output = state.finish_with_cache(Ok(()), &mut cache);
899    ///
900    /// assert!(output.result.is_ok());
901    /// assert_eq!(output.result.unwrap().len(), 1);
902    /// ```
903    ///
904    /// # Incremental Lexing Workflow
905    ///
906    /// This method is typically used as part of an incremental lexing workflow:
907    ///
908    /// ```ignore
909    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
910    /// let output = state.finish_with_cache(Ok(()), cache);
911    ///
912    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
913    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
914    /// let output = state.finish_with_cache(Ok(()), cache);
915    /// ```
916    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
917        let out = self.finish(result);
918        cache.set_lex_output(out.clone());
919        out
920    }
921}
922
923use crate::OakDiagnostics;
oak_core/lexer/state.rs

oak_core/lexer/
state.rs