oak_core/lexer/
state.rs

1use crate::{
2    Language, TokenType,
3    errors::OakError,
4    lexer::{LexOutput, LexerCache, Token, Tokens},
5    source::{Source, SourceCursor},
6};
7pub use core::range::Range;
8use std::borrow::Cow;
9
10/// Represents the state of the lexer during a tokenization session.
11///
12/// This struct maintains the current position and context during
13/// tokenization, enabling incremental and resumable lexing operations.
14/// It tracks the current position in the source text, collected tokens,
15/// and any errors encountered.
16#[derive(Debug)]
17pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
18    pub(crate) cursor: SourceCursor<'s, S>,
19    pub(crate) tokens: Vec<Token<L::TokenType>>,
20    pub(crate) errors: Vec<OakError>,
21    pub(crate) end_limit: Option<usize>,
22}
23
24impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
25    /// Creates a new lexer state with the given source text.
26    ///
27    /// # Arguments
28    ///
29    /// * `source` - The source text to lex
30    ///
31    /// # Returns
32    ///
33    /// A new `LexerState` initialized at the beginning of the source
34    pub fn new(source: &'s S) -> Self {
35        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None }
36    }
37
38    /// Creates a new lexer state with the given source text and incremental cache.
39    ///
40    /// # Arguments
41    ///
42    /// * `source` - The source text to lex
43    /// * `relex_from` - The minimum byte offset that may have been affected by edits
44    ///   (use `source.length()` to indicate no edits)
45    /// * `cache` - The incremental cache containing previous lexing results
46    ///
47    /// # Returns
48    ///
49    /// A new `LexerState` initialized at the beginning of the source with cache support
50    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
51        if !cache.has_tokens() {
52            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None };
53        }
54
55        let len = source.length();
56        let relex_from = relex_from.min(len);
57
58        if relex_from >= len {
59            let mut tokens = Vec::new();
60            if let Some(cached) = cache.get_tokens() {
61                tokens.extend_from_slice(cached)
62            }
63            else {
64                let count = cache.count_tokens();
65                tokens.reserve(count);
66                for i in 0..count {
67                    if let Some(t) = cache.get_token(i) {
68                        tokens.push(t)
69                    }
70                }
71            }
72            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
73            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![], end_limit: None };
74        }
75
76        if relex_from == 0 {
77            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None };
78        }
79
80        let mut reused_tokens = Vec::new();
81        const BACKTRACK_TOKENS: usize = 1;
82
83        if let Some(cached) = cache.get_tokens() {
84            let idx = cached.partition_point(|t| t.span.end <= relex_from);
85            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
86            if keep > 0 {
87                reused_tokens.extend_from_slice(&cached[..keep])
88            }
89        }
90        else {
91            let count = cache.count_tokens();
92            for i in 0..count {
93                let Some(token) = cache.get_token(i)
94                else {
95                    break;
96                };
97                if token.span.end <= relex_from {
98                    reused_tokens.push(token);
99                }
100                else {
101                    break;
102                }
103            }
104            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
105            reused_tokens.truncate(keep);
106        }
107
108        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
109        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![], end_limit: None }
110    }
111
112    /// Creates a sub-state for scanning a sub-range of the source.
113    pub fn sub_state(&mut self, start: usize, end: usize) -> Self {
114        Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![], end_limit: Some(end) }
115    }
116
117    /// Returns the source text provider.
118    pub fn get_source(&self) -> &'s S {
119        self.cursor.source()
120    }
121
122    /// Gets the remaining text from the current position to the end of the source.
123    ///
124    /// # Returns
125    ///
126    /// A string slice containing the remaining text
127    pub fn rest(&mut self) -> &str {
128        self.cursor.rest()
129    }
130
131    /// Gets the remaining text as a byte slice.
132    ///
133    /// Useful for byte-oriented scanning operations.
134    #[inline]
135    pub fn rest_bytes(&mut self) -> &[u8] {
136        self.cursor.rest().as_bytes()
137    }
138
139    /// Checks if the lexer has consumed all input from the source.
140    ///
141    /// Returns `true` if the current position is at or beyond the end of the source.
142    pub fn fully_reused(&self) -> bool {
143        self.cursor.position() >= self.cursor.source().length()
144    }
145
146    /// Gets the current byte offset position in the source text.
147    ///
148    /// # Returns
149    ///
150    /// The current byte offset from the start of the source text.
151    #[inline]
152    pub fn get_position(&self) -> usize {
153        self.cursor.position()
154    }
155
156    /// Checks if the lexer has NOT consumed all input from the source.
157    ///
158    /// Returns `true` if there are still bytes left to be scanned.
159    #[inline]
160    pub fn not_at_end(&self) -> bool {
161        self.cursor.position() < self.cursor.source().length()
162    }
163
164    /// Peeks at the next character without advancing the cursor.
165    ///
166    /// Returns `None` if at the end of the source.
167    #[inline]
168    pub fn peek(&mut self) -> Option<char> {
169        self.cursor.peek_char()
170    }
171
172    /// Peeks at the character immediately following the current character.
173    #[inline]
174    pub fn peek_next(&mut self) -> Option<char> {
175        self.cursor.peek_next_char()
176    }
177
178    /// Peeks at the character at the specified byte offset relative to the current position.
179    #[inline]
180    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
181        self.cursor.peek_next_n(n)
182    }
183
184    /// Advances the cursor by the specified number of bytes.
185    #[inline]
186    pub fn advance(&mut self, len: usize) {
187        self.cursor.advance_bytes(len);
188    }
189
190    /// Gets the total length of the source text in bytes.
191    #[inline]
192    pub fn get_length(&self) -> usize {
193        self.end_limit.unwrap_or_else(|| self.cursor.source().length())
194    }
195
196    /// Gets a single character at the specified absolute byte offset.
197    #[inline]
198    pub fn get_char_at(&self, offset: usize) -> Option<char> {
199        self.cursor.source().get_char_at(offset)
200    }
201
202    /// Peeks at the next byte without advancing the cursor.
203    #[inline]
204    pub fn peek_byte(&mut self) -> Option<u8> {
205        self.cursor.peek_byte()
206    }
207
208    /// Advances the cursor by one byte and returns it.
209    #[inline]
210    pub fn advance_byte(&mut self) -> Option<u8> {
211        self.cursor.advance_byte()
212    }
213
214    /// Advances the cursor while the byte predicate is true.
215    ///
216    /// Returns the byte range covered by the matched bytes.
217    #[inline]
218    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
219        self.cursor.take_while_byte(pred)
220    }
221
222    /// Skips common ASCII whitespace (space, tab, newline, carriage return).
223    ///
224    /// Uses SIMD acceleration if available on the platform.
225    /// Returns the range of the skipped whitespace.
226    #[inline]
227    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
228        self.cursor.skip_ascii_whitespace()
229    }
230
231    /// Skips all consecutive ASCII digits at the current position.
232    ///
233    /// Returns the range of the skipped digits.
234    #[inline]
235    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
236        self.cursor.skip_ascii_digits()
237    }
238
239    /// Skips all characters that can continue an ASCII identifier.
240    ///
241    /// This includes alphanumeric characters and underscores.
242    /// Returns the range of the skipped characters.
243    #[inline]
244    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
245        self.cursor.skip_ascii_ident_continue()
246    }
247
248    /// Skips all characters until the target byte is encountered.
249    ///
250    /// The target byte itself is NOT consumed.
251    /// Returns the range of the skipped characters.
252    #[inline]
253    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
254        self.cursor.skip_until(target)
255    }
256
257    /// Skips all ASCII hex digits (0-9, a-f, A-F).
258    ///
259    /// Uses SIMD acceleration if available on the platform.
260    /// Returns the range of the skipped hex digits.
261    #[inline]
262    pub fn skip_ascii_hexdigits(&mut self) -> std::range::Range<usize> {
263        let start = self.get_position();
264        let rest = self.rest_bytes();
265        let skipped = crate::source::SimdScanner::skip_ascii_hexdigits(rest);
266        self.advance(skipped);
267        (start..self.get_position()).into()
268    }
269
270    /// Finds the first occurrence of the target byte in the remaining text.
271    ///
272    /// Uses SIMD acceleration if available on the platform.
273    /// Returns the byte offset relative to the current position, or None if not found.
274    #[inline]
275    pub fn find_byte(&mut self, target: u8) -> Option<usize> {
276        let rest = self.rest_bytes();
277        crate::source::SimdScanner::find_byte(rest, target)
278    }
279
280    /// Finds the first occurrence of any of the 4 bytes in the remaining text.
281    ///
282    /// Uses SIMD acceleration if available on the platform.
283    /// Returns the byte offset relative to the current position, or None if not found.
284    #[inline]
285    pub fn find_first_of_4(&mut self, a: u8, b: u8, c: u8, d: u8) -> Option<usize> {
286        let rest = self.rest_bytes();
287        crate::source::SimdScanner::find_first_of_4(rest, a, b, c, d)
288    }
289
290    /// Scans an ASCII identifier.
291    ///
292    /// An identifier must start with an alphabetic character or an underscore,
293    /// and can be followed by any number of alphanumeric characters or underscores.
294    ///
295    /// # Arguments
296    ///
297    /// * `kind` - The token type to assign if an identifier is found.
298    ///
299    /// # Returns
300    ///
301    /// `true` if an identifier was successfully scanned and added.
302    #[inline]
303    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
304        let start = self.get_position();
305        if let Some(b) = self.peek_byte() {
306            if b == b'_' || b.is_ascii_alphabetic() {
307                self.advance_byte();
308                self.skip_ascii_ident_continue();
309                self.add_token(kind, start, self.get_position());
310                return true;
311            }
312        }
313        false
314    }
315
316    /// Scans a line comment starting with the given prefix.
317    ///
318    /// Consumes the prefix and all characters until the next newline or EOF.
319    ///
320    /// # Arguments
321    ///
322    /// * `kind` - The token type for the line comment.
323    /// * `prefix` - The string sequence that starts the comment (e.g., "//").
324    #[inline]
325    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
326        let start = self.get_position();
327        if self.consume_if_starts_with(prefix) {
328            self.skip_until(b'\n');
329            self.add_token(kind, start, self.get_position());
330            return true;
331        }
332        false
333    }
334
335    /// Scans a block comment with given start and end sequences.
336    ///
337    /// Handles nested comments if the underlying implementation supports it,
338    /// though this basic implementation is non-recursive.
339    ///
340    /// # Arguments
341    ///
342    /// * `kind` - The token type for the block comment.
343    /// * `start_seq` - The sequence that starts the block (e.g., "/*").
344    /// * `end_seq` - The sequence that ends the block (e.g., "*/").
345    #[inline]
346    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
347        let start = self.get_position();
348        if self.consume_if_starts_with(start_seq) {
349            while let Some(_b) = self.peek_byte() {
350                self.skip_until(end_seq.as_bytes()[0]);
351                if self.consume_if_starts_with(end_seq) {
352                    self.add_token(kind, start, self.get_position());
353                    return true;
354                }
355                self.advance_byte();
356            }
357            self.add_token(kind, start, self.get_position());
358            return true;
359        }
360        false
361    }
362
363    /// Gets the tokens collected so far in the lexer state.
364    ///
365    /// # Returns
366    ///
367    /// A slice of tokens collected during lexing.
368    #[inline]
369    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
370        &self.tokens
371    }
372
373    /// Sets the current position to the specified byte offset.
374    ///
375    /// # Arguments
376    ///
377    /// * `offset` - The new byte offset position.
378    ///
379    /// # Returns
380    ///
381    /// The previous byte offset position.
382    #[inline]
383    pub fn set_position(&mut self, offset: usize) -> usize {
384        self.cursor.set_position(offset)
385    }
386
387    /// Returns a reference to the underlying source.
388    pub fn source(&self) -> &'s S {
389        self.cursor.source()
390    }
391
392    /// Returns the text in the specified byte range.
393    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
394        self.cursor.source().get_text_in(range)
395    }
396
397    /// Returns the text from the specified byte offset to the end of the source.
398    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
399        self.cursor.source().get_text_from(offset)
400    }
401
402    /// Checks if the source starts with the given pattern at the current position.
403    pub fn starts_with(&mut self, pattern: &str) -> bool {
404        self.cursor.starts_with(pattern)
405    }
406
407    /// Consumes the pattern if it exists at the current position.
408    ///
409    /// Returns `true` if the pattern was found and consumed, advancing the cursor.
410    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
411        self.cursor.consume_if_starts_with(pattern)
412    }
413
414    /// Adds an error to the lexer state's diagnostics.
415    ///
416    /// # Arguments
417    ///
418    /// * `error` - The error to add.
419    #[inline]
420    pub fn add_error(&mut self, error: impl Into<OakError>) {
421        self.errors.push(error.into());
422    }
423
424    /// Adds a token to the lexer state.
425    ///
426    /// # Arguments
427    ///
428    /// * `kind` - The kind/type of the token.
429    /// * `start` - The starting byte offset.
430    /// * `end` - The ending byte offset.
431    #[inline]
432    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
433        self.tokens.push(Token { kind, span: Range { start, end } });
434    }
435
436    /// Adds an end-of-file (EOF) token to the lexer state.
437    ///
438    /// This method creates and adds an `END_OF_STREAM` token at the current position.
439    /// It is typically called when the lexer reaches the end of the source text.
440    ///
441    /// # Examples
442    ///
443    /// ```ignore
444    /// #![feature(new_range_api)]
445    /// # use core::range::Range;
446    /// # use oak_core::lexer::{LexerState, Token};
447    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
448    /// #
449    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
450    /// # enum SimpleToken {
451    /// #     End,
452    /// # }
453    /// #
454    /// # impl TokenType for SimpleToken {
455    /// #     const END_OF_STREAM: Self = SimpleToken::End;
456    /// #     type Role = UniversalTokenRole;
457    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
458    /// # }
459    /// #
460    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
461    /// # enum SimpleElement {}
462    /// #
463    /// # impl ElementType for SimpleElement {
464    /// #     type Role = UniversalElementRole;
465    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
466    /// # }
467    /// #
468    /// # #[derive(Clone)]
469    /// # struct SimpleLanguage;
470    /// #
471    /// # impl Language for SimpleLanguage {
472    /// #     const NAME: &'static str = "simple";
473    /// #     type TokenType = SimpleToken;
474    /// #     type ElementType = SimpleElement;
475    /// #     type TypedRoot = ();
476    /// # }
477    /// #
478    /// let source = SourceText::new("test");
479    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
480    /// state.take_while(|_| true);
481    /// state.add_eof();
482    ///
483    /// assert_eq!(state.get_tokens().len(), 1);
484    /// assert_eq!(state.get_tokens()[0].span, Range { start: 4, end: 4 });
485    /// ```
486    #[inline]
487    pub fn add_eof(&mut self) {
488        let end = self.get_position();
489        self.add_token(L::TokenType::END_OF_STREAM, end, end)
490    }
491
492    /// Gets the current character at the current position.
493    ///
494    /// # Returns
495    ///
496    /// The current character, or `None` if at the end of the source
497    #[inline]
498    pub fn current(&mut self) -> Option<char> {
499        self.cursor.peek_char()
500    }
501
502    /// Advances the position by the current character's length.
503    ///
504    /// # Returns
505    ///
506    /// The character that was skipped, or `None` if at the end of the source
507    #[inline]
508    pub fn bump(&mut self) -> Option<char> {
509        let ch = self.peek()?;
510        self.advance(ch.len_utf8());
511        Some(ch)
512    }
513
514    /// Advances the position by the token's length and adds the token to the lexer state.
515    ///
516    /// This method combines two common operations: advancing the lexer position
517    /// and adding a token to the token list. It calculates the advance distance
518    /// from the token's span, ensuring consistent positioning.
519    ///
520    /// # Arguments
521    ///
522    /// * `token` - The token to add to the lexer state
523    ///
524    /// # Returns
525    ///
526    /// The new byte offset position after advancing
527    ///
528    /// # Examples
529    ///
530    /// ```ignore
531    /// #![feature(new_range_api)]
532    /// # use core::range::Range;
533    /// # use oak_core::lexer::{LexerState, Token};
534    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
535    /// #
536    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
537    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
538    /// # enum SimpleToken { Identifier, End }
539    /// #
540    /// # impl TokenType for SimpleToken {
541    /// #     const END_OF_STREAM: Self = SimpleToken::End;
542    /// #     type Role = UniversalTokenRole;
543    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
544    /// # }
545    /// #
546    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
547    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
548    /// # enum SimpleElement {}
549    /// #
550    /// # impl ElementType for SimpleElement {
551    /// #     type Role = UniversalElementRole;
552    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
553    /// # }
554    /// #
555    /// # #[derive(Clone)]
556    /// # struct SimpleLanguage;
557    /// #
558    /// # impl Language for SimpleLanguage {
559    /// #     const NAME: &'static str = "simple";
560    /// #     type TokenType = SimpleToken;
561    /// #     type ElementType = SimpleElement;
562    /// #     type TypedRoot = ();
563    /// # }
564    /// #
565    /// let source = SourceText::new("hello world");
566    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
567    ///
568    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
569    ///
570    /// assert_eq!(state.get_position(), 0);
571    ///
572    /// let new_pos = state.advance_with(token);
573    ///
574    /// assert_eq!(new_pos, 5);
575    /// assert_eq!(state.get_position(), 5);
576    /// assert_eq!(state.get_tokens().len(), 1);
577    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
578    /// ```
579    ///
580    /// # Note
581    ///
582    /// The caller must ensure that the token's span is valid and that the advance
583    /// does not split multi-byte UTF-8 characters. The token should be created
584    /// with proper character boundaries.
585    #[inline]
586    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
587        self.cursor.advance_bytes(token.length());
588        self.tokens.push(token);
589        self.cursor.position()
590    }
591
592    /// Consumes characters while the predicate returns true, returning the consumed range.
593    ///
594    /// This method iterates through the source text from the current position,
595    /// consuming characters as long as the predicate function returns true.
596    /// It's commonly used for recognizing patterns like identifiers, numbers,
597    /// or whitespace sequences.
598    ///
599    /// # Arguments
600    ///
601    /// * `pred` - A closure that takes a character and returns true if the character
602    ///            should be consumed, false otherwise
603    ///
604    /// # Returns
605    ///
606    /// A byte range representing the span of consumed characters
607    ///
608    /// # Examples
609    ///
610    /// ```ignore
611    /// #![feature(new_range_api)]
612    /// # use core::range::Range;
613    /// # use oak_core::lexer::{LexerState, Token};
614    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
615    /// #
616    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
617    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
618    /// # enum SimpleToken { End }
619    /// #
620    /// # impl TokenType for SimpleToken {
621    /// #     const END_OF_STREAM: Self = SimpleToken::End;
622    /// #     type Role = UniversalTokenRole;
623    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
624    /// # }
625    /// #
626    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
627    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
628    /// # enum SimpleElement {}
629    /// #
630    /// # impl ElementType for SimpleElement {
631    /// #     type Role = UniversalElementRole;
632    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
633    /// # }
634    /// #
635    /// # #[derive(Clone)]
636    /// # struct SimpleLanguage;
637    /// #
638    /// # impl Language for SimpleLanguage {
639    /// #     const NAME: &'static str = "simple";
640    /// #     type TokenType = SimpleToken;
641    /// #     type ElementType = SimpleElement;
642    /// #     type TypedRoot = ();
643    /// # }
644    /// #
645    /// let source = SourceText::new("hello123world");
646    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
647    ///
648    /// let range = state.take_while(|c| c.is_alphabetic());
649    ///
650    /// assert_eq!(range, Range { start: 0, end: 5 });
651    /// assert_eq!(state.get_position(), 5);
652    ///
653    /// let range = state.take_while(|c| c.is_numeric());
654    ///
655    /// assert_eq!(range, Range { start: 5, end: 8 });
656    /// assert_eq!(state.get_position(), 8);
657    /// ```
658    ///
659    /// # Performance Note
660    ///
661    /// This method operates on a character-by-character basis, which means it
662    /// correctly handles multi-byte UTF-8 characters. For performance-critical
663    /// code, consider using byte-based methods when working with ASCII-only text.
664    pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
665        self.cursor.take_while(pred)
666    }
667
668    /// Performs a safety check to prevent infinite loops during lexing.
669    ///
670    /// This method ensures that the lexer always makes progress by forcing
671    /// advancement when stuck at the same position. It's used as a safeguard
672    /// against infinite loops in lexer implementations.
673    ///
674    /// The method compares the current position with a previously saved "safe point"
675    /// position. If they're the same, it means the lexer hasn't made progress since
676    /// that safe point, potentially indicating an infinite loop. In this case, the
677    /// method forces advancement by at least one character.
678    ///
679    /// # Arguments
680    ///
681    /// * `safe_point` - The position to check against for potential deadlock
682    ///
683    /// # Examples
684    ///
685    /// ```ignore
686    /// #![feature(new_range_api)]
687    /// # use oak_core::lexer::{LexerState, Token};
688    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
689    /// #
690    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
691    /// # enum SimpleToken { End }
692    /// #
693    /// # impl TokenType for SimpleToken {
694    /// #     const END_OF_STREAM: Self = SimpleToken::End;
695    /// #     type Role = UniversalTokenRole;
696    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
697    /// # }
698    /// #
699    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
700    /// # enum SimpleElement {}
701    /// #
702    /// # impl ElementType for SimpleElement {
703    /// #     type Role = UniversalElementRole;
704    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
705    /// # }
706    /// #
707    /// # struct SimpleLanguage;
708    /// #
709    /// # impl Language for SimpleLanguage {
710    /// #     const NAME: &'static str = "simple";
711    /// #     type TokenType = SimpleToken;
712    /// #     type ElementType = SimpleElement;
713    /// #     type TypedRoot = ();
714    /// # }
715    /// #
716    /// let source = SourceText::new("test");
717    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
718    ///
719    /// let safe_point = state.get_position();
720    ///
721    /// state.advance_if_dead_lock(safe_point);
722    ///
723    /// assert!(state.get_position() >= safe_point);
724    /// ```
725    ///
726    /// # Usage in Lexer Implementations
727    ///
728    /// This method is typically used at the beginning or end of lexing loops:
729    ///
730    /// ```ignore
731    /// loop {
732    ///     let safe_point = state.get_position();
733    ///
734    ///     if let Some(token) = try_recognize_token(&mut state) {
735    ///         continue;
736    ///     }
737    ///
738    ///     state.advance_if_dead_lock(safe_point);
739    ///
740    ///     if state.not_at_end() {
741    ///         continue;
742    ///     } else {
743    ///         break;
744    ///     }
745    /// }
746    /// ```
747    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
748        if self.cursor.position() == safe_point {
749            if let Some(ch) = self.peek() { self.advance(ch.len_utf8()) } else { self.advance(1) }
750        }
751    }
752
753    /// Finishes lexing and returns the final output with tokens and diagnostics.
754    ///
755    /// This method concludes the lexing process by converting the collected tokens
756    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
757    /// represents the overall success or failure of the lexing operation.
758    ///
759    /// If the result is `Ok`, the tokens are returned as the successful result.
760    /// If the result is `Err`, the error is returned as the failure result.
761    /// In both cases, any collected diagnostic errors are included in the output.
762    ///
763    /// # Arguments
764    ///
765    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
766    ///
767    /// # Returns
768    ///
769    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
770    ///
771    /// # Examples
772    ///
773    /// ```
774    /// #![feature(new_range_api)]
775    /// # use oak_core::lexer::{LexerState, Token};
776    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
777    /// #
778    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
779    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
780    /// # enum SimpleToken { Identifier, End }
781    /// #
782    /// # impl TokenType for SimpleToken {
783    /// #     const END_OF_STREAM: Self = SimpleToken::End;
784    /// #     type Role = UniversalTokenRole;
785    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
786    /// # }
787    /// #
788    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
789    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
790    /// # enum SimpleElement {}
791    /// #
792    /// # impl ElementType for SimpleElement {
793    /// #     type Role = UniversalElementRole;
794    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
795    /// # }
796    /// #
797    /// # struct SimpleLanguage;
798    /// #
799    /// # impl Language for SimpleLanguage {
800    /// #     const NAME: &'static str = "simple";
801    /// #     type TokenType = SimpleToken;
802    /// #     type ElementType = SimpleElement;
803    /// #     type TypedRoot = ();
804    /// # }
805    /// #
806    /// let source = SourceText::new("test");
807    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
808    ///
809    /// state.add_token(SimpleToken::Identifier, 0, 4);
810    ///
811    /// let output = state.finish(Ok(()));
812    ///
813    /// assert!(output.result.is_ok());
814    /// assert_eq!(output.result.unwrap().len(), 1);
815    /// assert_eq!(output.diagnostics.len(), 0);
816    ///
817    /// let source2 = SourceText::new("test");
818    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
819    /// state2.add_error(OakError::custom_error("Test error"));
820    ///
821    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
822    ///
823    /// assert!(output2.result.is_err());
824    /// assert_eq!(output2.diagnostics.len(), 1);
825    /// ```
826    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
827        match result {
828            Ok(_) => {
829                let tokens: Tokens<L> = self.tokens.into();
830                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
831            }
832            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
833        }
834    }
835
836    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
837    ///
838    /// This method is similar to `finish` but additionally updates the incremental cache
839    /// with the new tokens. It's used for incremental lexing where the results need to
840    /// be cached for future reuse when the source text changes.
841    ///
842    /// The method first creates the output in the same way as `finish`, then updates
843    /// the cache's `last_lex` field with the new tokens. This enables the next call
844    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
845    ///
846    /// # Arguments
847    ///
848    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
849    /// * `cache` - The incremental cache to update with the new tokens
850    ///
851    /// # Returns
852    ///
853    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
854    ///
855    /// # Examples
856    ///
857    /// ```ignore
858    /// #![feature(new_range_api)]
859    /// # use core::range::Range;
860    /// # use oak_core::lexer::{LexerState, Token};
861    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
862    /// # use oak_core::parser::session::ParseSession;
863    /// #
864    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
865    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
866    /// # enum SimpleToken { Identifier, End }
867    /// #
868    /// # impl TokenType for SimpleToken {
869    /// #     const END_OF_STREAM: Self = SimpleToken::End;
870    /// #     type Role = UniversalTokenRole;
871    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
872    /// # }
873    /// #
874    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
875    /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
876    /// # enum SimpleElement {}
877    /// #
878    /// # impl ElementType for SimpleElement {
879    /// #     type Role = UniversalElementRole;
880    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
881    /// # }
882    /// #
883    /// # struct SimpleLanguage;
884    /// #
885    /// # impl Language for SimpleLanguage {
886    /// #     const NAME: &'static str = "simple";
887    /// #     type TokenType = SimpleToken;
888    /// #     type ElementType = SimpleElement;
889    /// #     type TypedRoot = ();
890    /// # }
891    /// #
892    /// let source = SourceText::new("test");
893    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
894    ///
895    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
896    ///
897    /// state.add_token(SimpleToken::Identifier, 0, 4);
898    ///
899    /// let output = state.finish_with_cache(Ok(()), &mut cache);
900    ///
901    /// assert!(output.result.is_ok());
902    /// assert_eq!(output.result.unwrap().len(), 1);
903    /// ```
904    ///
905    /// # Incremental Lexing Workflow
906    ///
907    /// This method is typically used as part of an incremental lexing workflow:
908    ///
909    /// ```ignore
910    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
911    /// let output = state.finish_with_cache(Ok(()), cache);
912    ///
913    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
914    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
915    /// let output = state.finish_with_cache(Ok(()), cache);
916    /// ```
917    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
918        let out = self.finish(result);
919        cache.set_lex_output(out.clone());
920        out
921    }
922}
923
924use crate::OakDiagnostics;
oak_core/lexer/state.rs

oak_core/lexer/
state.rs