oak_core/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::{
4    Language, TextEdit, TokenType,
5    errors::{OakDiagnostics, OakError},
6    source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9use std::borrow::Cow;
10use triomphe::Arc;
11
12/// Utilities for scanning comments.
13pub mod scan_comment;
14/// Utilities for scanning identifiers.
15pub mod scan_identifier;
16/// Utilities for scanning numbers.
17pub mod scan_number;
18/// Utilities for scanning string literals.
19pub mod scan_string;
20/// Utilities for scanning whitespace.
21pub mod scan_white_space;
22
23pub use scan_comment::CommentConfig;
24pub use scan_string::StringConfig;
25pub use scan_white_space::WhitespaceConfig;
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
33
34/// Output type for lexical analysis operations, including diagnostics.
35pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
36
37/// Trait for tokenizing source code into sequences of tokens.
38///
39/// This trait defines the interface for converting source text into a sequence of
40/// tokens that can be consumed by the parser. Implementations should handle
41/// the specific lexical rules of their target language.
42///
43/// # Examples
44///
45/// ```ignore
46/// struct MyLexer;
47///
48/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
49/// enum MyToken {
50///     Number,
51///     Identifier,
52///     End,
53/// }
54///
55/// impl TokenType for MyToken {
56///     const END_OF_STREAM: Self = MyToken::End;
57///     type Role = UniversalTokenRole;
58///     fn role(&self) -> Self::Role { UniversalTokenRole::None }
59/// }
60///
61/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
62/// enum MyElement {}
63///
64/// impl ElementType for MyElement {
65///     type Role = UniversalElementRole;
66///     fn role(&self) -> Self::Role { UniversalElementRole::None }
67/// }
68///
69/// struct MyLanguage;
70///
71/// impl Language for MyLanguage {
72///     const NAME: &'static str = "my-language";
73///     type TokenType = MyToken;
74///     type ElementType = MyElement;
75///     type TypedRoot = ();
76/// }
77///
78/// impl Lexer<MyLanguage> for MyLexer {
79///     fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
80///         // Tokenization logic here
81///         todo!()
82///     }
83/// }
84/// ```
85pub trait Lexer<L: Language + Send + Sync + 'static> {
86    /// Tokenizes the given source text into a sequence of tokens.
87    ///
88    /// This method performs a full lexical analysis of the source text,
89    /// creating a new sequence of tokens from scratch. It uses a default
90    /// cache configuration.
91    ///
92    /// # Arguments
93    ///
94    /// * `source` - The source text to tokenize
95    ///
96    /// # Returns
97    ///
98    /// A [`LexOutput`] containing the tokens and any diagnostic messages
99    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
100}
101
102/// Cache trait for lexical results.
103///
104/// This trait defines the interface for caching and accessing lexical analysis results.
105/// It provides methods for storing and retrieving token information from previous
106/// lexical analysis operations.
107#[allow(unused_variables)]
108pub trait LexerCache<L: Language> {
109    /// Sets the lexed output in the cache.
110    ///
111    /// # Arguments
112    ///
113    /// * `output` - The output from lexical analysis, including tokens and diagnostics
114    fn set_lex_output(&mut self, output: LexOutput<L>);
115
116    /// Gets a token from the cache by index.
117    ///
118    /// # Arguments
119    ///
120    /// * `index` - The index of the token to retrieve
121    ///
122    /// # Returns
123    ///
124    /// An `Option<Token<L::TokenType>>` containing the token if it exists,
125    /// or `None` if the index is out of bounds or no tokens are cached
126    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
127
128    /// Gets the total number of tokens in the cache.
129    ///
130    /// # Returns
131    ///
132    /// The number of cached tokens, or 0 if no tokens are cached
133    fn count_tokens(&self) -> usize;
134
135    /// Checks if the cache contains any tokens.
136    ///
137    /// # Returns
138    ///
139    /// `true` if the cache contains tokens, `false` otherwise
140    fn has_tokens(&self) -> bool;
141
142    /// Gets all cached tokens as a slice.
143    ///
144    /// # Returns
145    ///
146    /// An optional slice of tokens if available.
147    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
148        None
149    }
150}
151
152impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
153    fn set_lex_output(&mut self, output: LexOutput<L>) {
154        (**self).set_lex_output(output);
155    }
156
157    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
158        (**self).get_token(index)
159    }
160
161    fn count_tokens(&self) -> usize {
162        (**self).count_tokens()
163    }
164
165    fn has_tokens(&self) -> bool {
166        (**self).has_tokens()
167    }
168
169    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
170        (**self).get_tokens()
171    }
172}
173
174/// Represents a single kind in the source code.
175///
176/// Tokens are the fundamental units of lexical analysis, representing
177/// categorized pieces of source text with their position information.
178#[derive(Debug, Clone, PartialEq, Eq, Copy)]
179pub struct Token<K> {
180    /// The kind/category of this kind (e.g., keyword, identifier, number)
181    pub kind: K,
182    /// The byte range in the source text that this kind occupies
183    pub span: Range<usize>,
184}
185
186impl<K> Token<K> {
187    /// Returns the length of this kind in bytes.
188    ///
189    /// # Returns
190    ///
191    /// The number of bytes between the start and end of the kind's span
192    ///
193    /// # Examples
194    ///
195    /// ```ignore
196    /// #![feature(new_range_api)]
197    /// # use oak_core::lexer::Token;
198    /// # use core::range::Range;
199    /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
200    /// assert_eq!(kind.length(), 5);
201    /// ```
202    #[inline]
203    pub fn length(&self) -> usize {
204        self.span.end - self.span.start
205    }
206}
207
208/// State information for incremental lexical analysis.
209///
210/// This struct maintains the current position and context during
211/// tokenization, enabling incremental and resumable lexing operations.
212#[derive(Debug)]
213pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
214    pub(crate) cursor: SourceCursor<'s, S>,
215    pub(crate) tokens: Vec<Token<L::TokenType>>,
216    pub(crate) errors: Vec<OakError>,
217}
218
219impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
220    /// Creates a new lexer state with the given source text.
221    ///
222    /// # Arguments
223    ///
224    /// * `source` - The source text to lex
225    ///
226    /// # Returns
227    ///
228    /// A new `LexerState` initialized at the beginning of the source
229    pub fn new(source: &'s S) -> Self {
230        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
231    }
232
233    /// Creates a new lexer state with the given source text and incremental cache.
234    ///
235    /// # Arguments
236    ///
237    /// * `source` - The source text to lex
238    /// * `relex_from` - The minimum byte offset that may have been affected by edits
239    ///   (use `source.length()` to indicate no edits)
240    /// * `cache` - The incremental cache containing previous lexing results
241    ///
242    /// # Returns
243    ///
244    /// A new `LexerState` initialized at the beginning of the source with cache support
245    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
246        if !cache.has_tokens() {
247            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
248        }
249
250        let len = source.length();
251        let relex_from = relex_from.min(len);
252
253        // Fast path: fully re-used
254        if relex_from >= len {
255            let mut tokens = Vec::new();
256            if let Some(cached) = cache.get_tokens() {
257                tokens.extend_from_slice(cached);
258            }
259            else {
260                let count = cache.count_tokens();
261                tokens.reserve(count);
262                for i in 0..count {
263                    if let Some(t) = cache.get_token(i) {
264                        tokens.push(t);
265                    }
266                }
267            }
268            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
269            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
270        }
271
272        if relex_from == 0 {
273            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
274        }
275
276        let mut reused_tokens = Vec::new();
277        const BACKTRACK_TOKENS: usize = 1;
278
279        if let Some(cached) = cache.get_tokens() {
280            // Binary search for the cut-off point since tokens are sorted by position
281            let idx = cached.partition_point(|t| t.span.end <= relex_from);
282            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
283            if keep > 0 {
284                reused_tokens.extend_from_slice(&cached[..keep]);
285            }
286        }
287        else {
288            // Fallback for caches that don't support slice access
289            let count = cache.count_tokens();
290            for i in 0..count {
291                let Some(token) = cache.get_token(i)
292                else {
293                    break;
294                };
295                if token.span.end <= relex_from {
296                    reused_tokens.push(token);
297                }
298                else {
299                    break;
300                }
301            }
302            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
303            reused_tokens.truncate(keep);
304        }
305
306        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
307        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
308    }
309
310    /// Gets the remaining text from the current position to the end of the source.
311    ///
312    /// # Returns
313    ///
314    /// A string slice containing the remaining text
315    pub fn rest(&mut self) -> &str {
316        self.cursor.rest()
317    }
318
319    /// Gets the remaining text as a byte slice.
320    #[inline]
321    pub fn rest_bytes(&mut self) -> &[u8] {
322        self.cursor.rest().as_bytes()
323    }
324
325    /// Checks if the lexer has consumed all input from the source.
326    pub fn fully_reused(&self) -> bool {
327        self.cursor.position() >= self.cursor.source().length()
328    }
329
330    /// Gets the current byte offset position in the source text.
331    ///
332    /// # Returns
333    ///
334    /// The current byte offset from the start of the source text
335    #[inline]
336    pub fn get_position(&self) -> usize {
337        self.cursor.position()
338    }
339
340    /// Gets the total length of the source text in bytes.
341    #[inline]
342    pub fn get_length(&self) -> usize {
343        self.cursor.source().length()
344    }
345
346    /// Gets a single character at the specified byte offset.
347    #[inline]
348    pub fn get_char_at(&self, offset: usize) -> Option<char> {
349        self.cursor.source().get_char_at(offset)
350    }
351
352    /// Peeks at the next byte without advancing.
353    #[inline]
354    pub fn peek_byte(&mut self) -> Option<u8> {
355        self.cursor.peek_byte()
356    }
357
358    /// Advances the cursor by one byte and returns it.
359    #[inline]
360    pub fn advance_byte(&mut self) -> Option<u8> {
361        self.cursor.advance_byte()
362    }
363
364    /// Advances the cursor while the byte predicate is true.
365    #[inline]
366    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
367        self.cursor.take_while_byte(pred)
368    }
369
370    /// Skips common ASCII whitespace using SIMD if possible.
371    #[inline]
372    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
373        self.cursor.skip_ascii_whitespace()
374    }
375
376    /// Skips all ASCII digits at the current position.
377    #[inline]
378    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
379        self.cursor.skip_ascii_digits()
380    }
381
382    /// Skips all characters that can continue an ASCII identifier.
383    #[inline]
384    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
385        self.cursor.skip_ascii_ident_continue()
386    }
387
388    /// Skips all characters until the target byte is encountered.
389    #[inline]
390    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
391        self.cursor.skip_until(target)
392    }
393
394    /// Scans an ASCII identifier (starts with alpha/_, continues with alphanumeric/_).
395    #[inline]
396    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
397        let start = self.get_position();
398        if let Some(b) = self.peek_byte() {
399            if b == b'_' || b.is_ascii_alphabetic() {
400                self.advance_byte();
401                self.skip_ascii_ident_continue();
402                self.add_token(kind, start, self.get_position());
403                return true;
404            }
405        }
406        false
407    }
408
409    /// Scans a line comment starting with the given prefix.
410    #[inline]
411    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
412        let start = self.get_position();
413        if self.consume_if_starts_with(prefix) {
414            self.skip_until(b'\n');
415            self.add_token(kind, start, self.get_position());
416            return true;
417        }
418        false
419    }
420
421    /// Scans a block comment with given start and end sequences.
422    #[inline]
423    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
424        let start = self.get_position();
425        if self.consume_if_starts_with(start_seq) {
426            while let Some(_b) = self.peek_byte() {
427                self.skip_until(end_seq.as_bytes()[0]);
428                if self.consume_if_starts_with(end_seq) {
429                    self.add_token(kind, start, self.get_position());
430                    return true;
431                }
432                self.advance_byte();
433            }
434            // Unclosed block comment is still a comment in many languages,
435            // but we might want to add an error here in the future.
436            self.add_token(kind, start, self.get_position());
437            return true;
438        }
439        false
440    }
441
442    /// Gets a reference to the tokens collected so far.
443    ///
444    /// # Returns
445    ///
446    /// A slice of tokens collected during the lexing process
447    #[inline]
448    pub fn tokens(&self) -> &[Token<L::TokenType>] {
449        &self.tokens
450    }
451
452    /// Sets the current position to the specified byte offset.
453    ///
454    /// # Arguments
455    ///
456    /// * `offset` - The new byte offset position
457    ///
458    /// # Returns
459    ///
460    /// The previous byte offset position
461    #[inline]
462    pub fn set_position(&mut self, offset: usize) -> usize {
463        self.cursor.set_position(offset)
464    }
465
466    /// Returns a reference to the underlying source.
467    pub fn source(&self) -> &'s S {
468        self.cursor.source()
469    }
470
471    /// Returns the text in the specified range.
472    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
473        self.cursor.source().get_text_in(range)
474    }
475
476    /// Returns the text from the specified offset to the end.
477    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
478        self.cursor.source().get_text_from(offset)
479    }
480
481    /// Checks if the source starts with the given pattern at the current position.
482    pub fn starts_with(&mut self, pattern: &str) -> bool {
483        self.cursor.starts_with(pattern)
484    }
485
486    /// Consumes the pattern if it exists at the current position.
487    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
488        self.cursor.consume_if_starts_with(pattern)
489    }
490
491    /// Gets the tokens collected so far in the lexer state.
492    ///
493    /// # Returns
494    ///
495    /// A slice of tokens collected during lexing
496    #[inline]
497    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
498        &self.tokens
499    }
500
501    /// Adds an error to the lexer state.
502    ///
503    /// # Arguments
504    ///
505    /// * `error` - The error to add to the diagnostics
506    #[inline]
507    pub fn add_error(&mut self, error: impl Into<OakError>) {
508        self.errors.push(error.into());
509    }
510
511    /// Adds a token to the lexer state.
512    ///
513    /// # Arguments
514    ///
515    /// * `kind` - The kind of the token
516    /// * `start` - The starting byte offset of the token
517    /// * `end` - The ending byte offset of the token
518    #[inline]
519    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
520        self.tokens.push(Token { kind, span: Range { start, end } });
521    }
522
523    /// Adds an end-of-file token to the lexer state.
524    ///
525    /// This method creates and adds an END_OF_STREAM token at the current position.
526    /// It's typically called when the lexer reaches the end of the source text
527    /// to mark the termination of the token stream.
528    ///
529    /// # Examples
530    ///
531    /// ```ignore
532    /// #![feature(new_range_api)]
533    /// # use core::range::Range;
534    /// # use oak_core::lexer::{LexerState, Token};
535    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
536    /// #
537    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
538    /// # enum SimpleToken {
539    /// #     End,
540    /// # }
541    /// #
542    /// # impl TokenType for SimpleToken {
543    /// #     const END_OF_STREAM: Self = SimpleToken::End;
544    /// #     type Role = UniversalTokenRole;
545    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
546    /// # }
547    /// #
548    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
549    /// # enum SimpleElement {}
550    /// #
551    /// # impl ElementType for SimpleElement {
552    /// #     type Role = UniversalElementRole;
553    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
554    /// # }
555    /// #
556    /// # #[derive(Clone)]
557    /// # struct SimpleLanguage;
558    /// #
559    /// # impl Language for SimpleLanguage {
560    /// #     const NAME: &'static str = "simple";
561    /// #     type TokenType = SimpleToken;
562    /// #     type ElementType = SimpleElement;
563    /// #     type TypedRoot = ();
564    /// # }
565    /// #
566    /// let source = SourceText::new("test");
567    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
568    /// state.take_while(|_| true); // Advance to end
569    /// state.add_eof();
570    ///
571    /// assert_eq!(state.tokens().len(), 1);
572    /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
573    /// ```
574    #[inline]
575    pub fn add_eof(&mut self) {
576        let end = self.get_position();
577        self.add_token(L::TokenType::END_OF_STREAM, end, end);
578    }
579
580    /// Gets the current character at the current position.
581    ///
582    /// # Returns
583    ///
584    /// The current character, or `None` if at the end of the source
585    #[inline]
586    pub fn current(&mut self) -> Option<char> {
587        self.cursor.peek_char()
588    }
589
590    /// Peeks at the next character without advancing the position.
591    ///
592    /// # Returns
593    ///
594    /// The next character, or `None` if at the end of the source
595    #[inline]
596    pub fn peek(&mut self) -> Option<char> {
597        self.cursor.peek_char()
598    }
599
600    /// Peeks at the character n positions ahead without advancing the position.
601    ///
602    /// # Arguments
603    ///
604    /// * `n` - The number of characters to peek ahead
605    ///
606    /// # Returns
607    ///
608    /// The character n positions ahead, or `None` if beyond the end of the source
609    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
610        if n == 0 {
611            return self.peek();
612        }
613
614        // Fast path: check current chunk
615        let rest = self.cursor.rest();
616        if let Some(ch) = rest.chars().nth(n) {
617            return Some(ch);
618        }
619
620        // Slow path: cross chunk
621        let mut count = 0;
622        let mut offset = self.cursor.position();
623        let end = self.get_length();
624
625        while offset < end {
626            let chunk = self.source().chunk_at(offset);
627            let text = chunk.slice_from(offset);
628            for ch in text.chars() {
629                if count == n {
630                    return Some(ch);
631                }
632                count += 1;
633            }
634            offset = chunk.end();
635        }
636
637        None
638    }
639
640    /// Advances the position by the specified number of bytes.
641    ///
642    /// This method moves the lexer's current position forward by the specified
643    /// number of bytes. It's commonly used after recognizing a token to move
644    /// past the token's characters.
645    ///
646    /// # Arguments
647    ///
648    /// * `length` - The number of bytes to advance
649    ///
650    /// # Returns
651    ///
652    /// The new byte offset position after advancing
653    #[inline]
654    pub fn advance(&mut self, length: usize) -> usize {
655        self.cursor.advance_bytes(length)
656    }
657
658    /// Advances the position by the current character's length.
659    ///
660    /// # Returns
661    ///
662    /// The character that was skipped, or `None` if at the end of the source
663    #[inline]
664    pub fn bump(&mut self) -> Option<char> {
665        let ch = self.peek()?;
666        self.advance(ch.len_utf8());
667        Some(ch)
668    }
669
670    /// Advances the position by the token's length and adds the token to the lexer state.
671    ///
672    /// This method combines two common operations: advancing the lexer position
673    /// and adding a token to the token list. It calculates the advance distance
674    /// from the token's span, ensuring consistent positioning.
675    ///
676    /// # Arguments
677    ///
678    /// * `token` - The token to add to the lexer state
679    ///
680    /// # Returns
681    ///
682    /// The new byte offset position after advancing
683    ///
684    /// # Examples
685    ///
686    /// ```ignore
687    /// #![feature(new_range_api)]
688    /// # use core::range::Range;
689    /// # use oak_core::lexer::{LexerState, Token};
690    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
691    /// #     /// #
692    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
693    /// # enum SimpleToken { Identifier, End }
694    /// #
695    /// # impl TokenType for SimpleToken {
696    /// #     const END_OF_STREAM: Self = SimpleToken::End;
697    /// #     type Role = UniversalTokenRole;
698    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
699    /// # }
700    /// #
701    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
702    /// # enum SimpleElement {}
703    /// #
704    /// # impl ElementType for SimpleElement {
705    /// #     type Role = UniversalElementRole;
706    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
707    /// # }
708    /// #
709    /// # #[derive(Clone)]
710    /// # struct SimpleLanguage;
711    /// #
712    /// # impl Language for SimpleLanguage {
713    /// #     const NAME: &'static str = "simple";
714    /// #     type TokenType = SimpleToken;
715    /// #     type ElementType = SimpleElement;
716    /// #     type TypedRoot = ();
717    /// # }
718    /// #
719    /// let source = SourceText::new("hello world");
720    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
721    ///
722    /// // Create a token for "hello"
723    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } };
724    ///
725    /// // Initially at position 0
726    /// assert_eq!(state.get_position(), 0);
727    ///
728    /// // Advance and add the token
729    /// let new_pos = state.advance_with(token);
730    ///
731    /// // Now at position 5 and token is added
732    /// assert_eq!(new_pos, 5);
733    /// assert_eq!(state.get_position(), 5);
734    /// assert_eq!(state.get_tokens().len(), 1);
735    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
736    /// ```
737    ///
738    /// # Note
739    ///
740    /// The caller must ensure that the token's span is valid and that the advance
741    /// does not split multi-byte UTF-8 characters. The token should be created
742    /// with proper character boundaries.
743    #[inline]
744    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
745        self.cursor.advance_bytes(token.length());
746        self.tokens.push(token);
747        self.cursor.position()
748    }
749
750    /// Consumes characters while the predicate returns true, returning the consumed range.
751    ///
752    /// This method iterates through the source text from the current position,
753    /// consuming characters as long as the predicate function returns true.
754    /// It's commonly used for recognizing patterns like identifiers, numbers,
755    /// or whitespace sequences.
756    ///
757    /// # Arguments
758    ///
759    /// * `pred` - A closure that takes a character and returns true if the character
760    ///            should be consumed, false otherwise
761    ///
762    /// # Returns
763    ///
764    /// A byte range representing the span of consumed characters
765    ///
766    /// # Examples
767    ///
768    /// ```ignore
769    /// #![feature(new_range_api)]
770    /// # use core::range::Range;
771    /// # use oak_core::lexer::{LexerState, Token};
772    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
773    /// #     /// #
774    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
775    /// # enum SimpleToken { End }
776    /// #
777    /// # impl TokenType for SimpleToken {
778    /// #     const END_OF_STREAM: Self = SimpleToken::End;
779    /// #     type Role = UniversalTokenRole;
780    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
781    /// # }
782    /// #
783    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
784    /// # enum SimpleElement {}
785    /// #
786    /// # impl ElementType for SimpleElement {
787    /// #     type Role = UniversalElementRole;
788    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
789    /// # }
790    /// #
791    /// # #[derive(Clone)]
792    /// # struct SimpleLanguage;
793    /// #
794    /// # impl Language for SimpleLanguage {
795    /// #     const NAME: &'static str = "simple";
796    /// #     type TokenType = SimpleToken;
797    /// #     type ElementType = SimpleElement;
798    /// #     type TypedRoot = ();
799    /// # }
800    /// #
801    /// let source = SourceText::new("hello123world");
802    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
803    ///
804    /// // Consume alphabetic characters
805    /// let range = state.take_while(|c| c.is_alphabetic());
806    ///
807    /// // Should have consumed "hello"
808    /// assert_eq!(range, Range { start: 0, end: 5 });
809    /// assert_eq!(state.get_position(), 5);
810    ///
811    /// // Consume numeric characters
812    /// let range = state.take_while(|c| c.is_numeric());
813    ///
814    /// // Should have consumed "123"
815    /// assert_eq!(range, Range { start: 5, end: 8 });
816    /// assert_eq!(state.get_position(), 8);
817    /// ```
818    ///
819    /// # Performance Note
820    ///
821    /// This method operates on a character-by-character basis, which means it
822    /// correctly handles multi-byte UTF-8 characters. For performance-critical
823    /// code, consider using byte-based methods when working with ASCII-only text.
824    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
825        let start = self.cursor.position();
826        while let Some(ch) = self.peek() {
827            if pred(ch) {
828                self.advance(ch.len_utf8());
829            }
830            else {
831                break;
832            }
833        }
834        Range { start, end: self.cursor.position() }
835    }
836
837    /// Checks if the lexer has not reached the end of the source text.
838    ///
839    /// # Returns
840    ///
841    /// `true` if not at the end of the source, `false` otherwise
842    #[inline]
843    pub fn not_at_end(&self) -> bool {
844        self.cursor.position() < self.cursor.source().length()
845    }
846
847    /// Performs a safety check to prevent infinite loops during lexing.
848    ///
849    /// This method ensures that the lexer always makes progress by forcing
850    /// advancement when stuck at the same position. It's used as a safeguard
851    /// against infinite loops in lexer implementations.
852    ///
853    /// The method compares the current position with a previously saved "safe point"
854    /// position. If they're the same, it means the lexer hasn't made progress since
855    /// that safe point, potentially indicating an infinite loop. In this case, the
856    /// method forces advancement by at least one character.
857    ///
858    /// # Arguments
859    ///
860    /// * `safe_point` - The position to check against for potential deadlock
861    ///
862    /// # Examples
863    ///
864    /// ```ignore
865    /// #![feature(new_range_api)]
866    /// # use oak_core::lexer::{LexerState, Token};
867    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
868    /// #     /// #
869    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
870    /// # enum SimpleToken { End }
871    /// #
872    /// # impl TokenType for SimpleToken {
873    /// #     const END_OF_STREAM: Self = SimpleToken::End;
874    /// #     type Role = UniversalTokenRole;
875    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
876    /// # }
877    /// #
878    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
879    /// # enum SimpleElement {}
880    /// #
881    /// # impl ElementType for SimpleElement {
882    /// #     type Role = UniversalElementRole;
883    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
884    /// # }
885    /// #
886    /// # struct SimpleLanguage;
887    /// #
888    /// # impl Language for SimpleLanguage {
889    /// #     const NAME: &'static str = "simple";
890    /// #     type TokenType = SimpleToken;
891    /// #     type ElementType = SimpleElement;
892    /// #     type TypedRoot = ();
893    /// # }
894    /// #
895    /// let source = SourceText::new("test");
896    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
897    ///
898    /// // Save the current position as a safe point
899    /// let safe_point = state.get_position();
900    ///
901    /// // In a real lexer, you would do some processing here
902    /// // If something went wrong and we didn't advance, this would prevent infinite loop
903    /// state.advance_if_dead_lock(safe_point);
904    ///
905    /// // If we were stuck, we would have advanced by at least 1
906    /// assert!(state.get_position() >= safe_point);
907    /// ```
908    ///
909    /// # Usage in Lexer Implementations
910    ///
911    /// This method is typically used at the beginning or end of lexing loops:
912    ///
913    /// ```ignore
914    /// loop {
915    ///     let safe_point = state.get_position();
916    ///     
917    ///     // Try to recognize a token
918    ///     if let Some(token) = try_recognize_token(&mut state) {
919    ///         // Success, continue loop
920    ///         continue;
921    ///     }
922    ///     
923    ///     // If we get here, we didn't recognize anything
924    ///     // This prevents infinite loops if recognition fails
925    ///     state.advance_if_dead_lock(safe_point);
926    ///     
927    ///     if state.not_at_end() {
928    ///         // Continue trying to recognize tokens
929    ///         continue;
930    ///     } else {
931    ///         // Reached end of source
932    ///         break;
933    ///     }
934    /// }
935    /// ```
936    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
937        // Force advance if no progress was made
938        if self.cursor.position() == safe_point {
939            if let Some(ch) = self.current() {
940                // Skip current character
941                self.advance(ch.len_utf8());
942            }
943            else {
944                // Advance anyway to prevent infinite loop
945                self.advance(1);
946            }
947            // tracing::warn!("deadlock");
948        }
949    }
950
951    /// Finishes lexing and returns the final output with tokens and diagnostics.
952    ///
953    /// This method concludes the lexing process by converting the collected tokens
954    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
955    /// represents the overall success or failure of the lexing operation.
956    ///
957    /// If the result is `Ok`, the tokens are returned as the successful result.
958    /// If the result is `Err`, the error is returned as the failure result.
959    /// In both cases, any collected diagnostic errors are included in the output.
960    ///
961    /// # Arguments
962    ///
963    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
964    ///
965    /// # Returns
966    ///
967    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
968    ///
969    /// # Examples
970    ///
971    /// ```
972    /// #![feature(new_range_api)]
973    /// # use oak_core::lexer::{LexerState, Token};
974    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
975    /// #     /// #
976    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
977    /// # enum SimpleToken { Identifier, End }
978    /// #
979    /// # impl TokenType for SimpleToken {
980    /// #     const END_OF_STREAM: Self = SimpleToken::End;
981    /// #     type Role = UniversalTokenRole;
982    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
983    /// # }
984    /// #
985    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
986    /// # enum SimpleElement {}
987    /// #
988    /// # impl ElementType for SimpleElement {
989    /// #     type Role = UniversalElementRole;
990    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
991    /// # }
992    /// #
993    /// # struct SimpleLanguage;
994    /// #
995    /// # impl Language for SimpleLanguage {
996    /// #     const NAME: &'static str = "simple";
997    /// #     type TokenType = SimpleToken;
998    /// #     type ElementType = SimpleElement;
999    /// #     type TypedRoot = ();
1000    /// # }
1001    /// #
1002    /// let source = SourceText::new("test");
1003    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1004    ///
1005    /// // Add some tokens during lexing
1006    /// state.add_token(SimpleToken::Identifier, 0, 4);
1007    ///
1008    /// // Finish with successful result
1009    /// let output = state.finish(Ok(()));
1010    ///
1011    /// // Check the results
1012    /// assert!(output.result.is_ok());
1013    /// assert_eq!(output.result.unwrap().len(), 1);
1014    /// assert_eq!(output.diagnostics.len(), 0);
1015    ///
1016    /// // Example with error
1017    /// let source2 = SourceText::new("test");
1018    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1019    /// state2.add_error(OakError::custom_error("Test error"));
1020    ///
1021    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1022    ///
1023    /// // Check the results
1024    /// assert!(output2.result.is_err());
1025    /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1026    /// ```
1027    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1028        match result {
1029            Ok(_) => {
1030                let tokens: Tokens<L> = self.tokens.into();
1031                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1032            }
1033            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1034        }
1035    }
1036
1037    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1038    ///
1039    /// This method is similar to `finish` but additionally updates the incremental cache
1040    /// with the new tokens. It's used for incremental lexing where the results need to
1041    /// be cached for future reuse when the source text changes.
1042    ///
1043    /// The method first creates the output in the same way as `finish`, then updates
1044    /// the cache's `last_lex` field with the new tokens. This enables the next call
1045    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1046    ///
1047    /// # Arguments
1048    ///
1049    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1050    /// * `cache` - The incremental cache to update with the new tokens
1051    ///
1052    /// # Returns
1053    ///
1054    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1055    ///
1056    /// # Examples
1057    ///
1058    /// ```ignore
1059    /// #![feature(new_range_api)]
1060    /// # use core::range::Range;
1061    /// # use oak_core::lexer::{LexerState, Token};
1062    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1063    /// # use oak_core::parser::session::ParseSession;
1064    /// #
1065    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1066    /// # enum SimpleToken { Identifier, End }
1067    /// #
1068    /// # impl TokenType for SimpleToken {
1069    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1070    /// #     type Role = UniversalTokenRole;
1071    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1072    /// # }
1073    /// #
1074    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1075    /// # enum SimpleElement {}
1076    /// #
1077    /// # impl ElementType for SimpleElement {
1078    /// #     type Role = UniversalElementRole;
1079    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1080    /// # }
1081    /// #
1082    /// # struct SimpleLanguage;
1083    /// #
1084    /// # impl Language for SimpleLanguage {
1085    /// #     const NAME: &'static str = "simple";
1086    /// #     type TokenType = SimpleToken;
1087    /// #     type ElementType = SimpleElement;
1088    /// #     type TypedRoot = ();
1089    /// # }
1090    /// #
1091    /// let source = SourceText::new("test");
1092    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1093    ///
1094    /// // Create a cache for incremental lexing
1095    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1096    ///
1097    /// // Add some tokens during lexing
1098    /// state.add_token(SimpleToken::Identifier, 0, 4);
1099    ///
1100    /// // Finish with cache update
1101    /// let output = state.finish_with_cache(Ok(()), &mut cache);
1102    ///
1103    /// // Check the results
1104    /// assert!(output.result.is_ok());
1105    /// assert_eq!(output.result.unwrap().len(), 1);
1106    /// ```
1107    ///
1108    /// # Incremental Lexing Workflow
1109    ///
1110    /// This method is typically used as part of an incremental lexing workflow:
1111    ///
1112    /// ```ignore
1113    /// // First lexing
1114    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1115    /// // ... lexing logic ...
1116    /// let output = state.finish_with_cache(Ok(()), cache);
1117    ///
1118    /// // Later, when source changes
1119    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1120    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1121    /// // ... lexing logic (reusing unchanged tokens) ...
1122    /// let output = state.finish_with_cache(Ok(()), cache);
1123    /// ```
1124    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1125        let out = self.finish(result);
1126        cache.set_lex_output(out.clone());
1127        out
1128    }
1129}
oak_core/lexer/mod.rs

oak_core/lexer/
mod.rs