oak_core/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::{
4    Language, TextEdit, TokenType,
5    errors::{OakDiagnostics, OakError},
6    source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15pub mod scan_comment;
16/// Utilities for scanning identifiers.
17pub mod scan_identifier;
18/// Utilities for scanning numbers.
19pub mod scan_number;
20/// Utilities for scanning string literals.
21pub mod scan_string;
22/// Utilities for scanning whitespace.
23pub mod scan_white_space;
24
25pub use scan_comment::CommentConfig;
26pub use scan_string::StringConfig;
27pub use scan_white_space::WhitespaceConfig;
28
29/// Output type for lexical analysis operations.
30///
31/// This type alias represents the result of tokenization, containing
32/// a vector of tokens and any diagnostic language that occurred during
33/// the lexing process.
34pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
35
36/// Output type for lexical analysis operations, including diagnostics.
37pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
38
39/// Trait for tokenizing source code into sequences of tokens.
40///
41/// This trait defines the interface for converting source text into a sequence of
42/// tokens that can be consumed by the parser. Implementations should handle
43/// the specific lexical rules of their target language.
44///
45/// # Examples
46///
47/// ```ignore
48/// struct MyLexer;
49///
50/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
51/// enum MyToken {
52///     Number,
53///     Identifier,
54///     End,
55/// }
56///
57/// impl TokenType for MyToken {
58///     const END_OF_STREAM: Self = MyToken::End;
59///     type Role = UniversalTokenRole;
60///     fn role(&self) -> Self::Role { UniversalTokenRole::None }
61/// }
62///
63/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64/// enum MyElement {}
65///
66/// impl ElementType for MyElement {
67///     type Role = UniversalElementRole;
68///     fn role(&self) -> Self::Role { UniversalElementRole::None }
69/// }
70///
71/// struct MyLanguage;
72///
73/// impl Language for MyLanguage {
74///     const NAME: &'static str = "my-language";
75///     type TokenType = MyToken;
76///     type ElementType = MyElement;
77///     type TypedRoot = ();
78/// }
79///
80/// impl Lexer<MyLanguage> for MyLexer {
81///     fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
82///         // Tokenization logic here
83///         todo!()
84///     }
85/// }
86/// ```
87pub trait Lexer<L: Language + Send + Sync> {
88    /// Tokenizes the given source text into a sequence of tokens.
89    ///
90    /// This method performs a full lexical analysis of the source text,
91    /// creating a new sequence of tokens from scratch. It uses a default
92    /// cache configuration.
93    ///
94    /// # Arguments
95    ///
96    /// * `source` - The source text to tokenize
97    ///
98    /// # Returns
99    ///
100    /// A [`LexOutput`] containing the tokens and any diagnostic messages
101    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
102}
103
104/// Cache trait for lexical results.
105///
106/// This trait defines the interface for caching and accessing lexical analysis results.
107/// It provides methods for storing and retrieving token information from previous
108/// lexical analysis operations.
109#[allow(unused_variables)]
110pub trait LexerCache<L: Language> {
111    /// Sets the lexed output in the cache.
112    ///
113    /// # Arguments
114    ///
115    /// * `output` - The output from lexical analysis, including tokens and diagnostics
116    fn set_lex_output(&mut self, output: LexOutput<L>);
117
118    /// Gets a token from the cache by index.
119    ///
120    /// # Arguments
121    ///
122    /// * `index` - The index of the token to retrieve
123    ///
124    /// # Returns
125    ///
126    /// An `Option<Token<L::TokenType>>` containing the token if it exists,
127    /// or `None` if the index is out of bounds or no tokens are cached
128    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
129
130    /// Gets the total number of tokens in the cache.
131    ///
132    /// # Returns
133    ///
134    /// The number of cached tokens, or 0 if no tokens are cached
135    fn count_tokens(&self) -> usize;
136
137    /// Checks if the cache contains any tokens.
138    ///
139    /// # Returns
140    ///
141    /// `true` if the cache contains tokens, `false` otherwise
142    fn has_tokens(&self) -> bool;
143
144    /// Gets all cached tokens as a slice.
145    ///
146    /// # Returns
147    ///
148    /// An optional slice of tokens if available.
149    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
150        None
151    }
152}
153
154impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
155    fn set_lex_output(&mut self, output: LexOutput<L>) {
156        (**self).set_lex_output(output)
157    }
158
159    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
160        (**self).get_token(index)
161    }
162
163    fn count_tokens(&self) -> usize {
164        (**self).count_tokens()
165    }
166
167    fn has_tokens(&self) -> bool {
168        (**self).has_tokens()
169    }
170
171    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
172        (**self).get_tokens()
173    }
174}
175
176/// Represents a single kind in the source code.
177///
178/// Tokens are the fundamental units of lexical analysis, representing
179/// categorized pieces of source text with their position information.
180#[derive(Debug, Clone, PartialEq, Eq, Copy)]
181#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
182pub struct Token<K> {
183    /// The kind/category of this kind (e.g., keyword, identifier, number)
184    pub kind: K,
185    /// The byte range in the source text that this kind occupies
186    #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
187    pub span: Range<usize>,
188}
189
190impl<K> Token<K> {
191    /// Returns the length of this kind in bytes.
192    ///
193    /// # Returns
194    ///
195    /// The number of bytes between the start and end of the kind's span
196    ///
197    /// # Examples
198    ///
199    /// ```ignore
200    /// #![feature(new_range_api)]
201    /// # use oak_core::lexer::Token;
202    /// # use core::range::Range;
203    /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } }
204    /// assert_eq!(kind.length(), 5);
205    /// ```
206    #[inline]
207    pub fn length(&self) -> usize {
208        self.span.end - self.span.start
209    }
210}
211
212/// A stream of tokens with associated source text.
213#[derive(Debug, Clone)]
214#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
215#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
216pub struct TokenStream<K: Copy> {
217    /// The raw source text.
218    pub raw: String,
219    /// The tokens extracted from the source text.
220    #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
221    pub tokens: Arc<[Token<K>]>,
222}
223
224#[cfg(feature = "serde")]
225mod arc_slice_serde {
226    use super::*;
227    use serde::{Deserialize, Deserializer, Serialize, Serializer};
228
229    pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
230    where
231        K: Serialize,
232        S: Serializer,
233    {
234        arc.as_ref().serialize(serializer)
235    }
236
237    pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
238    where
239        K: Deserialize<'de>,
240        D: Deserializer<'de>,
241    {
242        let vec = Vec::<Token<K>>::deserialize(deserializer)?;
243        Ok(Arc::from_iter(vec))
244    }
245}
246
247/// State information for incremental lexical analysis.
248///
249/// This struct maintains the current position and context during
250/// tokenization, enabling incremental and resumable lexing operations.
251#[derive(Debug)]
252pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
253    pub(crate) cursor: SourceCursor<'s, S>,
254    pub(crate) tokens: Vec<Token<L::TokenType>>,
255    pub(crate) errors: Vec<OakError>,
256}
257
258impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
259    /// Creates a new lexer state with the given source text.
260    ///
261    /// # Arguments
262    ///
263    /// * `source` - The source text to lex
264    ///
265    /// # Returns
266    ///
267    /// A new `LexerState` initialized at the beginning of the source
268    pub fn new(source: &'s S) -> Self {
269        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
270    }
271
272    /// Creates a new lexer state with the given source text and incremental cache.
273    ///
274    /// # Arguments
275    ///
276    /// * `source` - The source text to lex
277    /// * `relex_from` - The minimum byte offset that may have been affected by edits
278    ///   (use `source.length()` to indicate no edits)
279    /// * `cache` - The incremental cache containing previous lexing results
280    ///
281    /// # Returns
282    ///
283    /// A new `LexerState` initialized at the beginning of the source with cache support
284    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
285        if !cache.has_tokens() {
286            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
287        }
288
289        let len = source.length();
290        let relex_from = relex_from.min(len);
291
292        // Fast path: fully re-used
293        if relex_from >= len {
294            let mut tokens = Vec::new();
295            if let Some(cached) = cache.get_tokens() {
296                tokens.extend_from_slice(cached)
297            }
298            else {
299                let count = cache.count_tokens();
300                tokens.reserve(count);
301                for i in 0..count {
302                    if let Some(t) = cache.get_token(i) {
303                        tokens.push(t)
304                    }
305                }
306            }
307            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
308            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
309        }
310
311        if relex_from == 0 {
312            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
313        }
314
315        let mut reused_tokens = Vec::new();
316        const BACKTRACK_TOKENS: usize = 1;
317
318        if let Some(cached) = cache.get_tokens() {
319            // Binary search for the cut-off point since tokens are sorted by position
320            let idx = cached.partition_point(|t| t.span.end <= relex_from);
321            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
322            if keep > 0 {
323                reused_tokens.extend_from_slice(&cached[..keep])
324            }
325        }
326        else {
327            // Fallback for caches that don't support slice access
328            let count = cache.count_tokens();
329            for i in 0..count {
330                let Some(token) = cache.get_token(i)
331                else {
332                    break;
333                };
334                if token.span.end <= relex_from {
335                    reused_tokens.push(token);
336                }
337                else {
338                    break;
339                }
340            }
341            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
342            reused_tokens.truncate(keep);
343        }
344
345        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
346        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
347    }
348
349    /// Gets the remaining text from the current position to the end of the source.
350    ///
351    /// # Returns
352    ///
353    /// A string slice containing the remaining text
354    pub fn rest(&mut self) -> &str {
355        self.cursor.rest()
356    }
357
358    /// Gets the remaining text as a byte slice.
359    ///
360    /// Useful for byte-oriented scanning operations.
361    #[inline]
362    pub fn rest_bytes(&mut self) -> &[u8] {
363        self.cursor.rest().as_bytes()
364    }
365
366    /// Checks if the lexer has consumed all input from the source.
367    ///
368    /// Returns `true` if the current position is at or beyond the end of the source.
369    pub fn fully_reused(&self) -> bool {
370        self.cursor.position() >= self.cursor.source().length()
371    }
372
373    /// Gets the current byte offset position in the source text.
374    ///
375    /// # Returns
376    ///
377    /// The current byte offset from the start of the source text.
378    #[inline]
379    pub fn get_position(&self) -> usize {
380        self.cursor.position()
381    }
382
383    /// Checks if the lexer has NOT consumed all input from the source.
384    ///
385    /// Returns `true` if there are still bytes left to be scanned.
386    #[inline]
387    pub fn not_at_end(&self) -> bool {
388        self.cursor.position() < self.cursor.source().length()
389    }
390
391    /// Peeks at the next character without advancing the cursor.
392    ///
393    /// Returns `None` if at the end of the source.
394    #[inline]
395    pub fn peek(&mut self) -> Option<char> {
396        self.cursor.peek_char()
397    }
398
399    /// Peeks at the character immediately following the current character.
400    #[inline]
401    pub fn peek_next(&mut self) -> Option<char> {
402        self.cursor.peek_next_char()
403    }
404
405    /// Peeks at the character at the specified byte offset relative to the current position.
406    #[inline]
407    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
408        self.cursor.peek_next_n(n)
409    }
410
411    /// Advances the cursor by the specified number of bytes.
412    #[inline]
413    pub fn advance(&mut self, len: usize) {
414        self.cursor.advance_bytes(len);
415    }
416
417    /// Gets the total length of the source text in bytes.
418    #[inline]
419    pub fn get_length(&self) -> usize {
420        self.cursor.source().length()
421    }
422
423    /// Gets a single character at the specified absolute byte offset.
424    #[inline]
425    pub fn get_char_at(&self, offset: usize) -> Option<char> {
426        self.cursor.source().get_char_at(offset)
427    }
428
429    /// Peeks at the next byte without advancing the cursor.
430    #[inline]
431    pub fn peek_byte(&mut self) -> Option<u8> {
432        self.cursor.peek_byte()
433    }
434
435    /// Advances the cursor by one byte and returns it.
436    #[inline]
437    pub fn advance_byte(&mut self) -> Option<u8> {
438        self.cursor.advance_byte()
439    }
440
441    /// Advances the cursor while the byte predicate is true.
442    ///
443    /// Returns the byte range covered by the matched bytes.
444    #[inline]
445    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
446        self.cursor.take_while_byte(pred)
447    }
448
449    /// Skips common ASCII whitespace (space, tab, newline, carriage return).
450    ///
451    /// Uses SIMD acceleration if available on the platform.
452    /// Returns the range of the skipped whitespace.
453    #[inline]
454    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
455        self.cursor.skip_ascii_whitespace()
456    }
457
458    /// Skips all consecutive ASCII digits at the current position.
459    ///
460    /// Returns the range of the skipped digits.
461    #[inline]
462    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
463        self.cursor.skip_ascii_digits()
464    }
465
466    /// Skips all characters that can continue an ASCII identifier.
467    ///
468    /// This includes alphanumeric characters and underscores.
469    /// Returns the range of the skipped characters.
470    #[inline]
471    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
472        self.cursor.skip_ascii_ident_continue()
473    }
474
475    /// Skips all characters until the target byte is encountered.
476    ///
477    /// The target byte itself is NOT consumed.
478    /// Returns the range of the skipped characters.
479    #[inline]
480    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
481        self.cursor.skip_until(target)
482    }
483
484    /// Scans an ASCII identifier.
485    ///
486    /// An identifier must start with an alphabetic character or an underscore,
487    /// and can be followed by any number of alphanumeric characters or underscores.
488    ///
489    /// # Arguments
490    ///
491    /// * `kind` - The token type to assign if an identifier is found.
492    ///
493    /// # Returns
494    ///
495    /// `true` if an identifier was successfully scanned and added.
496    #[inline]
497    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
498        let start = self.get_position();
499        if let Some(b) = self.peek_byte() {
500            if b == b'_' || b.is_ascii_alphabetic() {
501                self.advance_byte();
502                self.skip_ascii_ident_continue();
503                self.add_token(kind, start, self.get_position());
504                return true;
505            }
506        }
507        false
508    }
509
510    /// Scans a line comment starting with the given prefix.
511    ///
512    /// Consumes the prefix and all characters until the next newline or EOF.
513    ///
514    /// # Arguments
515    ///
516    /// * `kind` - The token type for the line comment.
517    /// * `prefix` - The string sequence that starts the comment (e.g., "//").
518    #[inline]
519    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
520        let start = self.get_position();
521        if self.consume_if_starts_with(prefix) {
522            self.skip_until(b'\n');
523            self.add_token(kind, start, self.get_position());
524            return true;
525        }
526        false
527    }
528
529    /// Scans a block comment with given start and end sequences.
530    ///
531    /// Handles nested comments if the underlying implementation supports it,
532    /// though this basic implementation is non-recursive.
533    ///
534    /// # Arguments
535    ///
536    /// * `kind` - The token type for the block comment.
537    /// * `start_seq` - The sequence that starts the block (e.g., "/*").
538    /// * `end_seq` - The sequence that ends the block (e.g., "*/").
539    #[inline]
540    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
541        let start = self.get_position();
542        if self.consume_if_starts_with(start_seq) {
543            while let Some(_b) = self.peek_byte() {
544                self.skip_until(end_seq.as_bytes()[0]);
545                if self.consume_if_starts_with(end_seq) {
546                    self.add_token(kind, start, self.get_position());
547                    return true;
548                }
549                self.advance_byte();
550            }
551            // Unclosed block comment is still a comment in many languages,
552            // but we might want to add an error here in the future.
553            self.add_token(kind, start, self.get_position());
554            return true;
555        }
556        false
557    }
558
559    /// Gets a reference to the tokens collected so far.
560    ///
561    /// # Returns
562    ///
563    /// A slice of tokens collected during the lexing process.
564    #[inline]
565    pub fn tokens(&self) -> &[Token<L::TokenType>] {
566        &self.tokens
567    }
568
569    /// Sets the current position to the specified byte offset.
570    ///
571    /// # Arguments
572    ///
573    /// * `offset` - The new byte offset position.
574    ///
575    /// # Returns
576    ///
577    /// The previous byte offset position.
578    #[inline]
579    pub fn set_position(&mut self, offset: usize) -> usize {
580        self.cursor.set_position(offset)
581    }
582
583    /// Returns a reference to the underlying source.
584    pub fn source(&self) -> &'s S {
585        self.cursor.source()
586    }
587
588    /// Returns the text in the specified byte range.
589    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
590        self.cursor.source().get_text_in(range)
591    }
592
593    /// Returns the text from the specified byte offset to the end of the source.
594    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
595        self.cursor.source().get_text_from(offset)
596    }
597
598    /// Checks if the source starts with the given pattern at the current position.
599    pub fn starts_with(&mut self, pattern: &str) -> bool {
600        self.cursor.starts_with(pattern)
601    }
602
603    /// Consumes the pattern if it exists at the current position.
604    ///
605    /// Returns `true` if the pattern was found and consumed, advancing the cursor.
606    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
607        self.cursor.consume_if_starts_with(pattern)
608    }
609
610    /// Gets the tokens collected so far in the lexer state.
611    ///
612    /// # Returns
613    ///
614    /// A slice of tokens collected during lexing.
615    #[inline]
616    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
617        &self.tokens
618    }
619
620    /// Adds an error to the lexer state's diagnostics.
621    ///
622    /// # Arguments
623    ///
624    /// * `error` - The error to add.
625    #[inline]
626    pub fn add_error(&mut self, error: impl Into<OakError>) {
627        self.errors.push(error.into());
628    }
629
630    /// Adds a token to the lexer state.
631    ///
632    /// # Arguments
633    ///
634    /// * `kind` - The kind/type of the token.
635    /// * `start` - The starting byte offset.
636    /// * `end` - The ending byte offset.
637    #[inline]
638    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
639        self.tokens.push(Token { kind, span: Range { start, end } });
640    }
641
642    /// Adds an end-of-file (EOF) token to the lexer state.
643    ///
644    /// This method creates and adds an `END_OF_STREAM` token at the current position.
645    /// It is typically called when the lexer reaches the end of the source text.
646    ///
647    /// # Examples
648    ///
649    /// ```ignore
650    /// #![feature(new_range_api)]
651    /// # use core::range::Range;
652    /// # use oak_core::lexer::{LexerState, Token};
653    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
654    /// #
655    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
656    /// # enum SimpleToken {
657    /// #     End,
658    /// # }
659    /// #
660    /// # impl TokenType for SimpleToken {
661    /// #     const END_OF_STREAM: Self = SimpleToken::End;
662    /// #     type Role = UniversalTokenRole;
663    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
664    /// # }
665    /// #
666    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
667    /// # enum SimpleElement {}
668    /// #
669    /// # impl ElementType for SimpleElement {
670    /// #     type Role = UniversalElementRole;
671    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
672    /// # }
673    /// #
674    /// # #[derive(Clone)]
675    /// # struct SimpleLanguage;
676    /// #
677    /// # impl Language for SimpleLanguage {
678    /// #     const NAME: &'static str = "simple";
679    /// #     type TokenType = SimpleToken;
680    /// #     type ElementType = SimpleElement;
681    /// #     type TypedRoot = ();
682    /// # }
683    /// #
684    /// let source = SourceText::new("test");
685    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
686    /// state.take_while(|_| true); // Advance to end
687    /// state.add_eof();
688    ///
689    /// assert_eq!(state.tokens().len(), 1);
690    /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
691    /// ```
692    #[inline]
693    pub fn add_eof(&mut self) {
694        let end = self.get_position();
695        self.add_token(L::TokenType::END_OF_STREAM, end, end)
696    }
697
698    /// Gets the current character at the current position.
699    ///
700    /// # Returns
701    ///
702    /// The current character, or `None` if at the end of the source
703    #[inline]
704    pub fn current(&mut self) -> Option<char> {
705        self.cursor.peek_char()
706    }
707
708    /// Advances the position by the current character's length.
709    ///
710    /// # Returns
711    ///
712    /// The character that was skipped, or `None` if at the end of the source
713    #[inline]
714    pub fn bump(&mut self) -> Option<char> {
715        let ch = self.peek()?;
716        self.advance(ch.len_utf8());
717        Some(ch)
718    }
719
720    /// Advances the position by the token's length and adds the token to the lexer state.
721    ///
722    /// This method combines two common operations: advancing the lexer position
723    /// and adding a token to the token list. It calculates the advance distance
724    /// from the token's span, ensuring consistent positioning.
725    ///
726    /// # Arguments
727    ///
728    /// * `token` - The token to add to the lexer state
729    ///
730    /// # Returns
731    ///
732    /// The new byte offset position after advancing
733    ///
734    /// # Examples
735    ///
736    /// ```ignore
737    /// #![feature(new_range_api)]
738    /// # use core::range::Range;
739    /// # use oak_core::lexer::{LexerState, Token};
740    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
741    /// #     /// #
742    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
743    /// # enum SimpleToken { Identifier, End }
744    /// #
745    /// # impl TokenType for SimpleToken {
746    /// #     const END_OF_STREAM: Self = SimpleToken::End;
747    /// #     type Role = UniversalTokenRole;
748    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
749    /// # }
750    /// #
751    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
752    /// # enum SimpleElement {}
753    /// #
754    /// # impl ElementType for SimpleElement {
755    /// #     type Role = UniversalElementRole;
756    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
757    /// # }
758    /// #
759    /// # #[derive(Clone)]
760    /// # struct SimpleLanguage;
761    /// #
762    /// # impl Language for SimpleLanguage {
763    /// #     const NAME: &'static str = "simple";
764    /// #     type TokenType = SimpleToken;
765    /// #     type ElementType = SimpleElement;
766    /// #     type TypedRoot = ();
767    /// # }
768    /// #
769    /// let source = SourceText::new("hello world");
770    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
771    ///
772    /// // Create a token for "hello"
773    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
774    ///
775    /// // Initially at position 0
776    /// assert_eq!(state.get_position(), 0);
777    ///
778    /// // Advance and add the token
779    /// let new_pos = state.advance_with(token);
780    ///
781    /// // Now at position 5 and token is added
782    /// assert_eq!(new_pos, 5);
783    /// assert_eq!(state.get_position(), 5);
784    /// assert_eq!(state.get_tokens().len(), 1);
785    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
786    /// ```
787    ///
788    /// # Note
789    ///
790    /// The caller must ensure that the token's span is valid and that the advance
791    /// does not split multi-byte UTF-8 characters. The token should be created
792    /// with proper character boundaries.
793    #[inline]
794    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
795        self.cursor.advance_bytes(token.length());
796        self.tokens.push(token);
797        self.cursor.position()
798    }
799
800    /// Consumes characters while the predicate returns true, returning the consumed range.
801    ///
802    /// This method iterates through the source text from the current position,
803    /// consuming characters as long as the predicate function returns true.
804    /// It's commonly used for recognizing patterns like identifiers, numbers,
805    /// or whitespace sequences.
806    ///
807    /// # Arguments
808    ///
809    /// * `pred` - A closure that takes a character and returns true if the character
810    ///            should be consumed, false otherwise
811    ///
812    /// # Returns
813    ///
814    /// A byte range representing the span of consumed characters
815    ///
816    /// # Examples
817    ///
818    /// ```ignore
819    /// #![feature(new_range_api)]
820    /// # use core::range::Range;
821    /// # use oak_core::lexer::{LexerState, Token};
822    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
823    /// #     /// #
824    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
825    /// # enum SimpleToken { End }
826    /// #
827    /// # impl TokenType for SimpleToken {
828    /// #     const END_OF_STREAM: Self = SimpleToken::End;
829    /// #     type Role = UniversalTokenRole;
830    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
831    /// # }
832    /// #
833    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
834    /// # enum SimpleElement {}
835    /// #
836    /// # impl ElementType for SimpleElement {
837    /// #     type Role = UniversalElementRole;
838    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
839    /// # }
840    /// #
841    /// # #[derive(Clone)]
842    /// # struct SimpleLanguage;
843    /// #
844    /// # impl Language for SimpleLanguage {
845    /// #     const NAME: &'static str = "simple";
846    /// #     type TokenType = SimpleToken;
847    /// #     type ElementType = SimpleElement;
848    /// #     type TypedRoot = ();
849    /// # }
850    /// #
851    /// let source = SourceText::new("hello123world");
852    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
853    ///
854    /// // Consume alphabetic characters
855    /// let range = state.take_while(|c| c.is_alphabetic());
856    ///
857    /// // Should have consumed "hello"
858    /// assert_eq!(range, Range { start: 0, end: 5 });
859    /// assert_eq!(state.get_position(), 5);
860    ///
861    /// // Consume numeric characters
862    /// let range = state.take_while(|c| c.is_numeric());
863    ///
864    /// // Should have consumed "123"
865    /// assert_eq!(range, Range { start: 5, end: 8 });
866    /// assert_eq!(state.get_position(), 8);
867    /// ```
868    ///
869    /// # Performance Note
870    ///
871    /// This method operates on a character-by-character basis, which means it
872    /// correctly handles multi-byte UTF-8 characters. For performance-critical
873    /// code, consider using byte-based methods when working with ASCII-only text.
874    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
875        let start = self.cursor.position();
876        while let Some(ch) = self.peek() {
877            if pred(ch) { self.advance(ch.len_utf8()) } else { break }
878        }
879        Range { start, end: self.cursor.position() }
880    }
881
882    /// Performs a safety check to prevent infinite loops during lexing.
883    ///
884    /// This method ensures that the lexer always makes progress by forcing
885    /// advancement when stuck at the same position. It's used as a safeguard
886    /// against infinite loops in lexer implementations.
887    ///
888    /// The method compares the current position with a previously saved "safe point"
889    /// position. If they're the same, it means the lexer hasn't made progress since
890    /// that safe point, potentially indicating an infinite loop. In this case, the
891    /// method forces advancement by at least one character.
892    ///
893    /// # Arguments
894    ///
895    /// * `safe_point` - The position to check against for potential deadlock
896    ///
897    /// # Examples
898    ///
899    /// ```ignore
900    /// #![feature(new_range_api)]
901    /// # use oak_core::lexer::{LexerState, Token};
902    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
903    /// #     /// #
904    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
905    /// # enum SimpleToken { End }
906    /// #
907    /// # impl TokenType for SimpleToken {
908    /// #     const END_OF_STREAM: Self = SimpleToken::End;
909    /// #     type Role = UniversalTokenRole;
910    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
911    /// # }
912    /// #
913    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
914    /// # enum SimpleElement {}
915    /// #
916    /// # impl ElementType for SimpleElement {
917    /// #     type Role = UniversalElementRole;
918    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
919    /// # }
920    /// #
921    /// # struct SimpleLanguage;
922    /// #
923    /// # impl Language for SimpleLanguage {
924    /// #     const NAME: &'static str = "simple";
925    /// #     type TokenType = SimpleToken;
926    /// #     type ElementType = SimpleElement;
927    /// #     type TypedRoot = ();
928    /// # }
929    /// #
930    /// let source = SourceText::new("test");
931    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
932    ///
933    /// // Save the current position as a safe point
934    /// let safe_point = state.get_position();
935    ///
936    /// // In a real lexer, you would do some processing here
937    /// // If something went wrong and we didn't advance, this would prevent infinite loop
938    /// state.advance_if_dead_lock(safe_point);
939    ///
940    /// // If we were stuck, we would have advanced by at least 1
941    /// assert!(state.get_position() >= safe_point);
942    /// ```
943    ///
944    /// # Usage in Lexer Implementations
945    ///
946    /// This method is typically used at the beginning or end of lexing loops:
947    ///
948    /// ```ignore
949    /// loop {
950    ///     let safe_point = state.get_position();
951    ///     
952    ///     // Try to recognize a token
953    ///     if let Some(token) = try_recognize_token(&mut state) {
954    ///         // Success, continue loop
955    ///         continue;
956    ///     }
957    ///     
958    ///     // If we get here, we didn't recognize anything
959    ///     // This prevents infinite loops if recognition fails
960    ///     state.advance_if_dead_lock(safe_point);
961    ///     
962    ///     if state.not_at_end() {
963    ///         // Continue trying to recognize tokens
964    ///         continue;
965    ///     } else {
966    ///         // Reached end of source
967    ///         break;
968    ///     }
969    /// }
970    /// ```
971    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
972        // Force advance if no progress was made
973        if self.cursor.position() == safe_point {
974            if let Some(ch) = self.current() {
975                // Skip current character
976                self.advance(ch.len_utf8())
977            }
978            else {
979                // Advance anyway to prevent infinite loop
980                self.advance(1)
981            }
982            // tracing::warn!("deadlock")
983        }
984    }
985
986    /// Finishes lexing and returns the final output with tokens and diagnostics.
987    ///
988    /// This method concludes the lexing process by converting the collected tokens
989    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
990    /// represents the overall success or failure of the lexing operation.
991    ///
992    /// If the result is `Ok`, the tokens are returned as the successful result.
993    /// If the result is `Err`, the error is returned as the failure result.
994    /// In both cases, any collected diagnostic errors are included in the output.
995    ///
996    /// # Arguments
997    ///
998    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
999    ///
1000    /// # Returns
1001    ///
1002    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1003    ///
1004    /// # Examples
1005    ///
1006    /// ```
1007    /// #![feature(new_range_api)]
1008    /// # use oak_core::lexer::{LexerState, Token};
1009    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
1010    /// #     /// #
1011    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1012    /// # enum SimpleToken { Identifier, End }
1013    /// #
1014    /// # impl TokenType for SimpleToken {
1015    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1016    /// #     type Role = UniversalTokenRole;
1017    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1018    /// # }
1019    /// #
1020    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1021    /// # enum SimpleElement {}
1022    /// #
1023    /// # impl ElementType for SimpleElement {
1024    /// #     type Role = UniversalElementRole;
1025    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1026    /// # }
1027    /// #
1028    /// # struct SimpleLanguage;
1029    /// #
1030    /// # impl Language for SimpleLanguage {
1031    /// #     const NAME: &'static str = "simple";
1032    /// #     type TokenType = SimpleToken;
1033    /// #     type ElementType = SimpleElement;
1034    /// #     type TypedRoot = ();
1035    /// # }
1036    /// #
1037    /// let source = SourceText::new("test");
1038    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1039    ///
1040    /// // Add some tokens during lexing
1041    /// state.add_token(SimpleToken::Identifier, 0, 4);
1042    ///
1043    /// // Finish with successful result
1044    /// let output = state.finish(Ok(()));
1045    ///
1046    /// // Check the results
1047    /// assert!(output.result.is_ok());
1048    /// assert_eq!(output.result.unwrap().len(), 1);
1049    /// assert_eq!(output.diagnostics.len(), 0);
1050    ///
1051    /// // Example with error
1052    /// let source2 = SourceText::new("test");
1053    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1054    /// state2.add_error(OakError::custom_error("Test error"));
1055    ///
1056    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1057    ///
1058    /// // Check the results
1059    /// assert!(output2.result.is_err());
1060    /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1061    /// ```
1062    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1063        match result {
1064            Ok(_) => {
1065                let tokens: Tokens<L> = self.tokens.into();
1066                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1067            }
1068            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1069        }
1070    }
1071
1072    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1073    ///
1074    /// This method is similar to `finish` but additionally updates the incremental cache
1075    /// with the new tokens. It's used for incremental lexing where the results need to
1076    /// be cached for future reuse when the source text changes.
1077    ///
1078    /// The method first creates the output in the same way as `finish`, then updates
1079    /// the cache's `last_lex` field with the new tokens. This enables the next call
1080    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1081    ///
1082    /// # Arguments
1083    ///
1084    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1085    /// * `cache` - The incremental cache to update with the new tokens
1086    ///
1087    /// # Returns
1088    ///
1089    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1090    ///
1091    /// # Examples
1092    ///
1093    /// ```ignore
1094    /// #![feature(new_range_api)]
1095    /// # use core::range::Range;
1096    /// # use oak_core::lexer::{LexerState, Token};
1097    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1098    /// # use oak_core::parser::session::ParseSession;
1099    /// #
1100    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1101    /// # enum SimpleToken { Identifier, End }
1102    /// #
1103    /// # impl TokenType for SimpleToken {
1104    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1105    /// #     type Role = UniversalTokenRole;
1106    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1107    /// # }
1108    /// #
1109    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1110    /// # enum SimpleElement {}
1111    /// #
1112    /// # impl ElementType for SimpleElement {
1113    /// #     type Role = UniversalElementRole;
1114    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1115    /// # }
1116    /// #
1117    /// # struct SimpleLanguage;
1118    /// #
1119    /// # impl Language for SimpleLanguage {
1120    /// #     const NAME: &'static str = "simple";
1121    /// #     type TokenType = SimpleToken;
1122    /// #     type ElementType = SimpleElement;
1123    /// #     type TypedRoot = ();
1124    /// # }
1125    /// #
1126    /// let source = SourceText::new("test");
1127    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1128    ///
1129    /// // Create a cache for incremental lexing
1130    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1131    ///
1132    /// // Add some tokens during lexing
1133    /// state.add_token(SimpleToken::Identifier, 0, 4);
1134    ///
1135    /// // Finish with cache update
1136    /// let output = state.finish_with_cache(Ok(()), &mut cache);
1137    ///
1138    /// // Check the results
1139    /// assert!(output.result.is_ok());
1140    /// assert_eq!(output.result.unwrap().len(), 1);
1141    /// ```
1142    ///
1143    /// # Incremental Lexing Workflow
1144    ///
1145    /// This method is typically used as part of an incremental lexing workflow:
1146    ///
1147    /// ```ignore
1148    /// // First lexing
1149    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1150    /// // ... lexing logic ...
1151    /// let output = state.finish_with_cache(Ok(()), cache);
1152    ///
1153    /// // Later, when source changes
1154    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1155    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1156    /// // ... lexing logic (reusing unchanged tokens) ...
1157    /// let output = state.finish_with_cache(Ok(()), cache);
1158    /// ```
1159    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1160        let out = self.finish(result);
1161        cache.set_lex_output(out.clone());
1162        out
1163    }
1164}
oak_core/lexer/mod.rs

oak_core/lexer/
mod.rs