oak_core/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::{
4    Language, TextEdit, TokenType,
5    errors::{OakDiagnostics, OakError},
6    source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15pub mod scan_comment;
16/// Utilities for scanning identifiers.
17pub mod scan_identifier;
18/// Utilities for scanning numbers.
19pub mod scan_number;
20/// Utilities for scanning string literals.
21pub mod scan_string;
22/// Utilities for scanning whitespace.
23pub mod scan_white_space;
24
25pub use scan_comment::CommentConfig;
26pub use scan_string::StringConfig;
27pub use scan_white_space::WhitespaceConfig;
28
29/// Output type for lexical analysis operations.
30///
31/// This type alias represents the result of tokenization, containing
32/// a vector of tokens and any diagnostic language that occurred during
33/// the lexing process.
34pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
35
36/// Output type for lexical analysis operations, including diagnostics.
37pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
38
39/// Trait for tokenizing source code into sequences of tokens.
40///
41/// This trait defines the interface for converting source text into a sequence of
42/// tokens that can be consumed by the parser. Implementations should handle
43/// the specific lexical rules of their target language.
44///
45/// # Examples
46///
47/// ```ignore
48/// struct MyLexer;
49///
50/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
51/// enum MyToken {
52///     Number,
53///     Identifier,
54///     End,
55/// }
56///
57/// impl TokenType for MyToken {
58///     const END_OF_STREAM: Self = MyToken::End;
59///     type Role = UniversalTokenRole;
60///     fn role(&self) -> Self::Role { UniversalTokenRole::None }
61/// }
62///
63/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64/// enum MyElement {}
65///
66/// impl ElementType for MyElement {
67///     type Role = UniversalElementRole;
68///     fn role(&self) -> Self::Role { UniversalElementRole::None }
69/// }
70///
71/// struct MyLanguage;
72///
73/// impl Language for MyLanguage {
74///     const NAME: &'static str = "my-language";
75///     type TokenType = MyToken;
76///     type ElementType = MyElement;
77///     type TypedRoot = ();
78/// }
79///
80/// impl Lexer<MyLanguage> for MyLexer {
81///     fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
82///         // Tokenization logic here
83///         todo!()
84///     }
85/// }
86/// ```
87pub trait Lexer<L: Language + Send + Sync + 'static> {
88    /// Tokenizes the given source text into a sequence of tokens.
89    ///
90    /// This method performs a full lexical analysis of the source text,
91    /// creating a new sequence of tokens from scratch. It uses a default
92    /// cache configuration.
93    ///
94    /// # Arguments
95    ///
96    /// * `source` - The source text to tokenize
97    ///
98    /// # Returns
99    ///
100    /// A [`LexOutput`] containing the tokens and any diagnostic messages
101    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
102}
103
104/// Cache trait for lexical results.
105///
106/// This trait defines the interface for caching and accessing lexical analysis results.
107/// It provides methods for storing and retrieving token information from previous
108/// lexical analysis operations.
109#[allow(unused_variables)]
110pub trait LexerCache<L: Language> {
111    /// Sets the lexed output in the cache.
112    ///
113    /// # Arguments
114    ///
115    /// * `output` - The output from lexical analysis, including tokens and diagnostics
116    fn set_lex_output(&mut self, output: LexOutput<L>);
117
118    /// Gets a token from the cache by index.
119    ///
120    /// # Arguments
121    ///
122    /// * `index` - The index of the token to retrieve
123    ///
124    /// # Returns
125    ///
126    /// An `Option<Token<L::TokenType>>` containing the token if it exists,
127    /// or `None` if the index is out of bounds or no tokens are cached
128    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
129
130    /// Gets the total number of tokens in the cache.
131    ///
132    /// # Returns
133    ///
134    /// The number of cached tokens, or 0 if no tokens are cached
135    fn count_tokens(&self) -> usize;
136
137    /// Checks if the cache contains any tokens.
138    ///
139    /// # Returns
140    ///
141    /// `true` if the cache contains tokens, `false` otherwise
142    fn has_tokens(&self) -> bool;
143
144    /// Gets all cached tokens as a slice.
145    ///
146    /// # Returns
147    ///
148    /// An optional slice of tokens if available.
149    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
150        None
151    }
152}
153
154impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
155    fn set_lex_output(&mut self, output: LexOutput<L>) {
156        (**self).set_lex_output(output);
157    }
158
159    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
160        (**self).get_token(index)
161    }
162
163    fn count_tokens(&self) -> usize {
164        (**self).count_tokens()
165    }
166
167    fn has_tokens(&self) -> bool {
168        (**self).has_tokens()
169    }
170
171    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
172        (**self).get_tokens()
173    }
174}
175
176/// Represents a single kind in the source code.
177///
178/// Tokens are the fundamental units of lexical analysis, representing
179/// categorized pieces of source text with their position information.
180#[derive(Debug, Clone, PartialEq, Eq, Copy)]
181#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
182pub struct Token<K> {
183    /// The kind/category of this kind (e.g., keyword, identifier, number)
184    pub kind: K,
185    /// The byte range in the source text that this kind occupies
186    #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
187    pub span: Range<usize>,
188}
189
190impl<K> Token<K> {
191    /// Returns the length of this kind in bytes.
192    ///
193    /// # Returns
194    ///
195    /// The number of bytes between the start and end of the kind's span
196    ///
197    /// # Examples
198    ///
199    /// ```ignore
200    /// #![feature(new_range_api)]
201    /// # use oak_core::lexer::Token;
202    /// # use core::range::Range;
203    /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
204    /// assert_eq!(kind.length(), 5);
205    /// ```
206    #[inline]
207    pub fn length(&self) -> usize {
208        self.span.end - self.span.start
209    }
210}
211
212/// A stream of tokens with associated source text.
213#[derive(Debug, Clone)]
214#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
215#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
216pub struct TokenStream<K: Copy> {
217    /// The raw source text.
218    pub raw: String,
219    /// The tokens extracted from the source text.
220    #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
221    pub tokens: Arc<[Token<K>]>,
222}
223
224#[cfg(feature = "serde")]
225mod arc_slice_serde {
226    use super::*;
227    use serde::{Deserialize, Deserializer, Serialize, Serializer};
228
229    pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
230    where
231        K: Serialize,
232        S: Serializer,
233    {
234        arc.as_ref().serialize(serializer)
235    }
236
237    pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
238    where
239        K: Deserialize<'de>,
240        D: Deserializer<'de>,
241    {
242        let vec = Vec::<Token<K>>::deserialize(deserializer)?;
243        Ok(Arc::from_iter(vec))
244    }
245}
246
247/// State information for incremental lexical analysis.
248///
249/// This struct maintains the current position and context during
250/// tokenization, enabling incremental and resumable lexing operations.
251#[derive(Debug)]
252pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
253    pub(crate) cursor: SourceCursor<'s, S>,
254    pub(crate) tokens: Vec<Token<L::TokenType>>,
255    pub(crate) errors: Vec<OakError>,
256}
257
258impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
259    /// Creates a new lexer state with the given source text.
260    ///
261    /// # Arguments
262    ///
263    /// * `source` - The source text to lex
264    ///
265    /// # Returns
266    ///
267    /// A new `LexerState` initialized at the beginning of the source
268    pub fn new(source: &'s S) -> Self {
269        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
270    }
271
272    /// Creates a new lexer state with the given source text and incremental cache.
273    ///
274    /// # Arguments
275    ///
276    /// * `source` - The source text to lex
277    /// * `relex_from` - The minimum byte offset that may have been affected by edits
278    ///   (use `source.length()` to indicate no edits)
279    /// * `cache` - The incremental cache containing previous lexing results
280    ///
281    /// # Returns
282    ///
283    /// A new `LexerState` initialized at the beginning of the source with cache support
284    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
285        if !cache.has_tokens() {
286            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
287        }
288
289        let len = source.length();
290        let relex_from = relex_from.min(len);
291
292        // Fast path: fully re-used
293        if relex_from >= len {
294            let mut tokens = Vec::new();
295            if let Some(cached) = cache.get_tokens() {
296                tokens.extend_from_slice(cached);
297            }
298            else {
299                let count = cache.count_tokens();
300                tokens.reserve(count);
301                for i in 0..count {
302                    if let Some(t) = cache.get_token(i) {
303                        tokens.push(t);
304                    }
305                }
306            }
307            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
308            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
309        }
310
311        if relex_from == 0 {
312            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
313        }
314
315        let mut reused_tokens = Vec::new();
316        const BACKTRACK_TOKENS: usize = 1;
317
318        if let Some(cached) = cache.get_tokens() {
319            // Binary search for the cut-off point since tokens are sorted by position
320            let idx = cached.partition_point(|t| t.span.end <= relex_from);
321            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
322            if keep > 0 {
323                reused_tokens.extend_from_slice(&cached[..keep]);
324            }
325        }
326        else {
327            // Fallback for caches that don't support slice access
328            let count = cache.count_tokens();
329            for i in 0..count {
330                let Some(token) = cache.get_token(i)
331                else {
332                    break;
333                };
334                if token.span.end <= relex_from {
335                    reused_tokens.push(token);
336                }
337                else {
338                    break;
339                }
340            }
341            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
342            reused_tokens.truncate(keep);
343        }
344
345        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
346        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
347    }
348
349    /// Gets the remaining text from the current position to the end of the source.
350    ///
351    /// # Returns
352    ///
353    /// A string slice containing the remaining text
354    pub fn rest(&mut self) -> &str {
355        self.cursor.rest()
356    }
357
358    /// Gets the remaining text as a byte slice.
359    #[inline]
360    pub fn rest_bytes(&mut self) -> &[u8] {
361        self.cursor.rest().as_bytes()
362    }
363
364    /// Checks if the lexer has consumed all input from the source.
365    pub fn fully_reused(&self) -> bool {
366        self.cursor.position() >= self.cursor.source().length()
367    }
368
369    /// Gets the current byte offset position in the source text.
370    ///
371    /// # Returns
372    ///
373    /// The current byte offset from the start of the source text
374    #[inline]
375    pub fn get_position(&self) -> usize {
376        self.cursor.position()
377    }
378
379    /// Checks if the lexer has NOT consumed all input from the source.
380    #[inline]
381    pub fn not_at_end(&self) -> bool {
382        self.cursor.position() < self.cursor.source().length()
383    }
384
385    /// Peeks at the next character without advancing.
386    #[inline]
387    pub fn peek(&mut self) -> Option<char> {
388        self.cursor.peek_char()
389    }
390
391    /// Peeks at the character at the specified byte offset relative to the current position.
392    #[inline]
393    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
394        self.cursor.peek_next_n(n)
395    }
396
397    /// Advances the cursor by the specified number of bytes.
398    #[inline]
399    pub fn advance(&mut self, len: usize) {
400        self.cursor.advance_bytes(len);
401    }
402
403    /// Gets the total length of the source text in bytes.
404    #[inline]
405    pub fn get_length(&self) -> usize {
406        self.cursor.source().length()
407    }
408
409    /// Gets a single character at the specified byte offset.
410    #[inline]
411    pub fn get_char_at(&self, offset: usize) -> Option<char> {
412        self.cursor.source().get_char_at(offset)
413    }
414
415    /// Peeks at the next byte without advancing.
416    #[inline]
417    pub fn peek_byte(&mut self) -> Option<u8> {
418        self.cursor.peek_byte()
419    }
420
421    /// Advances the cursor by one byte and returns it.
422    #[inline]
423    pub fn advance_byte(&mut self) -> Option<u8> {
424        self.cursor.advance_byte()
425    }
426
427    /// Advances the cursor while the byte predicate is true.
428    #[inline]
429    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
430        self.cursor.take_while_byte(pred)
431    }
432
433    /// Skips common ASCII whitespace using SIMD if possible.
434    #[inline]
435    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
436        self.cursor.skip_ascii_whitespace()
437    }
438
439    /// Skips all ASCII digits at the current position.
440    #[inline]
441    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
442        self.cursor.skip_ascii_digits()
443    }
444
445    /// Skips all characters that can continue an ASCII identifier.
446    #[inline]
447    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
448        self.cursor.skip_ascii_ident_continue()
449    }
450
451    /// Skips all characters until the target byte is encountered.
452    #[inline]
453    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
454        self.cursor.skip_until(target)
455    }
456
457    /// Scans an ASCII identifier (starts with alpha/_, continues with alphanumeric/_).
458    #[inline]
459    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
460        let start = self.get_position();
461        if let Some(b) = self.peek_byte() {
462            if b == b'_' || b.is_ascii_alphabetic() {
463                self.advance_byte();
464                self.skip_ascii_ident_continue();
465                self.add_token(kind, start, self.get_position());
466                return true;
467            }
468        }
469        false
470    }
471
472    /// Scans a line comment starting with the given prefix.
473    #[inline]
474    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
475        let start = self.get_position();
476        if self.consume_if_starts_with(prefix) {
477            self.skip_until(b'\n');
478            self.add_token(kind, start, self.get_position());
479            return true;
480        }
481        false
482    }
483
484    /// Scans a block comment with given start and end sequences.
485    #[inline]
486    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
487        let start = self.get_position();
488        if self.consume_if_starts_with(start_seq) {
489            while let Some(_b) = self.peek_byte() {
490                self.skip_until(end_seq.as_bytes()[0]);
491                if self.consume_if_starts_with(end_seq) {
492                    self.add_token(kind, start, self.get_position());
493                    return true;
494                }
495                self.advance_byte();
496            }
497            // Unclosed block comment is still a comment in many languages,
498            // but we might want to add an error here in the future.
499            self.add_token(kind, start, self.get_position());
500            return true;
501        }
502        false
503    }
504
505    /// Gets a reference to the tokens collected so far.
506    ///
507    /// # Returns
508    ///
509    /// A slice of tokens collected during the lexing process
510    #[inline]
511    pub fn tokens(&self) -> &[Token<L::TokenType>] {
512        &self.tokens
513    }
514
515    /// Sets the current position to the specified byte offset.
516    ///
517    /// # Arguments
518    ///
519    /// * `offset` - The new byte offset position
520    ///
521    /// # Returns
522    ///
523    /// The previous byte offset position
524    #[inline]
525    pub fn set_position(&mut self, offset: usize) -> usize {
526        self.cursor.set_position(offset)
527    }
528
529    /// Returns a reference to the underlying source.
530    pub fn source(&self) -> &'s S {
531        self.cursor.source()
532    }
533
534    /// Returns the text in the specified range.
535    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
536        self.cursor.source().get_text_in(range)
537    }
538
539    /// Returns the text from the specified offset to the end.
540    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
541        self.cursor.source().get_text_from(offset)
542    }
543
544    /// Checks if the source starts with the given pattern at the current position.
545    pub fn starts_with(&mut self, pattern: &str) -> bool {
546        self.cursor.starts_with(pattern)
547    }
548
549    /// Consumes the pattern if it exists at the current position.
550    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
551        self.cursor.consume_if_starts_with(pattern)
552    }
553
554    /// Gets the tokens collected so far in the lexer state.
555    ///
556    /// # Returns
557    ///
558    /// A slice of tokens collected during lexing
559    #[inline]
560    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
561        &self.tokens
562    }
563
564    /// Adds an error to the lexer state.
565    ///
566    /// # Arguments
567    ///
568    /// * `error` - The error to add to the diagnostics
569    #[inline]
570    pub fn add_error(&mut self, error: impl Into<OakError>) {
571        self.errors.push(error.into());
572    }
573
574    /// Adds a token to the lexer state.
575    ///
576    /// # Arguments
577    ///
578    /// * `kind` - The kind of the token
579    /// * `start` - The starting byte offset of the token
580    /// * `end` - The ending byte offset of the token
581    #[inline]
582    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
583        self.tokens.push(Token { kind, span: Range { start, end } });
584    }
585
586    /// Adds an end-of-file token to the lexer state.
587    ///
588    /// This method creates and adds an END_OF_STREAM token at the current position.
589    /// It's typically called when the lexer reaches the end of the source text
590    /// to mark the termination of the token stream.
591    ///
592    /// # Examples
593    ///
594    /// ```ignore
595    /// #![feature(new_range_api)]
596    /// # use core::range::Range;
597    /// # use oak_core::lexer::{LexerState, Token};
598    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
599    /// #
600    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
601    /// # enum SimpleToken {
602    /// #     End,
603    /// # }
604    /// #
605    /// # impl TokenType for SimpleToken {
606    /// #     const END_OF_STREAM: Self = SimpleToken::End;
607    /// #     type Role = UniversalTokenRole;
608    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
609    /// # }
610    /// #
611    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
612    /// # enum SimpleElement {}
613    /// #
614    /// # impl ElementType for SimpleElement {
615    /// #     type Role = UniversalElementRole;
616    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
617    /// # }
618    /// #
619    /// # #[derive(Clone)]
620    /// # struct SimpleLanguage;
621    /// #
622    /// # impl Language for SimpleLanguage {
623    /// #     const NAME: &'static str = "simple";
624    /// #     type TokenType = SimpleToken;
625    /// #     type ElementType = SimpleElement;
626    /// #     type TypedRoot = ();
627    /// # }
628    /// #
629    /// let source = SourceText::new("test");
630    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
631    /// state.take_while(|_| true); // Advance to end
632    /// state.add_eof();
633    ///
634    /// assert_eq!(state.tokens().len(), 1);
635    /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
636    /// ```
637    #[inline]
638    pub fn add_eof(&mut self) {
639        let end = self.get_position();
640        self.add_token(L::TokenType::END_OF_STREAM, end, end);
641    }
642
643    /// Gets the current character at the current position.
644    ///
645    /// # Returns
646    ///
647    /// The current character, or `None` if at the end of the source
648    #[inline]
649    pub fn current(&mut self) -> Option<char> {
650        self.cursor.peek_char()
651    }
652
653    /// Advances the position by the current character's length.
654    ///
655    /// # Returns
656    ///
657    /// The character that was skipped, or `None` if at the end of the source
658    #[inline]
659    pub fn bump(&mut self) -> Option<char> {
660        let ch = self.peek()?;
661        self.advance(ch.len_utf8());
662        Some(ch)
663    }
664
665    /// Advances the position by the token's length and adds the token to the lexer state.
666    ///
667    /// This method combines two common operations: advancing the lexer position
668    /// and adding a token to the token list. It calculates the advance distance
669    /// from the token's span, ensuring consistent positioning.
670    ///
671    /// # Arguments
672    ///
673    /// * `token` - The token to add to the lexer state
674    ///
675    /// # Returns
676    ///
677    /// The new byte offset position after advancing
678    ///
679    /// # Examples
680    ///
681    /// ```ignore
682    /// #![feature(new_range_api)]
683    /// # use core::range::Range;
684    /// # use oak_core::lexer::{LexerState, Token};
685    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
686    /// #     /// #
687    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
688    /// # enum SimpleToken { Identifier, End }
689    /// #
690    /// # impl TokenType for SimpleToken {
691    /// #     const END_OF_STREAM: Self = SimpleToken::End;
692    /// #     type Role = UniversalTokenRole;
693    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
694    /// # }
695    /// #
696    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
697    /// # enum SimpleElement {}
698    /// #
699    /// # impl ElementType for SimpleElement {
700    /// #     type Role = UniversalElementRole;
701    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
702    /// # }
703    /// #
704    /// # #[derive(Clone)]
705    /// # struct SimpleLanguage;
706    /// #
707    /// # impl Language for SimpleLanguage {
708    /// #     const NAME: &'static str = "simple";
709    /// #     type TokenType = SimpleToken;
710    /// #     type ElementType = SimpleElement;
711    /// #     type TypedRoot = ();
712    /// # }
713    /// #
714    /// let source = SourceText::new("hello world");
715    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
716    ///
717    /// // Create a token for "hello"
718    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } };
719    ///
720    /// // Initially at position 0
721    /// assert_eq!(state.get_position(), 0);
722    ///
723    /// // Advance and add the token
724    /// let new_pos = state.advance_with(token);
725    ///
726    /// // Now at position 5 and token is added
727    /// assert_eq!(new_pos, 5);
728    /// assert_eq!(state.get_position(), 5);
729    /// assert_eq!(state.get_tokens().len(), 1);
730    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
731    /// ```
732    ///
733    /// # Note
734    ///
735    /// The caller must ensure that the token's span is valid and that the advance
736    /// does not split multi-byte UTF-8 characters. The token should be created
737    /// with proper character boundaries.
738    #[inline]
739    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
740        self.cursor.advance_bytes(token.length());
741        self.tokens.push(token);
742        self.cursor.position()
743    }
744
745    /// Consumes characters while the predicate returns true, returning the consumed range.
746    ///
747    /// This method iterates through the source text from the current position,
748    /// consuming characters as long as the predicate function returns true.
749    /// It's commonly used for recognizing patterns like identifiers, numbers,
750    /// or whitespace sequences.
751    ///
752    /// # Arguments
753    ///
754    /// * `pred` - A closure that takes a character and returns true if the character
755    ///            should be consumed, false otherwise
756    ///
757    /// # Returns
758    ///
759    /// A byte range representing the span of consumed characters
760    ///
761    /// # Examples
762    ///
763    /// ```ignore
764    /// #![feature(new_range_api)]
765    /// # use core::range::Range;
766    /// # use oak_core::lexer::{LexerState, Token};
767    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
768    /// #     /// #
769    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
770    /// # enum SimpleToken { End }
771    /// #
772    /// # impl TokenType for SimpleToken {
773    /// #     const END_OF_STREAM: Self = SimpleToken::End;
774    /// #     type Role = UniversalTokenRole;
775    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
776    /// # }
777    /// #
778    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
779    /// # enum SimpleElement {}
780    /// #
781    /// # impl ElementType for SimpleElement {
782    /// #     type Role = UniversalElementRole;
783    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
784    /// # }
785    /// #
786    /// # #[derive(Clone)]
787    /// # struct SimpleLanguage;
788    /// #
789    /// # impl Language for SimpleLanguage {
790    /// #     const NAME: &'static str = "simple";
791    /// #     type TokenType = SimpleToken;
792    /// #     type ElementType = SimpleElement;
793    /// #     type TypedRoot = ();
794    /// # }
795    /// #
796    /// let source = SourceText::new("hello123world");
797    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
798    ///
799    /// // Consume alphabetic characters
800    /// let range = state.take_while(|c| c.is_alphabetic());
801    ///
802    /// // Should have consumed "hello"
803    /// assert_eq!(range, Range { start: 0, end: 5 });
804    /// assert_eq!(state.get_position(), 5);
805    ///
806    /// // Consume numeric characters
807    /// let range = state.take_while(|c| c.is_numeric());
808    ///
809    /// // Should have consumed "123"
810    /// assert_eq!(range, Range { start: 5, end: 8 });
811    /// assert_eq!(state.get_position(), 8);
812    /// ```
813    ///
814    /// # Performance Note
815    ///
816    /// This method operates on a character-by-character basis, which means it
817    /// correctly handles multi-byte UTF-8 characters. For performance-critical
818    /// code, consider using byte-based methods when working with ASCII-only text.
819    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
820        let start = self.cursor.position();
821        while let Some(ch) = self.peek() {
822            if pred(ch) {
823                self.advance(ch.len_utf8());
824            }
825            else {
826                break;
827            }
828        }
829        Range { start, end: self.cursor.position() }
830    }
831
832    /// Performs a safety check to prevent infinite loops during lexing.
833    ///
834    /// This method ensures that the lexer always makes progress by forcing
835    /// advancement when stuck at the same position. It's used as a safeguard
836    /// against infinite loops in lexer implementations.
837    ///
838    /// The method compares the current position with a previously saved "safe point"
839    /// position. If they're the same, it means the lexer hasn't made progress since
840    /// that safe point, potentially indicating an infinite loop. In this case, the
841    /// method forces advancement by at least one character.
842    ///
843    /// # Arguments
844    ///
845    /// * `safe_point` - The position to check against for potential deadlock
846    ///
847    /// # Examples
848    ///
849    /// ```ignore
850    /// #![feature(new_range_api)]
851    /// # use oak_core::lexer::{LexerState, Token};
852    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
853    /// #     /// #
854    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
855    /// # enum SimpleToken { End }
856    /// #
857    /// # impl TokenType for SimpleToken {
858    /// #     const END_OF_STREAM: Self = SimpleToken::End;
859    /// #     type Role = UniversalTokenRole;
860    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
861    /// # }
862    /// #
863    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
864    /// # enum SimpleElement {}
865    /// #
866    /// # impl ElementType for SimpleElement {
867    /// #     type Role = UniversalElementRole;
868    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
869    /// # }
870    /// #
871    /// # struct SimpleLanguage;
872    /// #
873    /// # impl Language for SimpleLanguage {
874    /// #     const NAME: &'static str = "simple";
875    /// #     type TokenType = SimpleToken;
876    /// #     type ElementType = SimpleElement;
877    /// #     type TypedRoot = ();
878    /// # }
879    /// #
880    /// let source = SourceText::new("test");
881    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
882    ///
883    /// // Save the current position as a safe point
884    /// let safe_point = state.get_position();
885    ///
886    /// // In a real lexer, you would do some processing here
887    /// // If something went wrong and we didn't advance, this would prevent infinite loop
888    /// state.advance_if_dead_lock(safe_point);
889    ///
890    /// // If we were stuck, we would have advanced by at least 1
891    /// assert!(state.get_position() >= safe_point);
892    /// ```
893    ///
894    /// # Usage in Lexer Implementations
895    ///
896    /// This method is typically used at the beginning or end of lexing loops:
897    ///
898    /// ```ignore
899    /// loop {
900    ///     let safe_point = state.get_position();
901    ///     
902    ///     // Try to recognize a token
903    ///     if let Some(token) = try_recognize_token(&mut state) {
904    ///         // Success, continue loop
905    ///         continue;
906    ///     }
907    ///     
908    ///     // If we get here, we didn't recognize anything
909    ///     // This prevents infinite loops if recognition fails
910    ///     state.advance_if_dead_lock(safe_point);
911    ///     
912    ///     if state.not_at_end() {
913    ///         // Continue trying to recognize tokens
914    ///         continue;
915    ///     } else {
916    ///         // Reached end of source
917    ///         break;
918    ///     }
919    /// }
920    /// ```
921    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
922        // Force advance if no progress was made
923        if self.cursor.position() == safe_point {
924            if let Some(ch) = self.current() {
925                // Skip current character
926                self.advance(ch.len_utf8());
927            }
928            else {
929                // Advance anyway to prevent infinite loop
930                self.advance(1);
931            }
932            // tracing::warn!("deadlock");
933        }
934    }
935
936    /// Finishes lexing and returns the final output with tokens and diagnostics.
937    ///
938    /// This method concludes the lexing process by converting the collected tokens
939    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
940    /// represents the overall success or failure of the lexing operation.
941    ///
942    /// If the result is `Ok`, the tokens are returned as the successful result.
943    /// If the result is `Err`, the error is returned as the failure result.
944    /// In both cases, any collected diagnostic errors are included in the output.
945    ///
946    /// # Arguments
947    ///
948    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
949    ///
950    /// # Returns
951    ///
952    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
953    ///
954    /// # Examples
955    ///
956    /// ```
957    /// #![feature(new_range_api)]
958    /// # use oak_core::lexer::{LexerState, Token};
959    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
960    /// #     /// #
961    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
962    /// # enum SimpleToken { Identifier, End }
963    /// #
964    /// # impl TokenType for SimpleToken {
965    /// #     const END_OF_STREAM: Self = SimpleToken::End;
966    /// #     type Role = UniversalTokenRole;
967    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
968    /// # }
969    /// #
970    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
971    /// # enum SimpleElement {}
972    /// #
973    /// # impl ElementType for SimpleElement {
974    /// #     type Role = UniversalElementRole;
975    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
976    /// # }
977    /// #
978    /// # struct SimpleLanguage;
979    /// #
980    /// # impl Language for SimpleLanguage {
981    /// #     const NAME: &'static str = "simple";
982    /// #     type TokenType = SimpleToken;
983    /// #     type ElementType = SimpleElement;
984    /// #     type TypedRoot = ();
985    /// # }
986    /// #
987    /// let source = SourceText::new("test");
988    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
989    ///
990    /// // Add some tokens during lexing
991    /// state.add_token(SimpleToken::Identifier, 0, 4);
992    ///
993    /// // Finish with successful result
994    /// let output = state.finish(Ok(()));
995    ///
996    /// // Check the results
997    /// assert!(output.result.is_ok());
998    /// assert_eq!(output.result.unwrap().len(), 1);
999    /// assert_eq!(output.diagnostics.len(), 0);
1000    ///
1001    /// // Example with error
1002    /// let source2 = SourceText::new("test");
1003    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1004    /// state2.add_error(OakError::custom_error("Test error"));
1005    ///
1006    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1007    ///
1008    /// // Check the results
1009    /// assert!(output2.result.is_err());
1010    /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1011    /// ```
1012    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1013        match result {
1014            Ok(_) => {
1015                let tokens: Tokens<L> = self.tokens.into();
1016                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1017            }
1018            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1019        }
1020    }
1021
1022    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1023    ///
1024    /// This method is similar to `finish` but additionally updates the incremental cache
1025    /// with the new tokens. It's used for incremental lexing where the results need to
1026    /// be cached for future reuse when the source text changes.
1027    ///
1028    /// The method first creates the output in the same way as `finish`, then updates
1029    /// the cache's `last_lex` field with the new tokens. This enables the next call
1030    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1031    ///
1032    /// # Arguments
1033    ///
1034    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1035    /// * `cache` - The incremental cache to update with the new tokens
1036    ///
1037    /// # Returns
1038    ///
1039    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1040    ///
1041    /// # Examples
1042    ///
1043    /// ```ignore
1044    /// #![feature(new_range_api)]
1045    /// # use core::range::Range;
1046    /// # use oak_core::lexer::{LexerState, Token};
1047    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1048    /// # use oak_core::parser::session::ParseSession;
1049    /// #
1050    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1051    /// # enum SimpleToken { Identifier, End }
1052    /// #
1053    /// # impl TokenType for SimpleToken {
1054    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1055    /// #     type Role = UniversalTokenRole;
1056    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1057    /// # }
1058    /// #
1059    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1060    /// # enum SimpleElement {}
1061    /// #
1062    /// # impl ElementType for SimpleElement {
1063    /// #     type Role = UniversalElementRole;
1064    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1065    /// # }
1066    /// #
1067    /// # struct SimpleLanguage;
1068    /// #
1069    /// # impl Language for SimpleLanguage {
1070    /// #     const NAME: &'static str = "simple";
1071    /// #     type TokenType = SimpleToken;
1072    /// #     type ElementType = SimpleElement;
1073    /// #     type TypedRoot = ();
1074    /// # }
1075    /// #
1076    /// let source = SourceText::new("test");
1077    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1078    ///
1079    /// // Create a cache for incremental lexing
1080    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1081    ///
1082    /// // Add some tokens during lexing
1083    /// state.add_token(SimpleToken::Identifier, 0, 4);
1084    ///
1085    /// // Finish with cache update
1086    /// let output = state.finish_with_cache(Ok(()), &mut cache);
1087    ///
1088    /// // Check the results
1089    /// assert!(output.result.is_ok());
1090    /// assert_eq!(output.result.unwrap().len(), 1);
1091    /// ```
1092    ///
1093    /// # Incremental Lexing Workflow
1094    ///
1095    /// This method is typically used as part of an incremental lexing workflow:
1096    ///
1097    /// ```ignore
1098    /// // First lexing
1099    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1100    /// // ... lexing logic ...
1101    /// let output = state.finish_with_cache(Ok(()), cache);
1102    ///
1103    /// // Later, when source changes
1104    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1105    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1106    /// // ... lexing logic (reusing unchanged tokens) ...
1107    /// let output = state.finish_with_cache(Ok(()), cache);
1108    /// ```
1109    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1110        let out = self.finish(result);
1111        cache.set_lex_output(out.clone());
1112        out
1113    }
1114}
oak_core/lexer/mod.rs

oak_core/lexer/
mod.rs