oak_core/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3use crate::{
4    Language, TextEdit, TokenType,
5    errors::{OakDiagnostics, OakError},
6    source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15mod scan_comment;
16/// Utilities for scanning identifiers.
17mod scan_identifier;
18/// Utilities for scanning numbers.
19mod scan_number;
20/// Utilities for scanning string literals.
21mod scan_string;
22/// Utilities for scanning whitespace.
23mod scan_white_space;
24
25pub use self::{scan_comment::CommentConfig, scan_string::StringConfig, scan_white_space::WhitespaceConfig};
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32#[derive(Debug, PartialEq, Eq)]
33#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
34#[cfg_attr(feature = "serde", serde(transparent, bound(serialize = "L::TokenType: Serialize", deserialize = "L::TokenType: Deserialize<'de>")))]
35pub struct Tokens<L: Language>(#[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))] pub Arc<[Token<L::TokenType>]>);
36
37impl<L: Language> Clone for Tokens<L> {
38    fn clone(&self) -> Self {
39        Self(self.0.clone())
40    }
41}
42
43impl<L: Language> Default for Tokens<L> {
44    fn default() -> Self {
45        Self(Arc::from_iter(std::iter::empty()))
46    }
47}
48
49impl<L: Language> core::ops::Deref for Tokens<L> {
50    type Target = [Token<L::TokenType>];
51
52    fn deref(&self) -> &Self::Target {
53        &self.0
54    }
55}
56
57impl<L: Language> From<Arc<[Token<L::TokenType>]>> for Tokens<L> {
58    fn from(arc: Arc<[Token<L::TokenType>]>) -> Self {
59        Self(arc)
60    }
61}
62
63impl<L: Language> From<Vec<Token<L::TokenType>>> for Tokens<L> {
64    fn from(vec: Vec<Token<L::TokenType>>) -> Self {
65        Self(Arc::from_iter(vec))
66    }
67}
68
69/// Output type for lexical analysis operations, including diagnostics.
70pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
71
72/// Trait for tokenizing source code into sequences of tokens.
73///
74/// This trait defines the interface for converting source text into a sequence of
75/// tokens that can be consumed by the parser. Implementations should handle
76/// the specific lexical rules of their target language.
77///
78/// # Examples
79///
80/// ```ignore
81/// struct MyLexer;
82///
83/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
84/// enum MyToken {
85///     Number,
86///     Identifier,
87///     End,
88/// }
89///
90/// impl TokenType for MyToken {
91///     const END_OF_STREAM: Self = MyToken::End;
92///     type Role = UniversalTokenRole;
93///     fn role(&self) -> Self::Role { UniversalTokenRole::None }
94/// }
95///
96/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
97/// enum MyElement {}
98///
99/// impl ElementType for MyElement {
100///     type Role = UniversalElementRole;
101///     fn role(&self) -> Self::Role { UniversalElementRole::None }
102/// }
103///
104/// struct MyLanguage;
105///
106/// impl Language for MyLanguage {
107///     const NAME: &'static str = "my-language";
108///     type TokenType = MyToken;
109///     type ElementType = MyElement;
110///     type TypedRoot = ();
111/// }
112///
113/// impl Lexer<MyLanguage> for MyLexer {
114///     fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
115///         // Tokenization logic here
116///         todo!()
117///     }
118/// }
119/// ```
120pub trait Lexer<L: Language + Send + Sync> {
121    /// Tokenizes the given source text into a sequence of tokens.
122    ///
123    /// This method performs a full lexical analysis of the source text,
124    /// creating a new sequence of tokens from scratch. It uses a default
125    /// cache configuration.
126    ///
127    /// # Arguments
128    ///
129    /// * `source` - The source text to tokenize
130    ///
131    /// # Returns
132    ///
133    /// A [`LexOutput`] containing the tokens and any diagnostic messages
134    fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
135}
136
137/// Cache trait for lexical results.
138///
139/// This trait defines the interface for caching and accessing lexical analysis results.
140/// It provides methods for storing and retrieving token information from previous
141/// lexical analysis operations.
142#[allow(unused_variables)]
143pub trait LexerCache<L: Language> {
144    /// Sets the lexed output in the cache.
145    ///
146    /// # Arguments
147    ///
148    /// * `output` - The output from lexical analysis, including tokens and diagnostics
149    fn set_lex_output(&mut self, output: LexOutput<L>);
150
151    /// Gets a token from the cache by index.
152    ///
153    /// # Arguments
154    ///
155    /// * `index` - The index of the token to retrieve
156    ///
157    /// # Returns
158    ///
159    /// An `Option<Token<L::TokenType>>` containing the token if it exists,
160    /// or `None` if the index is out of bounds or no tokens are cached
161    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
162
163    /// Gets the total number of tokens in the cache.
164    ///
165    /// # Returns
166    ///
167    /// The number of cached tokens, or 0 if no tokens are cached
168    fn count_tokens(&self) -> usize;
169
170    /// Checks if the cache contains any tokens.
171    ///
172    /// # Returns
173    ///
174    /// `true` if the cache contains tokens, `false` otherwise
175    fn has_tokens(&self) -> bool;
176
177    /// Gets all cached tokens as a slice.
178    ///
179    /// # Returns
180    ///
181    /// An optional slice of tokens if available.
182    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
183        None
184    }
185}
186
187impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
188    fn set_lex_output(&mut self, output: LexOutput<L>) {
189        (**self).set_lex_output(output)
190    }
191
192    fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
193        (**self).get_token(index)
194    }
195
196    fn count_tokens(&self) -> usize {
197        (**self).count_tokens()
198    }
199
200    fn has_tokens(&self) -> bool {
201        (**self).has_tokens()
202    }
203
204    fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
205        (**self).get_tokens()
206    }
207}
208
209/// A no-op implementation of `LexerCache`.
210#[derive(Debug, Clone, Copy, Default)]
211pub struct NoLexerCache;
212
213impl<L: Language> LexerCache<L> for NoLexerCache {
214    fn set_lex_output(&mut self, _output: LexOutput<L>) {}
215
216    fn get_token(&self, _index: usize) -> Option<Token<L::TokenType>> {
217        None
218    }
219
220    fn count_tokens(&self) -> usize {
221        0
222    }
223
224    fn has_tokens(&self) -> bool {
225        false
226    }
227}
228
229/// Represents a single kind in the source code.
230///
231/// Tokens are the fundamental units of lexical analysis, representing
232/// categorized pieces of source text with their position information.
233#[derive(Debug, Clone, PartialEq, Eq, Copy)]
234#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
235pub struct Token<K> {
236    /// The kind/category of this kind (e.g., keyword, identifier, number)
237    pub kind: K,
238    /// The byte range in the source text that this kind occupies
239    #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
240    pub span: Range<usize>,
241}
242
243impl<K> Token<K> {
244    /// Returns the length of this kind in bytes.
245    ///
246    /// # Returns
247    ///
248    /// The number of bytes between the start and end of the kind's span
249    ///
250    /// # Examples
251    ///
252    /// ```ignore
253    /// #![feature(new_range_api)]
254    /// # use oak_core::lexer::Token;
255    /// # use core::range::Range;
256    /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } }
257    /// assert_eq!(kind.length(), 5);
258    /// ```
259    #[inline]
260    pub fn length(&self) -> usize {
261        self.span.end - self.span.start
262    }
263}
264
265/// A stream of tokens with associated source text.
266#[derive(Debug, Clone)]
267#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
268#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
269pub struct TokenStream<K: Copy> {
270    /// The raw source text.
271    pub raw: String,
272    /// The tokens extracted from the source text.
273    #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
274    pub tokens: Arc<[Token<K>]>,
275}
276
277#[cfg(feature = "serde")]
278mod arc_slice_serde {
279    use super::*;
280    use serde::{Deserialize, Deserializer, Serialize, Serializer};
281
282    pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
283    where
284        K: Serialize,
285        S: Serializer,
286    {
287        arc.as_ref().serialize(serializer)
288    }
289
290    pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
291    where
292        K: Deserialize<'de>,
293        D: Deserializer<'de>,
294    {
295        let vec = Vec::<Token<K>>::deserialize(deserializer)?;
296        Ok(Arc::from_iter(vec))
297    }
298}
299
300/// Represents the state of the lexer during a tokenization session.
301///
302/// This struct maintains the current position and context during
303/// tokenization, enabling incremental and resumable lexing operations.
304/// It tracks the current position in the source text, collected tokens,
305/// and any errors encountered.
306#[derive(Debug)]
307pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
308    pub(crate) cursor: SourceCursor<'s, S>,
309    pub(crate) tokens: Vec<Token<L::TokenType>>,
310    pub(crate) errors: Vec<OakError>,
311}
312
313impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
314    /// Creates a new lexer state with the given source text.
315    ///
316    /// # Arguments
317    ///
318    /// * `source` - The source text to lex
319    ///
320    /// # Returns
321    ///
322    /// A new `LexerState` initialized at the beginning of the source
323    pub fn new(source: &'s S) -> Self {
324        Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
325    }
326
327    /// Creates a new lexer state with the given source text and incremental cache.
328    ///
329    /// # Arguments
330    ///
331    /// * `source` - The source text to lex
332    /// * `relex_from` - The minimum byte offset that may have been affected by edits
333    ///   (use `source.length()` to indicate no edits)
334    /// * `cache` - The incremental cache containing previous lexing results
335    ///
336    /// # Returns
337    ///
338    /// A new `LexerState` initialized at the beginning of the source with cache support
339    pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
340        if !cache.has_tokens() {
341            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
342        }
343
344        let len = source.length();
345        let relex_from = relex_from.min(len);
346
347        // Fast path: fully re-used
348        if relex_from >= len {
349            let mut tokens = Vec::new();
350            if let Some(cached) = cache.get_tokens() {
351                tokens.extend_from_slice(cached)
352            }
353            else {
354                let count = cache.count_tokens();
355                tokens.reserve(count);
356                for i in 0..count {
357                    if let Some(t) = cache.get_token(i) {
358                        tokens.push(t)
359                    }
360                }
361            }
362            let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
363            return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
364        }
365
366        if relex_from == 0 {
367            return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
368        }
369
370        let mut reused_tokens = Vec::new();
371        const BACKTRACK_TOKENS: usize = 1;
372
373        if let Some(cached) = cache.get_tokens() {
374            // Binary search for the cut-off point since tokens are sorted by position
375            let idx = cached.partition_point(|t| t.span.end <= relex_from);
376            let keep = idx.saturating_sub(BACKTRACK_TOKENS);
377            if keep > 0 {
378                reused_tokens.extend_from_slice(&cached[..keep])
379            }
380        }
381        else {
382            // Fallback for caches that don't support slice access
383            let count = cache.count_tokens();
384            for i in 0..count {
385                let Some(token) = cache.get_token(i)
386                else {
387                    break;
388                };
389                if token.span.end <= relex_from {
390                    reused_tokens.push(token);
391                }
392                else {
393                    break;
394                }
395            }
396            let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
397            reused_tokens.truncate(keep);
398        }
399
400        let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
401        Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
402    }
403
404    /// Creates a sub-state for scanning a sub-range of the source.
405    pub fn sub_state(&mut self, start: usize, _end: usize) -> Self {
406        Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![] }
407    }
408
409    /// Returns the source text provider.
410    pub fn get_source(&self) -> &'s S {
411        self.cursor.source()
412    }
413
414    /// Gets the remaining text from the current position to the end of the source.
415    ///
416    /// # Returns
417    ///
418    /// A string slice containing the remaining text
419    pub fn rest(&mut self) -> &str {
420        self.cursor.rest()
421    }
422
423    /// Gets the remaining text as a byte slice.
424    ///
425    /// Useful for byte-oriented scanning operations.
426    #[inline]
427    pub fn rest_bytes(&mut self) -> &[u8] {
428        self.cursor.rest().as_bytes()
429    }
430
431    /// Checks if the lexer has consumed all input from the source.
432    ///
433    /// Returns `true` if the current position is at or beyond the end of the source.
434    pub fn fully_reused(&self) -> bool {
435        self.cursor.position() >= self.cursor.source().length()
436    }
437
438    /// Gets the current byte offset position in the source text.
439    ///
440    /// # Returns
441    ///
442    /// The current byte offset from the start of the source text.
443    #[inline]
444    pub fn get_position(&self) -> usize {
445        self.cursor.position()
446    }
447
448    /// Checks if the lexer has NOT consumed all input from the source.
449    ///
450    /// Returns `true` if there are still bytes left to be scanned.
451    #[inline]
452    pub fn not_at_end(&self) -> bool {
453        self.cursor.position() < self.cursor.source().length()
454    }
455
456    /// Peeks at the next character without advancing the cursor.
457    ///
458    /// Returns `None` if at the end of the source.
459    #[inline]
460    pub fn peek(&mut self) -> Option<char> {
461        self.cursor.peek_char()
462    }
463
464    /// Peeks at the character immediately following the current character.
465    #[inline]
466    pub fn peek_next(&mut self) -> Option<char> {
467        self.cursor.peek_next_char()
468    }
469
470    /// Peeks at the character at the specified byte offset relative to the current position.
471    #[inline]
472    pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
473        self.cursor.peek_next_n(n)
474    }
475
476    /// Advances the cursor by the specified number of bytes.
477    #[inline]
478    pub fn advance(&mut self, len: usize) {
479        self.cursor.advance_bytes(len);
480    }
481
482    /// Gets the total length of the source text in bytes.
483    #[inline]
484    pub fn get_length(&self) -> usize {
485        self.cursor.source().length()
486    }
487
488    /// Gets a single character at the specified absolute byte offset.
489    #[inline]
490    pub fn get_char_at(&self, offset: usize) -> Option<char> {
491        self.cursor.source().get_char_at(offset)
492    }
493
494    /// Peeks at the next byte without advancing the cursor.
495    #[inline]
496    pub fn peek_byte(&mut self) -> Option<u8> {
497        self.cursor.peek_byte()
498    }
499
500    /// Advances the cursor by one byte and returns it.
501    #[inline]
502    pub fn advance_byte(&mut self) -> Option<u8> {
503        self.cursor.advance_byte()
504    }
505
506    /// Advances the cursor while the byte predicate is true.
507    ///
508    /// Returns the byte range covered by the matched bytes.
509    #[inline]
510    pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
511        self.cursor.take_while_byte(pred)
512    }
513
514    /// Skips common ASCII whitespace (space, tab, newline, carriage return).
515    ///
516    /// Uses SIMD acceleration if available on the platform.
517    /// Returns the range of the skipped whitespace.
518    #[inline]
519    pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
520        self.cursor.skip_ascii_whitespace()
521    }
522
523    /// Skips all consecutive ASCII digits at the current position.
524    ///
525    /// Returns the range of the skipped digits.
526    #[inline]
527    pub fn skip_ascii_digits(&mut self) -> Range<usize> {
528        self.cursor.skip_ascii_digits()
529    }
530
531    /// Skips all characters that can continue an ASCII identifier.
532    ///
533    /// This includes alphanumeric characters and underscores.
534    /// Returns the range of the skipped characters.
535    #[inline]
536    pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
537        self.cursor.skip_ascii_ident_continue()
538    }
539
540    /// Skips all characters until the target byte is encountered.
541    ///
542    /// The target byte itself is NOT consumed.
543    /// Returns the range of the skipped characters.
544    #[inline]
545    pub fn skip_until(&mut self, target: u8) -> Range<usize> {
546        self.cursor.skip_until(target)
547    }
548
549    /// Scans an ASCII identifier.
550    ///
551    /// An identifier must start with an alphabetic character or an underscore,
552    /// and can be followed by any number of alphanumeric characters or underscores.
553    ///
554    /// # Arguments
555    ///
556    /// * `kind` - The token type to assign if an identifier is found.
557    ///
558    /// # Returns
559    ///
560    /// `true` if an identifier was successfully scanned and added.
561    #[inline]
562    pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
563        let start = self.get_position();
564        if let Some(b) = self.peek_byte() {
565            if b == b'_' || b.is_ascii_alphabetic() {
566                self.advance_byte();
567                self.skip_ascii_ident_continue();
568                self.add_token(kind, start, self.get_position());
569                return true;
570            }
571        }
572        false
573    }
574
575    /// Scans a line comment starting with the given prefix.
576    ///
577    /// Consumes the prefix and all characters until the next newline or EOF.
578    ///
579    /// # Arguments
580    ///
581    /// * `kind` - The token type for the line comment.
582    /// * `prefix` - The string sequence that starts the comment (e.g., "//").
583    #[inline]
584    pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
585        let start = self.get_position();
586        if self.consume_if_starts_with(prefix) {
587            self.skip_until(b'\n');
588            self.add_token(kind, start, self.get_position());
589            return true;
590        }
591        false
592    }
593
594    /// Scans a block comment with given start and end sequences.
595    ///
596    /// Handles nested comments if the underlying implementation supports it,
597    /// though this basic implementation is non-recursive.
598    ///
599    /// # Arguments
600    ///
601    /// * `kind` - The token type for the block comment.
602    /// * `start_seq` - The sequence that starts the block (e.g., "/*").
603    /// * `end_seq` - The sequence that ends the block (e.g., "*/").
604    #[inline]
605    pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
606        let start = self.get_position();
607        if self.consume_if_starts_with(start_seq) {
608            while let Some(_b) = self.peek_byte() {
609                self.skip_until(end_seq.as_bytes()[0]);
610                if self.consume_if_starts_with(end_seq) {
611                    self.add_token(kind, start, self.get_position());
612                    return true;
613                }
614                self.advance_byte();
615            }
616            // Unclosed block comment is still a comment in many languages,
617            // but we might want to add an error here in the future.
618            self.add_token(kind, start, self.get_position());
619            return true;
620        }
621        false
622    }
623
624    /// Gets the tokens collected so far in the lexer state.
625    ///
626    /// # Returns
627    ///
628    /// A slice of tokens collected during lexing.
629    #[inline]
630    pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
631        &self.tokens
632    }
633
634    /// Sets the current position to the specified byte offset.
635    ///
636    /// # Arguments
637    ///
638    /// * `offset` - The new byte offset position.
639    ///
640    /// # Returns
641    ///
642    /// The previous byte offset position.
643    #[inline]
644    pub fn set_position(&mut self, offset: usize) -> usize {
645        self.cursor.set_position(offset)
646    }
647
648    /// Returns a reference to the underlying source.
649    pub fn source(&self) -> &'s S {
650        self.cursor.source()
651    }
652
653    /// Returns the text in the specified byte range.
654    pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
655        self.cursor.source().get_text_in(range)
656    }
657
658    /// Returns the text from the specified byte offset to the end of the source.
659    pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
660        self.cursor.source().get_text_from(offset)
661    }
662
663    /// Checks if the source starts with the given pattern at the current position.
664    pub fn starts_with(&mut self, pattern: &str) -> bool {
665        self.cursor.starts_with(pattern)
666    }
667
668    /// Consumes the pattern if it exists at the current position.
669    ///
670    /// Returns `true` if the pattern was found and consumed, advancing the cursor.
671    pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
672        self.cursor.consume_if_starts_with(pattern)
673    }
674
675    /// Adds an error to the lexer state's diagnostics.
676    ///
677    /// # Arguments
678    ///
679    /// * `error` - The error to add.
680    #[inline]
681    pub fn add_error(&mut self, error: impl Into<OakError>) {
682        self.errors.push(error.into());
683    }
684
685    /// Adds a token to the lexer state.
686    ///
687    /// # Arguments
688    ///
689    /// * `kind` - The kind/type of the token.
690    /// * `start` - The starting byte offset.
691    /// * `end` - The ending byte offset.
692    #[inline]
693    pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
694        self.tokens.push(Token { kind, span: Range { start, end } });
695    }
696
697    /// Adds an end-of-file (EOF) token to the lexer state.
698    ///
699    /// This method creates and adds an `END_OF_STREAM` token at the current position.
700    /// It is typically called when the lexer reaches the end of the source text.
701    ///
702    /// # Examples
703    ///
704    /// ```ignore
705    /// #![feature(new_range_api)]
706    /// # use core::range::Range;
707    /// # use oak_core::lexer::{LexerState, Token};
708    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
709    /// #
710    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
711    /// # enum SimpleToken {
712    /// #     End,
713    /// # }
714    /// #
715    /// # impl TokenType for SimpleToken {
716    /// #     const END_OF_STREAM: Self = SimpleToken::End;
717    /// #     type Role = UniversalTokenRole;
718    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
719    /// # }
720    /// #
721    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
722    /// # enum SimpleElement {}
723    /// #
724    /// # impl ElementType for SimpleElement {
725    /// #     type Role = UniversalElementRole;
726    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
727    /// # }
728    /// #
729    /// # #[derive(Clone)]
730    /// # struct SimpleLanguage;
731    /// #
732    /// # impl Language for SimpleLanguage {
733    /// #     const NAME: &'static str = "simple";
734    /// #     type TokenType = SimpleToken;
735    /// #     type ElementType = SimpleElement;
736    /// #     type TypedRoot = ();
737    /// # }
738    /// #
739    /// let source = SourceText::new("test");
740    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
741    /// state.take_while(|_| true); // Advance to end
742    /// state.add_eof();
743    ///
744    /// assert_eq!(state.tokens().len(), 1);
745    /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
746    /// ```
747    #[inline]
748    pub fn add_eof(&mut self) {
749        let end = self.get_position();
750        self.add_token(L::TokenType::END_OF_STREAM, end, end)
751    }
752
753    /// Gets the current character at the current position.
754    ///
755    /// # Returns
756    ///
757    /// The current character, or `None` if at the end of the source
758    #[inline]
759    pub fn current(&mut self) -> Option<char> {
760        self.cursor.peek_char()
761    }
762
763    /// Advances the position by the current character's length.
764    ///
765    /// # Returns
766    ///
767    /// The character that was skipped, or `None` if at the end of the source
768    #[inline]
769    pub fn bump(&mut self) -> Option<char> {
770        let ch = self.peek()?;
771        self.advance(ch.len_utf8());
772        Some(ch)
773    }
774
775    /// Advances the position by the token's length and adds the token to the lexer state.
776    ///
777    /// This method combines two common operations: advancing the lexer position
778    /// and adding a token to the token list. It calculates the advance distance
779    /// from the token's span, ensuring consistent positioning.
780    ///
781    /// # Arguments
782    ///
783    /// * `token` - The token to add to the lexer state
784    ///
785    /// # Returns
786    ///
787    /// The new byte offset position after advancing
788    ///
789    /// # Examples
790    ///
791    /// ```ignore
792    /// #![feature(new_range_api)]
793    /// # use core::range::Range;
794    /// # use oak_core::lexer::{LexerState, Token};
795    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
796    /// #     /// #
797    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
798    /// # enum SimpleToken { Identifier, End }
799    /// #
800    /// # impl TokenType for SimpleToken {
801    /// #     const END_OF_STREAM: Self = SimpleToken::End;
802    /// #     type Role = UniversalTokenRole;
803    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
804    /// # }
805    /// #
806    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
807    /// # enum SimpleElement {}
808    /// #
809    /// # impl ElementType for SimpleElement {
810    /// #     type Role = UniversalElementRole;
811    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
812    /// # }
813    /// #
814    /// # #[derive(Clone)]
815    /// # struct SimpleLanguage;
816    /// #
817    /// # impl Language for SimpleLanguage {
818    /// #     const NAME: &'static str = "simple";
819    /// #     type TokenType = SimpleToken;
820    /// #     type ElementType = SimpleElement;
821    /// #     type TypedRoot = ();
822    /// # }
823    /// #
824    /// let source = SourceText::new("hello world");
825    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
826    ///
827    /// // Create a token for "hello"
828    /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
829    ///
830    /// // Initially at position 0
831    /// assert_eq!(state.get_position(), 0);
832    ///
833    /// // Advance and add the token
834    /// let new_pos = state.advance_with(token);
835    ///
836    /// // Now at position 5 and token is added
837    /// assert_eq!(new_pos, 5);
838    /// assert_eq!(state.get_position(), 5);
839    /// assert_eq!(state.get_tokens().len(), 1);
840    /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
841    /// ```
842    ///
843    /// # Note
844    ///
845    /// The caller must ensure that the token's span is valid and that the advance
846    /// does not split multi-byte UTF-8 characters. The token should be created
847    /// with proper character boundaries.
848    #[inline]
849    pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
850        self.cursor.advance_bytes(token.length());
851        self.tokens.push(token);
852        self.cursor.position()
853    }
854
855    /// Consumes characters while the predicate returns true, returning the consumed range.
856    ///
857    /// This method iterates through the source text from the current position,
858    /// consuming characters as long as the predicate function returns true.
859    /// It's commonly used for recognizing patterns like identifiers, numbers,
860    /// or whitespace sequences.
861    ///
862    /// # Arguments
863    ///
864    /// * `pred` - A closure that takes a character and returns true if the character
865    ///            should be consumed, false otherwise
866    ///
867    /// # Returns
868    ///
869    /// A byte range representing the span of consumed characters
870    ///
871    /// # Examples
872    ///
873    /// ```ignore
874    /// #![feature(new_range_api)]
875    /// # use core::range::Range;
876    /// # use oak_core::lexer::{LexerState, Token};
877    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
878    /// #     /// #
879    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
880    /// # enum SimpleToken { End }
881    /// #
882    /// # impl TokenType for SimpleToken {
883    /// #     const END_OF_STREAM: Self = SimpleToken::End;
884    /// #     type Role = UniversalTokenRole;
885    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
886    /// # }
887    /// #
888    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
889    /// # enum SimpleElement {}
890    /// #
891    /// # impl ElementType for SimpleElement {
892    /// #     type Role = UniversalElementRole;
893    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
894    /// # }
895    /// #
896    /// # #[derive(Clone)]
897    /// # struct SimpleLanguage;
898    /// #
899    /// # impl Language for SimpleLanguage {
900    /// #     const NAME: &'static str = "simple";
901    /// #     type TokenType = SimpleToken;
902    /// #     type ElementType = SimpleElement;
903    /// #     type TypedRoot = ();
904    /// # }
905    /// #
906    /// let source = SourceText::new("hello123world");
907    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
908    ///
909    /// // Consume alphabetic characters
910    /// let range = state.take_while(|c| c.is_alphabetic());
911    ///
912    /// // Should have consumed "hello"
913    /// assert_eq!(range, Range { start: 0, end: 5 });
914    /// assert_eq!(state.get_position(), 5);
915    ///
916    /// // Consume numeric characters
917    /// let range = state.take_while(|c| c.is_numeric());
918    ///
919    /// // Should have consumed "123"
920    /// assert_eq!(range, Range { start: 5, end: 8 });
921    /// assert_eq!(state.get_position(), 8);
922    /// ```
923    ///
924    /// # Performance Note
925    ///
926    /// This method operates on a character-by-character basis, which means it
927    /// correctly handles multi-byte UTF-8 characters. For performance-critical
928    /// code, consider using byte-based methods when working with ASCII-only text.
929    pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
930        self.cursor.take_while(pred)
931    }
932
933    /// Performs a safety check to prevent infinite loops during lexing.
934    ///
935    /// This method ensures that the lexer always makes progress by forcing
936    /// advancement when stuck at the same position. It's used as a safeguard
937    /// against infinite loops in lexer implementations.
938    ///
939    /// The method compares the current position with a previously saved "safe point"
940    /// position. If they're the same, it means the lexer hasn't made progress since
941    /// that safe point, potentially indicating an infinite loop. In this case, the
942    /// method forces advancement by at least one character.
943    ///
944    /// # Arguments
945    ///
946    /// * `safe_point` - The position to check against for potential deadlock
947    ///
948    /// # Examples
949    ///
950    /// ```ignore
951    /// #![feature(new_range_api)]
952    /// # use oak_core::lexer::{LexerState, Token};
953    /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
954    /// #     /// #
955    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
956    /// # enum SimpleToken { End }
957    /// #
958    /// # impl TokenType for SimpleToken {
959    /// #     const END_OF_STREAM: Self = SimpleToken::End;
960    /// #     type Role = UniversalTokenRole;
961    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
962    /// # }
963    /// #
964    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
965    /// # enum SimpleElement {}
966    /// #
967    /// # impl ElementType for SimpleElement {
968    /// #     type Role = UniversalElementRole;
969    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
970    /// # }
971    /// #
972    /// # struct SimpleLanguage;
973    /// #
974    /// # impl Language for SimpleLanguage {
975    /// #     const NAME: &'static str = "simple";
976    /// #     type TokenType = SimpleToken;
977    /// #     type ElementType = SimpleElement;
978    /// #     type TypedRoot = ();
979    /// # }
980    /// #
981    /// let source = SourceText::new("test");
982    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
983    ///
984    /// // Save the current position as a safe point
985    /// let safe_point = state.get_position();
986    ///
987    /// // In a real lexer, you would do some processing here
988    /// // If something went wrong and we didn't advance, this would prevent infinite loop
989    /// state.advance_if_dead_lock(safe_point);
990    ///
991    /// // If we were stuck, we would have advanced by at least 1
992    /// assert!(state.get_position() >= safe_point);
993    /// ```
994    ///
995    /// # Usage in Lexer Implementations
996    ///
997    /// This method is typically used at the beginning or end of lexing loops:
998    ///
999    /// ```ignore
1000    /// loop {
1001    ///     let safe_point = state.get_position();
1002    ///     
1003    ///     // Try to recognize a token
1004    ///     if let Some(token) = try_recognize_token(&mut state) {
1005    ///         // Success, continue loop
1006    ///         continue;
1007    ///     }
1008    ///     
1009    ///     // If we get here, we didn't recognize anything
1010    ///     // This prevents infinite loops if recognition fails
1011    ///     state.advance_if_dead_lock(safe_point);
1012    ///     
1013    ///     if state.not_at_end() {
1014    ///         // Continue trying to recognize tokens
1015    ///         continue;
1016    ///     } else {
1017    ///         // Reached end of source
1018    ///         break;
1019    ///     }
1020    /// }
1021    /// ```
1022    pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
1023        // Force advance if no progress was made
1024        if self.cursor.position() == safe_point {
1025            if let Some(ch) = self.peek() {
1026                // Skip current character
1027                self.advance(ch.len_utf8())
1028            }
1029            else {
1030                // Advance anyway to prevent infinite loop
1031                self.advance(1)
1032            }
1033            // tracing::warn!("deadlock")
1034        }
1035    }
1036
1037    /// Finishes lexing and returns the final output with tokens and diagnostics.
1038    ///
1039    /// This method concludes the lexing process by converting the collected tokens
1040    /// and errors into a `LexOutput` result. It takes a `Result` parameter that
1041    /// represents the overall success or failure of the lexing operation.
1042    ///
1043    /// If the result is `Ok`, the tokens are returned as the successful result.
1044    /// If the result is `Err`, the error is returned as the failure result.
1045    /// In both cases, any collected diagnostic errors are included in the output.
1046    ///
1047    /// # Arguments
1048    ///
1049    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1050    ///
1051    /// # Returns
1052    ///
1053    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1054    ///
1055    /// # Examples
1056    ///
1057    /// ```
1058    /// #![feature(new_range_api)]
1059    /// # use oak_core::lexer::{LexerState, Token};
1060    /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
1061    /// #     /// #
1062    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1063    /// # enum SimpleToken { Identifier, End }
1064    /// #
1065    /// # impl TokenType for SimpleToken {
1066    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1067    /// #     type Role = UniversalTokenRole;
1068    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1069    /// # }
1070    /// #
1071    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1072    /// # enum SimpleElement {}
1073    /// #
1074    /// # impl ElementType for SimpleElement {
1075    /// #     type Role = UniversalElementRole;
1076    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1077    /// # }
1078    /// #
1079    /// # struct SimpleLanguage;
1080    /// #
1081    /// # impl Language for SimpleLanguage {
1082    /// #     const NAME: &'static str = "simple";
1083    /// #     type TokenType = SimpleToken;
1084    /// #     type ElementType = SimpleElement;
1085    /// #     type TypedRoot = ();
1086    /// # }
1087    /// #
1088    /// let source = SourceText::new("test");
1089    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1090    ///
1091    /// // Add some tokens during lexing
1092    /// state.add_token(SimpleToken::Identifier, 0, 4);
1093    ///
1094    /// // Finish with successful result
1095    /// let output = state.finish(Ok(()));
1096    ///
1097    /// // Check the results
1098    /// assert!(output.result.is_ok());
1099    /// assert_eq!(output.result.unwrap().len(), 1);
1100    /// assert_eq!(output.diagnostics.len(), 0);
1101    ///
1102    /// // Example with error
1103    /// let source2 = SourceText::new("test");
1104    /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1105    /// state2.add_error(OakError::custom_error("Test error"));
1106    ///
1107    /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1108    ///
1109    /// // Check the results
1110    /// assert!(output2.result.is_err());
1111    /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1112    /// ```
1113    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1114        match result {
1115            Ok(_) => {
1116                let tokens: Tokens<L> = self.tokens.into();
1117                OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1118            }
1119            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1120        }
1121    }
1122
1123    /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1124    ///
1125    /// This method is similar to `finish` but additionally updates the incremental cache
1126    /// with the new tokens. It's used for incremental lexing where the results need to
1127    /// be cached for future reuse when the source text changes.
1128    ///
1129    /// The method first creates the output in the same way as `finish`, then updates
1130    /// the cache's `last_lex` field with the new tokens. This enables the next call
1131    /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1132    ///
1133    /// # Arguments
1134    ///
1135    /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1136    /// * `cache` - The incremental cache to update with the new tokens
1137    ///
1138    /// # Returns
1139    ///
1140    /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1141    ///
1142    /// # Examples
1143    ///
1144    /// ```ignore
1145    /// #![feature(new_range_api)]
1146    /// # use core::range::Range;
1147    /// # use oak_core::lexer::{LexerState, Token};
1148    /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1149    /// # use oak_core::parser::session::ParseSession;
1150    /// #
1151    /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1152    /// # enum SimpleToken { Identifier, End }
1153    /// #
1154    /// # impl TokenType for SimpleToken {
1155    /// #     const END_OF_STREAM: Self = SimpleToken::End;
1156    /// #     type Role = UniversalTokenRole;
1157    /// #     fn role(&self) -> Self::Role { UniversalTokenRole::None }
1158    /// # }
1159    /// #
1160    /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1161    /// # enum SimpleElement {}
1162    /// #
1163    /// # impl ElementType for SimpleElement {
1164    /// #     type Role = UniversalElementRole;
1165    /// #     fn role(&self) -> Self::Role { UniversalElementRole::None }
1166    /// # }
1167    /// #
1168    /// # struct SimpleLanguage;
1169    /// #
1170    /// # impl Language for SimpleLanguage {
1171    /// #     const NAME: &'static str = "simple";
1172    /// #     type TokenType = SimpleToken;
1173    /// #     type ElementType = SimpleElement;
1174    /// #     type TypedRoot = ();
1175    /// # }
1176    /// #
1177    /// let source = SourceText::new("test");
1178    /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1179    ///
1180    /// // Create a cache for incremental lexing
1181    /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1182    ///
1183    /// // Add some tokens during lexing
1184    /// state.add_token(SimpleToken::Identifier, 0, 4);
1185    ///
1186    /// // Finish with cache update
1187    /// let output = state.finish_with_cache(Ok(()), &mut cache);
1188    ///
1189    /// // Check the results
1190    /// assert!(output.result.is_ok());
1191    /// assert_eq!(output.result.unwrap().len(), 1);
1192    /// ```
1193    ///
1194    /// # Incremental Lexing Workflow
1195    ///
1196    /// This method is typically used as part of an incremental lexing workflow:
1197    ///
1198    /// ```ignore
1199    /// // First lexing
1200    /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1201    /// // ... lexing logic ...
1202    /// let output = state.finish_with_cache(Ok(()), cache);
1203    ///
1204    /// // Later, when source changes
1205    /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1206    /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1207    /// // ... lexing logic (reusing unchanged tokens) ...
1208    /// let output = state.finish_with_cache(Ok(()), cache);
1209    /// ```
1210    pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1211        let out = self.finish(result);
1212        cache.set_lex_output(out.clone());
1213        out
1214    }
1215}
oak_core/lexer/mod.rs

oak_core/lexer/
mod.rs