oak_core/lexer/mod.rs
1#![doc = include_str!("readme.md")]
2
3use crate::{
4 Language, TextEdit, TokenType,
5 errors::{OakDiagnostics, OakError},
6 source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9use std::borrow::Cow;
10use triomphe::Arc;
11
12/// Utilities for scanning comments.
13pub mod scan_comment;
14/// Utilities for scanning identifiers.
15pub mod scan_identifier;
16/// Utilities for scanning numbers.
17pub mod scan_number;
18/// Utilities for scanning string literals.
19pub mod scan_string;
20/// Utilities for scanning whitespace.
21pub mod scan_white_space;
22
23pub use scan_comment::CommentConfig;
24pub use scan_string::StringConfig;
25pub use scan_white_space::WhitespaceConfig;
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
33
34/// Output type for lexical analysis operations, including diagnostics.
35pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
36
37/// Trait for tokenizing source code into sequences of tokens.
38///
39/// This trait defines the interface for converting source text into a sequence of
40/// tokens that can be consumed by the parser. Implementations should handle
41/// the specific lexical rules of their target language.
42///
43/// # Examples
44///
45/// ```ignore
46/// struct MyLexer;
47///
48/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
49/// enum MyToken {
50/// Number,
51/// Identifier,
52/// End,
53/// }
54///
55/// impl TokenType for MyToken {
56/// const END_OF_STREAM: Self = MyToken::End;
57/// type Role = UniversalTokenRole;
58/// fn role(&self) -> Self::Role { UniversalTokenRole::None }
59/// }
60///
61/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
62/// enum MyElement {}
63///
64/// impl ElementType for MyElement {
65/// type Role = UniversalElementRole;
66/// fn role(&self) -> Self::Role { UniversalElementRole::None }
67/// }
68///
69/// struct MyLanguage;
70///
71/// impl Language for MyLanguage {
72/// const NAME: &'static str = "my-language";
73/// type TokenType = MyToken;
74/// type ElementType = MyElement;
75/// type TypedRoot = ();
76/// }
77///
78/// impl Lexer<MyLanguage> for MyLexer {
79/// fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
80/// // Tokenization logic here
81/// todo!()
82/// }
83/// }
84/// ```
85pub trait Lexer<L: Language + Send + Sync + 'static> {
86 /// Tokenizes the given source text into a sequence of tokens.
87 ///
88 /// This method performs a full lexical analysis of the source text,
89 /// creating a new sequence of tokens from scratch. It uses a default
90 /// cache configuration.
91 ///
92 /// # Arguments
93 ///
94 /// * `source` - The source text to tokenize
95 ///
96 /// # Returns
97 ///
98 /// A [`LexOutput`] containing the tokens and any diagnostic messages
99 fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
100}
101
102/// Cache trait for lexical results.
103///
104/// This trait defines the interface for caching and accessing lexical analysis results.
105/// It provides methods for storing and retrieving token information from previous
106/// lexical analysis operations.
107#[allow(unused_variables)]
108pub trait LexerCache<L: Language> {
109 /// Sets the lexed output in the cache.
110 ///
111 /// # Arguments
112 ///
113 /// * `output` - The output from lexical analysis, including tokens and diagnostics
114 fn set_lex_output(&mut self, output: LexOutput<L>);
115
116 /// Gets a token from the cache by index.
117 ///
118 /// # Arguments
119 ///
120 /// * `index` - The index of the token to retrieve
121 ///
122 /// # Returns
123 ///
124 /// An `Option<Token<L::TokenType>>` containing the token if it exists,
125 /// or `None` if the index is out of bounds or no tokens are cached
126 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
127
128 /// Gets the total number of tokens in the cache.
129 ///
130 /// # Returns
131 ///
132 /// The number of cached tokens, or 0 if no tokens are cached
133 fn count_tokens(&self) -> usize;
134
135 /// Checks if the cache contains any tokens.
136 ///
137 /// # Returns
138 ///
139 /// `true` if the cache contains tokens, `false` otherwise
140 fn has_tokens(&self) -> bool;
141
142 /// Gets all cached tokens as a slice.
143 ///
144 /// # Returns
145 ///
146 /// An optional slice of tokens if available.
147 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
148 None
149 }
150}
151
152impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
153 fn set_lex_output(&mut self, output: LexOutput<L>) {
154 (**self).set_lex_output(output);
155 }
156
157 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
158 (**self).get_token(index)
159 }
160
161 fn count_tokens(&self) -> usize {
162 (**self).count_tokens()
163 }
164
165 fn has_tokens(&self) -> bool {
166 (**self).has_tokens()
167 }
168
169 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
170 (**self).get_tokens()
171 }
172}
173
174/// Represents a single kind in the source code.
175///
176/// Tokens are the fundamental units of lexical analysis, representing
177/// categorized pieces of source text with their position information.
178#[derive(Debug, Clone, PartialEq, Eq, Copy)]
179pub struct Token<K> {
180 /// The kind/category of this kind (e.g., keyword, identifier, number)
181 pub kind: K,
182 /// The byte range in the source text that this kind occupies
183 pub span: Range<usize>,
184}
185
186impl<K> Token<K> {
187 /// Returns the length of this kind in bytes.
188 ///
189 /// # Returns
190 ///
191 /// The number of bytes between the start and end of the kind's span
192 ///
193 /// # Examples
194 ///
195 /// ```ignore
196 /// #![feature(new_range_api)]
197 /// # use oak_core::lexer::Token;
198 /// # use core::range::Range;
199 /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
200 /// assert_eq!(kind.length(), 5);
201 /// ```
202 #[inline]
203 pub fn length(&self) -> usize {
204 self.span.end - self.span.start
205 }
206}
207
208/// State information for incremental lexical analysis.
209///
210/// This struct maintains the current position and context during
211/// tokenization, enabling incremental and resumable lexing operations.
212#[derive(Debug)]
213pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
214 pub(crate) cursor: SourceCursor<'s, S>,
215 pub(crate) tokens: Vec<Token<L::TokenType>>,
216 pub(crate) errors: Vec<OakError>,
217}
218
219impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
220 /// Creates a new lexer state with the given source text.
221 ///
222 /// # Arguments
223 ///
224 /// * `source` - The source text to lex
225 ///
226 /// # Returns
227 ///
228 /// A new `LexerState` initialized at the beginning of the source
229 pub fn new(source: &'s S) -> Self {
230 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
231 }
232
233 /// Creates a new lexer state with the given source text and incremental cache.
234 ///
235 /// # Arguments
236 ///
237 /// * `source` - The source text to lex
238 /// * `relex_from` - The minimum byte offset that may have been affected by edits
239 /// (use `source.length()` to indicate no edits)
240 /// * `cache` - The incremental cache containing previous lexing results
241 ///
242 /// # Returns
243 ///
244 /// A new `LexerState` initialized at the beginning of the source with cache support
245 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
246 if !cache.has_tokens() {
247 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
248 }
249
250 let len = source.length();
251 let relex_from = relex_from.min(len);
252
253 // Fast path: fully re-used
254 if relex_from >= len {
255 let mut tokens = Vec::new();
256 if let Some(cached) = cache.get_tokens() {
257 tokens.extend_from_slice(cached);
258 }
259 else {
260 let count = cache.count_tokens();
261 tokens.reserve(count);
262 for i in 0..count {
263 if let Some(t) = cache.get_token(i) {
264 tokens.push(t);
265 }
266 }
267 }
268 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
269 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
270 }
271
272 if relex_from == 0 {
273 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
274 }
275
276 let mut reused_tokens = Vec::new();
277 const BACKTRACK_TOKENS: usize = 1;
278
279 if let Some(cached) = cache.get_tokens() {
280 // Binary search for the cut-off point since tokens are sorted by position
281 let idx = cached.partition_point(|t| t.span.end <= relex_from);
282 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
283 if keep > 0 {
284 reused_tokens.extend_from_slice(&cached[..keep]);
285 }
286 }
287 else {
288 // Fallback for caches that don't support slice access
289 let count = cache.count_tokens();
290 for i in 0..count {
291 let Some(token) = cache.get_token(i)
292 else {
293 break;
294 };
295 if token.span.end <= relex_from {
296 reused_tokens.push(token);
297 }
298 else {
299 break;
300 }
301 }
302 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
303 reused_tokens.truncate(keep);
304 }
305
306 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
307 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
308 }
309
310 /// Gets the remaining text from the current position to the end of the source.
311 ///
312 /// # Returns
313 ///
314 /// A string slice containing the remaining text
315 pub fn rest(&mut self) -> &str {
316 self.cursor.rest()
317 }
318
319 /// Gets the remaining text as a byte slice.
320 #[inline]
321 pub fn rest_bytes(&mut self) -> &[u8] {
322 self.cursor.rest().as_bytes()
323 }
324
325 /// Checks if the lexer has consumed all input from the source.
326 pub fn fully_reused(&self) -> bool {
327 self.cursor.position() >= self.cursor.source().length()
328 }
329
330 /// Gets the current byte offset position in the source text.
331 ///
332 /// # Returns
333 ///
334 /// The current byte offset from the start of the source text
335 #[inline]
336 pub fn get_position(&self) -> usize {
337 self.cursor.position()
338 }
339
340 /// Gets the total length of the source text in bytes.
341 #[inline]
342 pub fn get_length(&self) -> usize {
343 self.cursor.source().length()
344 }
345
346 /// Gets a single character at the specified byte offset.
347 #[inline]
348 pub fn get_char_at(&self, offset: usize) -> Option<char> {
349 self.cursor.source().get_char_at(offset)
350 }
351
352 /// Peeks at the next byte without advancing.
353 #[inline]
354 pub fn peek_byte(&mut self) -> Option<u8> {
355 self.cursor.peek_byte()
356 }
357
358 /// Advances the cursor by one byte and returns it.
359 #[inline]
360 pub fn advance_byte(&mut self) -> Option<u8> {
361 self.cursor.advance_byte()
362 }
363
364 /// Advances the cursor while the byte predicate is true.
365 #[inline]
366 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
367 self.cursor.take_while_byte(pred)
368 }
369
370 /// Skips common ASCII whitespace using SIMD if possible.
371 #[inline]
372 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
373 self.cursor.skip_ascii_whitespace()
374 }
375
376 /// Skips all ASCII digits at the current position.
377 #[inline]
378 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
379 self.cursor.skip_ascii_digits()
380 }
381
382 /// Skips all characters that can continue an ASCII identifier.
383 #[inline]
384 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
385 self.cursor.skip_ascii_ident_continue()
386 }
387
388 /// Skips all characters until the target byte is encountered.
389 #[inline]
390 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
391 self.cursor.skip_until(target)
392 }
393
394 /// Scans an ASCII identifier (starts with alpha/_, continues with alphanumeric/_).
395 #[inline]
396 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
397 let start = self.get_position();
398 if let Some(b) = self.peek_byte() {
399 if b == b'_' || b.is_ascii_alphabetic() {
400 self.advance_byte();
401 self.skip_ascii_ident_continue();
402 self.add_token(kind, start, self.get_position());
403 return true;
404 }
405 }
406 false
407 }
408
409 /// Scans a line comment starting with the given prefix.
410 #[inline]
411 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
412 let start = self.get_position();
413 if self.consume_if_starts_with(prefix) {
414 self.skip_until(b'\n');
415 self.add_token(kind, start, self.get_position());
416 return true;
417 }
418 false
419 }
420
421 /// Scans a block comment with given start and end sequences.
422 #[inline]
423 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
424 let start = self.get_position();
425 if self.consume_if_starts_with(start_seq) {
426 while let Some(_b) = self.peek_byte() {
427 self.skip_until(end_seq.as_bytes()[0]);
428 if self.consume_if_starts_with(end_seq) {
429 self.add_token(kind, start, self.get_position());
430 return true;
431 }
432 self.advance_byte();
433 }
434 // Unclosed block comment is still a comment in many languages,
435 // but we might want to add an error here in the future.
436 self.add_token(kind, start, self.get_position());
437 return true;
438 }
439 false
440 }
441
442 /// Gets a reference to the tokens collected so far.
443 ///
444 /// # Returns
445 ///
446 /// A slice of tokens collected during the lexing process
447 #[inline]
448 pub fn tokens(&self) -> &[Token<L::TokenType>] {
449 &self.tokens
450 }
451
452 /// Sets the current position to the specified byte offset.
453 ///
454 /// # Arguments
455 ///
456 /// * `offset` - The new byte offset position
457 ///
458 /// # Returns
459 ///
460 /// The previous byte offset position
461 #[inline]
462 pub fn set_position(&mut self, offset: usize) -> usize {
463 self.cursor.set_position(offset)
464 }
465
466 /// Returns a reference to the underlying source.
467 pub fn source(&self) -> &'s S {
468 self.cursor.source()
469 }
470
471 /// Returns the text in the specified range.
472 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
473 self.cursor.source().get_text_in(range)
474 }
475
476 /// Returns the text from the specified offset to the end.
477 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
478 self.cursor.source().get_text_from(offset)
479 }
480
481 /// Checks if the source starts with the given pattern at the current position.
482 pub fn starts_with(&mut self, pattern: &str) -> bool {
483 self.cursor.starts_with(pattern)
484 }
485
486 /// Consumes the pattern if it exists at the current position.
487 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
488 self.cursor.consume_if_starts_with(pattern)
489 }
490
491 /// Gets the tokens collected so far in the lexer state.
492 ///
493 /// # Returns
494 ///
495 /// A slice of tokens collected during lexing
496 #[inline]
497 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
498 &self.tokens
499 }
500
501 /// Adds an error to the lexer state.
502 ///
503 /// # Arguments
504 ///
505 /// * `error` - The error to add to the diagnostics
506 #[inline]
507 pub fn add_error(&mut self, error: impl Into<OakError>) {
508 self.errors.push(error.into());
509 }
510
511 /// Adds a token to the lexer state.
512 ///
513 /// # Arguments
514 ///
515 /// * `kind` - The kind of the token
516 /// * `start` - The starting byte offset of the token
517 /// * `end` - The ending byte offset of the token
518 #[inline]
519 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
520 self.tokens.push(Token { kind, span: Range { start, end } });
521 }
522
523 /// Adds an end-of-file token to the lexer state.
524 ///
525 /// This method creates and adds an END_OF_STREAM token at the current position.
526 /// It's typically called when the lexer reaches the end of the source text
527 /// to mark the termination of the token stream.
528 ///
529 /// # Examples
530 ///
531 /// ```ignore
532 /// #![feature(new_range_api)]
533 /// # use core::range::Range;
534 /// # use oak_core::lexer::{LexerState, Token};
535 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
536 /// #
537 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
538 /// # enum SimpleToken {
539 /// # End,
540 /// # }
541 /// #
542 /// # impl TokenType for SimpleToken {
543 /// # const END_OF_STREAM: Self = SimpleToken::End;
544 /// # type Role = UniversalTokenRole;
545 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
546 /// # }
547 /// #
548 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
549 /// # enum SimpleElement {}
550 /// #
551 /// # impl ElementType for SimpleElement {
552 /// # type Role = UniversalElementRole;
553 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
554 /// # }
555 /// #
556 /// # #[derive(Clone)]
557 /// # struct SimpleLanguage;
558 /// #
559 /// # impl Language for SimpleLanguage {
560 /// # const NAME: &'static str = "simple";
561 /// # type TokenType = SimpleToken;
562 /// # type ElementType = SimpleElement;
563 /// # type TypedRoot = ();
564 /// # }
565 /// #
566 /// let source = SourceText::new("test");
567 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
568 /// state.take_while(|_| true); // Advance to end
569 /// state.add_eof();
570 ///
571 /// assert_eq!(state.tokens().len(), 1);
572 /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
573 /// ```
574 #[inline]
575 pub fn add_eof(&mut self) {
576 let end = self.get_position();
577 self.add_token(L::TokenType::END_OF_STREAM, end, end);
578 }
579
580 /// Gets the current character at the current position.
581 ///
582 /// # Returns
583 ///
584 /// The current character, or `None` if at the end of the source
585 #[inline]
586 pub fn current(&mut self) -> Option<char> {
587 self.cursor.peek_char()
588 }
589
590 /// Peeks at the next character without advancing the position.
591 ///
592 /// # Returns
593 ///
594 /// The next character, or `None` if at the end of the source
595 #[inline]
596 pub fn peek(&mut self) -> Option<char> {
597 self.cursor.peek_char()
598 }
599
600 /// Peeks at the character n positions ahead without advancing the position.
601 ///
602 /// # Arguments
603 ///
604 /// * `n` - The number of characters to peek ahead
605 ///
606 /// # Returns
607 ///
608 /// The character n positions ahead, or `None` if beyond the end of the source
609 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
610 if n == 0 {
611 return self.peek();
612 }
613
614 // Fast path: check current chunk
615 let rest = self.cursor.rest();
616 if let Some(ch) = rest.chars().nth(n) {
617 return Some(ch);
618 }
619
620 // Slow path: cross chunk
621 let mut count = 0;
622 let mut offset = self.cursor.position();
623 let end = self.get_length();
624
625 while offset < end {
626 let chunk = self.source().chunk_at(offset);
627 let text = chunk.slice_from(offset);
628 for ch in text.chars() {
629 if count == n {
630 return Some(ch);
631 }
632 count += 1;
633 }
634 offset = chunk.end();
635 }
636
637 None
638 }
639
640 /// Advances the position by the specified number of bytes.
641 ///
642 /// This method moves the lexer's current position forward by the specified
643 /// number of bytes. It's commonly used after recognizing a token to move
644 /// past the token's characters.
645 ///
646 /// # Arguments
647 ///
648 /// * `length` - The number of bytes to advance
649 ///
650 /// # Returns
651 ///
652 /// The new byte offset position after advancing
653 #[inline]
654 pub fn advance(&mut self, length: usize) -> usize {
655 self.cursor.advance_bytes(length)
656 }
657
658 /// Advances the position by the current character's length.
659 ///
660 /// # Returns
661 ///
662 /// The character that was skipped, or `None` if at the end of the source
663 #[inline]
664 pub fn bump(&mut self) -> Option<char> {
665 let ch = self.peek()?;
666 self.advance(ch.len_utf8());
667 Some(ch)
668 }
669
670 /// Advances the position by the token's length and adds the token to the lexer state.
671 ///
672 /// This method combines two common operations: advancing the lexer position
673 /// and adding a token to the token list. It calculates the advance distance
674 /// from the token's span, ensuring consistent positioning.
675 ///
676 /// # Arguments
677 ///
678 /// * `token` - The token to add to the lexer state
679 ///
680 /// # Returns
681 ///
682 /// The new byte offset position after advancing
683 ///
684 /// # Examples
685 ///
686 /// ```ignore
687 /// #![feature(new_range_api)]
688 /// # use core::range::Range;
689 /// # use oak_core::lexer::{LexerState, Token};
690 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
691 /// # /// #
692 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
693 /// # enum SimpleToken { Identifier, End }
694 /// #
695 /// # impl TokenType for SimpleToken {
696 /// # const END_OF_STREAM: Self = SimpleToken::End;
697 /// # type Role = UniversalTokenRole;
698 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
699 /// # }
700 /// #
701 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
702 /// # enum SimpleElement {}
703 /// #
704 /// # impl ElementType for SimpleElement {
705 /// # type Role = UniversalElementRole;
706 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
707 /// # }
708 /// #
709 /// # #[derive(Clone)]
710 /// # struct SimpleLanguage;
711 /// #
712 /// # impl Language for SimpleLanguage {
713 /// # const NAME: &'static str = "simple";
714 /// # type TokenType = SimpleToken;
715 /// # type ElementType = SimpleElement;
716 /// # type TypedRoot = ();
717 /// # }
718 /// #
719 /// let source = SourceText::new("hello world");
720 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
721 ///
722 /// // Create a token for "hello"
723 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } };
724 ///
725 /// // Initially at position 0
726 /// assert_eq!(state.get_position(), 0);
727 ///
728 /// // Advance and add the token
729 /// let new_pos = state.advance_with(token);
730 ///
731 /// // Now at position 5 and token is added
732 /// assert_eq!(new_pos, 5);
733 /// assert_eq!(state.get_position(), 5);
734 /// assert_eq!(state.get_tokens().len(), 1);
735 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
736 /// ```
737 ///
738 /// # Note
739 ///
740 /// The caller must ensure that the token's span is valid and that the advance
741 /// does not split multi-byte UTF-8 characters. The token should be created
742 /// with proper character boundaries.
743 #[inline]
744 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
745 self.cursor.advance_bytes(token.length());
746 self.tokens.push(token);
747 self.cursor.position()
748 }
749
750 /// Consumes characters while the predicate returns true, returning the consumed range.
751 ///
752 /// This method iterates through the source text from the current position,
753 /// consuming characters as long as the predicate function returns true.
754 /// It's commonly used for recognizing patterns like identifiers, numbers,
755 /// or whitespace sequences.
756 ///
757 /// # Arguments
758 ///
759 /// * `pred` - A closure that takes a character and returns true if the character
760 /// should be consumed, false otherwise
761 ///
762 /// # Returns
763 ///
764 /// A byte range representing the span of consumed characters
765 ///
766 /// # Examples
767 ///
768 /// ```ignore
769 /// #![feature(new_range_api)]
770 /// # use core::range::Range;
771 /// # use oak_core::lexer::{LexerState, Token};
772 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
773 /// # /// #
774 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
775 /// # enum SimpleToken { End }
776 /// #
777 /// # impl TokenType for SimpleToken {
778 /// # const END_OF_STREAM: Self = SimpleToken::End;
779 /// # type Role = UniversalTokenRole;
780 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
781 /// # }
782 /// #
783 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
784 /// # enum SimpleElement {}
785 /// #
786 /// # impl ElementType for SimpleElement {
787 /// # type Role = UniversalElementRole;
788 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
789 /// # }
790 /// #
791 /// # #[derive(Clone)]
792 /// # struct SimpleLanguage;
793 /// #
794 /// # impl Language for SimpleLanguage {
795 /// # const NAME: &'static str = "simple";
796 /// # type TokenType = SimpleToken;
797 /// # type ElementType = SimpleElement;
798 /// # type TypedRoot = ();
799 /// # }
800 /// #
801 /// let source = SourceText::new("hello123world");
802 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
803 ///
804 /// // Consume alphabetic characters
805 /// let range = state.take_while(|c| c.is_alphabetic());
806 ///
807 /// // Should have consumed "hello"
808 /// assert_eq!(range, Range { start: 0, end: 5 });
809 /// assert_eq!(state.get_position(), 5);
810 ///
811 /// // Consume numeric characters
812 /// let range = state.take_while(|c| c.is_numeric());
813 ///
814 /// // Should have consumed "123"
815 /// assert_eq!(range, Range { start: 5, end: 8 });
816 /// assert_eq!(state.get_position(), 8);
817 /// ```
818 ///
819 /// # Performance Note
820 ///
821 /// This method operates on a character-by-character basis, which means it
822 /// correctly handles multi-byte UTF-8 characters. For performance-critical
823 /// code, consider using byte-based methods when working with ASCII-only text.
824 pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
825 let start = self.cursor.position();
826 while let Some(ch) = self.peek() {
827 if pred(ch) {
828 self.advance(ch.len_utf8());
829 }
830 else {
831 break;
832 }
833 }
834 Range { start, end: self.cursor.position() }
835 }
836
837 /// Checks if the lexer has not reached the end of the source text.
838 ///
839 /// # Returns
840 ///
841 /// `true` if not at the end of the source, `false` otherwise
842 #[inline]
843 pub fn not_at_end(&self) -> bool {
844 self.cursor.position() < self.cursor.source().length()
845 }
846
847 /// Performs a safety check to prevent infinite loops during lexing.
848 ///
849 /// This method ensures that the lexer always makes progress by forcing
850 /// advancement when stuck at the same position. It's used as a safeguard
851 /// against infinite loops in lexer implementations.
852 ///
853 /// The method compares the current position with a previously saved "safe point"
854 /// position. If they're the same, it means the lexer hasn't made progress since
855 /// that safe point, potentially indicating an infinite loop. In this case, the
856 /// method forces advancement by at least one character.
857 ///
858 /// # Arguments
859 ///
860 /// * `safe_point` - The position to check against for potential deadlock
861 ///
862 /// # Examples
863 ///
864 /// ```ignore
865 /// #![feature(new_range_api)]
866 /// # use oak_core::lexer::{LexerState, Token};
867 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
868 /// # /// #
869 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
870 /// # enum SimpleToken { End }
871 /// #
872 /// # impl TokenType for SimpleToken {
873 /// # const END_OF_STREAM: Self = SimpleToken::End;
874 /// # type Role = UniversalTokenRole;
875 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
876 /// # }
877 /// #
878 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
879 /// # enum SimpleElement {}
880 /// #
881 /// # impl ElementType for SimpleElement {
882 /// # type Role = UniversalElementRole;
883 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
884 /// # }
885 /// #
886 /// # struct SimpleLanguage;
887 /// #
888 /// # impl Language for SimpleLanguage {
889 /// # const NAME: &'static str = "simple";
890 /// # type TokenType = SimpleToken;
891 /// # type ElementType = SimpleElement;
892 /// # type TypedRoot = ();
893 /// # }
894 /// #
895 /// let source = SourceText::new("test");
896 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
897 ///
898 /// // Save the current position as a safe point
899 /// let safe_point = state.get_position();
900 ///
901 /// // In a real lexer, you would do some processing here
902 /// // If something went wrong and we didn't advance, this would prevent infinite loop
903 /// state.advance_if_dead_lock(safe_point);
904 ///
905 /// // If we were stuck, we would have advanced by at least 1
906 /// assert!(state.get_position() >= safe_point);
907 /// ```
908 ///
909 /// # Usage in Lexer Implementations
910 ///
911 /// This method is typically used at the beginning or end of lexing loops:
912 ///
913 /// ```ignore
914 /// loop {
915 /// let safe_point = state.get_position();
916 ///
917 /// // Try to recognize a token
918 /// if let Some(token) = try_recognize_token(&mut state) {
919 /// // Success, continue loop
920 /// continue;
921 /// }
922 ///
923 /// // If we get here, we didn't recognize anything
924 /// // This prevents infinite loops if recognition fails
925 /// state.advance_if_dead_lock(safe_point);
926 ///
927 /// if state.not_at_end() {
928 /// // Continue trying to recognize tokens
929 /// continue;
930 /// } else {
931 /// // Reached end of source
932 /// break;
933 /// }
934 /// }
935 /// ```
936 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
937 // Force advance if no progress was made
938 if self.cursor.position() == safe_point {
939 if let Some(ch) = self.current() {
940 // Skip current character
941 self.advance(ch.len_utf8());
942 }
943 else {
944 // Advance anyway to prevent infinite loop
945 self.advance(1);
946 }
947 // tracing::warn!("deadlock");
948 }
949 }
950
951 /// Finishes lexing and returns the final output with tokens and diagnostics.
952 ///
953 /// This method concludes the lexing process by converting the collected tokens
954 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
955 /// represents the overall success or failure of the lexing operation.
956 ///
957 /// If the result is `Ok`, the tokens are returned as the successful result.
958 /// If the result is `Err`, the error is returned as the failure result.
959 /// In both cases, any collected diagnostic errors are included in the output.
960 ///
961 /// # Arguments
962 ///
963 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
964 ///
965 /// # Returns
966 ///
967 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
968 ///
969 /// # Examples
970 ///
971 /// ```
972 /// #![feature(new_range_api)]
973 /// # use oak_core::lexer::{LexerState, Token};
974 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
975 /// # /// #
976 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
977 /// # enum SimpleToken { Identifier, End }
978 /// #
979 /// # impl TokenType for SimpleToken {
980 /// # const END_OF_STREAM: Self = SimpleToken::End;
981 /// # type Role = UniversalTokenRole;
982 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
983 /// # }
984 /// #
985 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
986 /// # enum SimpleElement {}
987 /// #
988 /// # impl ElementType for SimpleElement {
989 /// # type Role = UniversalElementRole;
990 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
991 /// # }
992 /// #
993 /// # struct SimpleLanguage;
994 /// #
995 /// # impl Language for SimpleLanguage {
996 /// # const NAME: &'static str = "simple";
997 /// # type TokenType = SimpleToken;
998 /// # type ElementType = SimpleElement;
999 /// # type TypedRoot = ();
1000 /// # }
1001 /// #
1002 /// let source = SourceText::new("test");
1003 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1004 ///
1005 /// // Add some tokens during lexing
1006 /// state.add_token(SimpleToken::Identifier, 0, 4);
1007 ///
1008 /// // Finish with successful result
1009 /// let output = state.finish(Ok(()));
1010 ///
1011 /// // Check the results
1012 /// assert!(output.result.is_ok());
1013 /// assert_eq!(output.result.unwrap().len(), 1);
1014 /// assert_eq!(output.diagnostics.len(), 0);
1015 ///
1016 /// // Example with error
1017 /// let source2 = SourceText::new("test");
1018 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1019 /// state2.add_error(OakError::custom_error("Test error"));
1020 ///
1021 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1022 ///
1023 /// // Check the results
1024 /// assert!(output2.result.is_err());
1025 /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1026 /// ```
1027 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1028 match result {
1029 Ok(_) => {
1030 let tokens: Tokens<L> = self.tokens.into();
1031 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1032 }
1033 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1034 }
1035 }
1036
1037 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1038 ///
1039 /// This method is similar to `finish` but additionally updates the incremental cache
1040 /// with the new tokens. It's used for incremental lexing where the results need to
1041 /// be cached for future reuse when the source text changes.
1042 ///
1043 /// The method first creates the output in the same way as `finish`, then updates
1044 /// the cache's `last_lex` field with the new tokens. This enables the next call
1045 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1046 ///
1047 /// # Arguments
1048 ///
1049 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1050 /// * `cache` - The incremental cache to update with the new tokens
1051 ///
1052 /// # Returns
1053 ///
1054 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1055 ///
1056 /// # Examples
1057 ///
1058 /// ```ignore
1059 /// #![feature(new_range_api)]
1060 /// # use core::range::Range;
1061 /// # use oak_core::lexer::{LexerState, Token};
1062 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1063 /// # use oak_core::parser::session::ParseSession;
1064 /// #
1065 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1066 /// # enum SimpleToken { Identifier, End }
1067 /// #
1068 /// # impl TokenType for SimpleToken {
1069 /// # const END_OF_STREAM: Self = SimpleToken::End;
1070 /// # type Role = UniversalTokenRole;
1071 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1072 /// # }
1073 /// #
1074 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1075 /// # enum SimpleElement {}
1076 /// #
1077 /// # impl ElementType for SimpleElement {
1078 /// # type Role = UniversalElementRole;
1079 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1080 /// # }
1081 /// #
1082 /// # struct SimpleLanguage;
1083 /// #
1084 /// # impl Language for SimpleLanguage {
1085 /// # const NAME: &'static str = "simple";
1086 /// # type TokenType = SimpleToken;
1087 /// # type ElementType = SimpleElement;
1088 /// # type TypedRoot = ();
1089 /// # }
1090 /// #
1091 /// let source = SourceText::new("test");
1092 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1093 ///
1094 /// // Create a cache for incremental lexing
1095 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1096 ///
1097 /// // Add some tokens during lexing
1098 /// state.add_token(SimpleToken::Identifier, 0, 4);
1099 ///
1100 /// // Finish with cache update
1101 /// let output = state.finish_with_cache(Ok(()), &mut cache);
1102 ///
1103 /// // Check the results
1104 /// assert!(output.result.is_ok());
1105 /// assert_eq!(output.result.unwrap().len(), 1);
1106 /// ```
1107 ///
1108 /// # Incremental Lexing Workflow
1109 ///
1110 /// This method is typically used as part of an incremental lexing workflow:
1111 ///
1112 /// ```ignore
1113 /// // First lexing
1114 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1115 /// // ... lexing logic ...
1116 /// let output = state.finish_with_cache(Ok(()), cache);
1117 ///
1118 /// // Later, when source changes
1119 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1120 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1121 /// // ... lexing logic (reusing unchanged tokens) ...
1122 /// let output = state.finish_with_cache(Ok(()), cache);
1123 /// ```
1124 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1125 let out = self.finish(result);
1126 cache.set_lex_output(out.clone());
1127 out
1128 }
1129}