oak_core/lexer/mod.rs
1#![doc = include_str!("readme.md")]
2
3use crate::{
4 Language, TextEdit, TokenType,
5 errors::{OakDiagnostics, OakError},
6 source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15pub mod scan_comment;
16/// Utilities for scanning identifiers.
17pub mod scan_identifier;
18/// Utilities for scanning numbers.
19pub mod scan_number;
20/// Utilities for scanning string literals.
21pub mod scan_string;
22/// Utilities for scanning whitespace.
23pub mod scan_white_space;
24
25pub use scan_comment::CommentConfig;
26pub use scan_string::StringConfig;
27pub use scan_white_space::WhitespaceConfig;
28
29/// Output type for lexical analysis operations.
30///
31/// This type alias represents the result of tokenization, containing
32/// a vector of tokens and any diagnostic language that occurred during
33/// the lexing process.
34pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
35
36/// Output type for lexical analysis operations, including diagnostics.
37pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
38
39/// Trait for tokenizing source code into sequences of tokens.
40///
41/// This trait defines the interface for converting source text into a sequence of
42/// tokens that can be consumed by the parser. Implementations should handle
43/// the specific lexical rules of their target language.
44///
45/// # Examples
46///
47/// ```ignore
48/// struct MyLexer;
49///
50/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
51/// enum MyToken {
52/// Number,
53/// Identifier,
54/// End,
55/// }
56///
57/// impl TokenType for MyToken {
58/// const END_OF_STREAM: Self = MyToken::End;
59/// type Role = UniversalTokenRole;
60/// fn role(&self) -> Self::Role { UniversalTokenRole::None }
61/// }
62///
63/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64/// enum MyElement {}
65///
66/// impl ElementType for MyElement {
67/// type Role = UniversalElementRole;
68/// fn role(&self) -> Self::Role { UniversalElementRole::None }
69/// }
70///
71/// struct MyLanguage;
72///
73/// impl Language for MyLanguage {
74/// const NAME: &'static str = "my-language";
75/// type TokenType = MyToken;
76/// type ElementType = MyElement;
77/// type TypedRoot = ();
78/// }
79///
80/// impl Lexer<MyLanguage> for MyLexer {
81/// fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
82/// // Tokenization logic here
83/// todo!()
84/// }
85/// }
86/// ```
87pub trait Lexer<L: Language + Send + Sync> {
88 /// Tokenizes the given source text into a sequence of tokens.
89 ///
90 /// This method performs a full lexical analysis of the source text,
91 /// creating a new sequence of tokens from scratch. It uses a default
92 /// cache configuration.
93 ///
94 /// # Arguments
95 ///
96 /// * `source` - The source text to tokenize
97 ///
98 /// # Returns
99 ///
100 /// A [`LexOutput`] containing the tokens and any diagnostic messages
101 fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
102}
103
104/// Cache trait for lexical results.
105///
106/// This trait defines the interface for caching and accessing lexical analysis results.
107/// It provides methods for storing and retrieving token information from previous
108/// lexical analysis operations.
109#[allow(unused_variables)]
110pub trait LexerCache<L: Language> {
111 /// Sets the lexed output in the cache.
112 ///
113 /// # Arguments
114 ///
115 /// * `output` - The output from lexical analysis, including tokens and diagnostics
116 fn set_lex_output(&mut self, output: LexOutput<L>);
117
118 /// Gets a token from the cache by index.
119 ///
120 /// # Arguments
121 ///
122 /// * `index` - The index of the token to retrieve
123 ///
124 /// # Returns
125 ///
126 /// An `Option<Token<L::TokenType>>` containing the token if it exists,
127 /// or `None` if the index is out of bounds or no tokens are cached
128 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
129
130 /// Gets the total number of tokens in the cache.
131 ///
132 /// # Returns
133 ///
134 /// The number of cached tokens, or 0 if no tokens are cached
135 fn count_tokens(&self) -> usize;
136
137 /// Checks if the cache contains any tokens.
138 ///
139 /// # Returns
140 ///
141 /// `true` if the cache contains tokens, `false` otherwise
142 fn has_tokens(&self) -> bool;
143
144 /// Gets all cached tokens as a slice.
145 ///
146 /// # Returns
147 ///
148 /// An optional slice of tokens if available.
149 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
150 None
151 }
152}
153
154impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
155 fn set_lex_output(&mut self, output: LexOutput<L>) {
156 (**self).set_lex_output(output)
157 }
158
159 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
160 (**self).get_token(index)
161 }
162
163 fn count_tokens(&self) -> usize {
164 (**self).count_tokens()
165 }
166
167 fn has_tokens(&self) -> bool {
168 (**self).has_tokens()
169 }
170
171 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
172 (**self).get_tokens()
173 }
174}
175
176/// Represents a single kind in the source code.
177///
178/// Tokens are the fundamental units of lexical analysis, representing
179/// categorized pieces of source text with their position information.
180#[derive(Debug, Clone, PartialEq, Eq, Copy)]
181#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
182pub struct Token<K> {
183 /// The kind/category of this kind (e.g., keyword, identifier, number)
184 pub kind: K,
185 /// The byte range in the source text that this kind occupies
186 #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
187 pub span: Range<usize>,
188}
189
190impl<K> Token<K> {
191 /// Returns the length of this kind in bytes.
192 ///
193 /// # Returns
194 ///
195 /// The number of bytes between the start and end of the kind's span
196 ///
197 /// # Examples
198 ///
199 /// ```ignore
200 /// #![feature(new_range_api)]
201 /// # use oak_core::lexer::Token;
202 /// # use core::range::Range;
203 /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } }
204 /// assert_eq!(kind.length(), 5);
205 /// ```
206 #[inline]
207 pub fn length(&self) -> usize {
208 self.span.end - self.span.start
209 }
210}
211
212/// A stream of tokens with associated source text.
213#[derive(Debug, Clone)]
214#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
215#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
216pub struct TokenStream<K: Copy> {
217 /// The raw source text.
218 pub raw: String,
219 /// The tokens extracted from the source text.
220 #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
221 pub tokens: Arc<[Token<K>]>,
222}
223
224#[cfg(feature = "serde")]
225mod arc_slice_serde {
226 use super::*;
227 use serde::{Deserialize, Deserializer, Serialize, Serializer};
228
229 pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
230 where
231 K: Serialize,
232 S: Serializer,
233 {
234 arc.as_ref().serialize(serializer)
235 }
236
237 pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
238 where
239 K: Deserialize<'de>,
240 D: Deserializer<'de>,
241 {
242 let vec = Vec::<Token<K>>::deserialize(deserializer)?;
243 Ok(Arc::from_iter(vec))
244 }
245}
246
247/// State information for incremental lexical analysis.
248///
249/// This struct maintains the current position and context during
250/// tokenization, enabling incremental and resumable lexing operations.
251#[derive(Debug)]
252pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
253 pub(crate) cursor: SourceCursor<'s, S>,
254 pub(crate) tokens: Vec<Token<L::TokenType>>,
255 pub(crate) errors: Vec<OakError>,
256}
257
258impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
259 /// Creates a new lexer state with the given source text.
260 ///
261 /// # Arguments
262 ///
263 /// * `source` - The source text to lex
264 ///
265 /// # Returns
266 ///
267 /// A new `LexerState` initialized at the beginning of the source
268 pub fn new(source: &'s S) -> Self {
269 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
270 }
271
272 /// Creates a new lexer state with the given source text and incremental cache.
273 ///
274 /// # Arguments
275 ///
276 /// * `source` - The source text to lex
277 /// * `relex_from` - The minimum byte offset that may have been affected by edits
278 /// (use `source.length()` to indicate no edits)
279 /// * `cache` - The incremental cache containing previous lexing results
280 ///
281 /// # Returns
282 ///
283 /// A new `LexerState` initialized at the beginning of the source with cache support
284 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
285 if !cache.has_tokens() {
286 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
287 }
288
289 let len = source.length();
290 let relex_from = relex_from.min(len);
291
292 // Fast path: fully re-used
293 if relex_from >= len {
294 let mut tokens = Vec::new();
295 if let Some(cached) = cache.get_tokens() {
296 tokens.extend_from_slice(cached)
297 }
298 else {
299 let count = cache.count_tokens();
300 tokens.reserve(count);
301 for i in 0..count {
302 if let Some(t) = cache.get_token(i) {
303 tokens.push(t)
304 }
305 }
306 }
307 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
308 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
309 }
310
311 if relex_from == 0 {
312 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
313 }
314
315 let mut reused_tokens = Vec::new();
316 const BACKTRACK_TOKENS: usize = 1;
317
318 if let Some(cached) = cache.get_tokens() {
319 // Binary search for the cut-off point since tokens are sorted by position
320 let idx = cached.partition_point(|t| t.span.end <= relex_from);
321 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
322 if keep > 0 {
323 reused_tokens.extend_from_slice(&cached[..keep])
324 }
325 }
326 else {
327 // Fallback for caches that don't support slice access
328 let count = cache.count_tokens();
329 for i in 0..count {
330 let Some(token) = cache.get_token(i)
331 else {
332 break;
333 };
334 if token.span.end <= relex_from {
335 reused_tokens.push(token);
336 }
337 else {
338 break;
339 }
340 }
341 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
342 reused_tokens.truncate(keep);
343 }
344
345 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
346 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
347 }
348
349 /// Gets the remaining text from the current position to the end of the source.
350 ///
351 /// # Returns
352 ///
353 /// A string slice containing the remaining text
354 pub fn rest(&mut self) -> &str {
355 self.cursor.rest()
356 }
357
358 /// Gets the remaining text as a byte slice.
359 ///
360 /// Useful for byte-oriented scanning operations.
361 #[inline]
362 pub fn rest_bytes(&mut self) -> &[u8] {
363 self.cursor.rest().as_bytes()
364 }
365
366 /// Checks if the lexer has consumed all input from the source.
367 ///
368 /// Returns `true` if the current position is at or beyond the end of the source.
369 pub fn fully_reused(&self) -> bool {
370 self.cursor.position() >= self.cursor.source().length()
371 }
372
373 /// Gets the current byte offset position in the source text.
374 ///
375 /// # Returns
376 ///
377 /// The current byte offset from the start of the source text.
378 #[inline]
379 pub fn get_position(&self) -> usize {
380 self.cursor.position()
381 }
382
383 /// Checks if the lexer has NOT consumed all input from the source.
384 ///
385 /// Returns `true` if there are still bytes left to be scanned.
386 #[inline]
387 pub fn not_at_end(&self) -> bool {
388 self.cursor.position() < self.cursor.source().length()
389 }
390
391 /// Peeks at the next character without advancing the cursor.
392 ///
393 /// Returns `None` if at the end of the source.
394 #[inline]
395 pub fn peek(&mut self) -> Option<char> {
396 self.cursor.peek_char()
397 }
398
399 /// Peeks at the character immediately following the current character.
400 #[inline]
401 pub fn peek_next(&mut self) -> Option<char> {
402 self.cursor.peek_next_char()
403 }
404
405 /// Peeks at the character at the specified byte offset relative to the current position.
406 #[inline]
407 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
408 self.cursor.peek_next_n(n)
409 }
410
411 /// Advances the cursor by the specified number of bytes.
412 #[inline]
413 pub fn advance(&mut self, len: usize) {
414 self.cursor.advance_bytes(len);
415 }
416
417 /// Gets the total length of the source text in bytes.
418 #[inline]
419 pub fn get_length(&self) -> usize {
420 self.cursor.source().length()
421 }
422
423 /// Gets a single character at the specified absolute byte offset.
424 #[inline]
425 pub fn get_char_at(&self, offset: usize) -> Option<char> {
426 self.cursor.source().get_char_at(offset)
427 }
428
429 /// Peeks at the next byte without advancing the cursor.
430 #[inline]
431 pub fn peek_byte(&mut self) -> Option<u8> {
432 self.cursor.peek_byte()
433 }
434
435 /// Advances the cursor by one byte and returns it.
436 #[inline]
437 pub fn advance_byte(&mut self) -> Option<u8> {
438 self.cursor.advance_byte()
439 }
440
441 /// Advances the cursor while the byte predicate is true.
442 ///
443 /// Returns the byte range covered by the matched bytes.
444 #[inline]
445 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
446 self.cursor.take_while_byte(pred)
447 }
448
449 /// Skips common ASCII whitespace (space, tab, newline, carriage return).
450 ///
451 /// Uses SIMD acceleration if available on the platform.
452 /// Returns the range of the skipped whitespace.
453 #[inline]
454 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
455 self.cursor.skip_ascii_whitespace()
456 }
457
458 /// Skips all consecutive ASCII digits at the current position.
459 ///
460 /// Returns the range of the skipped digits.
461 #[inline]
462 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
463 self.cursor.skip_ascii_digits()
464 }
465
466 /// Skips all characters that can continue an ASCII identifier.
467 ///
468 /// This includes alphanumeric characters and underscores.
469 /// Returns the range of the skipped characters.
470 #[inline]
471 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
472 self.cursor.skip_ascii_ident_continue()
473 }
474
475 /// Skips all characters until the target byte is encountered.
476 ///
477 /// The target byte itself is NOT consumed.
478 /// Returns the range of the skipped characters.
479 #[inline]
480 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
481 self.cursor.skip_until(target)
482 }
483
484 /// Scans an ASCII identifier.
485 ///
486 /// An identifier must start with an alphabetic character or an underscore,
487 /// and can be followed by any number of alphanumeric characters or underscores.
488 ///
489 /// # Arguments
490 ///
491 /// * `kind` - The token type to assign if an identifier is found.
492 ///
493 /// # Returns
494 ///
495 /// `true` if an identifier was successfully scanned and added.
496 #[inline]
497 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
498 let start = self.get_position();
499 if let Some(b) = self.peek_byte() {
500 if b == b'_' || b.is_ascii_alphabetic() {
501 self.advance_byte();
502 self.skip_ascii_ident_continue();
503 self.add_token(kind, start, self.get_position());
504 return true;
505 }
506 }
507 false
508 }
509
510 /// Scans a line comment starting with the given prefix.
511 ///
512 /// Consumes the prefix and all characters until the next newline or EOF.
513 ///
514 /// # Arguments
515 ///
516 /// * `kind` - The token type for the line comment.
517 /// * `prefix` - The string sequence that starts the comment (e.g., "//").
518 #[inline]
519 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
520 let start = self.get_position();
521 if self.consume_if_starts_with(prefix) {
522 self.skip_until(b'\n');
523 self.add_token(kind, start, self.get_position());
524 return true;
525 }
526 false
527 }
528
529 /// Scans a block comment with given start and end sequences.
530 ///
531 /// Handles nested comments if the underlying implementation supports it,
532 /// though this basic implementation is non-recursive.
533 ///
534 /// # Arguments
535 ///
536 /// * `kind` - The token type for the block comment.
537 /// * `start_seq` - The sequence that starts the block (e.g., "/*").
538 /// * `end_seq` - The sequence that ends the block (e.g., "*/").
539 #[inline]
540 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
541 let start = self.get_position();
542 if self.consume_if_starts_with(start_seq) {
543 while let Some(_b) = self.peek_byte() {
544 self.skip_until(end_seq.as_bytes()[0]);
545 if self.consume_if_starts_with(end_seq) {
546 self.add_token(kind, start, self.get_position());
547 return true;
548 }
549 self.advance_byte();
550 }
551 // Unclosed block comment is still a comment in many languages,
552 // but we might want to add an error here in the future.
553 self.add_token(kind, start, self.get_position());
554 return true;
555 }
556 false
557 }
558
559 /// Gets a reference to the tokens collected so far.
560 ///
561 /// # Returns
562 ///
563 /// A slice of tokens collected during the lexing process.
564 #[inline]
565 pub fn tokens(&self) -> &[Token<L::TokenType>] {
566 &self.tokens
567 }
568
569 /// Sets the current position to the specified byte offset.
570 ///
571 /// # Arguments
572 ///
573 /// * `offset` - The new byte offset position.
574 ///
575 /// # Returns
576 ///
577 /// The previous byte offset position.
578 #[inline]
579 pub fn set_position(&mut self, offset: usize) -> usize {
580 self.cursor.set_position(offset)
581 }
582
583 /// Returns a reference to the underlying source.
584 pub fn source(&self) -> &'s S {
585 self.cursor.source()
586 }
587
588 /// Returns the text in the specified byte range.
589 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
590 self.cursor.source().get_text_in(range)
591 }
592
593 /// Returns the text from the specified byte offset to the end of the source.
594 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
595 self.cursor.source().get_text_from(offset)
596 }
597
598 /// Checks if the source starts with the given pattern at the current position.
599 pub fn starts_with(&mut self, pattern: &str) -> bool {
600 self.cursor.starts_with(pattern)
601 }
602
603 /// Consumes the pattern if it exists at the current position.
604 ///
605 /// Returns `true` if the pattern was found and consumed, advancing the cursor.
606 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
607 self.cursor.consume_if_starts_with(pattern)
608 }
609
610 /// Gets the tokens collected so far in the lexer state.
611 ///
612 /// # Returns
613 ///
614 /// A slice of tokens collected during lexing.
615 #[inline]
616 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
617 &self.tokens
618 }
619
620 /// Adds an error to the lexer state's diagnostics.
621 ///
622 /// # Arguments
623 ///
624 /// * `error` - The error to add.
625 #[inline]
626 pub fn add_error(&mut self, error: impl Into<OakError>) {
627 self.errors.push(error.into());
628 }
629
630 /// Adds a token to the lexer state.
631 ///
632 /// # Arguments
633 ///
634 /// * `kind` - The kind/type of the token.
635 /// * `start` - The starting byte offset.
636 /// * `end` - The ending byte offset.
637 #[inline]
638 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
639 self.tokens.push(Token { kind, span: Range { start, end } });
640 }
641
642 /// Adds an end-of-file (EOF) token to the lexer state.
643 ///
644 /// This method creates and adds an `END_OF_STREAM` token at the current position.
645 /// It is typically called when the lexer reaches the end of the source text.
646 ///
647 /// # Examples
648 ///
649 /// ```ignore
650 /// #![feature(new_range_api)]
651 /// # use core::range::Range;
652 /// # use oak_core::lexer::{LexerState, Token};
653 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
654 /// #
655 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
656 /// # enum SimpleToken {
657 /// # End,
658 /// # }
659 /// #
660 /// # impl TokenType for SimpleToken {
661 /// # const END_OF_STREAM: Self = SimpleToken::End;
662 /// # type Role = UniversalTokenRole;
663 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
664 /// # }
665 /// #
666 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
667 /// # enum SimpleElement {}
668 /// #
669 /// # impl ElementType for SimpleElement {
670 /// # type Role = UniversalElementRole;
671 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
672 /// # }
673 /// #
674 /// # #[derive(Clone)]
675 /// # struct SimpleLanguage;
676 /// #
677 /// # impl Language for SimpleLanguage {
678 /// # const NAME: &'static str = "simple";
679 /// # type TokenType = SimpleToken;
680 /// # type ElementType = SimpleElement;
681 /// # type TypedRoot = ();
682 /// # }
683 /// #
684 /// let source = SourceText::new("test");
685 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
686 /// state.take_while(|_| true); // Advance to end
687 /// state.add_eof();
688 ///
689 /// assert_eq!(state.tokens().len(), 1);
690 /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
691 /// ```
692 #[inline]
693 pub fn add_eof(&mut self) {
694 let end = self.get_position();
695 self.add_token(L::TokenType::END_OF_STREAM, end, end)
696 }
697
698 /// Gets the current character at the current position.
699 ///
700 /// # Returns
701 ///
702 /// The current character, or `None` if at the end of the source
703 #[inline]
704 pub fn current(&mut self) -> Option<char> {
705 self.cursor.peek_char()
706 }
707
708 /// Advances the position by the current character's length.
709 ///
710 /// # Returns
711 ///
712 /// The character that was skipped, or `None` if at the end of the source
713 #[inline]
714 pub fn bump(&mut self) -> Option<char> {
715 let ch = self.peek()?;
716 self.advance(ch.len_utf8());
717 Some(ch)
718 }
719
720 /// Advances the position by the token's length and adds the token to the lexer state.
721 ///
722 /// This method combines two common operations: advancing the lexer position
723 /// and adding a token to the token list. It calculates the advance distance
724 /// from the token's span, ensuring consistent positioning.
725 ///
726 /// # Arguments
727 ///
728 /// * `token` - The token to add to the lexer state
729 ///
730 /// # Returns
731 ///
732 /// The new byte offset position after advancing
733 ///
734 /// # Examples
735 ///
736 /// ```ignore
737 /// #![feature(new_range_api)]
738 /// # use core::range::Range;
739 /// # use oak_core::lexer::{LexerState, Token};
740 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
741 /// # /// #
742 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
743 /// # enum SimpleToken { Identifier, End }
744 /// #
745 /// # impl TokenType for SimpleToken {
746 /// # const END_OF_STREAM: Self = SimpleToken::End;
747 /// # type Role = UniversalTokenRole;
748 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
749 /// # }
750 /// #
751 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
752 /// # enum SimpleElement {}
753 /// #
754 /// # impl ElementType for SimpleElement {
755 /// # type Role = UniversalElementRole;
756 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
757 /// # }
758 /// #
759 /// # #[derive(Clone)]
760 /// # struct SimpleLanguage;
761 /// #
762 /// # impl Language for SimpleLanguage {
763 /// # const NAME: &'static str = "simple";
764 /// # type TokenType = SimpleToken;
765 /// # type ElementType = SimpleElement;
766 /// # type TypedRoot = ();
767 /// # }
768 /// #
769 /// let source = SourceText::new("hello world");
770 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
771 ///
772 /// // Create a token for "hello"
773 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
774 ///
775 /// // Initially at position 0
776 /// assert_eq!(state.get_position(), 0);
777 ///
778 /// // Advance and add the token
779 /// let new_pos = state.advance_with(token);
780 ///
781 /// // Now at position 5 and token is added
782 /// assert_eq!(new_pos, 5);
783 /// assert_eq!(state.get_position(), 5);
784 /// assert_eq!(state.get_tokens().len(), 1);
785 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
786 /// ```
787 ///
788 /// # Note
789 ///
790 /// The caller must ensure that the token's span is valid and that the advance
791 /// does not split multi-byte UTF-8 characters. The token should be created
792 /// with proper character boundaries.
793 #[inline]
794 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
795 self.cursor.advance_bytes(token.length());
796 self.tokens.push(token);
797 self.cursor.position()
798 }
799
800 /// Consumes characters while the predicate returns true, returning the consumed range.
801 ///
802 /// This method iterates through the source text from the current position,
803 /// consuming characters as long as the predicate function returns true.
804 /// It's commonly used for recognizing patterns like identifiers, numbers,
805 /// or whitespace sequences.
806 ///
807 /// # Arguments
808 ///
809 /// * `pred` - A closure that takes a character and returns true if the character
810 /// should be consumed, false otherwise
811 ///
812 /// # Returns
813 ///
814 /// A byte range representing the span of consumed characters
815 ///
816 /// # Examples
817 ///
818 /// ```ignore
819 /// #![feature(new_range_api)]
820 /// # use core::range::Range;
821 /// # use oak_core::lexer::{LexerState, Token};
822 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
823 /// # /// #
824 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
825 /// # enum SimpleToken { End }
826 /// #
827 /// # impl TokenType for SimpleToken {
828 /// # const END_OF_STREAM: Self = SimpleToken::End;
829 /// # type Role = UniversalTokenRole;
830 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
831 /// # }
832 /// #
833 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
834 /// # enum SimpleElement {}
835 /// #
836 /// # impl ElementType for SimpleElement {
837 /// # type Role = UniversalElementRole;
838 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
839 /// # }
840 /// #
841 /// # #[derive(Clone)]
842 /// # struct SimpleLanguage;
843 /// #
844 /// # impl Language for SimpleLanguage {
845 /// # const NAME: &'static str = "simple";
846 /// # type TokenType = SimpleToken;
847 /// # type ElementType = SimpleElement;
848 /// # type TypedRoot = ();
849 /// # }
850 /// #
851 /// let source = SourceText::new("hello123world");
852 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
853 ///
854 /// // Consume alphabetic characters
855 /// let range = state.take_while(|c| c.is_alphabetic());
856 ///
857 /// // Should have consumed "hello"
858 /// assert_eq!(range, Range { start: 0, end: 5 });
859 /// assert_eq!(state.get_position(), 5);
860 ///
861 /// // Consume numeric characters
862 /// let range = state.take_while(|c| c.is_numeric());
863 ///
864 /// // Should have consumed "123"
865 /// assert_eq!(range, Range { start: 5, end: 8 });
866 /// assert_eq!(state.get_position(), 8);
867 /// ```
868 ///
869 /// # Performance Note
870 ///
871 /// This method operates on a character-by-character basis, which means it
872 /// correctly handles multi-byte UTF-8 characters. For performance-critical
873 /// code, consider using byte-based methods when working with ASCII-only text.
874 pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
875 let start = self.cursor.position();
876 while let Some(ch) = self.peek() {
877 if pred(ch) { self.advance(ch.len_utf8()) } else { break }
878 }
879 Range { start, end: self.cursor.position() }
880 }
881
882 /// Performs a safety check to prevent infinite loops during lexing.
883 ///
884 /// This method ensures that the lexer always makes progress by forcing
885 /// advancement when stuck at the same position. It's used as a safeguard
886 /// against infinite loops in lexer implementations.
887 ///
888 /// The method compares the current position with a previously saved "safe point"
889 /// position. If they're the same, it means the lexer hasn't made progress since
890 /// that safe point, potentially indicating an infinite loop. In this case, the
891 /// method forces advancement by at least one character.
892 ///
893 /// # Arguments
894 ///
895 /// * `safe_point` - The position to check against for potential deadlock
896 ///
897 /// # Examples
898 ///
899 /// ```ignore
900 /// #![feature(new_range_api)]
901 /// # use oak_core::lexer::{LexerState, Token};
902 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
903 /// # /// #
904 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
905 /// # enum SimpleToken { End }
906 /// #
907 /// # impl TokenType for SimpleToken {
908 /// # const END_OF_STREAM: Self = SimpleToken::End;
909 /// # type Role = UniversalTokenRole;
910 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
911 /// # }
912 /// #
913 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
914 /// # enum SimpleElement {}
915 /// #
916 /// # impl ElementType for SimpleElement {
917 /// # type Role = UniversalElementRole;
918 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
919 /// # }
920 /// #
921 /// # struct SimpleLanguage;
922 /// #
923 /// # impl Language for SimpleLanguage {
924 /// # const NAME: &'static str = "simple";
925 /// # type TokenType = SimpleToken;
926 /// # type ElementType = SimpleElement;
927 /// # type TypedRoot = ();
928 /// # }
929 /// #
930 /// let source = SourceText::new("test");
931 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
932 ///
933 /// // Save the current position as a safe point
934 /// let safe_point = state.get_position();
935 ///
936 /// // In a real lexer, you would do some processing here
937 /// // If something went wrong and we didn't advance, this would prevent infinite loop
938 /// state.advance_if_dead_lock(safe_point);
939 ///
940 /// // If we were stuck, we would have advanced by at least 1
941 /// assert!(state.get_position() >= safe_point);
942 /// ```
943 ///
944 /// # Usage in Lexer Implementations
945 ///
946 /// This method is typically used at the beginning or end of lexing loops:
947 ///
948 /// ```ignore
949 /// loop {
950 /// let safe_point = state.get_position();
951 ///
952 /// // Try to recognize a token
953 /// if let Some(token) = try_recognize_token(&mut state) {
954 /// // Success, continue loop
955 /// continue;
956 /// }
957 ///
958 /// // If we get here, we didn't recognize anything
959 /// // This prevents infinite loops if recognition fails
960 /// state.advance_if_dead_lock(safe_point);
961 ///
962 /// if state.not_at_end() {
963 /// // Continue trying to recognize tokens
964 /// continue;
965 /// } else {
966 /// // Reached end of source
967 /// break;
968 /// }
969 /// }
970 /// ```
971 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
972 // Force advance if no progress was made
973 if self.cursor.position() == safe_point {
974 if let Some(ch) = self.current() {
975 // Skip current character
976 self.advance(ch.len_utf8())
977 }
978 else {
979 // Advance anyway to prevent infinite loop
980 self.advance(1)
981 }
982 // tracing::warn!("deadlock")
983 }
984 }
985
986 /// Finishes lexing and returns the final output with tokens and diagnostics.
987 ///
988 /// This method concludes the lexing process by converting the collected tokens
989 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
990 /// represents the overall success or failure of the lexing operation.
991 ///
992 /// If the result is `Ok`, the tokens are returned as the successful result.
993 /// If the result is `Err`, the error is returned as the failure result.
994 /// In both cases, any collected diagnostic errors are included in the output.
995 ///
996 /// # Arguments
997 ///
998 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
999 ///
1000 /// # Returns
1001 ///
1002 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1003 ///
1004 /// # Examples
1005 ///
1006 /// ```
1007 /// #![feature(new_range_api)]
1008 /// # use oak_core::lexer::{LexerState, Token};
1009 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
1010 /// # /// #
1011 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1012 /// # enum SimpleToken { Identifier, End }
1013 /// #
1014 /// # impl TokenType for SimpleToken {
1015 /// # const END_OF_STREAM: Self = SimpleToken::End;
1016 /// # type Role = UniversalTokenRole;
1017 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1018 /// # }
1019 /// #
1020 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1021 /// # enum SimpleElement {}
1022 /// #
1023 /// # impl ElementType for SimpleElement {
1024 /// # type Role = UniversalElementRole;
1025 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1026 /// # }
1027 /// #
1028 /// # struct SimpleLanguage;
1029 /// #
1030 /// # impl Language for SimpleLanguage {
1031 /// # const NAME: &'static str = "simple";
1032 /// # type TokenType = SimpleToken;
1033 /// # type ElementType = SimpleElement;
1034 /// # type TypedRoot = ();
1035 /// # }
1036 /// #
1037 /// let source = SourceText::new("test");
1038 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1039 ///
1040 /// // Add some tokens during lexing
1041 /// state.add_token(SimpleToken::Identifier, 0, 4);
1042 ///
1043 /// // Finish with successful result
1044 /// let output = state.finish(Ok(()));
1045 ///
1046 /// // Check the results
1047 /// assert!(output.result.is_ok());
1048 /// assert_eq!(output.result.unwrap().len(), 1);
1049 /// assert_eq!(output.diagnostics.len(), 0);
1050 ///
1051 /// // Example with error
1052 /// let source2 = SourceText::new("test");
1053 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1054 /// state2.add_error(OakError::custom_error("Test error"));
1055 ///
1056 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1057 ///
1058 /// // Check the results
1059 /// assert!(output2.result.is_err());
1060 /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1061 /// ```
1062 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1063 match result {
1064 Ok(_) => {
1065 let tokens: Tokens<L> = self.tokens.into();
1066 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1067 }
1068 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1069 }
1070 }
1071
1072 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1073 ///
1074 /// This method is similar to `finish` but additionally updates the incremental cache
1075 /// with the new tokens. It's used for incremental lexing where the results need to
1076 /// be cached for future reuse when the source text changes.
1077 ///
1078 /// The method first creates the output in the same way as `finish`, then updates
1079 /// the cache's `last_lex` field with the new tokens. This enables the next call
1080 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1081 ///
1082 /// # Arguments
1083 ///
1084 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1085 /// * `cache` - The incremental cache to update with the new tokens
1086 ///
1087 /// # Returns
1088 ///
1089 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1090 ///
1091 /// # Examples
1092 ///
1093 /// ```ignore
1094 /// #![feature(new_range_api)]
1095 /// # use core::range::Range;
1096 /// # use oak_core::lexer::{LexerState, Token};
1097 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1098 /// # use oak_core::parser::session::ParseSession;
1099 /// #
1100 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1101 /// # enum SimpleToken { Identifier, End }
1102 /// #
1103 /// # impl TokenType for SimpleToken {
1104 /// # const END_OF_STREAM: Self = SimpleToken::End;
1105 /// # type Role = UniversalTokenRole;
1106 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1107 /// # }
1108 /// #
1109 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1110 /// # enum SimpleElement {}
1111 /// #
1112 /// # impl ElementType for SimpleElement {
1113 /// # type Role = UniversalElementRole;
1114 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1115 /// # }
1116 /// #
1117 /// # struct SimpleLanguage;
1118 /// #
1119 /// # impl Language for SimpleLanguage {
1120 /// # const NAME: &'static str = "simple";
1121 /// # type TokenType = SimpleToken;
1122 /// # type ElementType = SimpleElement;
1123 /// # type TypedRoot = ();
1124 /// # }
1125 /// #
1126 /// let source = SourceText::new("test");
1127 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1128 ///
1129 /// // Create a cache for incremental lexing
1130 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1131 ///
1132 /// // Add some tokens during lexing
1133 /// state.add_token(SimpleToken::Identifier, 0, 4);
1134 ///
1135 /// // Finish with cache update
1136 /// let output = state.finish_with_cache(Ok(()), &mut cache);
1137 ///
1138 /// // Check the results
1139 /// assert!(output.result.is_ok());
1140 /// assert_eq!(output.result.unwrap().len(), 1);
1141 /// ```
1142 ///
1143 /// # Incremental Lexing Workflow
1144 ///
1145 /// This method is typically used as part of an incremental lexing workflow:
1146 ///
1147 /// ```ignore
1148 /// // First lexing
1149 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1150 /// // ... lexing logic ...
1151 /// let output = state.finish_with_cache(Ok(()), cache);
1152 ///
1153 /// // Later, when source changes
1154 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1155 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1156 /// // ... lexing logic (reusing unchanged tokens) ...
1157 /// let output = state.finish_with_cache(Ok(()), cache);
1158 /// ```
1159 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1160 let out = self.finish(result);
1161 cache.set_lex_output(out.clone());
1162 out
1163 }
1164}