oak_core/lexer/mod.rs
1#![doc = include_str!("readme.md")]
2
3use crate::{
4 Language, TextEdit, TokenType,
5 errors::{OakDiagnostics, OakError},
6 source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15pub mod scan_comment;
16/// Utilities for scanning identifiers.
17pub mod scan_identifier;
18/// Utilities for scanning numbers.
19pub mod scan_number;
20/// Utilities for scanning string literals.
21pub mod scan_string;
22/// Utilities for scanning whitespace.
23pub mod scan_white_space;
24
25pub use scan_comment::CommentConfig;
26pub use scan_string::StringConfig;
27pub use scan_white_space::WhitespaceConfig;
28
29/// Output type for lexical analysis operations.
30///
31/// This type alias represents the result of tokenization, containing
32/// a vector of tokens and any diagnostic language that occurred during
33/// the lexing process.
34pub type Tokens<L: Language> = Arc<[Token<L::TokenType>]>;
35
36/// Output type for lexical analysis operations, including diagnostics.
37pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
38
39/// Trait for tokenizing source code into sequences of tokens.
40///
41/// This trait defines the interface for converting source text into a sequence of
42/// tokens that can be consumed by the parser. Implementations should handle
43/// the specific lexical rules of their target language.
44///
45/// # Examples
46///
47/// ```ignore
48/// struct MyLexer;
49///
50/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
51/// enum MyToken {
52/// Number,
53/// Identifier,
54/// End,
55/// }
56///
57/// impl TokenType for MyToken {
58/// const END_OF_STREAM: Self = MyToken::End;
59/// type Role = UniversalTokenRole;
60/// fn role(&self) -> Self::Role { UniversalTokenRole::None }
61/// }
62///
63/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64/// enum MyElement {}
65///
66/// impl ElementType for MyElement {
67/// type Role = UniversalElementRole;
68/// fn role(&self) -> Self::Role { UniversalElementRole::None }
69/// }
70///
71/// struct MyLanguage;
72///
73/// impl Language for MyLanguage {
74/// const NAME: &'static str = "my-language";
75/// type TokenType = MyToken;
76/// type ElementType = MyElement;
77/// type TypedRoot = ();
78/// }
79///
80/// impl Lexer<MyLanguage> for MyLexer {
81/// fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
82/// // Tokenization logic here
83/// todo!()
84/// }
85/// }
86/// ```
87pub trait Lexer<L: Language + Send + Sync + 'static> {
88 /// Tokenizes the given source text into a sequence of tokens.
89 ///
90 /// This method performs a full lexical analysis of the source text,
91 /// creating a new sequence of tokens from scratch. It uses a default
92 /// cache configuration.
93 ///
94 /// # Arguments
95 ///
96 /// * `source` - The source text to tokenize
97 ///
98 /// # Returns
99 ///
100 /// A [`LexOutput`] containing the tokens and any diagnostic messages
101 fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
102}
103
104/// Cache trait for lexical results.
105///
106/// This trait defines the interface for caching and accessing lexical analysis results.
107/// It provides methods for storing and retrieving token information from previous
108/// lexical analysis operations.
109#[allow(unused_variables)]
110pub trait LexerCache<L: Language> {
111 /// Sets the lexed output in the cache.
112 ///
113 /// # Arguments
114 ///
115 /// * `output` - The output from lexical analysis, including tokens and diagnostics
116 fn set_lex_output(&mut self, output: LexOutput<L>);
117
118 /// Gets a token from the cache by index.
119 ///
120 /// # Arguments
121 ///
122 /// * `index` - The index of the token to retrieve
123 ///
124 /// # Returns
125 ///
126 /// An `Option<Token<L::TokenType>>` containing the token if it exists,
127 /// or `None` if the index is out of bounds or no tokens are cached
128 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
129
130 /// Gets the total number of tokens in the cache.
131 ///
132 /// # Returns
133 ///
134 /// The number of cached tokens, or 0 if no tokens are cached
135 fn count_tokens(&self) -> usize;
136
137 /// Checks if the cache contains any tokens.
138 ///
139 /// # Returns
140 ///
141 /// `true` if the cache contains tokens, `false` otherwise
142 fn has_tokens(&self) -> bool;
143
144 /// Gets all cached tokens as a slice.
145 ///
146 /// # Returns
147 ///
148 /// An optional slice of tokens if available.
149 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
150 None
151 }
152}
153
154impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
155 fn set_lex_output(&mut self, output: LexOutput<L>) {
156 (**self).set_lex_output(output);
157 }
158
159 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
160 (**self).get_token(index)
161 }
162
163 fn count_tokens(&self) -> usize {
164 (**self).count_tokens()
165 }
166
167 fn has_tokens(&self) -> bool {
168 (**self).has_tokens()
169 }
170
171 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
172 (**self).get_tokens()
173 }
174}
175
176/// Represents a single kind in the source code.
177///
178/// Tokens are the fundamental units of lexical analysis, representing
179/// categorized pieces of source text with their position information.
180#[derive(Debug, Clone, PartialEq, Eq, Copy)]
181#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
182pub struct Token<K> {
183 /// The kind/category of this kind (e.g., keyword, identifier, number)
184 pub kind: K,
185 /// The byte range in the source text that this kind occupies
186 #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
187 pub span: Range<usize>,
188}
189
190impl<K> Token<K> {
191 /// Returns the length of this kind in bytes.
192 ///
193 /// # Returns
194 ///
195 /// The number of bytes between the start and end of the kind's span
196 ///
197 /// # Examples
198 ///
199 /// ```ignore
200 /// #![feature(new_range_api)]
201 /// # use oak_core::lexer::Token;
202 /// # use core::range::Range;
203 /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
204 /// assert_eq!(kind.length(), 5);
205 /// ```
206 #[inline]
207 pub fn length(&self) -> usize {
208 self.span.end - self.span.start
209 }
210}
211
212/// A stream of tokens with associated source text.
213#[derive(Debug, Clone)]
214#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
215#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
216pub struct TokenStream<K: Copy> {
217 /// The raw source text.
218 pub raw: String,
219 /// The tokens extracted from the source text.
220 #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
221 pub tokens: Arc<[Token<K>]>,
222}
223
224#[cfg(feature = "serde")]
225mod arc_slice_serde {
226 use super::*;
227 use serde::{Deserialize, Deserializer, Serialize, Serializer};
228
229 pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
230 where
231 K: Serialize,
232 S: Serializer,
233 {
234 arc.as_ref().serialize(serializer)
235 }
236
237 pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
238 where
239 K: Deserialize<'de>,
240 D: Deserializer<'de>,
241 {
242 let vec = Vec::<Token<K>>::deserialize(deserializer)?;
243 Ok(Arc::from_iter(vec))
244 }
245}
246
247/// State information for incremental lexical analysis.
248///
249/// This struct maintains the current position and context during
250/// tokenization, enabling incremental and resumable lexing operations.
251#[derive(Debug)]
252pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
253 pub(crate) cursor: SourceCursor<'s, S>,
254 pub(crate) tokens: Vec<Token<L::TokenType>>,
255 pub(crate) errors: Vec<OakError>,
256}
257
258impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
259 /// Creates a new lexer state with the given source text.
260 ///
261 /// # Arguments
262 ///
263 /// * `source` - The source text to lex
264 ///
265 /// # Returns
266 ///
267 /// A new `LexerState` initialized at the beginning of the source
268 pub fn new(source: &'s S) -> Self {
269 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
270 }
271
272 /// Creates a new lexer state with the given source text and incremental cache.
273 ///
274 /// # Arguments
275 ///
276 /// * `source` - The source text to lex
277 /// * `relex_from` - The minimum byte offset that may have been affected by edits
278 /// (use `source.length()` to indicate no edits)
279 /// * `cache` - The incremental cache containing previous lexing results
280 ///
281 /// # Returns
282 ///
283 /// A new `LexerState` initialized at the beginning of the source with cache support
284 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
285 if !cache.has_tokens() {
286 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
287 }
288
289 let len = source.length();
290 let relex_from = relex_from.min(len);
291
292 // Fast path: fully re-used
293 if relex_from >= len {
294 let mut tokens = Vec::new();
295 if let Some(cached) = cache.get_tokens() {
296 tokens.extend_from_slice(cached);
297 }
298 else {
299 let count = cache.count_tokens();
300 tokens.reserve(count);
301 for i in 0..count {
302 if let Some(t) = cache.get_token(i) {
303 tokens.push(t);
304 }
305 }
306 }
307 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
308 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
309 }
310
311 if relex_from == 0 {
312 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
313 }
314
315 let mut reused_tokens = Vec::new();
316 const BACKTRACK_TOKENS: usize = 1;
317
318 if let Some(cached) = cache.get_tokens() {
319 // Binary search for the cut-off point since tokens are sorted by position
320 let idx = cached.partition_point(|t| t.span.end <= relex_from);
321 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
322 if keep > 0 {
323 reused_tokens.extend_from_slice(&cached[..keep]);
324 }
325 }
326 else {
327 // Fallback for caches that don't support slice access
328 let count = cache.count_tokens();
329 for i in 0..count {
330 let Some(token) = cache.get_token(i)
331 else {
332 break;
333 };
334 if token.span.end <= relex_from {
335 reused_tokens.push(token);
336 }
337 else {
338 break;
339 }
340 }
341 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
342 reused_tokens.truncate(keep);
343 }
344
345 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
346 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
347 }
348
349 /// Gets the remaining text from the current position to the end of the source.
350 ///
351 /// # Returns
352 ///
353 /// A string slice containing the remaining text
354 pub fn rest(&mut self) -> &str {
355 self.cursor.rest()
356 }
357
358 /// Gets the remaining text as a byte slice.
359 #[inline]
360 pub fn rest_bytes(&mut self) -> &[u8] {
361 self.cursor.rest().as_bytes()
362 }
363
364 /// Checks if the lexer has consumed all input from the source.
365 pub fn fully_reused(&self) -> bool {
366 self.cursor.position() >= self.cursor.source().length()
367 }
368
369 /// Gets the current byte offset position in the source text.
370 ///
371 /// # Returns
372 ///
373 /// The current byte offset from the start of the source text
374 #[inline]
375 pub fn get_position(&self) -> usize {
376 self.cursor.position()
377 }
378
379 /// Checks if the lexer has NOT consumed all input from the source.
380 #[inline]
381 pub fn not_at_end(&self) -> bool {
382 self.cursor.position() < self.cursor.source().length()
383 }
384
385 /// Peeks at the next character without advancing.
386 #[inline]
387 pub fn peek(&mut self) -> Option<char> {
388 self.cursor.peek_char()
389 }
390
391 /// Peeks at the character at the specified byte offset relative to the current position.
392 #[inline]
393 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
394 self.cursor.peek_next_n(n)
395 }
396
397 /// Advances the cursor by the specified number of bytes.
398 #[inline]
399 pub fn advance(&mut self, len: usize) {
400 self.cursor.advance_bytes(len);
401 }
402
403 /// Gets the total length of the source text in bytes.
404 #[inline]
405 pub fn get_length(&self) -> usize {
406 self.cursor.source().length()
407 }
408
409 /// Gets a single character at the specified byte offset.
410 #[inline]
411 pub fn get_char_at(&self, offset: usize) -> Option<char> {
412 self.cursor.source().get_char_at(offset)
413 }
414
415 /// Peeks at the next byte without advancing.
416 #[inline]
417 pub fn peek_byte(&mut self) -> Option<u8> {
418 self.cursor.peek_byte()
419 }
420
421 /// Advances the cursor by one byte and returns it.
422 #[inline]
423 pub fn advance_byte(&mut self) -> Option<u8> {
424 self.cursor.advance_byte()
425 }
426
427 /// Advances the cursor while the byte predicate is true.
428 #[inline]
429 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
430 self.cursor.take_while_byte(pred)
431 }
432
433 /// Skips common ASCII whitespace using SIMD if possible.
434 #[inline]
435 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
436 self.cursor.skip_ascii_whitespace()
437 }
438
439 /// Skips all ASCII digits at the current position.
440 #[inline]
441 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
442 self.cursor.skip_ascii_digits()
443 }
444
445 /// Skips all characters that can continue an ASCII identifier.
446 #[inline]
447 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
448 self.cursor.skip_ascii_ident_continue()
449 }
450
451 /// Skips all characters until the target byte is encountered.
452 #[inline]
453 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
454 self.cursor.skip_until(target)
455 }
456
457 /// Scans an ASCII identifier (starts with alpha/_, continues with alphanumeric/_).
458 #[inline]
459 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
460 let start = self.get_position();
461 if let Some(b) = self.peek_byte() {
462 if b == b'_' || b.is_ascii_alphabetic() {
463 self.advance_byte();
464 self.skip_ascii_ident_continue();
465 self.add_token(kind, start, self.get_position());
466 return true;
467 }
468 }
469 false
470 }
471
472 /// Scans a line comment starting with the given prefix.
473 #[inline]
474 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
475 let start = self.get_position();
476 if self.consume_if_starts_with(prefix) {
477 self.skip_until(b'\n');
478 self.add_token(kind, start, self.get_position());
479 return true;
480 }
481 false
482 }
483
484 /// Scans a block comment with given start and end sequences.
485 #[inline]
486 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
487 let start = self.get_position();
488 if self.consume_if_starts_with(start_seq) {
489 while let Some(_b) = self.peek_byte() {
490 self.skip_until(end_seq.as_bytes()[0]);
491 if self.consume_if_starts_with(end_seq) {
492 self.add_token(kind, start, self.get_position());
493 return true;
494 }
495 self.advance_byte();
496 }
497 // Unclosed block comment is still a comment in many languages,
498 // but we might want to add an error here in the future.
499 self.add_token(kind, start, self.get_position());
500 return true;
501 }
502 false
503 }
504
505 /// Gets a reference to the tokens collected so far.
506 ///
507 /// # Returns
508 ///
509 /// A slice of tokens collected during the lexing process
510 #[inline]
511 pub fn tokens(&self) -> &[Token<L::TokenType>] {
512 &self.tokens
513 }
514
515 /// Sets the current position to the specified byte offset.
516 ///
517 /// # Arguments
518 ///
519 /// * `offset` - The new byte offset position
520 ///
521 /// # Returns
522 ///
523 /// The previous byte offset position
524 #[inline]
525 pub fn set_position(&mut self, offset: usize) -> usize {
526 self.cursor.set_position(offset)
527 }
528
529 /// Returns a reference to the underlying source.
530 pub fn source(&self) -> &'s S {
531 self.cursor.source()
532 }
533
534 /// Returns the text in the specified range.
535 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
536 self.cursor.source().get_text_in(range)
537 }
538
539 /// Returns the text from the specified offset to the end.
540 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
541 self.cursor.source().get_text_from(offset)
542 }
543
544 /// Checks if the source starts with the given pattern at the current position.
545 pub fn starts_with(&mut self, pattern: &str) -> bool {
546 self.cursor.starts_with(pattern)
547 }
548
549 /// Consumes the pattern if it exists at the current position.
550 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
551 self.cursor.consume_if_starts_with(pattern)
552 }
553
554 /// Gets the tokens collected so far in the lexer state.
555 ///
556 /// # Returns
557 ///
558 /// A slice of tokens collected during lexing
559 #[inline]
560 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
561 &self.tokens
562 }
563
564 /// Adds an error to the lexer state.
565 ///
566 /// # Arguments
567 ///
568 /// * `error` - The error to add to the diagnostics
569 #[inline]
570 pub fn add_error(&mut self, error: impl Into<OakError>) {
571 self.errors.push(error.into());
572 }
573
574 /// Adds a token to the lexer state.
575 ///
576 /// # Arguments
577 ///
578 /// * `kind` - The kind of the token
579 /// * `start` - The starting byte offset of the token
580 /// * `end` - The ending byte offset of the token
581 #[inline]
582 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
583 self.tokens.push(Token { kind, span: Range { start, end } });
584 }
585
586 /// Adds an end-of-file token to the lexer state.
587 ///
588 /// This method creates and adds an END_OF_STREAM token at the current position.
589 /// It's typically called when the lexer reaches the end of the source text
590 /// to mark the termination of the token stream.
591 ///
592 /// # Examples
593 ///
594 /// ```ignore
595 /// #![feature(new_range_api)]
596 /// # use core::range::Range;
597 /// # use oak_core::lexer::{LexerState, Token};
598 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
599 /// #
600 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
601 /// # enum SimpleToken {
602 /// # End,
603 /// # }
604 /// #
605 /// # impl TokenType for SimpleToken {
606 /// # const END_OF_STREAM: Self = SimpleToken::End;
607 /// # type Role = UniversalTokenRole;
608 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
609 /// # }
610 /// #
611 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
612 /// # enum SimpleElement {}
613 /// #
614 /// # impl ElementType for SimpleElement {
615 /// # type Role = UniversalElementRole;
616 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
617 /// # }
618 /// #
619 /// # #[derive(Clone)]
620 /// # struct SimpleLanguage;
621 /// #
622 /// # impl Language for SimpleLanguage {
623 /// # const NAME: &'static str = "simple";
624 /// # type TokenType = SimpleToken;
625 /// # type ElementType = SimpleElement;
626 /// # type TypedRoot = ();
627 /// # }
628 /// #
629 /// let source = SourceText::new("test");
630 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
631 /// state.take_while(|_| true); // Advance to end
632 /// state.add_eof();
633 ///
634 /// assert_eq!(state.tokens().len(), 1);
635 /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
636 /// ```
637 #[inline]
638 pub fn add_eof(&mut self) {
639 let end = self.get_position();
640 self.add_token(L::TokenType::END_OF_STREAM, end, end);
641 }
642
643 /// Gets the current character at the current position.
644 ///
645 /// # Returns
646 ///
647 /// The current character, or `None` if at the end of the source
648 #[inline]
649 pub fn current(&mut self) -> Option<char> {
650 self.cursor.peek_char()
651 }
652
653 /// Advances the position by the current character's length.
654 ///
655 /// # Returns
656 ///
657 /// The character that was skipped, or `None` if at the end of the source
658 #[inline]
659 pub fn bump(&mut self) -> Option<char> {
660 let ch = self.peek()?;
661 self.advance(ch.len_utf8());
662 Some(ch)
663 }
664
665 /// Advances the position by the token's length and adds the token to the lexer state.
666 ///
667 /// This method combines two common operations: advancing the lexer position
668 /// and adding a token to the token list. It calculates the advance distance
669 /// from the token's span, ensuring consistent positioning.
670 ///
671 /// # Arguments
672 ///
673 /// * `token` - The token to add to the lexer state
674 ///
675 /// # Returns
676 ///
677 /// The new byte offset position after advancing
678 ///
679 /// # Examples
680 ///
681 /// ```ignore
682 /// #![feature(new_range_api)]
683 /// # use core::range::Range;
684 /// # use oak_core::lexer::{LexerState, Token};
685 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
686 /// # /// #
687 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
688 /// # enum SimpleToken { Identifier, End }
689 /// #
690 /// # impl TokenType for SimpleToken {
691 /// # const END_OF_STREAM: Self = SimpleToken::End;
692 /// # type Role = UniversalTokenRole;
693 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
694 /// # }
695 /// #
696 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
697 /// # enum SimpleElement {}
698 /// #
699 /// # impl ElementType for SimpleElement {
700 /// # type Role = UniversalElementRole;
701 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
702 /// # }
703 /// #
704 /// # #[derive(Clone)]
705 /// # struct SimpleLanguage;
706 /// #
707 /// # impl Language for SimpleLanguage {
708 /// # const NAME: &'static str = "simple";
709 /// # type TokenType = SimpleToken;
710 /// # type ElementType = SimpleElement;
711 /// # type TypedRoot = ();
712 /// # }
713 /// #
714 /// let source = SourceText::new("hello world");
715 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
716 ///
717 /// // Create a token for "hello"
718 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } };
719 ///
720 /// // Initially at position 0
721 /// assert_eq!(state.get_position(), 0);
722 ///
723 /// // Advance and add the token
724 /// let new_pos = state.advance_with(token);
725 ///
726 /// // Now at position 5 and token is added
727 /// assert_eq!(new_pos, 5);
728 /// assert_eq!(state.get_position(), 5);
729 /// assert_eq!(state.get_tokens().len(), 1);
730 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
731 /// ```
732 ///
733 /// # Note
734 ///
735 /// The caller must ensure that the token's span is valid and that the advance
736 /// does not split multi-byte UTF-8 characters. The token should be created
737 /// with proper character boundaries.
738 #[inline]
739 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
740 self.cursor.advance_bytes(token.length());
741 self.tokens.push(token);
742 self.cursor.position()
743 }
744
745 /// Consumes characters while the predicate returns true, returning the consumed range.
746 ///
747 /// This method iterates through the source text from the current position,
748 /// consuming characters as long as the predicate function returns true.
749 /// It's commonly used for recognizing patterns like identifiers, numbers,
750 /// or whitespace sequences.
751 ///
752 /// # Arguments
753 ///
754 /// * `pred` - A closure that takes a character and returns true if the character
755 /// should be consumed, false otherwise
756 ///
757 /// # Returns
758 ///
759 /// A byte range representing the span of consumed characters
760 ///
761 /// # Examples
762 ///
763 /// ```ignore
764 /// #![feature(new_range_api)]
765 /// # use core::range::Range;
766 /// # use oak_core::lexer::{LexerState, Token};
767 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
768 /// # /// #
769 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
770 /// # enum SimpleToken { End }
771 /// #
772 /// # impl TokenType for SimpleToken {
773 /// # const END_OF_STREAM: Self = SimpleToken::End;
774 /// # type Role = UniversalTokenRole;
775 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
776 /// # }
777 /// #
778 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
779 /// # enum SimpleElement {}
780 /// #
781 /// # impl ElementType for SimpleElement {
782 /// # type Role = UniversalElementRole;
783 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
784 /// # }
785 /// #
786 /// # #[derive(Clone)]
787 /// # struct SimpleLanguage;
788 /// #
789 /// # impl Language for SimpleLanguage {
790 /// # const NAME: &'static str = "simple";
791 /// # type TokenType = SimpleToken;
792 /// # type ElementType = SimpleElement;
793 /// # type TypedRoot = ();
794 /// # }
795 /// #
796 /// let source = SourceText::new("hello123world");
797 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
798 ///
799 /// // Consume alphabetic characters
800 /// let range = state.take_while(|c| c.is_alphabetic());
801 ///
802 /// // Should have consumed "hello"
803 /// assert_eq!(range, Range { start: 0, end: 5 });
804 /// assert_eq!(state.get_position(), 5);
805 ///
806 /// // Consume numeric characters
807 /// let range = state.take_while(|c| c.is_numeric());
808 ///
809 /// // Should have consumed "123"
810 /// assert_eq!(range, Range { start: 5, end: 8 });
811 /// assert_eq!(state.get_position(), 8);
812 /// ```
813 ///
814 /// # Performance Note
815 ///
816 /// This method operates on a character-by-character basis, which means it
817 /// correctly handles multi-byte UTF-8 characters. For performance-critical
818 /// code, consider using byte-based methods when working with ASCII-only text.
819 pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
820 let start = self.cursor.position();
821 while let Some(ch) = self.peek() {
822 if pred(ch) {
823 self.advance(ch.len_utf8());
824 }
825 else {
826 break;
827 }
828 }
829 Range { start, end: self.cursor.position() }
830 }
831
832 /// Performs a safety check to prevent infinite loops during lexing.
833 ///
834 /// This method ensures that the lexer always makes progress by forcing
835 /// advancement when stuck at the same position. It's used as a safeguard
836 /// against infinite loops in lexer implementations.
837 ///
838 /// The method compares the current position with a previously saved "safe point"
839 /// position. If they're the same, it means the lexer hasn't made progress since
840 /// that safe point, potentially indicating an infinite loop. In this case, the
841 /// method forces advancement by at least one character.
842 ///
843 /// # Arguments
844 ///
845 /// * `safe_point` - The position to check against for potential deadlock
846 ///
847 /// # Examples
848 ///
849 /// ```ignore
850 /// #![feature(new_range_api)]
851 /// # use oak_core::lexer::{LexerState, Token};
852 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
853 /// # /// #
854 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
855 /// # enum SimpleToken { End }
856 /// #
857 /// # impl TokenType for SimpleToken {
858 /// # const END_OF_STREAM: Self = SimpleToken::End;
859 /// # type Role = UniversalTokenRole;
860 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
861 /// # }
862 /// #
863 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
864 /// # enum SimpleElement {}
865 /// #
866 /// # impl ElementType for SimpleElement {
867 /// # type Role = UniversalElementRole;
868 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
869 /// # }
870 /// #
871 /// # struct SimpleLanguage;
872 /// #
873 /// # impl Language for SimpleLanguage {
874 /// # const NAME: &'static str = "simple";
875 /// # type TokenType = SimpleToken;
876 /// # type ElementType = SimpleElement;
877 /// # type TypedRoot = ();
878 /// # }
879 /// #
880 /// let source = SourceText::new("test");
881 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
882 ///
883 /// // Save the current position as a safe point
884 /// let safe_point = state.get_position();
885 ///
886 /// // In a real lexer, you would do some processing here
887 /// // If something went wrong and we didn't advance, this would prevent infinite loop
888 /// state.advance_if_dead_lock(safe_point);
889 ///
890 /// // If we were stuck, we would have advanced by at least 1
891 /// assert!(state.get_position() >= safe_point);
892 /// ```
893 ///
894 /// # Usage in Lexer Implementations
895 ///
896 /// This method is typically used at the beginning or end of lexing loops:
897 ///
898 /// ```ignore
899 /// loop {
900 /// let safe_point = state.get_position();
901 ///
902 /// // Try to recognize a token
903 /// if let Some(token) = try_recognize_token(&mut state) {
904 /// // Success, continue loop
905 /// continue;
906 /// }
907 ///
908 /// // If we get here, we didn't recognize anything
909 /// // This prevents infinite loops if recognition fails
910 /// state.advance_if_dead_lock(safe_point);
911 ///
912 /// if state.not_at_end() {
913 /// // Continue trying to recognize tokens
914 /// continue;
915 /// } else {
916 /// // Reached end of source
917 /// break;
918 /// }
919 /// }
920 /// ```
921 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
922 // Force advance if no progress was made
923 if self.cursor.position() == safe_point {
924 if let Some(ch) = self.current() {
925 // Skip current character
926 self.advance(ch.len_utf8());
927 }
928 else {
929 // Advance anyway to prevent infinite loop
930 self.advance(1);
931 }
932 // tracing::warn!("deadlock");
933 }
934 }
935
936 /// Finishes lexing and returns the final output with tokens and diagnostics.
937 ///
938 /// This method concludes the lexing process by converting the collected tokens
939 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
940 /// represents the overall success or failure of the lexing operation.
941 ///
942 /// If the result is `Ok`, the tokens are returned as the successful result.
943 /// If the result is `Err`, the error is returned as the failure result.
944 /// In both cases, any collected diagnostic errors are included in the output.
945 ///
946 /// # Arguments
947 ///
948 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
949 ///
950 /// # Returns
951 ///
952 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
953 ///
954 /// # Examples
955 ///
956 /// ```
957 /// #![feature(new_range_api)]
958 /// # use oak_core::lexer::{LexerState, Token};
959 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
960 /// # /// #
961 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
962 /// # enum SimpleToken { Identifier, End }
963 /// #
964 /// # impl TokenType for SimpleToken {
965 /// # const END_OF_STREAM: Self = SimpleToken::End;
966 /// # type Role = UniversalTokenRole;
967 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
968 /// # }
969 /// #
970 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
971 /// # enum SimpleElement {}
972 /// #
973 /// # impl ElementType for SimpleElement {
974 /// # type Role = UniversalElementRole;
975 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
976 /// # }
977 /// #
978 /// # struct SimpleLanguage;
979 /// #
980 /// # impl Language for SimpleLanguage {
981 /// # const NAME: &'static str = "simple";
982 /// # type TokenType = SimpleToken;
983 /// # type ElementType = SimpleElement;
984 /// # type TypedRoot = ();
985 /// # }
986 /// #
987 /// let source = SourceText::new("test");
988 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
989 ///
990 /// // Add some tokens during lexing
991 /// state.add_token(SimpleToken::Identifier, 0, 4);
992 ///
993 /// // Finish with successful result
994 /// let output = state.finish(Ok(()));
995 ///
996 /// // Check the results
997 /// assert!(output.result.is_ok());
998 /// assert_eq!(output.result.unwrap().len(), 1);
999 /// assert_eq!(output.diagnostics.len(), 0);
1000 ///
1001 /// // Example with error
1002 /// let source2 = SourceText::new("test");
1003 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1004 /// state2.add_error(OakError::custom_error("Test error"));
1005 ///
1006 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1007 ///
1008 /// // Check the results
1009 /// assert!(output2.result.is_err());
1010 /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1011 /// ```
1012 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1013 match result {
1014 Ok(_) => {
1015 let tokens: Tokens<L> = self.tokens.into();
1016 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1017 }
1018 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1019 }
1020 }
1021
1022 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1023 ///
1024 /// This method is similar to `finish` but additionally updates the incremental cache
1025 /// with the new tokens. It's used for incremental lexing where the results need to
1026 /// be cached for future reuse when the source text changes.
1027 ///
1028 /// The method first creates the output in the same way as `finish`, then updates
1029 /// the cache's `last_lex` field with the new tokens. This enables the next call
1030 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1031 ///
1032 /// # Arguments
1033 ///
1034 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1035 /// * `cache` - The incremental cache to update with the new tokens
1036 ///
1037 /// # Returns
1038 ///
1039 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1040 ///
1041 /// # Examples
1042 ///
1043 /// ```ignore
1044 /// #![feature(new_range_api)]
1045 /// # use core::range::Range;
1046 /// # use oak_core::lexer::{LexerState, Token};
1047 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1048 /// # use oak_core::parser::session::ParseSession;
1049 /// #
1050 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1051 /// # enum SimpleToken { Identifier, End }
1052 /// #
1053 /// # impl TokenType for SimpleToken {
1054 /// # const END_OF_STREAM: Self = SimpleToken::End;
1055 /// # type Role = UniversalTokenRole;
1056 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1057 /// # }
1058 /// #
1059 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1060 /// # enum SimpleElement {}
1061 /// #
1062 /// # impl ElementType for SimpleElement {
1063 /// # type Role = UniversalElementRole;
1064 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1065 /// # }
1066 /// #
1067 /// # struct SimpleLanguage;
1068 /// #
1069 /// # impl Language for SimpleLanguage {
1070 /// # const NAME: &'static str = "simple";
1071 /// # type TokenType = SimpleToken;
1072 /// # type ElementType = SimpleElement;
1073 /// # type TypedRoot = ();
1074 /// # }
1075 /// #
1076 /// let source = SourceText::new("test");
1077 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1078 ///
1079 /// // Create a cache for incremental lexing
1080 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1081 ///
1082 /// // Add some tokens during lexing
1083 /// state.add_token(SimpleToken::Identifier, 0, 4);
1084 ///
1085 /// // Finish with cache update
1086 /// let output = state.finish_with_cache(Ok(()), &mut cache);
1087 ///
1088 /// // Check the results
1089 /// assert!(output.result.is_ok());
1090 /// assert_eq!(output.result.unwrap().len(), 1);
1091 /// ```
1092 ///
1093 /// # Incremental Lexing Workflow
1094 ///
1095 /// This method is typically used as part of an incremental lexing workflow:
1096 ///
1097 /// ```ignore
1098 /// // First lexing
1099 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1100 /// // ... lexing logic ...
1101 /// let output = state.finish_with_cache(Ok(()), cache);
1102 ///
1103 /// // Later, when source changes
1104 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1105 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1106 /// // ... lexing logic (reusing unchanged tokens) ...
1107 /// let output = state.finish_with_cache(Ok(()), cache);
1108 /// ```
1109 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1110 let out = self.finish(result);
1111 cache.set_lex_output(out.clone());
1112 out
1113 }
1114}