oak_core/lexer/state.rs
1use crate::{
2 Language, TokenType,
3 errors::OakError,
4 lexer::{LexOutput, LexerCache, Token, Tokens},
5 source::{Source, SourceCursor},
6};
7pub use core::range::Range;
8use std::borrow::Cow;
9
10/// Represents the state of the lexer during a tokenization session.
11///
12/// This struct maintains the current position and context during
13/// tokenization, enabling incremental and resumable lexing operations.
14/// It tracks the current position in the source text, collected tokens,
15/// and any errors encountered.
16#[derive(Debug)]
17pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
18 pub(crate) cursor: SourceCursor<'s, S>,
19 pub(crate) tokens: Vec<Token<L::TokenType>>,
20 pub(crate) errors: Vec<OakError>,
21 pub(crate) end_limit: Option<usize>,
22}
23
24impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
25 /// Creates a new lexer state with the given source text.
26 ///
27 /// # Arguments
28 ///
29 /// * `source` - The source text to lex
30 ///
31 /// # Returns
32 ///
33 /// A new `LexerState` initialized at the beginning of the source
34 pub fn new(source: &'s S) -> Self {
35 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None }
36 }
37
38 /// Creates a new lexer state with the given source text and incremental cache.
39 ///
40 /// # Arguments
41 ///
42 /// * `source` - The source text to lex
43 /// * `relex_from` - The minimum byte offset that may have been affected by edits
44 /// (use `source.length()` to indicate no edits)
45 /// * `cache` - The incremental cache containing previous lexing results
46 ///
47 /// # Returns
48 ///
49 /// A new `LexerState` initialized at the beginning of the source with cache support
50 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
51 if !cache.has_tokens() {
52 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None };
53 }
54
55 let len = source.length();
56 let relex_from = relex_from.min(len);
57
58 if relex_from >= len {
59 let mut tokens = Vec::new();
60 if let Some(cached) = cache.get_tokens() {
61 tokens.extend_from_slice(cached)
62 }
63 else {
64 let count = cache.count_tokens();
65 tokens.reserve(count);
66 for i in 0..count {
67 if let Some(t) = cache.get_token(i) {
68 tokens.push(t)
69 }
70 }
71 }
72 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
73 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![], end_limit: None };
74 }
75
76 if relex_from == 0 {
77 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![], end_limit: None };
78 }
79
80 let mut reused_tokens = Vec::new();
81 const BACKTRACK_TOKENS: usize = 1;
82
83 if let Some(cached) = cache.get_tokens() {
84 let idx = cached.partition_point(|t| t.span.end <= relex_from);
85 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
86 if keep > 0 {
87 reused_tokens.extend_from_slice(&cached[..keep])
88 }
89 }
90 else {
91 let count = cache.count_tokens();
92 for i in 0..count {
93 let Some(token) = cache.get_token(i)
94 else {
95 break;
96 };
97 if token.span.end <= relex_from {
98 reused_tokens.push(token);
99 }
100 else {
101 break;
102 }
103 }
104 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
105 reused_tokens.truncate(keep);
106 }
107
108 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
109 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![], end_limit: None }
110 }
111
112 /// Creates a sub-state for scanning a sub-range of the source.
113 pub fn sub_state(&mut self, start: usize, end: usize) -> Self {
114 Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![], end_limit: Some(end) }
115 }
116
117 /// Returns the source text provider.
118 pub fn get_source(&self) -> &'s S {
119 self.cursor.source()
120 }
121
122 /// Gets the remaining text from the current position to the end of the source.
123 ///
124 /// # Returns
125 ///
126 /// A string slice containing the remaining text
127 pub fn rest(&mut self) -> &str {
128 self.cursor.rest()
129 }
130
131 /// Gets the remaining text as a byte slice.
132 ///
133 /// Useful for byte-oriented scanning operations.
134 #[inline]
135 pub fn rest_bytes(&mut self) -> &[u8] {
136 self.cursor.rest().as_bytes()
137 }
138
139 /// Checks if the lexer has consumed all input from the source.
140 ///
141 /// Returns `true` if the current position is at or beyond the end of the source.
142 pub fn fully_reused(&self) -> bool {
143 self.cursor.position() >= self.cursor.source().length()
144 }
145
146 /// Gets the current byte offset position in the source text.
147 ///
148 /// # Returns
149 ///
150 /// The current byte offset from the start of the source text.
151 #[inline]
152 pub fn get_position(&self) -> usize {
153 self.cursor.position()
154 }
155
156 /// Checks if the lexer has NOT consumed all input from the source.
157 ///
158 /// Returns `true` if there are still bytes left to be scanned.
159 #[inline]
160 pub fn not_at_end(&self) -> bool {
161 self.cursor.position() < self.cursor.source().length()
162 }
163
164 /// Peeks at the next character without advancing the cursor.
165 ///
166 /// Returns `None` if at the end of the source.
167 #[inline]
168 pub fn peek(&mut self) -> Option<char> {
169 self.cursor.peek_char()
170 }
171
172 /// Peeks at the character immediately following the current character.
173 #[inline]
174 pub fn peek_next(&mut self) -> Option<char> {
175 self.cursor.peek_next_char()
176 }
177
178 /// Peeks at the character at the specified byte offset relative to the current position.
179 #[inline]
180 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
181 self.cursor.peek_next_n(n)
182 }
183
184 /// Advances the cursor by the specified number of bytes.
185 #[inline]
186 pub fn advance(&mut self, len: usize) {
187 self.cursor.advance_bytes(len);
188 }
189
190 /// Gets the total length of the source text in bytes.
191 #[inline]
192 pub fn get_length(&self) -> usize {
193 self.end_limit.unwrap_or_else(|| self.cursor.source().length())
194 }
195
196 /// Gets a single character at the specified absolute byte offset.
197 #[inline]
198 pub fn get_char_at(&self, offset: usize) -> Option<char> {
199 self.cursor.source().get_char_at(offset)
200 }
201
202 /// Peeks at the next byte without advancing the cursor.
203 #[inline]
204 pub fn peek_byte(&mut self) -> Option<u8> {
205 self.cursor.peek_byte()
206 }
207
208 /// Advances the cursor by one byte and returns it.
209 #[inline]
210 pub fn advance_byte(&mut self) -> Option<u8> {
211 self.cursor.advance_byte()
212 }
213
214 /// Advances the cursor while the byte predicate is true.
215 ///
216 /// Returns the byte range covered by the matched bytes.
217 #[inline]
218 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
219 self.cursor.take_while_byte(pred)
220 }
221
222 /// Skips common ASCII whitespace (space, tab, newline, carriage return).
223 ///
224 /// Uses SIMD acceleration if available on the platform.
225 /// Returns the range of the skipped whitespace.
226 #[inline]
227 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
228 self.cursor.skip_ascii_whitespace()
229 }
230
231 /// Skips all consecutive ASCII digits at the current position.
232 ///
233 /// Returns the range of the skipped digits.
234 #[inline]
235 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
236 self.cursor.skip_ascii_digits()
237 }
238
239 /// Skips all characters that can continue an ASCII identifier.
240 ///
241 /// This includes alphanumeric characters and underscores.
242 /// Returns the range of the skipped characters.
243 #[inline]
244 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
245 self.cursor.skip_ascii_ident_continue()
246 }
247
248 /// Skips all characters until the target byte is encountered.
249 ///
250 /// The target byte itself is NOT consumed.
251 /// Returns the range of the skipped characters.
252 #[inline]
253 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
254 self.cursor.skip_until(target)
255 }
256
257 /// Skips all ASCII hex digits (0-9, a-f, A-F).
258 ///
259 /// Uses SIMD acceleration if available on the platform.
260 /// Returns the range of the skipped hex digits.
261 #[inline]
262 pub fn skip_ascii_hexdigits(&mut self) -> std::range::Range<usize> {
263 let start = self.get_position();
264 let rest = self.rest_bytes();
265 let skipped = crate::source::SimdScanner::skip_ascii_hexdigits(rest);
266 self.advance(skipped);
267 (start..self.get_position()).into()
268 }
269
270 /// Finds the first occurrence of the target byte in the remaining text.
271 ///
272 /// Uses SIMD acceleration if available on the platform.
273 /// Returns the byte offset relative to the current position, or None if not found.
274 #[inline]
275 pub fn find_byte(&mut self, target: u8) -> Option<usize> {
276 let rest = self.rest_bytes();
277 crate::source::SimdScanner::find_byte(rest, target)
278 }
279
280 /// Finds the first occurrence of any of the 4 bytes in the remaining text.
281 ///
282 /// Uses SIMD acceleration if available on the platform.
283 /// Returns the byte offset relative to the current position, or None if not found.
284 #[inline]
285 pub fn find_first_of_4(&mut self, a: u8, b: u8, c: u8, d: u8) -> Option<usize> {
286 let rest = self.rest_bytes();
287 crate::source::SimdScanner::find_first_of_4(rest, a, b, c, d)
288 }
289
290 /// Scans an ASCII identifier.
291 ///
292 /// An identifier must start with an alphabetic character or an underscore,
293 /// and can be followed by any number of alphanumeric characters or underscores.
294 ///
295 /// # Arguments
296 ///
297 /// * `kind` - The token type to assign if an identifier is found.
298 ///
299 /// # Returns
300 ///
301 /// `true` if an identifier was successfully scanned and added.
302 #[inline]
303 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
304 let start = self.get_position();
305 if let Some(b) = self.peek_byte() {
306 if b == b'_' || b.is_ascii_alphabetic() {
307 self.advance_byte();
308 self.skip_ascii_ident_continue();
309 self.add_token(kind, start, self.get_position());
310 return true;
311 }
312 }
313 false
314 }
315
316 /// Scans a line comment starting with the given prefix.
317 ///
318 /// Consumes the prefix and all characters until the next newline or EOF.
319 ///
320 /// # Arguments
321 ///
322 /// * `kind` - The token type for the line comment.
323 /// * `prefix` - The string sequence that starts the comment (e.g., "//").
324 #[inline]
325 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
326 let start = self.get_position();
327 if self.consume_if_starts_with(prefix) {
328 self.skip_until(b'\n');
329 self.add_token(kind, start, self.get_position());
330 return true;
331 }
332 false
333 }
334
335 /// Scans a block comment with given start and end sequences.
336 ///
337 /// Handles nested comments if the underlying implementation supports it,
338 /// though this basic implementation is non-recursive.
339 ///
340 /// # Arguments
341 ///
342 /// * `kind` - The token type for the block comment.
343 /// * `start_seq` - The sequence that starts the block (e.g., "/*").
344 /// * `end_seq` - The sequence that ends the block (e.g., "*/").
345 #[inline]
346 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
347 let start = self.get_position();
348 if self.consume_if_starts_with(start_seq) {
349 while let Some(_b) = self.peek_byte() {
350 self.skip_until(end_seq.as_bytes()[0]);
351 if self.consume_if_starts_with(end_seq) {
352 self.add_token(kind, start, self.get_position());
353 return true;
354 }
355 self.advance_byte();
356 }
357 self.add_token(kind, start, self.get_position());
358 return true;
359 }
360 false
361 }
362
363 /// Gets the tokens collected so far in the lexer state.
364 ///
365 /// # Returns
366 ///
367 /// A slice of tokens collected during lexing.
368 #[inline]
369 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
370 &self.tokens
371 }
372
373 /// Sets the current position to the specified byte offset.
374 ///
375 /// # Arguments
376 ///
377 /// * `offset` - The new byte offset position.
378 ///
379 /// # Returns
380 ///
381 /// The previous byte offset position.
382 #[inline]
383 pub fn set_position(&mut self, offset: usize) -> usize {
384 self.cursor.set_position(offset)
385 }
386
387 /// Returns a reference to the underlying source.
388 pub fn source(&self) -> &'s S {
389 self.cursor.source()
390 }
391
392 /// Returns the text in the specified byte range.
393 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
394 self.cursor.source().get_text_in(range)
395 }
396
397 /// Returns the text from the specified byte offset to the end of the source.
398 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
399 self.cursor.source().get_text_from(offset)
400 }
401
402 /// Checks if the source starts with the given pattern at the current position.
403 pub fn starts_with(&mut self, pattern: &str) -> bool {
404 self.cursor.starts_with(pattern)
405 }
406
407 /// Consumes the pattern if it exists at the current position.
408 ///
409 /// Returns `true` if the pattern was found and consumed, advancing the cursor.
410 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
411 self.cursor.consume_if_starts_with(pattern)
412 }
413
414 /// Adds an error to the lexer state's diagnostics.
415 ///
416 /// # Arguments
417 ///
418 /// * `error` - The error to add.
419 #[inline]
420 pub fn add_error(&mut self, error: impl Into<OakError>) {
421 self.errors.push(error.into());
422 }
423
424 /// Adds a token to the lexer state.
425 ///
426 /// # Arguments
427 ///
428 /// * `kind` - The kind/type of the token.
429 /// * `start` - The starting byte offset.
430 /// * `end` - The ending byte offset.
431 #[inline]
432 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
433 self.tokens.push(Token { kind, span: Range { start, end } });
434 }
435
436 /// Adds an end-of-file (EOF) token to the lexer state.
437 ///
438 /// This method creates and adds an `END_OF_STREAM` token at the current position.
439 /// It is typically called when the lexer reaches the end of the source text.
440 ///
441 /// # Examples
442 ///
443 /// ```ignore
444 /// #![feature(new_range_api)]
445 /// # use core::range::Range;
446 /// # use oak_core::lexer::{LexerState, Token};
447 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
448 /// #
449 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
450 /// # enum SimpleToken {
451 /// # End,
452 /// # }
453 /// #
454 /// # impl TokenType for SimpleToken {
455 /// # const END_OF_STREAM: Self = SimpleToken::End;
456 /// # type Role = UniversalTokenRole;
457 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
458 /// # }
459 /// #
460 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
461 /// # enum SimpleElement {}
462 /// #
463 /// # impl ElementType for SimpleElement {
464 /// # type Role = UniversalElementRole;
465 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
466 /// # }
467 /// #
468 /// # #[derive(Clone)]
469 /// # struct SimpleLanguage;
470 /// #
471 /// # impl Language for SimpleLanguage {
472 /// # const NAME: &'static str = "simple";
473 /// # type TokenType = SimpleToken;
474 /// # type ElementType = SimpleElement;
475 /// # type TypedRoot = ();
476 /// # }
477 /// #
478 /// let source = SourceText::new("test");
479 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
480 /// state.take_while(|_| true);
481 /// state.add_eof();
482 ///
483 /// assert_eq!(state.get_tokens().len(), 1);
484 /// assert_eq!(state.get_tokens()[0].span, Range { start: 4, end: 4 });
485 /// ```
486 #[inline]
487 pub fn add_eof(&mut self) {
488 let end = self.get_position();
489 self.add_token(L::TokenType::END_OF_STREAM, end, end)
490 }
491
492 /// Gets the current character at the current position.
493 ///
494 /// # Returns
495 ///
496 /// The current character, or `None` if at the end of the source
497 #[inline]
498 pub fn current(&mut self) -> Option<char> {
499 self.cursor.peek_char()
500 }
501
502 /// Advances the position by the current character's length.
503 ///
504 /// # Returns
505 ///
506 /// The character that was skipped, or `None` if at the end of the source
507 #[inline]
508 pub fn bump(&mut self) -> Option<char> {
509 let ch = self.peek()?;
510 self.advance(ch.len_utf8());
511 Some(ch)
512 }
513
514 /// Advances the position by the token's length and adds the token to the lexer state.
515 ///
516 /// This method combines two common operations: advancing the lexer position
517 /// and adding a token to the token list. It calculates the advance distance
518 /// from the token's span, ensuring consistent positioning.
519 ///
520 /// # Arguments
521 ///
522 /// * `token` - The token to add to the lexer state
523 ///
524 /// # Returns
525 ///
526 /// The new byte offset position after advancing
527 ///
528 /// # Examples
529 ///
530 /// ```ignore
531 /// #![feature(new_range_api)]
532 /// # use core::range::Range;
533 /// # use oak_core::lexer::{LexerState, Token};
534 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
535 /// #
536 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
537 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
538 /// # enum SimpleToken { Identifier, End }
539 /// #
540 /// # impl TokenType for SimpleToken {
541 /// # const END_OF_STREAM: Self = SimpleToken::End;
542 /// # type Role = UniversalTokenRole;
543 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
544 /// # }
545 /// #
546 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
547 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
548 /// # enum SimpleElement {}
549 /// #
550 /// # impl ElementType for SimpleElement {
551 /// # type Role = UniversalElementRole;
552 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
553 /// # }
554 /// #
555 /// # #[derive(Clone)]
556 /// # struct SimpleLanguage;
557 /// #
558 /// # impl Language for SimpleLanguage {
559 /// # const NAME: &'static str = "simple";
560 /// # type TokenType = SimpleToken;
561 /// # type ElementType = SimpleElement;
562 /// # type TypedRoot = ();
563 /// # }
564 /// #
565 /// let source = SourceText::new("hello world");
566 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
567 ///
568 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
569 ///
570 /// assert_eq!(state.get_position(), 0);
571 ///
572 /// let new_pos = state.advance_with(token);
573 ///
574 /// assert_eq!(new_pos, 5);
575 /// assert_eq!(state.get_position(), 5);
576 /// assert_eq!(state.get_tokens().len(), 1);
577 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
578 /// ```
579 ///
580 /// # Note
581 ///
582 /// The caller must ensure that the token's span is valid and that the advance
583 /// does not split multi-byte UTF-8 characters. The token should be created
584 /// with proper character boundaries.
585 #[inline]
586 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
587 self.cursor.advance_bytes(token.length());
588 self.tokens.push(token);
589 self.cursor.position()
590 }
591
592 /// Consumes characters while the predicate returns true, returning the consumed range.
593 ///
594 /// This method iterates through the source text from the current position,
595 /// consuming characters as long as the predicate function returns true.
596 /// It's commonly used for recognizing patterns like identifiers, numbers,
597 /// or whitespace sequences.
598 ///
599 /// # Arguments
600 ///
601 /// * `pred` - A closure that takes a character and returns true if the character
602 /// should be consumed, false otherwise
603 ///
604 /// # Returns
605 ///
606 /// A byte range representing the span of consumed characters
607 ///
608 /// # Examples
609 ///
610 /// ```ignore
611 /// #![feature(new_range_api)]
612 /// # use core::range::Range;
613 /// # use oak_core::lexer::{LexerState, Token};
614 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
615 /// #
616 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
617 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
618 /// # enum SimpleToken { End }
619 /// #
620 /// # impl TokenType for SimpleToken {
621 /// # const END_OF_STREAM: Self = SimpleToken::End;
622 /// # type Role = UniversalTokenRole;
623 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
624 /// # }
625 /// #
626 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
627 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
628 /// # enum SimpleElement {}
629 /// #
630 /// # impl ElementType for SimpleElement {
631 /// # type Role = UniversalElementRole;
632 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
633 /// # }
634 /// #
635 /// # #[derive(Clone)]
636 /// # struct SimpleLanguage;
637 /// #
638 /// # impl Language for SimpleLanguage {
639 /// # const NAME: &'static str = "simple";
640 /// # type TokenType = SimpleToken;
641 /// # type ElementType = SimpleElement;
642 /// # type TypedRoot = ();
643 /// # }
644 /// #
645 /// let source = SourceText::new("hello123world");
646 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
647 ///
648 /// let range = state.take_while(|c| c.is_alphabetic());
649 ///
650 /// assert_eq!(range, Range { start: 0, end: 5 });
651 /// assert_eq!(state.get_position(), 5);
652 ///
653 /// let range = state.take_while(|c| c.is_numeric());
654 ///
655 /// assert_eq!(range, Range { start: 5, end: 8 });
656 /// assert_eq!(state.get_position(), 8);
657 /// ```
658 ///
659 /// # Performance Note
660 ///
661 /// This method operates on a character-by-character basis, which means it
662 /// correctly handles multi-byte UTF-8 characters. For performance-critical
663 /// code, consider using byte-based methods when working with ASCII-only text.
664 pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
665 self.cursor.take_while(pred)
666 }
667
668 /// Performs a safety check to prevent infinite loops during lexing.
669 ///
670 /// This method ensures that the lexer always makes progress by forcing
671 /// advancement when stuck at the same position. It's used as a safeguard
672 /// against infinite loops in lexer implementations.
673 ///
674 /// The method compares the current position with a previously saved "safe point"
675 /// position. If they're the same, it means the lexer hasn't made progress since
676 /// that safe point, potentially indicating an infinite loop. In this case, the
677 /// method forces advancement by at least one character.
678 ///
679 /// # Arguments
680 ///
681 /// * `safe_point` - The position to check against for potential deadlock
682 ///
683 /// # Examples
684 ///
685 /// ```ignore
686 /// #![feature(new_range_api)]
687 /// # use oak_core::lexer::{LexerState, Token};
688 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
689 /// #
690 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
691 /// # enum SimpleToken { End }
692 /// #
693 /// # impl TokenType for SimpleToken {
694 /// # const END_OF_STREAM: Self = SimpleToken::End;
695 /// # type Role = UniversalTokenRole;
696 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
697 /// # }
698 /// #
699 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
700 /// # enum SimpleElement {}
701 /// #
702 /// # impl ElementType for SimpleElement {
703 /// # type Role = UniversalElementRole;
704 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
705 /// # }
706 /// #
707 /// # struct SimpleLanguage;
708 /// #
709 /// # impl Language for SimpleLanguage {
710 /// # const NAME: &'static str = "simple";
711 /// # type TokenType = SimpleToken;
712 /// # type ElementType = SimpleElement;
713 /// # type TypedRoot = ();
714 /// # }
715 /// #
716 /// let source = SourceText::new("test");
717 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
718 ///
719 /// let safe_point = state.get_position();
720 ///
721 /// state.advance_if_dead_lock(safe_point);
722 ///
723 /// assert!(state.get_position() >= safe_point);
724 /// ```
725 ///
726 /// # Usage in Lexer Implementations
727 ///
728 /// This method is typically used at the beginning or end of lexing loops:
729 ///
730 /// ```ignore
731 /// loop {
732 /// let safe_point = state.get_position();
733 ///
734 /// if let Some(token) = try_recognize_token(&mut state) {
735 /// continue;
736 /// }
737 ///
738 /// state.advance_if_dead_lock(safe_point);
739 ///
740 /// if state.not_at_end() {
741 /// continue;
742 /// } else {
743 /// break;
744 /// }
745 /// }
746 /// ```
747 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
748 if self.cursor.position() == safe_point {
749 if let Some(ch) = self.peek() { self.advance(ch.len_utf8()) } else { self.advance(1) }
750 }
751 }
752
753 /// Finishes lexing and returns the final output with tokens and diagnostics.
754 ///
755 /// This method concludes the lexing process by converting the collected tokens
756 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
757 /// represents the overall success or failure of the lexing operation.
758 ///
759 /// If the result is `Ok`, the tokens are returned as the successful result.
760 /// If the result is `Err`, the error is returned as the failure result.
761 /// In both cases, any collected diagnostic errors are included in the output.
762 ///
763 /// # Arguments
764 ///
765 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
766 ///
767 /// # Returns
768 ///
769 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
770 ///
771 /// # Examples
772 ///
773 /// ```
774 /// #![feature(new_range_api)]
775 /// # use oak_core::lexer::{LexerState, Token};
776 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
777 /// #
778 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
779 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
780 /// # enum SimpleToken { Identifier, End }
781 /// #
782 /// # impl TokenType for SimpleToken {
783 /// # const END_OF_STREAM: Self = SimpleToken::End;
784 /// # type Role = UniversalTokenRole;
785 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
786 /// # }
787 /// #
788 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
789 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
790 /// # enum SimpleElement {}
791 /// #
792 /// # impl ElementType for SimpleElement {
793 /// # type Role = UniversalElementRole;
794 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
795 /// # }
796 /// #
797 /// # struct SimpleLanguage;
798 /// #
799 /// # impl Language for SimpleLanguage {
800 /// # const NAME: &'static str = "simple";
801 /// # type TokenType = SimpleToken;
802 /// # type ElementType = SimpleElement;
803 /// # type TypedRoot = ();
804 /// # }
805 /// #
806 /// let source = SourceText::new("test");
807 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
808 ///
809 /// state.add_token(SimpleToken::Identifier, 0, 4);
810 ///
811 /// let output = state.finish(Ok(()));
812 ///
813 /// assert!(output.result.is_ok());
814 /// assert_eq!(output.result.unwrap().len(), 1);
815 /// assert_eq!(output.diagnostics.len(), 0);
816 ///
817 /// let source2 = SourceText::new("test");
818 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
819 /// state2.add_error(OakError::custom_error("Test error"));
820 ///
821 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
822 ///
823 /// assert!(output2.result.is_err());
824 /// assert_eq!(output2.diagnostics.len(), 1);
825 /// ```
826 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
827 match result {
828 Ok(_) => {
829 let tokens: Tokens<L> = self.tokens.into();
830 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
831 }
832 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
833 }
834 }
835
836 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
837 ///
838 /// This method is similar to `finish` but additionally updates the incremental cache
839 /// with the new tokens. It's used for incremental lexing where the results need to
840 /// be cached for future reuse when the source text changes.
841 ///
842 /// The method first creates the output in the same way as `finish`, then updates
843 /// the cache's `last_lex` field with the new tokens. This enables the next call
844 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
845 ///
846 /// # Arguments
847 ///
848 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
849 /// * `cache` - The incremental cache to update with the new tokens
850 ///
851 /// # Returns
852 ///
853 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
854 ///
855 /// # Examples
856 ///
857 /// ```ignore
858 /// #![feature(new_range_api)]
859 /// # use core::range::Range;
860 /// # use oak_core::lexer::{LexerState, Token};
861 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
862 /// # use oak_core::parser::session::ParseSession;
863 /// #
864 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
865 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
866 /// # enum SimpleToken { Identifier, End }
867 /// #
868 /// # impl TokenType for SimpleToken {
869 /// # const END_OF_STREAM: Self = SimpleToken::End;
870 /// # type Role = UniversalTokenRole;
871 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
872 /// # }
873 /// #
874 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
875 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
876 /// # enum SimpleElement {}
877 /// #
878 /// # impl ElementType for SimpleElement {
879 /// # type Role = UniversalElementRole;
880 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
881 /// # }
882 /// #
883 /// # struct SimpleLanguage;
884 /// #
885 /// # impl Language for SimpleLanguage {
886 /// # const NAME: &'static str = "simple";
887 /// # type TokenType = SimpleToken;
888 /// # type ElementType = SimpleElement;
889 /// # type TypedRoot = ();
890 /// # }
891 /// #
892 /// let source = SourceText::new("test");
893 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
894 ///
895 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
896 ///
897 /// state.add_token(SimpleToken::Identifier, 0, 4);
898 ///
899 /// let output = state.finish_with_cache(Ok(()), &mut cache);
900 ///
901 /// assert!(output.result.is_ok());
902 /// assert_eq!(output.result.unwrap().len(), 1);
903 /// ```
904 ///
905 /// # Incremental Lexing Workflow
906 ///
907 /// This method is typically used as part of an incremental lexing workflow:
908 ///
909 /// ```ignore
910 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
911 /// let output = state.finish_with_cache(Ok(()), cache);
912 ///
913 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
914 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
915 /// let output = state.finish_with_cache(Ok(()), cache);
916 /// ```
917 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
918 let out = self.finish(result);
919 cache.set_lex_output(out.clone());
920 out
921 }
922}
923
924use crate::OakDiagnostics;