oak_core/lexer/state.rs
1use crate::{
2 Language, TokenType,
3 errors::OakError,
4 lexer::{LexOutput, LexerCache, Token, Tokens},
5 source::{Source, SourceCursor},
6};
7pub use core::range::Range;
8use std::borrow::Cow;
9
10/// Represents the state of the lexer during a tokenization session.
11///
12/// This struct maintains the current position and context during
13/// tokenization, enabling incremental and resumable lexing operations.
14/// It tracks the current position in the source text, collected tokens,
15/// and any errors encountered.
16#[derive(Debug)]
17pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
18 pub(crate) cursor: SourceCursor<'s, S>,
19 pub(crate) tokens: Vec<Token<L::TokenType>>,
20 pub(crate) errors: Vec<OakError>,
21}
22
23impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
24 /// Creates a new lexer state with the given source text.
25 ///
26 /// # Arguments
27 ///
28 /// * `source` - The source text to lex
29 ///
30 /// # Returns
31 ///
32 /// A new `LexerState` initialized at the beginning of the source
33 pub fn new(source: &'s S) -> Self {
34 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
35 }
36
37 /// Creates a new lexer state with the given source text and incremental cache.
38 ///
39 /// # Arguments
40 ///
41 /// * `source` - The source text to lex
42 /// * `relex_from` - The minimum byte offset that may have been affected by edits
43 /// (use `source.length()` to indicate no edits)
44 /// * `cache` - The incremental cache containing previous lexing results
45 ///
46 /// # Returns
47 ///
48 /// A new `LexerState` initialized at the beginning of the source with cache support
49 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
50 if !cache.has_tokens() {
51 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
52 }
53
54 let len = source.length();
55 let relex_from = relex_from.min(len);
56
57 if relex_from >= len {
58 let mut tokens = Vec::new();
59 if let Some(cached) = cache.get_tokens() {
60 tokens.extend_from_slice(cached)
61 }
62 else {
63 let count = cache.count_tokens();
64 tokens.reserve(count);
65 for i in 0..count {
66 if let Some(t) = cache.get_token(i) {
67 tokens.push(t)
68 }
69 }
70 }
71 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
72 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
73 }
74
75 if relex_from == 0 {
76 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
77 }
78
79 let mut reused_tokens = Vec::new();
80 const BACKTRACK_TOKENS: usize = 1;
81
82 if let Some(cached) = cache.get_tokens() {
83 let idx = cached.partition_point(|t| t.span.end <= relex_from);
84 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
85 if keep > 0 {
86 reused_tokens.extend_from_slice(&cached[..keep])
87 }
88 }
89 else {
90 let count = cache.count_tokens();
91 for i in 0..count {
92 let Some(token) = cache.get_token(i)
93 else {
94 break;
95 };
96 if token.span.end <= relex_from {
97 reused_tokens.push(token);
98 }
99 else {
100 break;
101 }
102 }
103 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
104 reused_tokens.truncate(keep);
105 }
106
107 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
108 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
109 }
110
111 /// Creates a sub-state for scanning a sub-range of the source.
112 pub fn sub_state(&mut self, start: usize, _end: usize) -> Self {
113 Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![] }
114 }
115
116 /// Returns the source text provider.
117 pub fn get_source(&self) -> &'s S {
118 self.cursor.source()
119 }
120
121 /// Gets the remaining text from the current position to the end of the source.
122 ///
123 /// # Returns
124 ///
125 /// A string slice containing the remaining text
126 pub fn rest(&mut self) -> &str {
127 self.cursor.rest()
128 }
129
130 /// Gets the remaining text as a byte slice.
131 ///
132 /// Useful for byte-oriented scanning operations.
133 #[inline]
134 pub fn rest_bytes(&mut self) -> &[u8] {
135 self.cursor.rest().as_bytes()
136 }
137
138 /// Checks if the lexer has consumed all input from the source.
139 ///
140 /// Returns `true` if the current position is at or beyond the end of the source.
141 pub fn fully_reused(&self) -> bool {
142 self.cursor.position() >= self.cursor.source().length()
143 }
144
145 /// Gets the current byte offset position in the source text.
146 ///
147 /// # Returns
148 ///
149 /// The current byte offset from the start of the source text.
150 #[inline]
151 pub fn get_position(&self) -> usize {
152 self.cursor.position()
153 }
154
155 /// Checks if the lexer has NOT consumed all input from the source.
156 ///
157 /// Returns `true` if there are still bytes left to be scanned.
158 #[inline]
159 pub fn not_at_end(&self) -> bool {
160 self.cursor.position() < self.cursor.source().length()
161 }
162
163 /// Peeks at the next character without advancing the cursor.
164 ///
165 /// Returns `None` if at the end of the source.
166 #[inline]
167 pub fn peek(&mut self) -> Option<char> {
168 self.cursor.peek_char()
169 }
170
171 /// Peeks at the character immediately following the current character.
172 #[inline]
173 pub fn peek_next(&mut self) -> Option<char> {
174 self.cursor.peek_next_char()
175 }
176
177 /// Peeks at the character at the specified byte offset relative to the current position.
178 #[inline]
179 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
180 self.cursor.peek_next_n(n)
181 }
182
183 /// Advances the cursor by the specified number of bytes.
184 #[inline]
185 pub fn advance(&mut self, len: usize) {
186 self.cursor.advance_bytes(len);
187 }
188
189 /// Gets the total length of the source text in bytes.
190 #[inline]
191 pub fn get_length(&self) -> usize {
192 self.cursor.source().length()
193 }
194
195 /// Gets a single character at the specified absolute byte offset.
196 #[inline]
197 pub fn get_char_at(&self, offset: usize) -> Option<char> {
198 self.cursor.source().get_char_at(offset)
199 }
200
201 /// Peeks at the next byte without advancing the cursor.
202 #[inline]
203 pub fn peek_byte(&mut self) -> Option<u8> {
204 self.cursor.peek_byte()
205 }
206
207 /// Advances the cursor by one byte and returns it.
208 #[inline]
209 pub fn advance_byte(&mut self) -> Option<u8> {
210 self.cursor.advance_byte()
211 }
212
213 /// Advances the cursor while the byte predicate is true.
214 ///
215 /// Returns the byte range covered by the matched bytes.
216 #[inline]
217 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
218 self.cursor.take_while_byte(pred)
219 }
220
221 /// Skips common ASCII whitespace (space, tab, newline, carriage return).
222 ///
223 /// Uses SIMD acceleration if available on the platform.
224 /// Returns the range of the skipped whitespace.
225 #[inline]
226 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
227 self.cursor.skip_ascii_whitespace()
228 }
229
230 /// Skips all consecutive ASCII digits at the current position.
231 ///
232 /// Returns the range of the skipped digits.
233 #[inline]
234 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
235 self.cursor.skip_ascii_digits()
236 }
237
238 /// Skips all characters that can continue an ASCII identifier.
239 ///
240 /// This includes alphanumeric characters and underscores.
241 /// Returns the range of the skipped characters.
242 #[inline]
243 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
244 self.cursor.skip_ascii_ident_continue()
245 }
246
247 /// Skips all characters until the target byte is encountered.
248 ///
249 /// The target byte itself is NOT consumed.
250 /// Returns the range of the skipped characters.
251 #[inline]
252 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
253 self.cursor.skip_until(target)
254 }
255
256 /// Skips all ASCII hex digits (0-9, a-f, A-F).
257 ///
258 /// Uses SIMD acceleration if available on the platform.
259 /// Returns the range of the skipped hex digits.
260 #[inline]
261 pub fn skip_ascii_hexdigits(&mut self) -> std::range::Range<usize> {
262 let start = self.get_position();
263 let rest = self.rest_bytes();
264 let skipped = crate::source::SimdScanner::skip_ascii_hexdigits(rest);
265 self.advance(skipped);
266 (start..self.get_position()).into()
267 }
268
269 /// Finds the first occurrence of the target byte in the remaining text.
270 ///
271 /// Uses SIMD acceleration if available on the platform.
272 /// Returns the byte offset relative to the current position, or None if not found.
273 #[inline]
274 pub fn find_byte(&mut self, target: u8) -> Option<usize> {
275 let rest = self.rest_bytes();
276 crate::source::SimdScanner::find_byte(rest, target)
277 }
278
279 /// Finds the first occurrence of any of the 4 bytes in the remaining text.
280 ///
281 /// Uses SIMD acceleration if available on the platform.
282 /// Returns the byte offset relative to the current position, or None if not found.
283 #[inline]
284 pub fn find_first_of_4(&mut self, a: u8, b: u8, c: u8, d: u8) -> Option<usize> {
285 let rest = self.rest_bytes();
286 crate::source::SimdScanner::find_first_of_4(rest, a, b, c, d)
287 }
288
289 /// Scans an ASCII identifier.
290 ///
291 /// An identifier must start with an alphabetic character or an underscore,
292 /// and can be followed by any number of alphanumeric characters or underscores.
293 ///
294 /// # Arguments
295 ///
296 /// * `kind` - The token type to assign if an identifier is found.
297 ///
298 /// # Returns
299 ///
300 /// `true` if an identifier was successfully scanned and added.
301 #[inline]
302 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
303 let start = self.get_position();
304 if let Some(b) = self.peek_byte() {
305 if b == b'_' || b.is_ascii_alphabetic() {
306 self.advance_byte();
307 self.skip_ascii_ident_continue();
308 self.add_token(kind, start, self.get_position());
309 return true;
310 }
311 }
312 false
313 }
314
315 /// Scans a line comment starting with the given prefix.
316 ///
317 /// Consumes the prefix and all characters until the next newline or EOF.
318 ///
319 /// # Arguments
320 ///
321 /// * `kind` - The token type for the line comment.
322 /// * `prefix` - The string sequence that starts the comment (e.g., "//").
323 #[inline]
324 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
325 let start = self.get_position();
326 if self.consume_if_starts_with(prefix) {
327 self.skip_until(b'\n');
328 self.add_token(kind, start, self.get_position());
329 return true;
330 }
331 false
332 }
333
334 /// Scans a block comment with given start and end sequences.
335 ///
336 /// Handles nested comments if the underlying implementation supports it,
337 /// though this basic implementation is non-recursive.
338 ///
339 /// # Arguments
340 ///
341 /// * `kind` - The token type for the block comment.
342 /// * `start_seq` - The sequence that starts the block (e.g., "/*").
343 /// * `end_seq` - The sequence that ends the block (e.g., "*/").
344 #[inline]
345 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
346 let start = self.get_position();
347 if self.consume_if_starts_with(start_seq) {
348 while let Some(_b) = self.peek_byte() {
349 self.skip_until(end_seq.as_bytes()[0]);
350 if self.consume_if_starts_with(end_seq) {
351 self.add_token(kind, start, self.get_position());
352 return true;
353 }
354 self.advance_byte();
355 }
356 self.add_token(kind, start, self.get_position());
357 return true;
358 }
359 false
360 }
361
362 /// Gets the tokens collected so far in the lexer state.
363 ///
364 /// # Returns
365 ///
366 /// A slice of tokens collected during lexing.
367 #[inline]
368 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
369 &self.tokens
370 }
371
372 /// Sets the current position to the specified byte offset.
373 ///
374 /// # Arguments
375 ///
376 /// * `offset` - The new byte offset position.
377 ///
378 /// # Returns
379 ///
380 /// The previous byte offset position.
381 #[inline]
382 pub fn set_position(&mut self, offset: usize) -> usize {
383 self.cursor.set_position(offset)
384 }
385
386 /// Returns a reference to the underlying source.
387 pub fn source(&self) -> &'s S {
388 self.cursor.source()
389 }
390
391 /// Returns the text in the specified byte range.
392 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
393 self.cursor.source().get_text_in(range)
394 }
395
396 /// Returns the text from the specified byte offset to the end of the source.
397 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
398 self.cursor.source().get_text_from(offset)
399 }
400
401 /// Checks if the source starts with the given pattern at the current position.
402 pub fn starts_with(&mut self, pattern: &str) -> bool {
403 self.cursor.starts_with(pattern)
404 }
405
406 /// Consumes the pattern if it exists at the current position.
407 ///
408 /// Returns `true` if the pattern was found and consumed, advancing the cursor.
409 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
410 self.cursor.consume_if_starts_with(pattern)
411 }
412
413 /// Adds an error to the lexer state's diagnostics.
414 ///
415 /// # Arguments
416 ///
417 /// * `error` - The error to add.
418 #[inline]
419 pub fn add_error(&mut self, error: impl Into<OakError>) {
420 self.errors.push(error.into());
421 }
422
423 /// Adds a token to the lexer state.
424 ///
425 /// # Arguments
426 ///
427 /// * `kind` - The kind/type of the token.
428 /// * `start` - The starting byte offset.
429 /// * `end` - The ending byte offset.
430 #[inline]
431 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
432 self.tokens.push(Token { kind, span: Range { start, end } });
433 }
434
435 /// Adds an end-of-file (EOF) token to the lexer state.
436 ///
437 /// This method creates and adds an `END_OF_STREAM` token at the current position.
438 /// It is typically called when the lexer reaches the end of the source text.
439 ///
440 /// # Examples
441 ///
442 /// ```ignore
443 /// #![feature(new_range_api)]
444 /// # use core::range::Range;
445 /// # use oak_core::lexer::{LexerState, Token};
446 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
447 /// #
448 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
449 /// # enum SimpleToken {
450 /// # End,
451 /// # }
452 /// #
453 /// # impl TokenType for SimpleToken {
454 /// # const END_OF_STREAM: Self = SimpleToken::End;
455 /// # type Role = UniversalTokenRole;
456 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
457 /// # }
458 /// #
459 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
460 /// # enum SimpleElement {}
461 /// #
462 /// # impl ElementType for SimpleElement {
463 /// # type Role = UniversalElementRole;
464 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
465 /// # }
466 /// #
467 /// # #[derive(Clone)]
468 /// # struct SimpleLanguage;
469 /// #
470 /// # impl Language for SimpleLanguage {
471 /// # const NAME: &'static str = "simple";
472 /// # type TokenType = SimpleToken;
473 /// # type ElementType = SimpleElement;
474 /// # type TypedRoot = ();
475 /// # }
476 /// #
477 /// let source = SourceText::new("test");
478 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
479 /// state.take_while(|_| true);
480 /// state.add_eof();
481 ///
482 /// assert_eq!(state.get_tokens().len(), 1);
483 /// assert_eq!(state.get_tokens()[0].span, Range { start: 4, end: 4 });
484 /// ```
485 #[inline]
486 pub fn add_eof(&mut self) {
487 let end = self.get_position();
488 self.add_token(L::TokenType::END_OF_STREAM, end, end)
489 }
490
491 /// Gets the current character at the current position.
492 ///
493 /// # Returns
494 ///
495 /// The current character, or `None` if at the end of the source
496 #[inline]
497 pub fn current(&mut self) -> Option<char> {
498 self.cursor.peek_char()
499 }
500
501 /// Advances the position by the current character's length.
502 ///
503 /// # Returns
504 ///
505 /// The character that was skipped, or `None` if at the end of the source
506 #[inline]
507 pub fn bump(&mut self) -> Option<char> {
508 let ch = self.peek()?;
509 self.advance(ch.len_utf8());
510 Some(ch)
511 }
512
513 /// Advances the position by the token's length and adds the token to the lexer state.
514 ///
515 /// This method combines two common operations: advancing the lexer position
516 /// and adding a token to the token list. It calculates the advance distance
517 /// from the token's span, ensuring consistent positioning.
518 ///
519 /// # Arguments
520 ///
521 /// * `token` - The token to add to the lexer state
522 ///
523 /// # Returns
524 ///
525 /// The new byte offset position after advancing
526 ///
527 /// # Examples
528 ///
529 /// ```ignore
530 /// #![feature(new_range_api)]
531 /// # use core::range::Range;
532 /// # use oak_core::lexer::{LexerState, Token};
533 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
534 /// #
535 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
536 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
537 /// # enum SimpleToken { Identifier, End }
538 /// #
539 /// # impl TokenType for SimpleToken {
540 /// # const END_OF_STREAM: Self = SimpleToken::End;
541 /// # type Role = UniversalTokenRole;
542 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
543 /// # }
544 /// #
545 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
546 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
547 /// # enum SimpleElement {}
548 /// #
549 /// # impl ElementType for SimpleElement {
550 /// # type Role = UniversalElementRole;
551 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
552 /// # }
553 /// #
554 /// # #[derive(Clone)]
555 /// # struct SimpleLanguage;
556 /// #
557 /// # impl Language for SimpleLanguage {
558 /// # const NAME: &'static str = "simple";
559 /// # type TokenType = SimpleToken;
560 /// # type ElementType = SimpleElement;
561 /// # type TypedRoot = ();
562 /// # }
563 /// #
564 /// let source = SourceText::new("hello world");
565 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
566 ///
567 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
568 ///
569 /// assert_eq!(state.get_position(), 0);
570 ///
571 /// let new_pos = state.advance_with(token);
572 ///
573 /// assert_eq!(new_pos, 5);
574 /// assert_eq!(state.get_position(), 5);
575 /// assert_eq!(state.get_tokens().len(), 1);
576 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
577 /// ```
578 ///
579 /// # Note
580 ///
581 /// The caller must ensure that the token's span is valid and that the advance
582 /// does not split multi-byte UTF-8 characters. The token should be created
583 /// with proper character boundaries.
584 #[inline]
585 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
586 self.cursor.advance_bytes(token.length());
587 self.tokens.push(token);
588 self.cursor.position()
589 }
590
591 /// Consumes characters while the predicate returns true, returning the consumed range.
592 ///
593 /// This method iterates through the source text from the current position,
594 /// consuming characters as long as the predicate function returns true.
595 /// It's commonly used for recognizing patterns like identifiers, numbers,
596 /// or whitespace sequences.
597 ///
598 /// # Arguments
599 ///
600 /// * `pred` - A closure that takes a character and returns true if the character
601 /// should be consumed, false otherwise
602 ///
603 /// # Returns
604 ///
605 /// A byte range representing the span of consumed characters
606 ///
607 /// # Examples
608 ///
609 /// ```ignore
610 /// #![feature(new_range_api)]
611 /// # use core::range::Range;
612 /// # use oak_core::lexer::{LexerState, Token};
613 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
614 /// #
615 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
616 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
617 /// # enum SimpleToken { End }
618 /// #
619 /// # impl TokenType for SimpleToken {
620 /// # const END_OF_STREAM: Self = SimpleToken::End;
621 /// # type Role = UniversalTokenRole;
622 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
623 /// # }
624 /// #
625 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
626 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
627 /// # enum SimpleElement {}
628 /// #
629 /// # impl ElementType for SimpleElement {
630 /// # type Role = UniversalElementRole;
631 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
632 /// # }
633 /// #
634 /// # #[derive(Clone)]
635 /// # struct SimpleLanguage;
636 /// #
637 /// # impl Language for SimpleLanguage {
638 /// # const NAME: &'static str = "simple";
639 /// # type TokenType = SimpleToken;
640 /// # type ElementType = SimpleElement;
641 /// # type TypedRoot = ();
642 /// # }
643 /// #
644 /// let source = SourceText::new("hello123world");
645 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
646 ///
647 /// let range = state.take_while(|c| c.is_alphabetic());
648 ///
649 /// assert_eq!(range, Range { start: 0, end: 5 });
650 /// assert_eq!(state.get_position(), 5);
651 ///
652 /// let range = state.take_while(|c| c.is_numeric());
653 ///
654 /// assert_eq!(range, Range { start: 5, end: 8 });
655 /// assert_eq!(state.get_position(), 8);
656 /// ```
657 ///
658 /// # Performance Note
659 ///
660 /// This method operates on a character-by-character basis, which means it
661 /// correctly handles multi-byte UTF-8 characters. For performance-critical
662 /// code, consider using byte-based methods when working with ASCII-only text.
663 pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
664 self.cursor.take_while(pred)
665 }
666
667 /// Performs a safety check to prevent infinite loops during lexing.
668 ///
669 /// This method ensures that the lexer always makes progress by forcing
670 /// advancement when stuck at the same position. It's used as a safeguard
671 /// against infinite loops in lexer implementations.
672 ///
673 /// The method compares the current position with a previously saved "safe point"
674 /// position. If they're the same, it means the lexer hasn't made progress since
675 /// that safe point, potentially indicating an infinite loop. In this case, the
676 /// method forces advancement by at least one character.
677 ///
678 /// # Arguments
679 ///
680 /// * `safe_point` - The position to check against for potential deadlock
681 ///
682 /// # Examples
683 ///
684 /// ```ignore
685 /// #![feature(new_range_api)]
686 /// # use oak_core::lexer::{LexerState, Token};
687 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
688 /// #
689 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
690 /// # enum SimpleToken { End }
691 /// #
692 /// # impl TokenType for SimpleToken {
693 /// # const END_OF_STREAM: Self = SimpleToken::End;
694 /// # type Role = UniversalTokenRole;
695 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
696 /// # }
697 /// #
698 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
699 /// # enum SimpleElement {}
700 /// #
701 /// # impl ElementType for SimpleElement {
702 /// # type Role = UniversalElementRole;
703 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
704 /// # }
705 /// #
706 /// # struct SimpleLanguage;
707 /// #
708 /// # impl Language for SimpleLanguage {
709 /// # const NAME: &'static str = "simple";
710 /// # type TokenType = SimpleToken;
711 /// # type ElementType = SimpleElement;
712 /// # type TypedRoot = ();
713 /// # }
714 /// #
715 /// let source = SourceText::new("test");
716 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
717 ///
718 /// let safe_point = state.get_position();
719 ///
720 /// state.advance_if_dead_lock(safe_point);
721 ///
722 /// assert!(state.get_position() >= safe_point);
723 /// ```
724 ///
725 /// # Usage in Lexer Implementations
726 ///
727 /// This method is typically used at the beginning or end of lexing loops:
728 ///
729 /// ```ignore
730 /// loop {
731 /// let safe_point = state.get_position();
732 ///
733 /// if let Some(token) = try_recognize_token(&mut state) {
734 /// continue;
735 /// }
736 ///
737 /// state.advance_if_dead_lock(safe_point);
738 ///
739 /// if state.not_at_end() {
740 /// continue;
741 /// } else {
742 /// break;
743 /// }
744 /// }
745 /// ```
746 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
747 if self.cursor.position() == safe_point {
748 if let Some(ch) = self.peek() { self.advance(ch.len_utf8()) } else { self.advance(1) }
749 }
750 }
751
752 /// Finishes lexing and returns the final output with tokens and diagnostics.
753 ///
754 /// This method concludes the lexing process by converting the collected tokens
755 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
756 /// represents the overall success or failure of the lexing operation.
757 ///
758 /// If the result is `Ok`, the tokens are returned as the successful result.
759 /// If the result is `Err`, the error is returned as the failure result.
760 /// In both cases, any collected diagnostic errors are included in the output.
761 ///
762 /// # Arguments
763 ///
764 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
765 ///
766 /// # Returns
767 ///
768 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
769 ///
770 /// # Examples
771 ///
772 /// ```
773 /// #![feature(new_range_api)]
774 /// # use oak_core::lexer::{LexerState, Token};
775 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
776 /// #
777 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
778 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
779 /// # enum SimpleToken { Identifier, End }
780 /// #
781 /// # impl TokenType for SimpleToken {
782 /// # const END_OF_STREAM: Self = SimpleToken::End;
783 /// # type Role = UniversalTokenRole;
784 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
785 /// # }
786 /// #
787 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
788 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
789 /// # enum SimpleElement {}
790 /// #
791 /// # impl ElementType for SimpleElement {
792 /// # type Role = UniversalElementRole;
793 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
794 /// # }
795 /// #
796 /// # struct SimpleLanguage;
797 /// #
798 /// # impl Language for SimpleLanguage {
799 /// # const NAME: &'static str = "simple";
800 /// # type TokenType = SimpleToken;
801 /// # type ElementType = SimpleElement;
802 /// # type TypedRoot = ();
803 /// # }
804 /// #
805 /// let source = SourceText::new("test");
806 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
807 ///
808 /// state.add_token(SimpleToken::Identifier, 0, 4);
809 ///
810 /// let output = state.finish(Ok(()));
811 ///
812 /// assert!(output.result.is_ok());
813 /// assert_eq!(output.result.unwrap().len(), 1);
814 /// assert_eq!(output.diagnostics.len(), 0);
815 ///
816 /// let source2 = SourceText::new("test");
817 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
818 /// state2.add_error(OakError::custom_error("Test error"));
819 ///
820 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
821 ///
822 /// assert!(output2.result.is_err());
823 /// assert_eq!(output2.diagnostics.len(), 1);
824 /// ```
825 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
826 match result {
827 Ok(_) => {
828 let tokens: Tokens<L> = self.tokens.into();
829 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
830 }
831 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
832 }
833 }
834
835 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
836 ///
837 /// This method is similar to `finish` but additionally updates the incremental cache
838 /// with the new tokens. It's used for incremental lexing where the results need to
839 /// be cached for future reuse when the source text changes.
840 ///
841 /// The method first creates the output in the same way as `finish`, then updates
842 /// the cache's `last_lex` field with the new tokens. This enables the next call
843 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
844 ///
845 /// # Arguments
846 ///
847 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
848 /// * `cache` - The incremental cache to update with the new tokens
849 ///
850 /// # Returns
851 ///
852 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
853 ///
854 /// # Examples
855 ///
856 /// ```ignore
857 /// #![feature(new_range_api)]
858 /// # use core::range::Range;
859 /// # use oak_core::lexer::{LexerState, Token};
860 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
861 /// # use oak_core::parser::session::ParseSession;
862 /// #
863 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
864 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
865 /// # enum SimpleToken { Identifier, End }
866 /// #
867 /// # impl TokenType for SimpleToken {
868 /// # const END_OF_STREAM: Self = SimpleToken::End;
869 /// # type Role = UniversalTokenRole;
870 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
871 /// # }
872 /// #
873 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
874 /// # #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
875 /// # enum SimpleElement {}
876 /// #
877 /// # impl ElementType for SimpleElement {
878 /// # type Role = UniversalElementRole;
879 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
880 /// # }
881 /// #
882 /// # struct SimpleLanguage;
883 /// #
884 /// # impl Language for SimpleLanguage {
885 /// # const NAME: &'static str = "simple";
886 /// # type TokenType = SimpleToken;
887 /// # type ElementType = SimpleElement;
888 /// # type TypedRoot = ();
889 /// # }
890 /// #
891 /// let source = SourceText::new("test");
892 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
893 ///
894 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
895 ///
896 /// state.add_token(SimpleToken::Identifier, 0, 4);
897 ///
898 /// let output = state.finish_with_cache(Ok(()), &mut cache);
899 ///
900 /// assert!(output.result.is_ok());
901 /// assert_eq!(output.result.unwrap().len(), 1);
902 /// ```
903 ///
904 /// # Incremental Lexing Workflow
905 ///
906 /// This method is typically used as part of an incremental lexing workflow:
907 ///
908 /// ```ignore
909 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
910 /// let output = state.finish_with_cache(Ok(()), cache);
911 ///
912 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
913 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
914 /// let output = state.finish_with_cache(Ok(()), cache);
915 /// ```
916 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
917 let out = self.finish(result);
918 cache.set_lex_output(out.clone());
919 out
920 }
921}
922
923use crate::OakDiagnostics;