oak_core/lexer/mod.rs
1#![doc = include_str!("readme.md")]
2
3use crate::{
4 Language, TextEdit, TokenType,
5 errors::{OakDiagnostics, OakError},
6 source::{Source, SourceCursor},
7};
8pub use core::range::Range;
9#[cfg(feature = "serde")]
10use serde::{Deserialize, Serialize};
11use std::borrow::Cow;
12use triomphe::Arc;
13
14/// Utilities for scanning comments.
15mod scan_comment;
16/// Utilities for scanning identifiers.
17mod scan_identifier;
18/// Utilities for scanning numbers.
19mod scan_number;
20/// Utilities for scanning string literals.
21mod scan_string;
22/// Utilities for scanning whitespace.
23mod scan_white_space;
24
25pub use self::{scan_comment::CommentConfig, scan_string::StringConfig, scan_white_space::WhitespaceConfig};
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32#[derive(Debug, PartialEq, Eq)]
33#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
34#[cfg_attr(feature = "serde", serde(transparent, bound(serialize = "L::TokenType: Serialize", deserialize = "L::TokenType: Deserialize<'de>")))]
35pub struct Tokens<L: Language>(#[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))] pub Arc<[Token<L::TokenType>]>);
36
37impl<L: Language> Clone for Tokens<L> {
38 fn clone(&self) -> Self {
39 Self(self.0.clone())
40 }
41}
42
43impl<L: Language> Default for Tokens<L> {
44 fn default() -> Self {
45 Self(Arc::from_iter(std::iter::empty()))
46 }
47}
48
49impl<L: Language> core::ops::Deref for Tokens<L> {
50 type Target = [Token<L::TokenType>];
51
52 fn deref(&self) -> &Self::Target {
53 &self.0
54 }
55}
56
57impl<L: Language> From<Arc<[Token<L::TokenType>]>> for Tokens<L> {
58 fn from(arc: Arc<[Token<L::TokenType>]>) -> Self {
59 Self(arc)
60 }
61}
62
63impl<L: Language> From<Vec<Token<L::TokenType>>> for Tokens<L> {
64 fn from(vec: Vec<Token<L::TokenType>>) -> Self {
65 Self(Arc::from_iter(vec))
66 }
67}
68
69/// Output type for lexical analysis operations, including diagnostics.
70pub type LexOutput<L: Language> = OakDiagnostics<Tokens<L>>;
71
72/// Trait for tokenizing source code into sequences of tokens.
73///
74/// This trait defines the interface for converting source text into a sequence of
75/// tokens that can be consumed by the parser. Implementations should handle
76/// the specific lexical rules of their target language.
77///
78/// # Examples
79///
80/// ```ignore
81/// struct MyLexer;
82///
83/// #[derive(Debug, Clone, PartialEq, Eq, Copy)]
84/// enum MyToken {
85/// Number,
86/// Identifier,
87/// End,
88/// }
89///
90/// impl TokenType for MyToken {
91/// const END_OF_STREAM: Self = MyToken::End;
92/// type Role = UniversalTokenRole;
93/// fn role(&self) -> Self::Role { UniversalTokenRole::None }
94/// }
95///
96/// #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
97/// enum MyElement {}
98///
99/// impl ElementType for MyElement {
100/// type Role = UniversalElementRole;
101/// fn role(&self) -> Self::Role { UniversalElementRole::None }
102/// }
103///
104/// struct MyLanguage;
105///
106/// impl Language for MyLanguage {
107/// const NAME: &'static str = "my-language";
108/// type TokenType = MyToken;
109/// type ElementType = MyElement;
110/// type TypedRoot = ();
111/// }
112///
113/// impl Lexer<MyLanguage> for MyLexer {
114/// fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<MyLanguage>) -> LexOutput<MyLanguage> {
115/// // Tokenization logic here
116/// todo!()
117/// }
118/// }
119/// ```
120pub trait Lexer<L: Language + Send + Sync> {
121 /// Tokenizes the given source text into a sequence of tokens.
122 ///
123 /// This method performs a full lexical analysis of the source text,
124 /// creating a new sequence of tokens from scratch. It uses a default
125 /// cache configuration.
126 ///
127 /// # Arguments
128 ///
129 /// * `source` - The source text to tokenize
130 ///
131 /// # Returns
132 ///
133 /// A [`LexOutput`] containing the tokens and any diagnostic messages
134 fn lex<'a, S: Source + ?Sized>(&self, text: &S, edits: &[TextEdit], cache: &'a mut impl LexerCache<L>) -> LexOutput<L>;
135}
136
137/// Cache trait for lexical results.
138///
139/// This trait defines the interface for caching and accessing lexical analysis results.
140/// It provides methods for storing and retrieving token information from previous
141/// lexical analysis operations.
142#[allow(unused_variables)]
143pub trait LexerCache<L: Language> {
144 /// Sets the lexed output in the cache.
145 ///
146 /// # Arguments
147 ///
148 /// * `output` - The output from lexical analysis, including tokens and diagnostics
149 fn set_lex_output(&mut self, output: LexOutput<L>);
150
151 /// Gets a token from the cache by index.
152 ///
153 /// # Arguments
154 ///
155 /// * `index` - The index of the token to retrieve
156 ///
157 /// # Returns
158 ///
159 /// An `Option<Token<L::TokenType>>` containing the token if it exists,
160 /// or `None` if the index is out of bounds or no tokens are cached
161 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>>;
162
163 /// Gets the total number of tokens in the cache.
164 ///
165 /// # Returns
166 ///
167 /// The number of cached tokens, or 0 if no tokens are cached
168 fn count_tokens(&self) -> usize;
169
170 /// Checks if the cache contains any tokens.
171 ///
172 /// # Returns
173 ///
174 /// `true` if the cache contains tokens, `false` otherwise
175 fn has_tokens(&self) -> bool;
176
177 /// Gets all cached tokens as a slice.
178 ///
179 /// # Returns
180 ///
181 /// An optional slice of tokens if available.
182 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
183 None
184 }
185}
186
187impl<'a, L: Language, C: LexerCache<L> + ?Sized> LexerCache<L> for &'a mut C {
188 fn set_lex_output(&mut self, output: LexOutput<L>) {
189 (**self).set_lex_output(output)
190 }
191
192 fn get_token(&self, index: usize) -> Option<Token<L::TokenType>> {
193 (**self).get_token(index)
194 }
195
196 fn count_tokens(&self) -> usize {
197 (**self).count_tokens()
198 }
199
200 fn has_tokens(&self) -> bool {
201 (**self).has_tokens()
202 }
203
204 fn get_tokens(&self) -> Option<&[Token<L::TokenType>]> {
205 (**self).get_tokens()
206 }
207}
208
209/// A no-op implementation of `LexerCache`.
210#[derive(Debug, Clone, Copy, Default)]
211pub struct NoLexerCache;
212
213impl<L: Language> LexerCache<L> for NoLexerCache {
214 fn set_lex_output(&mut self, _output: LexOutput<L>) {}
215
216 fn get_token(&self, _index: usize) -> Option<Token<L::TokenType>> {
217 None
218 }
219
220 fn count_tokens(&self) -> usize {
221 0
222 }
223
224 fn has_tokens(&self) -> bool {
225 false
226 }
227}
228
229/// Represents a single kind in the source code.
230///
231/// Tokens are the fundamental units of lexical analysis, representing
232/// categorized pieces of source text with their position information.
233#[derive(Debug, Clone, PartialEq, Eq, Copy)]
234#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
235pub struct Token<K> {
236 /// The kind/category of this kind (e.g., keyword, identifier, number)
237 pub kind: K,
238 /// The byte range in the source text that this kind occupies
239 #[cfg_attr(feature = "serde", serde(with = "crate::serde_range"))]
240 pub span: Range<usize>,
241}
242
243impl<K> Token<K> {
244 /// Returns the length of this kind in bytes.
245 ///
246 /// # Returns
247 ///
248 /// The number of bytes between the start and end of the kind's span
249 ///
250 /// # Examples
251 ///
252 /// ```ignore
253 /// #![feature(new_range_api)]
254 /// # use oak_core::lexer::Token;
255 /// # use core::range::Range;
256 /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } }
257 /// assert_eq!(kind.length(), 5);
258 /// ```
259 #[inline]
260 pub fn length(&self) -> usize {
261 self.span.end - self.span.start
262 }
263}
264
265/// A stream of tokens with associated source text.
266#[derive(Debug, Clone)]
267#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
268#[cfg_attr(feature = "serde", serde(bound(serialize = "K: Serialize", deserialize = "K: Deserialize<'de>")))]
269pub struct TokenStream<K: Copy> {
270 /// The raw source text.
271 pub raw: String,
272 /// The tokens extracted from the source text.
273 #[cfg_attr(feature = "serde", serde(with = "arc_slice_serde"))]
274 pub tokens: Arc<[Token<K>]>,
275}
276
277#[cfg(feature = "serde")]
278mod arc_slice_serde {
279 use super::*;
280 use serde::{Deserialize, Deserializer, Serialize, Serializer};
281
282 pub fn serialize<K, S>(arc: &Arc<[Token<K>]>, serializer: S) -> Result<S::Ok, S::Error>
283 where
284 K: Serialize,
285 S: Serializer,
286 {
287 arc.as_ref().serialize(serializer)
288 }
289
290 pub fn deserialize<'de, K, D>(deserializer: D) -> Result<Arc<[Token<K>]>, D::Error>
291 where
292 K: Deserialize<'de>,
293 D: Deserializer<'de>,
294 {
295 let vec = Vec::<Token<K>>::deserialize(deserializer)?;
296 Ok(Arc::from_iter(vec))
297 }
298}
299
300/// Represents the state of the lexer during a tokenization session.
301///
302/// This struct maintains the current position and context during
303/// tokenization, enabling incremental and resumable lexing operations.
304/// It tracks the current position in the source text, collected tokens,
305/// and any errors encountered.
306#[derive(Debug)]
307pub struct LexerState<'s, S: Source + ?Sized, L: Language> {
308 pub(crate) cursor: SourceCursor<'s, S>,
309 pub(crate) tokens: Vec<Token<L::TokenType>>,
310 pub(crate) errors: Vec<OakError>,
311}
312
313impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L> {
314 /// Creates a new lexer state with the given source text.
315 ///
316 /// # Arguments
317 ///
318 /// * `source` - The source text to lex
319 ///
320 /// # Returns
321 ///
322 /// A new `LexerState` initialized at the beginning of the source
323 pub fn new(source: &'s S) -> Self {
324 Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] }
325 }
326
327 /// Creates a new lexer state with the given source text and incremental cache.
328 ///
329 /// # Arguments
330 ///
331 /// * `source` - The source text to lex
332 /// * `relex_from` - The minimum byte offset that may have been affected by edits
333 /// (use `source.length()` to indicate no edits)
334 /// * `cache` - The incremental cache containing previous lexing results
335 ///
336 /// # Returns
337 ///
338 /// A new `LexerState` initialized at the beginning of the source with cache support
339 pub fn new_with_cache(source: &'s S, relex_from: usize, cache: &impl LexerCache<L>) -> Self {
340 if !cache.has_tokens() {
341 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
342 }
343
344 let len = source.length();
345 let relex_from = relex_from.min(len);
346
347 // Fast path: fully re-used
348 if relex_from >= len {
349 let mut tokens = Vec::new();
350 if let Some(cached) = cache.get_tokens() {
351 tokens.extend_from_slice(cached)
352 }
353 else {
354 let count = cache.count_tokens();
355 tokens.reserve(count);
356 for i in 0..count {
357 if let Some(t) = cache.get_token(i) {
358 tokens.push(t)
359 }
360 }
361 }
362 let offset = tokens.last().map(|t| t.span.end).unwrap_or(0).min(len);
363 return Self { cursor: SourceCursor::new_at(source, offset), tokens, errors: vec![] };
364 }
365
366 if relex_from == 0 {
367 return Self { cursor: SourceCursor::new(source), tokens: vec![], errors: vec![] };
368 }
369
370 let mut reused_tokens = Vec::new();
371 const BACKTRACK_TOKENS: usize = 1;
372
373 if let Some(cached) = cache.get_tokens() {
374 // Binary search for the cut-off point since tokens are sorted by position
375 let idx = cached.partition_point(|t| t.span.end <= relex_from);
376 let keep = idx.saturating_sub(BACKTRACK_TOKENS);
377 if keep > 0 {
378 reused_tokens.extend_from_slice(&cached[..keep])
379 }
380 }
381 else {
382 // Fallback for caches that don't support slice access
383 let count = cache.count_tokens();
384 for i in 0..count {
385 let Some(token) = cache.get_token(i)
386 else {
387 break;
388 };
389 if token.span.end <= relex_from {
390 reused_tokens.push(token);
391 }
392 else {
393 break;
394 }
395 }
396 let keep = reused_tokens.len().saturating_sub(BACKTRACK_TOKENS);
397 reused_tokens.truncate(keep);
398 }
399
400 let stable_offset = reused_tokens.last().map(|t| t.span.end).unwrap_or(0);
401 Self { cursor: SourceCursor::new_at(source, stable_offset), tokens: reused_tokens, errors: vec![] }
402 }
403
404 /// Creates a sub-state for scanning a sub-range of the source.
405 pub fn sub_state(&mut self, start: usize, _end: usize) -> Self {
406 Self { cursor: SourceCursor::new_at(self.cursor.source(), start), tokens: vec![], errors: vec![] }
407 }
408
409 /// Returns the source text provider.
410 pub fn get_source(&self) -> &'s S {
411 self.cursor.source()
412 }
413
414 /// Gets the remaining text from the current position to the end of the source.
415 ///
416 /// # Returns
417 ///
418 /// A string slice containing the remaining text
419 pub fn rest(&mut self) -> &str {
420 self.cursor.rest()
421 }
422
423 /// Gets the remaining text as a byte slice.
424 ///
425 /// Useful for byte-oriented scanning operations.
426 #[inline]
427 pub fn rest_bytes(&mut self) -> &[u8] {
428 self.cursor.rest().as_bytes()
429 }
430
431 /// Checks if the lexer has consumed all input from the source.
432 ///
433 /// Returns `true` if the current position is at or beyond the end of the source.
434 pub fn fully_reused(&self) -> bool {
435 self.cursor.position() >= self.cursor.source().length()
436 }
437
438 /// Gets the current byte offset position in the source text.
439 ///
440 /// # Returns
441 ///
442 /// The current byte offset from the start of the source text.
443 #[inline]
444 pub fn get_position(&self) -> usize {
445 self.cursor.position()
446 }
447
448 /// Checks if the lexer has NOT consumed all input from the source.
449 ///
450 /// Returns `true` if there are still bytes left to be scanned.
451 #[inline]
452 pub fn not_at_end(&self) -> bool {
453 self.cursor.position() < self.cursor.source().length()
454 }
455
456 /// Peeks at the next character without advancing the cursor.
457 ///
458 /// Returns `None` if at the end of the source.
459 #[inline]
460 pub fn peek(&mut self) -> Option<char> {
461 self.cursor.peek_char()
462 }
463
464 /// Peeks at the character immediately following the current character.
465 #[inline]
466 pub fn peek_next(&mut self) -> Option<char> {
467 self.cursor.peek_next_char()
468 }
469
470 /// Peeks at the character at the specified byte offset relative to the current position.
471 #[inline]
472 pub fn peek_next_n(&mut self, n: usize) -> Option<char> {
473 self.cursor.peek_next_n(n)
474 }
475
476 /// Advances the cursor by the specified number of bytes.
477 #[inline]
478 pub fn advance(&mut self, len: usize) {
479 self.cursor.advance_bytes(len);
480 }
481
482 /// Gets the total length of the source text in bytes.
483 #[inline]
484 pub fn get_length(&self) -> usize {
485 self.cursor.source().length()
486 }
487
488 /// Gets a single character at the specified absolute byte offset.
489 #[inline]
490 pub fn get_char_at(&self, offset: usize) -> Option<char> {
491 self.cursor.source().get_char_at(offset)
492 }
493
494 /// Peeks at the next byte without advancing the cursor.
495 #[inline]
496 pub fn peek_byte(&mut self) -> Option<u8> {
497 self.cursor.peek_byte()
498 }
499
500 /// Advances the cursor by one byte and returns it.
501 #[inline]
502 pub fn advance_byte(&mut self) -> Option<u8> {
503 self.cursor.advance_byte()
504 }
505
506 /// Advances the cursor while the byte predicate is true.
507 ///
508 /// Returns the byte range covered by the matched bytes.
509 #[inline]
510 pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize> {
511 self.cursor.take_while_byte(pred)
512 }
513
514 /// Skips common ASCII whitespace (space, tab, newline, carriage return).
515 ///
516 /// Uses SIMD acceleration if available on the platform.
517 /// Returns the range of the skipped whitespace.
518 #[inline]
519 pub fn skip_ascii_whitespace(&mut self) -> Range<usize> {
520 self.cursor.skip_ascii_whitespace()
521 }
522
523 /// Skips all consecutive ASCII digits at the current position.
524 ///
525 /// Returns the range of the skipped digits.
526 #[inline]
527 pub fn skip_ascii_digits(&mut self) -> Range<usize> {
528 self.cursor.skip_ascii_digits()
529 }
530
531 /// Skips all characters that can continue an ASCII identifier.
532 ///
533 /// This includes alphanumeric characters and underscores.
534 /// Returns the range of the skipped characters.
535 #[inline]
536 pub fn skip_ascii_ident_continue(&mut self) -> Range<usize> {
537 self.cursor.skip_ascii_ident_continue()
538 }
539
540 /// Skips all characters until the target byte is encountered.
541 ///
542 /// The target byte itself is NOT consumed.
543 /// Returns the range of the skipped characters.
544 #[inline]
545 pub fn skip_until(&mut self, target: u8) -> Range<usize> {
546 self.cursor.skip_until(target)
547 }
548
549 /// Scans an ASCII identifier.
550 ///
551 /// An identifier must start with an alphabetic character or an underscore,
552 /// and can be followed by any number of alphanumeric characters or underscores.
553 ///
554 /// # Arguments
555 ///
556 /// * `kind` - The token type to assign if an identifier is found.
557 ///
558 /// # Returns
559 ///
560 /// `true` if an identifier was successfully scanned and added.
561 #[inline]
562 pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool {
563 let start = self.get_position();
564 if let Some(b) = self.peek_byte() {
565 if b == b'_' || b.is_ascii_alphabetic() {
566 self.advance_byte();
567 self.skip_ascii_ident_continue();
568 self.add_token(kind, start, self.get_position());
569 return true;
570 }
571 }
572 false
573 }
574
575 /// Scans a line comment starting with the given prefix.
576 ///
577 /// Consumes the prefix and all characters until the next newline or EOF.
578 ///
579 /// # Arguments
580 ///
581 /// * `kind` - The token type for the line comment.
582 /// * `prefix` - The string sequence that starts the comment (e.g., "//").
583 #[inline]
584 pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool {
585 let start = self.get_position();
586 if self.consume_if_starts_with(prefix) {
587 self.skip_until(b'\n');
588 self.add_token(kind, start, self.get_position());
589 return true;
590 }
591 false
592 }
593
594 /// Scans a block comment with given start and end sequences.
595 ///
596 /// Handles nested comments if the underlying implementation supports it,
597 /// though this basic implementation is non-recursive.
598 ///
599 /// # Arguments
600 ///
601 /// * `kind` - The token type for the block comment.
602 /// * `start_seq` - The sequence that starts the block (e.g., "/*").
603 /// * `end_seq` - The sequence that ends the block (e.g., "*/").
604 #[inline]
605 pub fn scan_block_comment(&mut self, kind: L::TokenType, start_seq: &str, end_seq: &str) -> bool {
606 let start = self.get_position();
607 if self.consume_if_starts_with(start_seq) {
608 while let Some(_b) = self.peek_byte() {
609 self.skip_until(end_seq.as_bytes()[0]);
610 if self.consume_if_starts_with(end_seq) {
611 self.add_token(kind, start, self.get_position());
612 return true;
613 }
614 self.advance_byte();
615 }
616 // Unclosed block comment is still a comment in many languages,
617 // but we might want to add an error here in the future.
618 self.add_token(kind, start, self.get_position());
619 return true;
620 }
621 false
622 }
623
624 /// Gets the tokens collected so far in the lexer state.
625 ///
626 /// # Returns
627 ///
628 /// A slice of tokens collected during lexing.
629 #[inline]
630 pub fn get_tokens(&self) -> &[Token<L::TokenType>] {
631 &self.tokens
632 }
633
634 /// Sets the current position to the specified byte offset.
635 ///
636 /// # Arguments
637 ///
638 /// * `offset` - The new byte offset position.
639 ///
640 /// # Returns
641 ///
642 /// The previous byte offset position.
643 #[inline]
644 pub fn set_position(&mut self, offset: usize) -> usize {
645 self.cursor.set_position(offset)
646 }
647
648 /// Returns a reference to the underlying source.
649 pub fn source(&self) -> &'s S {
650 self.cursor.source()
651 }
652
653 /// Returns the text in the specified byte range.
654 pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str> {
655 self.cursor.source().get_text_in(range)
656 }
657
658 /// Returns the text from the specified byte offset to the end of the source.
659 pub fn get_text_from(&self, offset: usize) -> Cow<'_, str> {
660 self.cursor.source().get_text_from(offset)
661 }
662
663 /// Checks if the source starts with the given pattern at the current position.
664 pub fn starts_with(&mut self, pattern: &str) -> bool {
665 self.cursor.starts_with(pattern)
666 }
667
668 /// Consumes the pattern if it exists at the current position.
669 ///
670 /// Returns `true` if the pattern was found and consumed, advancing the cursor.
671 pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool {
672 self.cursor.consume_if_starts_with(pattern)
673 }
674
675 /// Adds an error to the lexer state's diagnostics.
676 ///
677 /// # Arguments
678 ///
679 /// * `error` - The error to add.
680 #[inline]
681 pub fn add_error(&mut self, error: impl Into<OakError>) {
682 self.errors.push(error.into());
683 }
684
685 /// Adds a token to the lexer state.
686 ///
687 /// # Arguments
688 ///
689 /// * `kind` - The kind/type of the token.
690 /// * `start` - The starting byte offset.
691 /// * `end` - The ending byte offset.
692 #[inline]
693 pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize) {
694 self.tokens.push(Token { kind, span: Range { start, end } });
695 }
696
697 /// Adds an end-of-file (EOF) token to the lexer state.
698 ///
699 /// This method creates and adds an `END_OF_STREAM` token at the current position.
700 /// It is typically called when the lexer reaches the end of the source text.
701 ///
702 /// # Examples
703 ///
704 /// ```ignore
705 /// #![feature(new_range_api)]
706 /// # use core::range::Range;
707 /// # use oak_core::lexer::{LexerState, Token};
708 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
709 /// #
710 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
711 /// # enum SimpleToken {
712 /// # End,
713 /// # }
714 /// #
715 /// # impl TokenType for SimpleToken {
716 /// # const END_OF_STREAM: Self = SimpleToken::End;
717 /// # type Role = UniversalTokenRole;
718 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
719 /// # }
720 /// #
721 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
722 /// # enum SimpleElement {}
723 /// #
724 /// # impl ElementType for SimpleElement {
725 /// # type Role = UniversalElementRole;
726 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
727 /// # }
728 /// #
729 /// # #[derive(Clone)]
730 /// # struct SimpleLanguage;
731 /// #
732 /// # impl Language for SimpleLanguage {
733 /// # const NAME: &'static str = "simple";
734 /// # type TokenType = SimpleToken;
735 /// # type ElementType = SimpleElement;
736 /// # type TypedRoot = ();
737 /// # }
738 /// #
739 /// let source = SourceText::new("test");
740 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
741 /// state.take_while(|_| true); // Advance to end
742 /// state.add_eof();
743 ///
744 /// assert_eq!(state.tokens().len(), 1);
745 /// assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });
746 /// ```
747 #[inline]
748 pub fn add_eof(&mut self) {
749 let end = self.get_position();
750 self.add_token(L::TokenType::END_OF_STREAM, end, end)
751 }
752
753 /// Gets the current character at the current position.
754 ///
755 /// # Returns
756 ///
757 /// The current character, or `None` if at the end of the source
758 #[inline]
759 pub fn current(&mut self) -> Option<char> {
760 self.cursor.peek_char()
761 }
762
763 /// Advances the position by the current character's length.
764 ///
765 /// # Returns
766 ///
767 /// The character that was skipped, or `None` if at the end of the source
768 #[inline]
769 pub fn bump(&mut self) -> Option<char> {
770 let ch = self.peek()?;
771 self.advance(ch.len_utf8());
772 Some(ch)
773 }
774
775 /// Advances the position by the token's length and adds the token to the lexer state.
776 ///
777 /// This method combines two common operations: advancing the lexer position
778 /// and adding a token to the token list. It calculates the advance distance
779 /// from the token's span, ensuring consistent positioning.
780 ///
781 /// # Arguments
782 ///
783 /// * `token` - The token to add to the lexer state
784 ///
785 /// # Returns
786 ///
787 /// The new byte offset position after advancing
788 ///
789 /// # Examples
790 ///
791 /// ```ignore
792 /// #![feature(new_range_api)]
793 /// # use core::range::Range;
794 /// # use oak_core::lexer::{LexerState, Token};
795 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
796 /// # /// #
797 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
798 /// # enum SimpleToken { Identifier, End }
799 /// #
800 /// # impl TokenType for SimpleToken {
801 /// # const END_OF_STREAM: Self = SimpleToken::End;
802 /// # type Role = UniversalTokenRole;
803 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
804 /// # }
805 /// #
806 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
807 /// # enum SimpleElement {}
808 /// #
809 /// # impl ElementType for SimpleElement {
810 /// # type Role = UniversalElementRole;
811 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
812 /// # }
813 /// #
814 /// # #[derive(Clone)]
815 /// # struct SimpleLanguage;
816 /// #
817 /// # impl Language for SimpleLanguage {
818 /// # const NAME: &'static str = "simple";
819 /// # type TokenType = SimpleToken;
820 /// # type ElementType = SimpleElement;
821 /// # type TypedRoot = ();
822 /// # }
823 /// #
824 /// let source = SourceText::new("hello world");
825 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
826 ///
827 /// // Create a token for "hello"
828 /// let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } }
829 ///
830 /// // Initially at position 0
831 /// assert_eq!(state.get_position(), 0);
832 ///
833 /// // Advance and add the token
834 /// let new_pos = state.advance_with(token);
835 ///
836 /// // Now at position 5 and token is added
837 /// assert_eq!(new_pos, 5);
838 /// assert_eq!(state.get_position(), 5);
839 /// assert_eq!(state.get_tokens().len(), 1);
840 /// assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);
841 /// ```
842 ///
843 /// # Note
844 ///
845 /// The caller must ensure that the token's span is valid and that the advance
846 /// does not split multi-byte UTF-8 characters. The token should be created
847 /// with proper character boundaries.
848 #[inline]
849 pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize {
850 self.cursor.advance_bytes(token.length());
851 self.tokens.push(token);
852 self.cursor.position()
853 }
854
855 /// Consumes characters while the predicate returns true, returning the consumed range.
856 ///
857 /// This method iterates through the source text from the current position,
858 /// consuming characters as long as the predicate function returns true.
859 /// It's commonly used for recognizing patterns like identifiers, numbers,
860 /// or whitespace sequences.
861 ///
862 /// # Arguments
863 ///
864 /// * `pred` - A closure that takes a character and returns true if the character
865 /// should be consumed, false otherwise
866 ///
867 /// # Returns
868 ///
869 /// A byte range representing the span of consumed characters
870 ///
871 /// # Examples
872 ///
873 /// ```ignore
874 /// #![feature(new_range_api)]
875 /// # use core::range::Range;
876 /// # use oak_core::lexer::{LexerState, Token};
877 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
878 /// # /// #
879 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
880 /// # enum SimpleToken { End }
881 /// #
882 /// # impl TokenType for SimpleToken {
883 /// # const END_OF_STREAM: Self = SimpleToken::End;
884 /// # type Role = UniversalTokenRole;
885 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
886 /// # }
887 /// #
888 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
889 /// # enum SimpleElement {}
890 /// #
891 /// # impl ElementType for SimpleElement {
892 /// # type Role = UniversalElementRole;
893 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
894 /// # }
895 /// #
896 /// # #[derive(Clone)]
897 /// # struct SimpleLanguage;
898 /// #
899 /// # impl Language for SimpleLanguage {
900 /// # const NAME: &'static str = "simple";
901 /// # type TokenType = SimpleToken;
902 /// # type ElementType = SimpleElement;
903 /// # type TypedRoot = ();
904 /// # }
905 /// #
906 /// let source = SourceText::new("hello123world");
907 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
908 ///
909 /// // Consume alphabetic characters
910 /// let range = state.take_while(|c| c.is_alphabetic());
911 ///
912 /// // Should have consumed "hello"
913 /// assert_eq!(range, Range { start: 0, end: 5 });
914 /// assert_eq!(state.get_position(), 5);
915 ///
916 /// // Consume numeric characters
917 /// let range = state.take_while(|c| c.is_numeric());
918 ///
919 /// // Should have consumed "123"
920 /// assert_eq!(range, Range { start: 5, end: 8 });
921 /// assert_eq!(state.get_position(), 8);
922 /// ```
923 ///
924 /// # Performance Note
925 ///
926 /// This method operates on a character-by-character basis, which means it
927 /// correctly handles multi-byte UTF-8 characters. For performance-critical
928 /// code, consider using byte-based methods when working with ASCII-only text.
929 pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize> {
930 self.cursor.take_while(pred)
931 }
932
933 /// Performs a safety check to prevent infinite loops during lexing.
934 ///
935 /// This method ensures that the lexer always makes progress by forcing
936 /// advancement when stuck at the same position. It's used as a safeguard
937 /// against infinite loops in lexer implementations.
938 ///
939 /// The method compares the current position with a previously saved "safe point"
940 /// position. If they're the same, it means the lexer hasn't made progress since
941 /// that safe point, potentially indicating an infinite loop. In this case, the
942 /// method forces advancement by at least one character.
943 ///
944 /// # Arguments
945 ///
946 /// * `safe_point` - The position to check against for potential deadlock
947 ///
948 /// # Examples
949 ///
950 /// ```ignore
951 /// #![feature(new_range_api)]
952 /// # use oak_core::lexer::{LexerState, Token};
953 /// # use oak_core::{Language, TokenType, SourceText, UniversalTokenRole, TokenRole, UniversalElementRole, ElementRole, ElementType};
954 /// # /// #
955 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
956 /// # enum SimpleToken { End }
957 /// #
958 /// # impl TokenType for SimpleToken {
959 /// # const END_OF_STREAM: Self = SimpleToken::End;
960 /// # type Role = UniversalTokenRole;
961 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
962 /// # }
963 /// #
964 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
965 /// # enum SimpleElement {}
966 /// #
967 /// # impl ElementType for SimpleElement {
968 /// # type Role = UniversalElementRole;
969 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
970 /// # }
971 /// #
972 /// # struct SimpleLanguage;
973 /// #
974 /// # impl Language for SimpleLanguage {
975 /// # const NAME: &'static str = "simple";
976 /// # type TokenType = SimpleToken;
977 /// # type ElementType = SimpleElement;
978 /// # type TypedRoot = ();
979 /// # }
980 /// #
981 /// let source = SourceText::new("test");
982 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
983 ///
984 /// // Save the current position as a safe point
985 /// let safe_point = state.get_position();
986 ///
987 /// // In a real lexer, you would do some processing here
988 /// // If something went wrong and we didn't advance, this would prevent infinite loop
989 /// state.advance_if_dead_lock(safe_point);
990 ///
991 /// // If we were stuck, we would have advanced by at least 1
992 /// assert!(state.get_position() >= safe_point);
993 /// ```
994 ///
995 /// # Usage in Lexer Implementations
996 ///
997 /// This method is typically used at the beginning or end of lexing loops:
998 ///
999 /// ```ignore
1000 /// loop {
1001 /// let safe_point = state.get_position();
1002 ///
1003 /// // Try to recognize a token
1004 /// if let Some(token) = try_recognize_token(&mut state) {
1005 /// // Success, continue loop
1006 /// continue;
1007 /// }
1008 ///
1009 /// // If we get here, we didn't recognize anything
1010 /// // This prevents infinite loops if recognition fails
1011 /// state.advance_if_dead_lock(safe_point);
1012 ///
1013 /// if state.not_at_end() {
1014 /// // Continue trying to recognize tokens
1015 /// continue;
1016 /// } else {
1017 /// // Reached end of source
1018 /// break;
1019 /// }
1020 /// }
1021 /// ```
1022 pub fn advance_if_dead_lock(&mut self, safe_point: usize) {
1023 // Force advance if no progress was made
1024 if self.cursor.position() == safe_point {
1025 if let Some(ch) = self.peek() {
1026 // Skip current character
1027 self.advance(ch.len_utf8())
1028 }
1029 else {
1030 // Advance anyway to prevent infinite loop
1031 self.advance(1)
1032 }
1033 // tracing::warn!("deadlock")
1034 }
1035 }
1036
1037 /// Finishes lexing and returns the final output with tokens and diagnostics.
1038 ///
1039 /// This method concludes the lexing process by converting the collected tokens
1040 /// and errors into a `LexOutput` result. It takes a `Result` parameter that
1041 /// represents the overall success or failure of the lexing operation.
1042 ///
1043 /// If the result is `Ok`, the tokens are returned as the successful result.
1044 /// If the result is `Err`, the error is returned as the failure result.
1045 /// In both cases, any collected diagnostic errors are included in the output.
1046 ///
1047 /// # Arguments
1048 ///
1049 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1050 ///
1051 /// # Returns
1052 ///
1053 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1054 ///
1055 /// # Examples
1056 ///
1057 /// ```
1058 /// #![feature(new_range_api)]
1059 /// # use oak_core::lexer::{LexerState, Token};
1060 /// # use oak_core::{Language, TokenType, SourceText, OakError, OakDiagnostics, UniversalTokenRole, UniversalElementRole, ElementType};
1061 /// # /// #
1062 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1063 /// # enum SimpleToken { Identifier, End }
1064 /// #
1065 /// # impl TokenType for SimpleToken {
1066 /// # const END_OF_STREAM: Self = SimpleToken::End;
1067 /// # type Role = UniversalTokenRole;
1068 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1069 /// # }
1070 /// #
1071 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1072 /// # enum SimpleElement {}
1073 /// #
1074 /// # impl ElementType for SimpleElement {
1075 /// # type Role = UniversalElementRole;
1076 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1077 /// # }
1078 /// #
1079 /// # struct SimpleLanguage;
1080 /// #
1081 /// # impl Language for SimpleLanguage {
1082 /// # const NAME: &'static str = "simple";
1083 /// # type TokenType = SimpleToken;
1084 /// # type ElementType = SimpleElement;
1085 /// # type TypedRoot = ();
1086 /// # }
1087 /// #
1088 /// let source = SourceText::new("test");
1089 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1090 ///
1091 /// // Add some tokens during lexing
1092 /// state.add_token(SimpleToken::Identifier, 0, 4);
1093 ///
1094 /// // Finish with successful result
1095 /// let output = state.finish(Ok(()));
1096 ///
1097 /// // Check the results
1098 /// assert!(output.result.is_ok());
1099 /// assert_eq!(output.result.unwrap().len(), 1);
1100 /// assert_eq!(output.diagnostics.len(), 0);
1101 ///
1102 /// // Example with error
1103 /// let source2 = SourceText::new("test");
1104 /// let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
1105 /// state2.add_error(OakError::custom_error("Test error"));
1106 ///
1107 /// let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));
1108 ///
1109 /// // Check the results
1110 /// assert!(output2.result.is_err());
1111 /// assert_eq!(output2.diagnostics.len(), 1); // The added error
1112 /// ```
1113 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
1114 match result {
1115 Ok(_) => {
1116 let tokens: Tokens<L> = self.tokens.into();
1117 OakDiagnostics { result: Ok(tokens), diagnostics: self.errors }
1118 }
1119 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
1120 }
1121 }
1122
1123 /// Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.
1124 ///
1125 /// This method is similar to `finish` but additionally updates the incremental cache
1126 /// with the new tokens. It's used for incremental lexing where the results need to
1127 /// be cached for future reuse when the source text changes.
1128 ///
1129 /// The method first creates the output in the same way as `finish`, then updates
1130 /// the cache's `last_lex` field with the new tokens. This enables the next call
1131 /// to `new_with_cache` to reuse these tokens if the source text hasn't changed.
1132 ///
1133 /// # Arguments
1134 ///
1135 /// * `result` - The result of the lexing operation (Ok for success, Err for failure)
1136 /// * `cache` - The incremental cache to update with the new tokens
1137 ///
1138 /// # Returns
1139 ///
1140 /// A `LexOutput` containing the tokens (if successful) and any diagnostic errors
1141 ///
1142 /// # Examples
1143 ///
1144 /// ```ignore
1145 /// #![feature(new_range_api)]
1146 /// # use core::range::Range;
1147 /// # use oak_core::lexer::{LexerState, Token};
1148 /// # use oak_core::{Language, TokenType, SourceText, OakError, LexOutput, UniversalTokenRole, UniversalElementRole, ElementType};
1149 /// # use oak_core::parser::session::ParseSession;
1150 /// #
1151 /// # #[derive(Debug, PartialEq, Clone, Copy, Eq, Hash)]
1152 /// # enum SimpleToken { Identifier, End }
1153 /// #
1154 /// # impl TokenType for SimpleToken {
1155 /// # const END_OF_STREAM: Self = SimpleToken::End;
1156 /// # type Role = UniversalTokenRole;
1157 /// # fn role(&self) -> Self::Role { UniversalTokenRole::None }
1158 /// # }
1159 /// #
1160 /// # #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1161 /// # enum SimpleElement {}
1162 /// #
1163 /// # impl ElementType for SimpleElement {
1164 /// # type Role = UniversalElementRole;
1165 /// # fn role(&self) -> Self::Role { UniversalElementRole::None }
1166 /// # }
1167 /// #
1168 /// # struct SimpleLanguage;
1169 /// #
1170 /// # impl Language for SimpleLanguage {
1171 /// # const NAME: &'static str = "simple";
1172 /// # type TokenType = SimpleToken;
1173 /// # type ElementType = SimpleElement;
1174 /// # type TypedRoot = ();
1175 /// # }
1176 /// #
1177 /// let source = SourceText::new("test");
1178 /// let mut state = LexerState::<_, SimpleLanguage>::new(&source);
1179 ///
1180 /// // Create a cache for incremental lexing
1181 /// let mut cache = ParseSession::<SimpleLanguage>::new(16);
1182 ///
1183 /// // Add some tokens during lexing
1184 /// state.add_token(SimpleToken::Identifier, 0, 4);
1185 ///
1186 /// // Finish with cache update
1187 /// let output = state.finish_with_cache(Ok(()), &mut cache);
1188 ///
1189 /// // Check the results
1190 /// assert!(output.result.is_ok());
1191 /// assert_eq!(output.result.unwrap().len(), 1);
1192 /// ```
1193 ///
1194 /// # Incremental Lexing Workflow
1195 ///
1196 /// This method is typically used as part of an incremental lexing workflow:
1197 ///
1198 /// ```ignore
1199 /// // First lexing
1200 /// let mut state = LexerState::new_with_cache(source, source.length(), cache);
1201 /// // ... lexing logic ...
1202 /// let output = state.finish_with_cache(Ok(()), cache);
1203 ///
1204 /// // Later, when source changes
1205 /// let relex_from = calculate_min_affected_offset(old_source, new_source);
1206 /// let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
1207 /// // ... lexing logic (reusing unchanged tokens) ...
1208 /// let output = state.finish_with_cache(Ok(()), cache);
1209 /// ```
1210 pub fn finish_with_cache(self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>) -> LexOutput<L> {
1211 let out = self.finish(result);
1212 cache.set_lex_output(out.clone());
1213 out
1214 }
1215}