Struct LexerState

Source

pub struct LexerState<'s, S: Source + ?Sized, L: Language> { /* private fields */ }

Expand description

State information for incremental lexical analysis.

This struct maintains the current position and context during tokenization, enabling incremental and resumable lexing operations.

Implementations§

Source §

impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L>

Source

pub fn new(source: &'s S) -> Self

Creates a new lexer state with the given source text.

§Arguments

source - The source text to lex

§Returns

A new LexerState initialized at the beginning of the source

Source

pub fn new_with_cache( source: &'s S, relex_from: usize, cache: &impl LexerCache<L>, ) -> Self

Creates a new lexer state with the given source text and incremental cache.

§Arguments

source - The source text to lex
relex_from - The minimum byte offset that may have been affected by edits (use source.length() to indicate no edits)
cache - The incremental cache containing previous lexing results

§Returns

A new LexerState initialized at the beginning of the source with cache support

Source

pub fn rest(&mut self) -> &str

Gets the remaining text from the current position to the end of the source.

§Returns

A string slice containing the remaining text

Source

pub fn rest_bytes(&mut self) -> &[u8] ⓘ

Gets the remaining text as a byte slice.

Source

pub fn fully_reused(&self) -> bool

Checks if the lexer has consumed all input from the source.

Source

pub fn get_position(&self) -> usize

Gets the current byte offset position in the source text.

§Returns

The current byte offset from the start of the source text

Source

pub fn get_length(&self) -> usize

Gets the total length of the source text in bytes.

Source

pub fn get_char_at(&self, offset: usize) -> Option<char>

Gets a single character at the specified byte offset.

Source

pub fn peek_byte(&mut self) -> Option<u8>

Peeks at the next byte without advancing.

Source

pub fn advance_byte(&mut self) -> Option<u8>

Advances the cursor by one byte and returns it.

Source

pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize>

Advances the cursor while the byte predicate is true.

Source

pub fn skip_ascii_whitespace(&mut self) -> Range<usize>

Skips common ASCII whitespace using SIMD if possible.

Source

pub fn skip_ascii_digits(&mut self) -> Range<usize>

Skips all ASCII digits at the current position.

Source

pub fn skip_ascii_ident_continue(&mut self) -> Range<usize>

Skips all characters that can continue an ASCII identifier.

Source

pub fn skip_until(&mut self, target: u8) -> Range<usize>

Skips all characters until the target byte is encountered.

Source

pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool

Scans an ASCII identifier (starts with alpha/, continues with alphanumeric/).

Source

pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool

Scans a line comment starting with the given prefix.

Source

pub fn scan_block_comment( &mut self, kind: L::TokenType, start_seq: &str, end_seq: &str, ) -> bool

Scans a block comment with given start and end sequences.

Source

pub fn tokens(&self) -> &[Token<L::TokenType>]

Gets a reference to the tokens collected so far.

§Returns

A slice of tokens collected during the lexing process

Source

pub fn set_position(&mut self, offset: usize) -> usize

Sets the current position to the specified byte offset.

§Arguments

offset - The new byte offset position

§Returns

The previous byte offset position

Source

pub fn source(&self) -> &'s S

Returns a reference to the underlying source.

Source

pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str>

Returns the text in the specified range.

Source

pub fn get_text_from(&self, offset: usize) -> Cow<'_, str>

Returns the text from the specified offset to the end.

Source

pub fn starts_with(&mut self, pattern: &str) -> bool

Checks if the source starts with the given pattern at the current position.

Source

pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool

Consumes the pattern if it exists at the current position.

Source

pub fn get_tokens(&self) -> &[Token<L::TokenType>]

Gets the tokens collected so far in the lexer state.

§Returns

A slice of tokens collected during lexing

Source

pub fn add_error(&mut self, error: impl Into<OakError>)

Adds an error to the lexer state.

§Arguments

error - The error to add to the diagnostics

Source

pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize)

Adds a token to the lexer state.

§Arguments

kind - The kind of the token
start - The starting byte offset of the token
end - The ending byte offset of the token

Source

pub fn add_eof(&mut self)

Adds an end-of-file token to the lexer state.

This method creates and adds an END_OF_STREAM token at the current position. It’s typically called when the lexer reaches the end of the source text to mark the termination of the token stream.

§Examples

#![feature(new_range_api)]
let source = SourceText::new("test");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);
state.take_while(|_| true); // Advance to end
state.add_eof();

assert_eq!(state.tokens().len(), 1);
assert_eq!(state.tokens()[0].span, Range { start: 4, end: 4 });

Source

pub fn current(&mut self) -> Option<char>

Gets the current character at the current position.

§Returns

The current character, or None if at the end of the source

Source

pub fn peek(&mut self) -> Option<char>

Peeks at the next character without advancing the position.

§Returns

The next character, or None if at the end of the source

Source

pub fn peek_next_n(&mut self, n: usize) -> Option<char>

Peeks at the character n positions ahead without advancing the position.

§Arguments

n - The number of characters to peek ahead

§Returns

The character n positions ahead, or None if beyond the end of the source

Source

pub fn advance(&mut self, length: usize) -> usize

Advances the position by the specified number of bytes.

This method moves the lexer’s current position forward by the specified number of bytes. It’s commonly used after recognizing a token to move past the token’s characters.

§Arguments

length - The number of bytes to advance

§Returns

The new byte offset position after advancing

Source

pub fn bump(&mut self) -> Option<char>

Advances the position by the current character’s length.

§Returns

The character that was skipped, or None if at the end of the source

Source

pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize

Advances the position by the token’s length and adds the token to the lexer state.

This method combines two common operations: advancing the lexer position and adding a token to the token list. It calculates the advance distance from the token’s span, ensuring consistent positioning.

§Arguments

token - The token to add to the lexer state

§Returns

The new byte offset position after advancing

§Examples

#![feature(new_range_api)]
let source = SourceText::new("hello world");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);

// Create a token for "hello"
let token = Token { kind: SimpleToken::Identifier, span: Range { start: 0, end: 5 } };

// Initially at position 0
assert_eq!(state.get_position(), 0);

// Advance and add the token
let new_pos = state.advance_with(token);

// Now at position 5 and token is added
assert_eq!(new_pos, 5);
assert_eq!(state.get_position(), 5);
assert_eq!(state.get_tokens().len(), 1);
assert_eq!(state.get_tokens()[0].kind, SimpleToken::Identifier);

§Note

The caller must ensure that the token’s span is valid and that the advance does not split multi-byte UTF-8 characters. The token should be created with proper character boundaries.

Source

pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize>

Consumes characters while the predicate returns true, returning the consumed range.

This method iterates through the source text from the current position, consuming characters as long as the predicate function returns true. It’s commonly used for recognizing patterns like identifiers, numbers, or whitespace sequences.

§Arguments

pred - A closure that takes a character and returns true if the character should be consumed, false otherwise

§Returns

A byte range representing the span of consumed characters

§Examples

#![feature(new_range_api)]
let source = SourceText::new("hello123world");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);

// Consume alphabetic characters
let range = state.take_while(|c| c.is_alphabetic());

// Should have consumed "hello"
assert_eq!(range, Range { start: 0, end: 5 });
assert_eq!(state.get_position(), 5);

// Consume numeric characters
let range = state.take_while(|c| c.is_numeric());

// Should have consumed "123"
assert_eq!(range, Range { start: 5, end: 8 });
assert_eq!(state.get_position(), 8);

§Performance Note

This method operates on a character-by-character basis, which means it correctly handles multi-byte UTF-8 characters. For performance-critical code, consider using byte-based methods when working with ASCII-only text.

Source

pub fn not_at_end(&self) -> bool

Checks if the lexer has not reached the end of the source text.

§Returns

true if not at the end of the source, false otherwise

Source

pub fn advance_if_dead_lock(&mut self, safe_point: usize)

Performs a safety check to prevent infinite loops during lexing.

This method ensures that the lexer always makes progress by forcing advancement when stuck at the same position. It’s used as a safeguard against infinite loops in lexer implementations.

The method compares the current position with a previously saved “safe point” position. If they’re the same, it means the lexer hasn’t made progress since that safe point, potentially indicating an infinite loop. In this case, the method forces advancement by at least one character.

§Arguments

safe_point - The position to check against for potential deadlock

§Examples

#![feature(new_range_api)]
let source = SourceText::new("test");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);

// Save the current position as a safe point
let safe_point = state.get_position();

// In a real lexer, you would do some processing here
// If something went wrong and we didn't advance, this would prevent infinite loop
state.advance_if_dead_lock(safe_point);

// If we were stuck, we would have advanced by at least 1
assert!(state.get_position() >= safe_point);

§Usage in Lexer Implementations

This method is typically used at the beginning or end of lexing loops:

loop {
    let safe_point = state.get_position();
     
    // Try to recognize a token
    if let Some(token) = try_recognize_token(&mut state) {
        // Success, continue loop
        continue;
    }
     
    // If we get here, we didn't recognize anything
    // This prevents infinite loops if recognition fails
    state.advance_if_dead_lock(safe_point);
     
    if state.not_at_end() {
        // Continue trying to recognize tokens
        continue;
    } else {
        // Reached end of source
        break;
    }
}

Source

pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L>

Finishes lexing and returns the final output with tokens and diagnostics.

This method concludes the lexing process by converting the collected tokens and errors into a LexOutput result. It takes a Result parameter that represents the overall success or failure of the lexing operation.

If the result is Ok, the tokens are returned as the successful result. If the result is Err, the error is returned as the failure result. In both cases, any collected diagnostic errors are included in the output.

§Arguments

result - The result of the lexing operation (Ok for success, Err for failure)

§Returns

A LexOutput containing the tokens (if successful) and any diagnostic errors

§Examples

#![feature(new_range_api)]
let source = SourceText::new("test");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);

// Add some tokens during lexing
state.add_token(SimpleToken::Identifier, 0, 4);

// Finish with successful result
let output = state.finish(Ok(()));

// Check the results
assert!(output.result.is_ok());
assert_eq!(output.result.unwrap().len(), 1);
assert_eq!(output.diagnostics.len(), 0);

// Example with error
let source2 = SourceText::new("test");
let mut state2 = LexerState::<_, SimpleLanguage>::new(&source2);
state2.add_error(OakError::custom_error("Test error"));

let output2 = state2.finish(Err(OakError::custom_error("Fatal error")));

// Check the results
assert!(output2.result.is_err());
assert_eq!(output2.diagnostics.len(), 1); // The added error

Source

pub fn finish_with_cache( self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>, ) -> LexOutput<L>

Finishes lexing and returns the final output with tokens, diagnostics, and updated cache.

This method is similar to finish but additionally updates the incremental cache with the new tokens. It’s used for incremental lexing where the results need to be cached for future reuse when the source text changes.

The method first creates the output in the same way as finish, then updates the cache’s last_lex field with the new tokens. This enables the next call to new_with_cache to reuse these tokens if the source text hasn’t changed.

§Arguments

result - The result of the lexing operation (Ok for success, Err for failure)
cache - The incremental cache to update with the new tokens

§Returns

A LexOutput containing the tokens (if successful) and any diagnostic errors

§Examples

#![feature(new_range_api)]
let source = SourceText::new("test");
let mut state = LexerState::<_, SimpleLanguage>::new(&source);

// Create a cache for incremental lexing
let mut cache = ParseSession::<SimpleLanguage>::new(16);

// Add some tokens during lexing
state.add_token(SimpleToken::Identifier, 0, 4);

// Finish with cache update
let output = state.finish_with_cache(Ok(()), &mut cache);

// Check the results
assert!(output.result.is_ok());
assert_eq!(output.result.unwrap().len(), 1);

§Incremental Lexing Workflow

This method is typically used as part of an incremental lexing workflow:

// First lexing
let mut state = LexerState::new_with_cache(source, source.length(), cache);
// ... lexing logic ...
let output = state.finish_with_cache(Ok(()), cache);

// Later, when source changes
let relex_from = calculate_min_affected_offset(old_source, new_source);
let mut state = LexerState::new_with_cache(new_source, relex_from, cache);
// ... lexing logic (reusing unchanged tokens) ...
let output = state.finish_with_cache(Ok(()), cache);

Trait Implementations§

Source §

impl<'s, S: Debug + Source + ?Sized, L: Debug + Language> Debug for LexerState<'s, S, L>
where L::TokenType: Debug,

Source §

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

§

impl<'s, S, L> Unpin for LexerState<'s, S, L>
where <L as Language>::TokenType: Unpin, S: ?Sized,

§

impl<'s, S, L> !UnwindSafe for LexerState<'s, S, L>

Blanket Implementations§

Source §

impl<T> Any for T
where T: 'static + ?Sized,

Source §

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more

Source §

impl<T> Borrow<T> for T
where T: ?Sized,

Source §

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more

Source §

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source §

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more

Source §

impl<T> From<T> for T

Source §

fn from(t: T) -> T

Returns the argument unchanged.

Source §

impl<T, U> Into for T
where U: From<T>,

Source §

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source §

impl<T, U> TryFrom for T
where U: Into<T>,

Source §

type Error = Infallible

The type returned in the event of a conversion error.

Source §

fn try_from(value: U) -> Result<T, <T as TryFrom>::Error>

Performs the conversion.

Source §

impl<T, U> TryInto for T
where U: TryFrom<T>,

Source §

type Error = >::Error

The type returned in the event of a conversion error.

Source §

fn try_into(self) -> Result<U, >::Error>

Performs the conversion.

LexerState

Struct LexerState Copy item path

Implementations§

impl<'s, S: Source + ?Sized, L: Language> LexerState<'s, S, L>

pub fn new(source: &'s S) -> Self

§Arguments

§Returns

pub fn new_with_cache( source: &'s S, relex_from: usize, cache: &impl LexerCache<L>, ) -> Self

§Arguments

§Returns

pub fn rest(&mut self) -> &str

§Returns

pub fn rest_bytes(&mut self) -> &[u8] ⓘ

pub fn fully_reused(&self) -> bool

pub fn get_position(&self) -> usize

§Returns

pub fn get_length(&self) -> usize

pub fn get_char_at(&self, offset: usize) -> Option<char>

pub fn peek_byte(&mut self) -> Option<u8>

pub fn advance_byte(&mut self) -> Option<u8>

pub fn take_while_byte(&mut self, pred: impl FnMut(u8) -> bool) -> Range<usize>

pub fn skip_ascii_whitespace(&mut self) -> Range<usize>

pub fn skip_ascii_digits(&mut self) -> Range<usize>

pub fn skip_ascii_ident_continue(&mut self) -> Range<usize>

pub fn skip_until(&mut self, target: u8) -> Range<usize>

pub fn scan_ascii_identifier(&mut self, kind: L::TokenType) -> bool

pub fn scan_line_comment(&mut self, kind: L::TokenType, prefix: &str) -> bool

pub fn scan_block_comment( &mut self, kind: L::TokenType, start_seq: &str, end_seq: &str, ) -> bool

pub fn tokens(&self) -> &[Token<L::TokenType>]

§Returns

pub fn set_position(&mut self, offset: usize) -> usize

§Arguments

§Returns

pub fn source(&self) -> &'s S

pub fn get_text_in(&self, range: Range<usize>) -> Cow<'_, str>

pub fn get_text_from(&self, offset: usize) -> Cow<'_, str>

pub fn starts_with(&mut self, pattern: &str) -> bool

pub fn consume_if_starts_with(&mut self, pattern: &str) -> bool

pub fn get_tokens(&self) -> &[Token<L::TokenType>]

§Returns

pub fn add_error(&mut self, error: impl Into<OakError>)

§Arguments

pub fn add_token(&mut self, kind: L::TokenType, start: usize, end: usize)

§Arguments

pub fn add_eof(&mut self)

§Examples

pub fn current(&mut self) -> Option<char>

§Returns

pub fn peek(&mut self) -> Option<char>

§Returns

pub fn peek_next_n(&mut self, n: usize) -> Option<char>

§Arguments

§Returns

pub fn advance(&mut self, length: usize) -> usize

§Arguments

§Returns

pub fn bump(&mut self) -> Option<char>

§Returns

pub fn advance_with(&mut self, token: Token<L::TokenType>) -> usize

§Arguments

§Returns

§Examples

§Note

pub fn take_while(&mut self, pred: impl FnMut(char) -> bool) -> Range<usize>

§Arguments

§Returns

§Examples

§Performance Note

pub fn not_at_end(&self) -> bool

§Returns

pub fn advance_if_dead_lock(&mut self, safe_point: usize)

§Arguments

§Examples

§Usage in Lexer Implementations

pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L>

§Arguments

§Returns

§Examples

pub fn finish_with_cache( self, result: Result<(), OakError>, cache: &mut impl LexerCache<L>, ) -> LexOutput<L>

§Arguments

Struct LexerState

impl<'s, S: Debug + Source + ?Sized, L: Debug + Language> Debug for LexerState<'s, S, L>
where L::TokenType: Debug,

impl<'s, S, L> Freeze for LexerState<'s, S, L>
where S: ?Sized,

impl<'s, S, L> Send for LexerState<'s, S, L>
where S: ?Sized,

impl<'s, S, L> Sync for LexerState<'s, S, L>
where S: ?Sized,

impl<'s, S, L> Unpin for LexerState<'s, S, L>
where <L as Language>::TokenType: Unpin, S: ?Sized,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T, U> Into<U> for T
where U: From<T>,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,