oak_core/lexer/
mod.rs

1//! Lexical analysis and tokenization for the Oak Core parsing framework.
2//!
3//! This module provides traits and utilities for converting source text into
4//! sequences of tokens that can be consumed by parsers. It includes support
5//! for common lexical patterns and incremental tokenization.
6
7pub use self::{scan_comment::*, scan_string::*, scan_white_space::*};
8use crate::{
9    GreenBuilder, IncrementalCache, Language,
10    errors::{OakDiagnostics, OakError},
11    source::Source,
12};
13use std::{ops::Deref, range::Range};
14
15/// Common lexical patterns and utilities shared across different languages.
16///
17/// This module provides reusable components for common lexical constructs such as
18/// whitespace handling, number literals, string literals, and identifier recognition.
19/// These utilities can be used by language-specific lexers to avoid reimplementing
20/// basic tokenization patterns.
21mod scan_white_space;
22
23mod scan_comment;
24
25pub mod scan_string;
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32pub type LexOutput<L: Language> = OakDiagnostics<Vec<Token<L::SyntaxKind>>>;
33
34/// Trait for tokenizing source code into sequences of tokens.
35///
36/// This trait defines the interface for converting source text into a sequence of
37/// tokens that can be consumed by the parser. Implementations should handle
38/// the specific lexical rules of their target language.
39///
40/// # Examples
41///
42/// ```rust
43/// # use oak_core::{Lexer, Language, SourceText, LexOutput};
44///
45/// struct MyLexer;
46/// enum MyToken {
47///     Number,
48///     Identifier,
49/// }
50///
51/// impl Language for MyToken {
52///     type SyntaxKind = MyToken;
53/// }
54///
55/// impl Lexer<MyToken> for MyLexer {
56///     fn lex(&self, source: &SourceText) -> LexOutput<MyToken> {
57///         // Tokenization logic here
58///         todo!()
59///     }
60/// }
61/// ```
62pub trait Lexer<L: Language + Send + Sync + 'static> {
63    /// Tokenizes the given source text into a sequence of tokens.
64    ///
65    /// Tokenizes source text into a sequence of tokens.
66    ///
67    /// This method performs a full lexical analysis of the source text,
68    /// creating a new sequence of tokens from scratch. It uses a default
69    /// cache configuration.
70    ///
71    /// # Arguments
72    ///
73    /// * `source` - The source text to tokenize
74    ///
75    /// # Returns
76    ///
77    /// A [`LexOutput`] containing the tokens and any diagnostic messages
78    fn lex(&self, source: impl Source) -> LexOutput<L> {
79        let mut pool = GreenBuilder::new(0);
80        let cache = IncrementalCache::new(&mut pool);
81        self.lex_incremental(source, 0, cache)
82    }
83
84    /// Tokenizes source text using an existing cache for incremental parsing.
85    ///
86    /// This method enables efficient re-lexing by reusing information from previous
87    /// parsing operations, only processing the changed portions of the source.
88    ///
89    /// # Arguments
90    ///
91    /// * `source` - The source text to tokenize
92    /// * `changed` - The number of bytes that have changed since the last parse
93    /// * `cache` - The incremental cache containing previous parsing results
94    ///
95    /// # Returns
96    ///
97    /// A [`LexOutput`] containing the tokens and any diagnostic messages
98    fn lex_incremental(&self, source: impl Source, changed: usize, cache: IncrementalCache<L>) -> LexOutput<L>;
99}
100
101/// Represents a single kind in the source code.
102///
103/// Tokens are the fundamental units of lexical analysis, representing
104/// categorized pieces of source text with their position information.
105#[derive(Debug, Clone, PartialEq, Eq)]
106pub struct Token<K> {
107    /// The kind/category of this kind (e.g., keyword, identifier, number)
108    pub kind: K,
109    /// The byte range in the source text that this kind occupies
110    pub span: Range<usize>,
111}
112
113impl<K> Token<K> {
114    /// Returns the length of this kind in bytes.
115    ///
116    /// # Returns
117    ///
118    /// The number of bytes between the start and end of the kind's span
119    ///
120    /// # Examples
121    ///
122    /// ```rust
123    /// # use oak_core::Token;
124    /// # use core::range::Range;
125    ///
126    /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
127    /// assert_eq!(kind.length(), 5);
128    /// ```
129    #[inline]
130    pub fn length(&self) -> usize {
131        self.span.end - self.span.start
132    }
133}
134
135/// State information for incremental lexical analysis.
136///
137/// This struct maintains the current position and context during
138/// tokenization, enabling incremental and resumable lexing operations.
139#[derive(Debug)]
140pub struct LexerState<S, L: Language> {
141    /// The source text being tokenized
142    pub(crate) source: S,
143    /// Current byte offset position in the source text
144    pub(crate) offset: usize,
145    pub(crate) tokens: Vec<Token<L::SyntaxKind>>,
146    pub(crate) errors: Vec<OakError>,
147}
148
149impl<S: Source, L: Language> Deref for LexerState<S, L> {
150    type Target = S;
151
152    fn deref(&self) -> &Self::Target {
153        &self.source
154    }
155}
156
157impl<S: Source, L: Language> LexerState<S, L> {
158    /// Creates a new lexer state with the given source text.
159    ///
160    /// # Arguments
161    ///
162    /// * `source` - The source text to lex
163    ///
164    /// # Returns
165    ///
166    /// A new `LexerState` initialized at the beginning of the source
167    pub fn new(source: S) -> Self {
168        Self { source, offset: 0, tokens: vec![], errors: vec![] }
169    }
170
171    /// Creates a new lexer state with the given source text and incremental cache.
172    ///
173    /// # Arguments
174    ///
175    /// * `source` - The source text to lex
176    /// * `changed` - The number of bytes that have changed since the last lex
177    /// * `cache` - The incremental cache containing previous lexing results
178    ///
179    /// # Returns
180    ///
181    /// A new `LexerState` initialized at the beginning of the source with cache support
182    pub fn new_with_cache(source: S, changed: usize, cache: IncrementalCache<L>) -> Self {
183        Self { source, offset: 0, tokens: vec![], errors: vec![] }
184    }
185
186    /// Gets the remaining text from the current position to the end of the source.
187    ///
188    /// # Returns
189    ///
190    /// A string slice containing the remaining text
191    pub fn rest(&self) -> &str {
192        self.source.get_text_from(self.offset)
193    }
194
195    /// Gets the current byte offset position in the source text.
196    ///
197    /// # Returns
198    ///
199    /// The current byte offset from the start of the source text
200    #[inline]
201    pub fn get_position(&self) -> usize {
202        self.offset
203    }
204
205    /// Sets the current position to the specified byte offset.
206    ///
207    /// # Arguments
208    ///
209    /// * `offset` - The new byte offset position
210    ///
211    /// # Returns
212    ///
213    /// The previous byte offset position
214    #[inline]
215    pub fn set_position(&mut self, offset: usize) -> usize {
216        let last = self.offset;
217        self.offset = offset;
218        last
219    }
220
221    /// Gets the total length of the source text in bytes.
222    ///
223    /// # Returns
224    ///
225    /// The total number of bytes in the source text
226    pub fn get_length(&self) -> usize {
227        self.source.length()
228    }
229
230    /// Adds an error to the lexer state.
231    ///
232    /// # Arguments
233    ///
234    /// * `error` - The error to add to the diagnostics
235    #[inline]
236    pub fn add_error(&mut self, error: impl Into<OakError>) {
237        self.errors.push(error.into());
238    }
239
240    /// Adds a token to the lexer state.
241    ///
242    /// # Arguments
243    ///
244    /// * `kind` - The kind of the token
245    /// * `start` - The starting byte offset of the token
246    /// * `end` - The ending byte offset of the token
247    #[inline]
248    pub fn add_token(&mut self, kind: L::SyntaxKind, start: usize, end: usize) {
249        self.tokens.push(Token { kind, span: Range { start, end } });
250    }
251
252    /// Gets the current character at the current position.
253    ///
254    /// # Returns
255    ///
256    /// The current character, or `None` if at the end of the source
257    #[inline]
258    pub fn current(&self) -> Option<char> {
259        self.peek_next_n(0)
260    }
261
262    /// Peeks at the next character without advancing the position.
263    ///
264    /// # Returns
265    ///
266    /// The next character, or `None` if at the end of the source
267    #[inline]
268    pub fn peek(&self) -> Option<char> {
269        self.peek_next_n(0)
270    }
271
272    /// Peeks at the character n positions ahead without advancing the position.
273    ///
274    /// # Arguments
275    ///
276    /// * `n` - The number of characters to peek ahead
277    ///
278    /// # Returns
279    ///
280    /// The character n positions ahead, or `None` if beyond the end of the source
281    pub fn peek_next_n(&self, n: usize) -> Option<char> {
282        let rest = self.source.get_text_from(self.offset);
283        rest.chars().nth(n)
284    }
285
286    /// Advances the position by the specified number of bytes.
287    ///
288    /// # Arguments
289    ///
290    /// * `length` - The number of bytes to advance
291    ///
292    /// # Returns
293    ///
294    /// The new byte offset position
295    #[inline]
296    pub fn advance(&mut self, length: usize) -> usize {
297        let end = self.offset + length;
298        self.offset = end;
299        end
300    }
301
302    /// Advances the position by the specified number of bytes and adds a token.
303    ///
304    /// # Arguments
305    ///
306    /// * `length` - The number of bytes to advance
307    /// * `token` - The kind of token to add
308    ///
309    /// # Returns
310    ///
311    /// The new byte offset position
312    ///
313    /// # Note
314    ///
315    /// The caller must ensure that the advance is at character boundaries.
316    #[inline]
317    pub fn advance_with(&mut self, token: Token<L::SyntaxKind>) -> usize {
318        self.offset += token.length();
319        self.tokens.push(token);
320        self.offset
321    }
322
323    /// Consumes characters while the predicate returns true, returning the consumed range.
324    ///
325    /// # Arguments
326    ///
327    /// * `pred` - The predicate function that determines whether to consume a character
328    ///
329    /// # Returns
330    ///
331    /// The byte range of consumed characters
332    pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
333        let start = self.offset;
334        while let Some(ch) = self.peek() {
335            if pred(ch) {
336                self.advance(ch.len_utf8());
337            }
338            else {
339                break;
340            }
341        }
342        Range { start, end: self.offset }
343    }
344
345    /// Checks if the lexer has not reached the end of the source text.
346    ///
347    /// # Returns
348    ///
349    /// `true` if not at the end of the source, `false` otherwise
350    #[inline]
351    pub fn not_at_end(&self) -> bool {
352        self.offset < self.source.length()
353    }
354
355    /// Performs a safety check to prevent infinite loops during lexing.
356    ///
357    /// This method ensures that the lexer always makes progress by forcing
358    /// advancement when stuck at the same position. It's used as a safeguard
359    /// against infinite loops in lexer implementations.
360    ///
361    /// # Arguments
362    ///
363    /// * `safe_point` - The position to check against for potential deadlock
364    pub fn safe_check(&mut self, safe_point: usize) {
365        // 如果没有前进过,强制前进
366        if self.offset == safe_point {
367            match self.peek_next_n(0) {
368                // 跳过当前字符
369                Some(c) => self.offset += c.len_utf8(),
370                // 无论如何都要前进,防止死循环
371                None => self.offset += 1,
372            }
373            // tracing::warn!("deadlock");
374        }
375    }
376
377    /// Finishes lexing and returns the final output with tokens and diagnostics.
378    ///
379    /// # Returns
380    ///
381    /// A `LexOutput` containing the collected tokens and any errors encountered
382    pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
383        match result {
384            Ok(_) => OakDiagnostics { result: Ok(self.tokens), diagnostics: self.errors },
385            Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
386        }
387    }
388}