oak_core/lexer/mod.rs
1//! Lexical analysis and tokenization for the Oak Core parsing framework.
2//!
3//! This module provides traits and utilities for converting source text into
4//! sequences of tokens that can be consumed by parsers. It includes support
5//! for common lexical patterns and incremental tokenization.
6
7pub use self::{scan_comment::*, scan_string::*, scan_white_space::*};
8use crate::{
9 GreenBuilder, IncrementalCache, Language,
10 errors::{OakDiagnostics, OakError},
11 source::Source,
12};
13use std::{ops::Deref, range::Range};
14
15/// Common lexical patterns and utilities shared across different languages.
16///
17/// This module provides reusable components for common lexical constructs such as
18/// whitespace handling, number literals, string literals, and identifier recognition.
19/// These utilities can be used by language-specific lexers to avoid reimplementing
20/// basic tokenization patterns.
21mod scan_white_space;
22
23mod scan_comment;
24
25pub mod scan_string;
26
27/// Output type for lexical analysis operations.
28///
29/// This type alias represents the result of tokenization, containing
30/// a vector of tokens and any diagnostic language that occurred during
31/// the lexing process.
32pub type LexOutput<L: Language> = OakDiagnostics<Vec<Token<L::SyntaxKind>>>;
33
34/// Trait for tokenizing source code into sequences of tokens.
35///
36/// This trait defines the interface for converting source text into a sequence of
37/// tokens that can be consumed by the parser. Implementations should handle
38/// the specific lexical rules of their target language.
39///
40/// # Examples
41///
42/// ```rust
43/// # use oak_core::{Lexer, Language, SourceText, LexOutput};
44///
45/// struct MyLexer;
46/// enum MyToken {
47/// Number,
48/// Identifier,
49/// }
50///
51/// impl Language for MyToken {
52/// type SyntaxKind = MyToken;
53/// }
54///
55/// impl Lexer<MyToken> for MyLexer {
56/// fn lex(&self, source: &SourceText) -> LexOutput<MyToken> {
57/// // Tokenization logic here
58/// todo!()
59/// }
60/// }
61/// ```
62pub trait Lexer<L: Language + Send + Sync + 'static> {
63 /// Tokenizes the given source text into a sequence of tokens.
64 ///
65 /// Tokenizes source text into a sequence of tokens.
66 ///
67 /// This method performs a full lexical analysis of the source text,
68 /// creating a new sequence of tokens from scratch. It uses a default
69 /// cache configuration.
70 ///
71 /// # Arguments
72 ///
73 /// * `source` - The source text to tokenize
74 ///
75 /// # Returns
76 ///
77 /// A [`LexOutput`] containing the tokens and any diagnostic messages
78 fn lex(&self, source: impl Source) -> LexOutput<L> {
79 let mut pool = GreenBuilder::new(0);
80 let cache = IncrementalCache::new(&mut pool);
81 self.lex_incremental(source, 0, cache)
82 }
83
84 /// Tokenizes source text using an existing cache for incremental parsing.
85 ///
86 /// This method enables efficient re-lexing by reusing information from previous
87 /// parsing operations, only processing the changed portions of the source.
88 ///
89 /// # Arguments
90 ///
91 /// * `source` - The source text to tokenize
92 /// * `changed` - The number of bytes that have changed since the last parse
93 /// * `cache` - The incremental cache containing previous parsing results
94 ///
95 /// # Returns
96 ///
97 /// A [`LexOutput`] containing the tokens and any diagnostic messages
98 fn lex_incremental(&self, source: impl Source, changed: usize, cache: IncrementalCache<L>) -> LexOutput<L>;
99}
100
101/// Represents a single kind in the source code.
102///
103/// Tokens are the fundamental units of lexical analysis, representing
104/// categorized pieces of source text with their position information.
105#[derive(Debug, Clone, PartialEq, Eq)]
106pub struct Token<K> {
107 /// The kind/category of this kind (e.g., keyword, identifier, number)
108 pub kind: K,
109 /// The byte range in the source text that this kind occupies
110 pub span: Range<usize>,
111}
112
113impl<K> Token<K> {
114 /// Returns the length of this kind in bytes.
115 ///
116 /// # Returns
117 ///
118 /// The number of bytes between the start and end of the kind's span
119 ///
120 /// # Examples
121 ///
122 /// ```rust
123 /// # use oak_core::Token;
124 /// # use core::range::Range;
125 ///
126 /// let kind = Token { kind: "ident", span: Range { start: 0, end: 5 } };
127 /// assert_eq!(kind.length(), 5);
128 /// ```
129 #[inline]
130 pub fn length(&self) -> usize {
131 self.span.end - self.span.start
132 }
133}
134
135/// State information for incremental lexical analysis.
136///
137/// This struct maintains the current position and context during
138/// tokenization, enabling incremental and resumable lexing operations.
139#[derive(Debug)]
140pub struct LexerState<S, L: Language> {
141 /// The source text being tokenized
142 pub(crate) source: S,
143 /// Current byte offset position in the source text
144 pub(crate) offset: usize,
145 pub(crate) tokens: Vec<Token<L::SyntaxKind>>,
146 pub(crate) errors: Vec<OakError>,
147}
148
149impl<S: Source, L: Language> Deref for LexerState<S, L> {
150 type Target = S;
151
152 fn deref(&self) -> &Self::Target {
153 &self.source
154 }
155}
156
157impl<S: Source, L: Language> LexerState<S, L> {
158 /// Creates a new lexer state with the given source text.
159 ///
160 /// # Arguments
161 ///
162 /// * `source` - The source text to lex
163 ///
164 /// # Returns
165 ///
166 /// A new `LexerState` initialized at the beginning of the source
167 pub fn new(source: S) -> Self {
168 Self { source, offset: 0, tokens: vec![], errors: vec![] }
169 }
170
171 /// Creates a new lexer state with the given source text and incremental cache.
172 ///
173 /// # Arguments
174 ///
175 /// * `source` - The source text to lex
176 /// * `changed` - The number of bytes that have changed since the last lex
177 /// * `cache` - The incremental cache containing previous lexing results
178 ///
179 /// # Returns
180 ///
181 /// A new `LexerState` initialized at the beginning of the source with cache support
182 pub fn new_with_cache(source: S, changed: usize, cache: IncrementalCache<L>) -> Self {
183 Self { source, offset: 0, tokens: vec![], errors: vec![] }
184 }
185
186 /// Gets the remaining text from the current position to the end of the source.
187 ///
188 /// # Returns
189 ///
190 /// A string slice containing the remaining text
191 pub fn rest(&self) -> &str {
192 self.source.get_text_from(self.offset)
193 }
194
195 /// Gets the current byte offset position in the source text.
196 ///
197 /// # Returns
198 ///
199 /// The current byte offset from the start of the source text
200 #[inline]
201 pub fn get_position(&self) -> usize {
202 self.offset
203 }
204
205 /// Sets the current position to the specified byte offset.
206 ///
207 /// # Arguments
208 ///
209 /// * `offset` - The new byte offset position
210 ///
211 /// # Returns
212 ///
213 /// The previous byte offset position
214 #[inline]
215 pub fn set_position(&mut self, offset: usize) -> usize {
216 let last = self.offset;
217 self.offset = offset;
218 last
219 }
220
221 /// Gets the total length of the source text in bytes.
222 ///
223 /// # Returns
224 ///
225 /// The total number of bytes in the source text
226 pub fn get_length(&self) -> usize {
227 self.source.length()
228 }
229
230 /// Adds an error to the lexer state.
231 ///
232 /// # Arguments
233 ///
234 /// * `error` - The error to add to the diagnostics
235 #[inline]
236 pub fn add_error(&mut self, error: impl Into<OakError>) {
237 self.errors.push(error.into());
238 }
239
240 /// Adds a token to the lexer state.
241 ///
242 /// # Arguments
243 ///
244 /// * `kind` - The kind of the token
245 /// * `start` - The starting byte offset of the token
246 /// * `end` - The ending byte offset of the token
247 #[inline]
248 pub fn add_token(&mut self, kind: L::SyntaxKind, start: usize, end: usize) {
249 self.tokens.push(Token { kind, span: Range { start, end } });
250 }
251
252 /// Gets the current character at the current position.
253 ///
254 /// # Returns
255 ///
256 /// The current character, or `None` if at the end of the source
257 #[inline]
258 pub fn current(&self) -> Option<char> {
259 self.peek_next_n(0)
260 }
261
262 /// Peeks at the next character without advancing the position.
263 ///
264 /// # Returns
265 ///
266 /// The next character, or `None` if at the end of the source
267 #[inline]
268 pub fn peek(&self) -> Option<char> {
269 self.peek_next_n(0)
270 }
271
272 /// Peeks at the character n positions ahead without advancing the position.
273 ///
274 /// # Arguments
275 ///
276 /// * `n` - The number of characters to peek ahead
277 ///
278 /// # Returns
279 ///
280 /// The character n positions ahead, or `None` if beyond the end of the source
281 pub fn peek_next_n(&self, n: usize) -> Option<char> {
282 let rest = self.source.get_text_from(self.offset);
283 rest.chars().nth(n)
284 }
285
286 /// Advances the position by the specified number of bytes.
287 ///
288 /// # Arguments
289 ///
290 /// * `length` - The number of bytes to advance
291 ///
292 /// # Returns
293 ///
294 /// The new byte offset position
295 #[inline]
296 pub fn advance(&mut self, length: usize) -> usize {
297 let end = self.offset + length;
298 self.offset = end;
299 end
300 }
301
302 /// Advances the position by the specified number of bytes and adds a token.
303 ///
304 /// # Arguments
305 ///
306 /// * `length` - The number of bytes to advance
307 /// * `token` - The kind of token to add
308 ///
309 /// # Returns
310 ///
311 /// The new byte offset position
312 ///
313 /// # Note
314 ///
315 /// The caller must ensure that the advance is at character boundaries.
316 #[inline]
317 pub fn advance_with(&mut self, token: Token<L::SyntaxKind>) -> usize {
318 self.offset += token.length();
319 self.tokens.push(token);
320 self.offset
321 }
322
323 /// Consumes characters while the predicate returns true, returning the consumed range.
324 ///
325 /// # Arguments
326 ///
327 /// * `pred` - The predicate function that determines whether to consume a character
328 ///
329 /// # Returns
330 ///
331 /// The byte range of consumed characters
332 pub fn take_while(&mut self, mut pred: impl FnMut(char) -> bool) -> Range<usize> {
333 let start = self.offset;
334 while let Some(ch) = self.peek() {
335 if pred(ch) {
336 self.advance(ch.len_utf8());
337 }
338 else {
339 break;
340 }
341 }
342 Range { start, end: self.offset }
343 }
344
345 /// Checks if the lexer has not reached the end of the source text.
346 ///
347 /// # Returns
348 ///
349 /// `true` if not at the end of the source, `false` otherwise
350 #[inline]
351 pub fn not_at_end(&self) -> bool {
352 self.offset < self.source.length()
353 }
354
355 /// Performs a safety check to prevent infinite loops during lexing.
356 ///
357 /// This method ensures that the lexer always makes progress by forcing
358 /// advancement when stuck at the same position. It's used as a safeguard
359 /// against infinite loops in lexer implementations.
360 ///
361 /// # Arguments
362 ///
363 /// * `safe_point` - The position to check against for potential deadlock
364 pub fn safe_check(&mut self, safe_point: usize) {
365 // 如果没有前进过,强制前进
366 if self.offset == safe_point {
367 match self.peek_next_n(0) {
368 // 跳过当前字符
369 Some(c) => self.offset += c.len_utf8(),
370 // 无论如何都要前进,防止死循环
371 None => self.offset += 1,
372 }
373 // tracing::warn!("deadlock");
374 }
375 }
376
377 /// Finishes lexing and returns the final output with tokens and diagnostics.
378 ///
379 /// # Returns
380 ///
381 /// A `LexOutput` containing the collected tokens and any errors encountered
382 pub fn finish(self, result: Result<(), OakError>) -> LexOutput<L> {
383 match result {
384 Ok(_) => OakDiagnostics { result: Ok(self.tokens), diagnostics: self.errors },
385 Err(e) => OakDiagnostics { result: Err(e), diagnostics: self.errors },
386 }
387 }
388}