oak_typst/lexer/
mod.rs

1use crate::{kind::TypstSyntaxKind, language::TypstLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, TypstLanguage>;
10
11static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TYPST_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct TypstLexer<'config> {
17    config: &'config TypstLanguage,
18}
19
20impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<TypstLanguage>,
26    ) -> LexOutput<TypstLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> TypstLexer<'config> {
34    pub fn new(config: &'config TypstLanguage) -> Self {
35        Self { config }
36    }
37
38    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.skip_comment(state) {
47                continue;
48            }
49
50            if self.lex_string_literal(state) {
51                continue;
52            }
53
54            if self.lex_number_literal(state) {
55                continue;
56            }
57
58            if self.lex_identifier_or_keyword(state) {
59                continue;
60            }
61
62            if self.lex_operators(state) {
63                continue;
64            }
65
66            if self.lex_single_char_tokens(state) {
67                continue;
68            }
69
70            state.safe_check(safe_point);
71        }
72
73        // 添加 EOF token
74        let eof_pos = state.get_position();
75        state.add_token(TypstSyntaxKind::Eof, eof_pos, eof_pos);
76        Ok(())
77    }
78
79    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
80        match TYPST_WHITESPACE.scan(state.rest(), state.get_position(), TypstSyntaxKind::Whitespace) {
81            Some(token) => {
82                state.advance_with(token);
83                return true;
84            }
85            None => {}
86        }
87        false
88    }
89
90    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
91        // 行注释
92        if let Some(token) = TYPST_COMMENT.scan(state.rest(), state.get_position(), TypstSyntaxKind::LineComment) {
93            state.advance_with(token);
94            return true;
95        }
96
97        // 块注释
98        if state.rest().starts_with("/*") {
99            let start = state.get_position();
100            let mut pos = 2;
101            let text = state.rest();
102
103            while pos < text.len() {
104                if text[pos..].starts_with("*/") {
105                    pos += 2;
106                    break;
107                }
108                pos += 1;
109            }
110
111            state.advance(pos);
112            state.add_token(TypstSyntaxKind::BlockComment, start, state.get_position());
113            return true;
114        }
115
116        false
117    }
118
119    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
120        match TYPST_STRING.scan(state.rest(), state.get_position(), TypstSyntaxKind::StringLiteral) {
121            Some(token) => {
122                state.advance_with(token);
123                return true;
124            }
125            None => {}
126        }
127        false
128    }
129
130    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
131        let text = state.rest();
132        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
133            return false;
134        }
135
136        let start = state.get_position();
137        let mut pos = 0;
138        let chars: Vec<char> = text.chars().collect();
139
140        // 整数部分
141        while pos < chars.len() && chars[pos].is_ascii_digit() {
142            pos += 1;
143        }
144
145        // 小数部分
146        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
147            pos += 1; // 跳过 '.'
148            while pos < chars.len() && chars[pos].is_ascii_digit() {
149                pos += 1;
150            }
151        }
152
153        // 指数部分
154        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
155            pos += 1;
156            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
157                pos += 1;
158            }
159            while pos < chars.len() && chars[pos].is_ascii_digit() {
160                pos += 1;
161            }
162        }
163
164        if pos > 0 {
165            state.advance(pos);
166            state.add_token(TypstSyntaxKind::NumericLiteral, start, state.get_position());
167            return true;
168        }
169
170        false
171    }
172
173    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
174        let text = state.rest();
175        if text.is_empty() {
176            return false;
177        }
178
179        let first_char = text.chars().next().unwrap();
180        if !first_char.is_ascii_alphabetic() && first_char != '_' {
181            return false;
182        }
183
184        let start = state.get_position();
185        let mut pos = 0;
186        let chars: Vec<char> = text.chars().collect();
187
188        // 第一个字符
189        pos += 1;
190
191        // 后续字符
192        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric() || chars[pos] == '_') {
193            pos += 1;
194        }
195
196        if pos > 0 {
197            let identifier_text = &text[..pos];
198            let kind = self.keyword_or_identifier(identifier_text);
199            state.advance(pos);
200            state.add_token(kind, start, state.get_position());
201            return true;
202        }
203
204        false
205    }
206
207    fn keyword_or_identifier(&self, text: &str) -> TypstSyntaxKind {
208        match text {
209            "let" => TypstSyntaxKind::Let,
210            "if" => TypstSyntaxKind::If,
211            "else" => TypstSyntaxKind::Else,
212            "for" => TypstSyntaxKind::For,
213            "while" => TypstSyntaxKind::While,
214            "break" => TypstSyntaxKind::Break,
215            "continue" => TypstSyntaxKind::Continue,
216            "return" => TypstSyntaxKind::Return,
217            "true" => TypstSyntaxKind::True,
218            "false" => TypstSyntaxKind::False,
219            "set" => TypstSyntaxKind::Set,
220            "show" => TypstSyntaxKind::Show,
221            "import" => TypstSyntaxKind::Import,
222            "include" => TypstSyntaxKind::Include,
223            _ => TypstSyntaxKind::Identifier,
224        }
225    }
226
227    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
228        let text = state.rest();
229        if text.is_empty() {
230            return false;
231        }
232
233        let start = state.get_position();
234        let chars: Vec<char> = text.chars().collect();
235
236        let (kind, len) = match chars[0] {
237            '=' => {
238                if chars.len() > 1 && chars[1] == '=' {
239                    (TypstSyntaxKind::EqualEqual, 2)
240                }
241                else {
242                    (TypstSyntaxKind::Equal, 1)
243                }
244            }
245            '!' => {
246                if chars.len() > 1 && chars[1] == '=' {
247                    (TypstSyntaxKind::NotEqual, 2)
248                }
249                else {
250                    (TypstSyntaxKind::Not, 1)
251                }
252            }
253            '<' => {
254                if chars.len() > 1 && chars[1] == '=' {
255                    (TypstSyntaxKind::LessEqual, 2)
256                }
257                else {
258                    (TypstSyntaxKind::Less, 1)
259                }
260            }
261            '>' => {
262                if chars.len() > 1 && chars[1] == '=' {
263                    (TypstSyntaxKind::GreaterEqual, 2)
264                }
265                else {
266                    (TypstSyntaxKind::Greater, 1)
267                }
268            }
269            '&' => {
270                if chars.len() > 1 && chars[1] == '&' {
271                    (TypstSyntaxKind::And, 2)
272                }
273                else {
274                    return false;
275                }
276            }
277            '|' => {
278                if chars.len() > 1 && chars[1] == '|' {
279                    (TypstSyntaxKind::Or, 2)
280                }
281                else {
282                    return false;
283                }
284            }
285            '+' => (TypstSyntaxKind::Plus, 1),
286            '-' => (TypstSyntaxKind::Minus, 1),
287            '*' => (TypstSyntaxKind::Star, 1),
288            '/' => (TypstSyntaxKind::Slash, 1),
289            '%' => (TypstSyntaxKind::Percent, 1),
290            _ => return false,
291        };
292
293        state.advance(len);
294        state.add_token(kind, start, state.get_position());
295        true
296    }
297
298    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
299        let text = state.rest();
300        if text.is_empty() {
301            return false;
302        }
303
304        let start = state.get_position();
305        let ch = text.chars().next().unwrap();
306
307        let kind = match ch {
308            '(' => TypstSyntaxKind::LeftParen,
309            ')' => TypstSyntaxKind::RightParen,
310            '{' => TypstSyntaxKind::LeftBrace,
311            '}' => TypstSyntaxKind::RightBrace,
312            '[' => TypstSyntaxKind::LeftBracket,
313            ']' => TypstSyntaxKind::RightBracket,
314            ';' => TypstSyntaxKind::Semicolon,
315            ',' => TypstSyntaxKind::Comma,
316            '.' => TypstSyntaxKind::Dot,
317            ':' => TypstSyntaxKind::Colon,
318            '#' => TypstSyntaxKind::Hash,
319            '@' => TypstSyntaxKind::At,
320            '$' => TypstSyntaxKind::Dollar,
321            '_' => TypstSyntaxKind::Underscore,
322            _ => TypstSyntaxKind::Error,
323        };
324
325        state.advance(1);
326        state.add_token(kind, start, state.get_position());
327        true
328    }
329}