oak_typst/lexer/
mod.rs

1use crate::{kind::TypstSyntaxKind, language::TypstLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, TypstLanguage>;
10
11static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct TypstLexer<'config> {
17    _config: &'config TypstLanguage,
18}
19
20impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
22        let mut state = State::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish(result)
28    }
29}
30
31impl<'config> TypstLexer<'config> {
32    pub fn new(config: &'config TypstLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if TYPST_WHITESPACE.scan(state, TypstSyntaxKind::Whitespace) {
41                continue;
42            }
43
44            if TYPST_COMMENT.scan(state, TypstSyntaxKind::LineComment, TypstSyntaxKind::BlockComment) {
45                continue;
46            }
47
48            if TYPST_STRING.scan(state, TypstSyntaxKind::StringLiteral) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operators(state) {
61                continue;
62            }
63
64            if self.lex_single_char_tokens(state) {
65                continue;
66            }
67
68            state.advance_if_dead_lock(safe_point);
69        }
70
71        Ok(())
72    }
73
74    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
75        let start = state.get_position();
76        let text = state.rest();
77        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
78            return false;
79        }
80
81        let mut pos = 0;
82        let chars: Vec<char> = text.chars().collect();
83
84        // 整数部分
85        while pos < chars.len() && chars[pos].is_ascii_digit() {
86            pos += 1;
87        }
88
89        // 小数部分
90        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
91            pos += 1; // 跳过 '.'
92            while pos < chars.len() && chars[pos].is_ascii_digit() {
93                pos += 1;
94            }
95        }
96
97        // 指数部分
98        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
99            pos += 1;
100            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
101                pos += 1;
102            }
103            while pos < chars.len() && chars[pos].is_ascii_digit() {
104                pos += 1;
105            }
106        }
107
108        if pos > 0 {
109            state.advance(pos);
110            state.add_token(TypstSyntaxKind::NumericLiteral, start, state.get_position());
111            return true;
112        }
113
114        false
115    }
116
117    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
118        let start = state.get_position();
119        let text = state.rest();
120        if text.is_empty() {
121            return false;
122        }
123
124        let first_char = text.chars().next().unwrap();
125        if !first_char.is_ascii_alphabetic() && first_char != '_' {
126            return false;
127        }
128
129        let mut pos = 0;
130        let chars: Vec<char> = text.chars().collect();
131
132        // 第一个字符
133        pos += 1;
134
135        // 后续字符
136        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric() || chars[pos] == '_') {
137            pos += 1;
138        }
139
140        if pos > 0 {
141            let identifier_text = &text[..pos];
142            let kind = self.keyword_or_identifier(identifier_text);
143            state.advance(pos);
144            state.add_token(kind, start, state.get_position());
145            return true;
146        }
147
148        false
149    }
150
151    fn keyword_or_identifier(&self, text: &str) -> TypstSyntaxKind {
152        match text {
153            "let" => TypstSyntaxKind::Let,
154            "if" => TypstSyntaxKind::If,
155            "else" => TypstSyntaxKind::Else,
156            "for" => TypstSyntaxKind::For,
157            "while" => TypstSyntaxKind::While,
158            "break" => TypstSyntaxKind::Break,
159            "continue" => TypstSyntaxKind::Continue,
160            "return" => TypstSyntaxKind::Return,
161            "true" => TypstSyntaxKind::True,
162            "false" => TypstSyntaxKind::False,
163            "set" => TypstSyntaxKind::Set,
164            "show" => TypstSyntaxKind::Show,
165            "import" => TypstSyntaxKind::Import,
166            "include" => TypstSyntaxKind::Include,
167            _ => TypstSyntaxKind::Identifier,
168        }
169    }
170
171    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
172        let start = state.get_position();
173        let text = state.rest();
174        if text.is_empty() {
175            return false;
176        }
177
178        let chars: Vec<char> = text.chars().collect();
179
180        let (kind, len) = match chars[0] {
181            '=' => {
182                if chars.len() > 1 && chars[1] == '=' {
183                    (TypstSyntaxKind::EqualEqual, 2)
184                }
185                else {
186                    (TypstSyntaxKind::Equal, 1)
187                }
188            }
189            '!' => {
190                if chars.len() > 1 && chars[1] == '=' {
191                    (TypstSyntaxKind::NotEqual, 2)
192                }
193                else {
194                    (TypstSyntaxKind::Not, 1)
195                }
196            }
197            '<' => {
198                if chars.len() > 1 && chars[1] == '=' {
199                    (TypstSyntaxKind::LessEqual, 2)
200                }
201                else {
202                    (TypstSyntaxKind::Less, 1)
203                }
204            }
205            '>' => {
206                if chars.len() > 1 && chars[1] == '=' {
207                    (TypstSyntaxKind::GreaterEqual, 2)
208                }
209                else {
210                    (TypstSyntaxKind::Greater, 1)
211                }
212            }
213            '&' => {
214                if chars.len() > 1 && chars[1] == '&' {
215                    (TypstSyntaxKind::And, 2)
216                }
217                else {
218                    return false;
219                }
220            }
221            '|' => {
222                if chars.len() > 1 && chars[1] == '|' {
223                    (TypstSyntaxKind::Or, 2)
224                }
225                else {
226                    return false;
227                }
228            }
229            '+' => (TypstSyntaxKind::Plus, 1),
230            '-' => (TypstSyntaxKind::Minus, 1),
231            '*' => (TypstSyntaxKind::Star, 1),
232            '/' => (TypstSyntaxKind::Slash, 1),
233            '%' => (TypstSyntaxKind::Percent, 1),
234            _ => return false,
235        };
236
237        state.advance(len);
238        state.add_token(kind, start, state.get_position());
239        true
240    }
241
242    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
243        let start = state.get_position();
244        let text = state.rest();
245        if text.is_empty() {
246            return false;
247        }
248
249        let ch = text.chars().next().unwrap();
250
251        let kind = match ch {
252            '(' => TypstSyntaxKind::LeftParen,
253            ')' => TypstSyntaxKind::RightParen,
254            '{' => TypstSyntaxKind::LeftBrace,
255            '}' => TypstSyntaxKind::RightBrace,
256            '[' => TypstSyntaxKind::LeftBracket,
257            ']' => TypstSyntaxKind::RightBracket,
258            ';' => TypstSyntaxKind::Semicolon,
259            ',' => TypstSyntaxKind::Comma,
260            '.' => TypstSyntaxKind::Dot,
261            ':' => TypstSyntaxKind::Colon,
262            '#' => TypstSyntaxKind::Hash,
263            '@' => TypstSyntaxKind::At,
264            '$' => TypstSyntaxKind::Dollar,
265            '_' => TypstSyntaxKind::Underscore,
266            _ => TypstSyntaxKind::Error,
267        };
268
269        state.advance(1);
270        state.add_token(kind, start, state.get_position());
271        true
272    }
273}