Skip to main content

oak_typst/lexer/
mod.rs

1use crate::{kind::TypstSyntaxKind, language::TypstLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, TypstLanguage>;
10
11static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct TypstLexer<'config> {
17    _config: &'config TypstLanguage,
18}
19
20impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
22        let mut state = State::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish(result)
28    }
29}
30
31impl<'config> TypstLexer<'config> {
32    pub fn new(config: &'config TypstLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.lex_whitespace(state) {
41                continue;
42            }
43
44            if TYPST_COMMENT.scan(state, TypstSyntaxKind::LineComment, TypstSyntaxKind::BlockComment) {
45                continue;
46            }
47
48            if TYPST_STRING.scan(state, TypstSyntaxKind::StringLiteral) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operators(state) {
61                continue;
62            }
63
64            if self.lex_single_char_tokens(state) {
65                continue;
66            }
67
68            state.advance_if_dead_lock(safe_point);
69        }
70
71        Ok(())
72    }
73
74    fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
75        if let Some(ch) = state.peek() {
76            if ch == '\n' || ch == '\r' {
77                let start = state.get_position();
78                state.advance(1);
79                if ch == '\r' && state.peek() == Some('\n') {
80                    state.advance(1);
81                }
82                state.add_token(TypstSyntaxKind::Newline, start, state.get_position());
83                return true;
84            }
85        }
86        TYPST_WHITESPACE.scan(state, TypstSyntaxKind::Whitespace)
87    }
88
89    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90        let start = state.get_position();
91        let text = state.rest();
92        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
93            return false;
94        }
95
96        let mut pos = 0;
97        let chars: Vec<char> = text.chars().collect();
98
99        // 整数部分
100        while pos < chars.len() && chars[pos].is_ascii_digit() {
101            pos += 1;
102        }
103
104        // 小数部分
105        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
106            pos += 1; // 跳过 '.'
107            while pos < chars.len() && chars[pos].is_ascii_digit() {
108                pos += 1;
109            }
110        }
111
112        // 指数部分
113        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
114            pos += 1;
115            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
116                pos += 1;
117            }
118            while pos < chars.len() && chars[pos].is_ascii_digit() {
119                pos += 1;
120            }
121        }
122
123        if pos > 0 {
124            state.advance(pos);
125            state.add_token(TypstSyntaxKind::NumericLiteral, start, state.get_position());
126            return true;
127        }
128
129        false
130    }
131
132    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
133        let start = state.get_position();
134        let text = state.rest();
135        if text.is_empty() {
136            return false;
137        }
138
139        let first_char = text.chars().next().unwrap();
140        if !first_char.is_ascii_alphabetic() {
141            return false;
142        }
143
144        let mut pos = 0;
145        let chars: Vec<char> = text.chars().collect();
146
147        // 第一个字符
148        pos += 1;
149
150        // 后续字符
151        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
152            pos += 1;
153        }
154
155        if pos > 0 {
156            let identifier_text = &text[..pos];
157            let kind = self.keyword_or_identifier(identifier_text);
158            state.advance(pos);
159            state.add_token(kind, start, state.get_position());
160            return true;
161        }
162
163        false
164    }
165
166    fn keyword_or_identifier(&self, text: &str) -> TypstSyntaxKind {
167        match text {
168            "let" => TypstSyntaxKind::Let,
169            "if" => TypstSyntaxKind::If,
170            "else" => TypstSyntaxKind::Else,
171            "for" => TypstSyntaxKind::For,
172            "while" => TypstSyntaxKind::While,
173            "break" => TypstSyntaxKind::Break,
174            "continue" => TypstSyntaxKind::Continue,
175            "return" => TypstSyntaxKind::Return,
176            "true" => TypstSyntaxKind::True,
177            "false" => TypstSyntaxKind::False,
178            "set" => TypstSyntaxKind::Set,
179            "show" => TypstSyntaxKind::Show,
180            "import" => TypstSyntaxKind::Import,
181            "include" => TypstSyntaxKind::Include,
182            _ => TypstSyntaxKind::Identifier,
183        }
184    }
185
186    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
187        let start = state.get_position();
188        let text = state.rest();
189        if text.is_empty() {
190            return false;
191        }
192
193        let chars: Vec<char> = text.chars().collect();
194
195        let (kind, len) = match chars[0] {
196            '=' => {
197                let mut count = 1;
198                while count < chars.len() && chars[count] == '=' {
199                    count += 1;
200                }
201                (TypstSyntaxKind::Equal, count)
202            }
203            '!' => {
204                if chars.len() > 1 && chars[1] == '=' {
205                    (TypstSyntaxKind::NotEqual, 2)
206                }
207                else {
208                    (TypstSyntaxKind::Not, 1)
209                }
210            }
211            '<' => {
212                if chars.len() > 1 && chars[1] == '=' {
213                    (TypstSyntaxKind::LessEqual, 2)
214                }
215                else {
216                    (TypstSyntaxKind::Less, 1)
217                }
218            }
219            '>' => {
220                if chars.len() > 1 && chars[1] == '=' {
221                    (TypstSyntaxKind::GreaterEqual, 2)
222                }
223                else {
224                    (TypstSyntaxKind::Greater, 1)
225                }
226            }
227            '&' => {
228                if chars.len() > 1 && chars[1] == '&' {
229                    (TypstSyntaxKind::And, 2)
230                }
231                else {
232                    return false;
233                }
234            }
235            '|' => {
236                if chars.len() > 1 && chars[1] == '|' {
237                    (TypstSyntaxKind::Or, 2)
238                }
239                else {
240                    return false;
241                }
242            }
243            '+' => (TypstSyntaxKind::Plus, 1),
244            '-' => (TypstSyntaxKind::Minus, 1),
245            '*' => (TypstSyntaxKind::Star, 1),
246            '/' => (TypstSyntaxKind::Slash, 1),
247            '%' => (TypstSyntaxKind::Percent, 1),
248            _ => return false,
249        };
250
251        state.advance(len);
252        state.add_token(kind, start, state.get_position());
253        true
254    }
255
256    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
257        let start = state.get_position();
258        let text = state.rest();
259        if text.is_empty() {
260            return false;
261        }
262
263        let ch = text.chars().next().unwrap();
264
265        let kind = match ch {
266            '(' => TypstSyntaxKind::LeftParen,
267            ')' => TypstSyntaxKind::RightParen,
268            '{' => TypstSyntaxKind::LeftBrace,
269            '}' => TypstSyntaxKind::RightBrace,
270            '[' => TypstSyntaxKind::LeftBracket,
271            ']' => TypstSyntaxKind::RightBracket,
272            ';' => TypstSyntaxKind::Semicolon,
273            ',' => TypstSyntaxKind::Comma,
274            '.' => TypstSyntaxKind::Dot,
275            ':' => TypstSyntaxKind::Colon,
276            '#' => TypstSyntaxKind::Hash,
277            '@' => TypstSyntaxKind::At,
278            '$' => TypstSyntaxKind::Dollar,
279            '_' => TypstSyntaxKind::Underscore,
280            '`' => TypstSyntaxKind::Backtick,
281            _ => TypstSyntaxKind::Error,
282        };
283
284        state.advance(1);
285        state.add_token(kind, start, state.get_position());
286        true
287    }
288}