Skip to main content

oak_typst/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for Typst lexer.
3pub mod token_type;
4
5use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9    source::{Source, TextEdit},
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, TypstLanguage>;
14
15static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
17static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18
19#[derive(Clone, Debug)]
20/// Lexer for Typst source code.
21pub struct TypstLexer<'config> {
22    config: &'config TypstLanguage,
23}
24
25impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
26    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
27        let mut state = State::new(source);
28        let result = self.run(&mut state);
29        if result.is_ok() {
30            state.add_eof();
31        }
32        state.finish(result)
33    }
34}
35
36impl<'config> TypstLexer<'config> {
37    /// Creates a new TypstLexer with the given language configuration.
38    pub fn new(config: &'config TypstLanguage) -> Self {
39        Self { config }
40    }
41
42    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
43        while state.not_at_end() {
44            let safe_point = state.get_position();
45
46            if self.lex_whitespace(state) {
47                continue;
48            }
49
50            if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
51                continue;
52            }
53
54            if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
55                continue;
56            }
57
58            if self.lex_number_literal(state) {
59                continue;
60            }
61
62            if self.lex_markup(state) {
63                continue;
64            }
65
66            if self.lex_identifier_or_keyword(state) {
67                continue;
68            }
69
70            if self.lex_operators(state) {
71                continue;
72            }
73
74            if self.lex_single_char_tokens(state) {
75                continue;
76            }
77
78            if self.lex_text(state) {
79                continue;
80            }
81
82            state.advance_if_dead_lock(safe_point)
83        }
84
85        Ok(())
86    }
87
88    fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        if let Some(ch) = state.peek() {
90            if ch == '\n' || ch == '\r' {
91                let start = state.get_position();
92                state.advance(1);
93                if ch == '\r' && state.peek() == Some('\n') {
94                    state.advance(1);
95                }
96                state.add_token(TypstTokenType::Newline, start, state.get_position());
97                return true;
98            }
99        }
100        TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
101    }
102
103    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
104        let start = state.get_position();
105        let text = state.rest();
106        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
107            return false;
108        }
109
110        let mut pos = 0;
111        let chars: Vec<char> = text.chars().collect();
112
113        // Integer part
114        while pos < chars.len() && chars[pos].is_ascii_digit() {
115            pos += 1;
116        }
117
118        // Fractional part
119        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
120            pos += 1; // Skip '.'
121            while pos < chars.len() && chars[pos].is_ascii_digit() {
122                pos += 1;
123            }
124        }
125
126        // Exponent part
127        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
128            pos += 1;
129            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
130                pos += 1;
131            }
132            while pos < chars.len() && chars[pos].is_ascii_digit() {
133                pos += 1;
134            }
135        }
136
137        if pos > 0 {
138            state.advance(pos);
139            state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
140            return true;
141        }
142
143        false
144    }
145
146    fn lex_markup<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
147        let start = state.get_position();
148
149        // Check for the beginning of a line
150        let is_line_start = start == 0 || matches!(state.source().get_char_at(start - 1), Some('\n') | Some('\r'));
151
152        if let Some(ch) = state.peek() {
153            match ch {
154                '=' if is_line_start => {
155                    let mut count = 0;
156                    while state.peek() == Some('=') {
157                        count += 1;
158                        state.advance(1);
159                    }
160                    if state.peek() == Some(' ') || state.peek() == Some('\t') {
161                        state.add_token(TypstTokenType::Heading, start, state.get_position());
162                        return true;
163                    }
164                }
165                '-' | '+' if is_line_start => {
166                    state.advance(1);
167                    if state.peek() == Some(' ') || state.peek() == Some('\t') {
168                        state.add_token(TypstTokenType::ListItem, start, state.get_position());
169                        return true;
170                    }
171                }
172                '0'..='9' if is_line_start => {
173                    let mut pos = 0;
174                    while let Some(c) = state.peek_next_n(pos) {
175                        if c.is_ascii_digit() {
176                            pos += 1;
177                        }
178                        else {
179                            break;
180                        }
181                    }
182                    if pos > 0 && state.peek_next_n(pos) == Some('.') {
183                        pos += 1; // '.'
184                        if state.peek_next_n(pos) == Some(' ') || state.peek_next_n(pos) == Some('\t') {
185                            state.advance(pos);
186                            state.add_token(TypstTokenType::EnumItem, start, state.get_position());
187                            return true;
188                        }
189                    }
190                }
191                '*' => {
192                    let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
193                    if !is_escaped {
194                        state.advance(1);
195                        state.add_token(TypstTokenType::Strong, start, state.get_position());
196                        return true;
197                    }
198                }
199                '_' => {
200                    let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
201                    if !is_escaped {
202                        state.advance(1);
203                        state.add_token(TypstTokenType::Emphasis, start, state.get_position());
204                        return true;
205                    }
206                }
207                _ => {}
208            }
209        }
210
211        state.set_position(start);
212        false
213    }
214
215    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
216        let start = state.get_position();
217        let text = state.rest();
218        if text.is_empty() {
219            return false;
220        }
221
222        let first_char = text.chars().next().unwrap();
223        if !first_char.is_ascii_alphabetic() {
224            return false;
225        }
226
227        let mut pos = 0;
228        let chars: Vec<char> = text.chars().collect();
229
230        // First character
231        pos += 1;
232
233        // Subsequent characters
234        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
235            pos += 1;
236        }
237
238        if pos > 0 {
239            let identifier_text = &text[..pos];
240            let kind = self.keyword_or_identifier(identifier_text);
241            state.advance(pos);
242            state.add_token(kind, start, state.get_position());
243            return true;
244        }
245
246        false
247    }
248
249    fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
250        match text {
251            "let" => TypstTokenType::Let,
252            "if" => TypstTokenType::If,
253            "else" => TypstTokenType::Else,
254            "for" => TypstTokenType::For,
255            "while" => TypstTokenType::While,
256            "break" => TypstTokenType::Break,
257            "continue" => TypstTokenType::Continue,
258            "return" => TypstTokenType::Return,
259            "true" => TypstTokenType::True,
260            "false" => TypstTokenType::False,
261            "set" => TypstTokenType::Set,
262            "show" => TypstTokenType::Show,
263            "import" => TypstTokenType::Import,
264            "include" => TypstTokenType::Include,
265            _ => TypstTokenType::Identifier,
266        }
267    }
268
269    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
270        let start = state.get_position();
271        let text = state.rest();
272        if text.is_empty() {
273            return false;
274        }
275
276        let chars: Vec<char> = text.chars().collect();
277
278        let (kind, len) = match chars[0] {
279            '=' => {
280                let mut count = 1;
281                while count < chars.len() && chars[count] == '=' {
282                    count += 1;
283                }
284                (TypstTokenType::Equal, count)
285            }
286            '!' => {
287                if chars.len() > 1 && chars[1] == '=' {
288                    (TypstTokenType::NotEqual, 2)
289                }
290                else {
291                    (TypstTokenType::Not, 1)
292                }
293            }
294            '<' => {
295                if chars.len() > 1 && chars[1] == '=' {
296                    (TypstTokenType::LessEqual, 2)
297                }
298                else {
299                    (TypstTokenType::Less, 1)
300                }
301            }
302            '>' => {
303                if chars.len() > 1 && chars[1] == '=' {
304                    (TypstTokenType::GreaterEqual, 2)
305                }
306                else {
307                    (TypstTokenType::Greater, 1)
308                }
309            }
310            '&' => {
311                if chars.len() > 1 && chars[1] == '&' {
312                    (TypstTokenType::And, 2)
313                }
314                else {
315                    return false;
316                }
317            }
318            '|' => {
319                if chars.len() > 1 && chars[1] == '|' {
320                    (TypstTokenType::Or, 2)
321                }
322                else {
323                    return false;
324                }
325            }
326            '+' => (TypstTokenType::Plus, 1),
327            '-' => (TypstTokenType::Minus, 1),
328            '*' => (TypstTokenType::Star, 1),
329            '/' => (TypstTokenType::Slash, 1),
330            '%' => (TypstTokenType::Percent, 1),
331            _ => return false,
332        };
333
334        state.advance(len);
335        state.add_token(kind, start, state.get_position());
336        true
337    }
338
339    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
340        let start = state.get_position();
341        let text = state.rest();
342        if text.is_empty() {
343            return false;
344        }
345
346        let ch = text.chars().next().unwrap();
347
348        let kind = match ch {
349            '(' => TypstTokenType::LeftParen,
350            ')' => TypstTokenType::RightParen,
351            '{' => TypstTokenType::LeftBrace,
352            '}' => TypstTokenType::RightBrace,
353            '[' => TypstTokenType::LeftBracket,
354            ']' => TypstTokenType::RightBracket,
355            ';' => TypstTokenType::Semicolon,
356            ',' => TypstTokenType::Comma,
357            '.' => TypstTokenType::Dot,
358            ':' => TypstTokenType::Colon,
359            '#' => TypstTokenType::Hash,
360            '@' => TypstTokenType::At,
361            '$' => TypstTokenType::Dollar,
362            '_' => TypstTokenType::Underscore,
363            '`' => TypstTokenType::Backtick,
364            _ => return false,
365        };
366
367        state.advance(1);
368        state.add_token(kind, start, state.get_position());
369        true
370    }
371
372    fn lex_text<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
373        let start = state.get_position();
374        let mut has_text = false;
375
376        while let Some(ch) = state.peek() {
377            // Text should also consume alphanumeric characters if they don't form identifiers
378            // But here we want to break on anything that could be a special token
379            if ch.is_whitespace()
380                || ch == '/'
381                || ch == '"'
382                || ch == '='
383                || ch == '-'
384                || ch == '+'
385                || ch == '!'
386                || ch == '<'
387                || ch == '>'
388                || ch == '&'
389                || ch == '|'
390                || ch == '('
391                || ch == ')'
392                || ch == '{'
393                || ch == '}'
394                || ch == '['
395                || ch == ']'
396                || ch == ';'
397                || ch == ','
398                || ch == '.'
399                || ch == ':'
400                || ch == '#'
401                || ch == '@'
402                || ch == '$'
403                || ch == '`'
404                || ch == '\\'
405            {
406                break;
407            }
408
409            // Special handling for markup chars that were not handled by lex_markup
410            if ch == '*' || ch == '_' {
411                break;
412            }
413
414            state.advance(ch.len_utf8());
415            has_text = true;
416        }
417
418        if has_text {
419            state.add_token(TypstTokenType::Text, start, state.get_position());
420            true
421        }
422        else {
423            false
424        }
425    }
426}