Skip to main content

oak_typst/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8    source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, TypstLanguage>;
13
14static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct TypstLexer<'config> {
20    config: &'config TypstLanguage,
21}
22
23impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
25        let mut state = State::new(source);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish(result)
31    }
32}
33
34impl<'config> TypstLexer<'config> {
35    pub fn new(config: &'config TypstLanguage) -> Self {
36        Self { config }
37    }
38
39    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.lex_whitespace(state) {
44                continue;
45            }
46
47            if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
48                continue;
49            }
50
51            if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_markup(state) {
60                continue;
61            }
62
63            if self.lex_identifier_or_keyword(state) {
64                continue;
65            }
66
67            if self.lex_operators(state) {
68                continue;
69            }
70
71            if self.lex_single_char_tokens(state) {
72                continue;
73            }
74
75            if self.lex_text(state) {
76                continue;
77            }
78
79            state.advance_if_dead_lock(safe_point)
80        }
81
82        Ok(())
83    }
84
85    fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86        if let Some(ch) = state.peek() {
87            if ch == '\n' || ch == '\r' {
88                let start = state.get_position();
89                state.advance(1);
90                if ch == '\r' && state.peek() == Some('\n') {
91                    state.advance(1);
92                }
93                state.add_token(TypstTokenType::Newline, start, state.get_position());
94                return true;
95            }
96        }
97        TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
98    }
99
100    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
101        let start = state.get_position();
102        let text = state.rest();
103        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
104            return false;
105        }
106
107        let mut pos = 0;
108        let chars: Vec<char> = text.chars().collect();
109
110        // Integer part
111        while pos < chars.len() && chars[pos].is_ascii_digit() {
112            pos += 1;
113        }
114
115        // Fractional part
116        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
117            pos += 1; // Skip '.'
118            while pos < chars.len() && chars[pos].is_ascii_digit() {
119                pos += 1;
120            }
121        }
122
123        // Exponent part
124        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
125            pos += 1;
126            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
127                pos += 1;
128            }
129            while pos < chars.len() && chars[pos].is_ascii_digit() {
130                pos += 1;
131            }
132        }
133
134        if pos > 0 {
135            state.advance(pos);
136            state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
137            return true;
138        }
139
140        false
141    }
142
143    fn lex_markup<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
144        let start = state.get_position();
145
146        // Check for the beginning of a line
147        let is_line_start = start == 0 || matches!(state.source().get_char_at(start - 1), Some('\n') | Some('\r'));
148
149        if let Some(ch) = state.peek() {
150            match ch {
151                '=' if is_line_start => {
152                    let mut count = 0;
153                    while state.peek() == Some('=') {
154                        count += 1;
155                        state.advance(1);
156                    }
157                    if state.peek() == Some(' ') || state.peek() == Some('\t') {
158                        state.add_token(TypstTokenType::Heading, start, state.get_position());
159                        return true;
160                    }
161                }
162                '-' | '+' if is_line_start => {
163                    state.advance(1);
164                    if state.peek() == Some(' ') || state.peek() == Some('\t') {
165                        state.add_token(TypstTokenType::ListItem, start, state.get_position());
166                        return true;
167                    }
168                }
169                '0'..='9' if is_line_start => {
170                    let mut pos = 0;
171                    while let Some(c) = state.peek_next_n(pos) {
172                        if c.is_ascii_digit() {
173                            pos += 1;
174                        }
175                        else {
176                            break;
177                        }
178                    }
179                    if pos > 0 && state.peek_next_n(pos) == Some('.') {
180                        pos += 1; // '.'
181                        if state.peek_next_n(pos) == Some(' ') || state.peek_next_n(pos) == Some('\t') {
182                            state.advance(pos);
183                            state.add_token(TypstTokenType::EnumItem, start, state.get_position());
184                            return true;
185                        }
186                    }
187                }
188                '*' => {
189                    let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
190                    if !is_escaped {
191                        state.advance(1);
192                        state.add_token(TypstTokenType::Strong, start, state.get_position());
193                        return true;
194                    }
195                }
196                '_' => {
197                    let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
198                    if !is_escaped {
199                        state.advance(1);
200                        state.add_token(TypstTokenType::Emphasis, start, state.get_position());
201                        return true;
202                    }
203                }
204                _ => {}
205            }
206        }
207
208        state.set_position(start);
209        false
210    }
211
212    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
213        let start = state.get_position();
214        let text = state.rest();
215        if text.is_empty() {
216            return false;
217        }
218
219        let first_char = text.chars().next().unwrap();
220        if !first_char.is_ascii_alphabetic() {
221            return false;
222        }
223
224        let mut pos = 0;
225        let chars: Vec<char> = text.chars().collect();
226
227        // First character
228        pos += 1;
229
230        // Subsequent characters
231        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
232            pos += 1;
233        }
234
235        if pos > 0 {
236            let identifier_text = &text[..pos];
237            let kind = self.keyword_or_identifier(identifier_text);
238            state.advance(pos);
239            state.add_token(kind, start, state.get_position());
240            return true;
241        }
242
243        false
244    }
245
246    fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
247        match text {
248            "let" => TypstTokenType::Let,
249            "if" => TypstTokenType::If,
250            "else" => TypstTokenType::Else,
251            "for" => TypstTokenType::For,
252            "while" => TypstTokenType::While,
253            "break" => TypstTokenType::Break,
254            "continue" => TypstTokenType::Continue,
255            "return" => TypstTokenType::Return,
256            "true" => TypstTokenType::True,
257            "false" => TypstTokenType::False,
258            "set" => TypstTokenType::Set,
259            "show" => TypstTokenType::Show,
260            "import" => TypstTokenType::Import,
261            "include" => TypstTokenType::Include,
262            _ => TypstTokenType::Identifier,
263        }
264    }
265
266    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
267        let start = state.get_position();
268        let text = state.rest();
269        if text.is_empty() {
270            return false;
271        }
272
273        let chars: Vec<char> = text.chars().collect();
274
275        let (kind, len) = match chars[0] {
276            '=' => {
277                let mut count = 1;
278                while count < chars.len() && chars[count] == '=' {
279                    count += 1;
280                }
281                (TypstTokenType::Equal, count)
282            }
283            '!' => {
284                if chars.len() > 1 && chars[1] == '=' {
285                    (TypstTokenType::NotEqual, 2)
286                }
287                else {
288                    (TypstTokenType::Not, 1)
289                }
290            }
291            '<' => {
292                if chars.len() > 1 && chars[1] == '=' {
293                    (TypstTokenType::LessEqual, 2)
294                }
295                else {
296                    (TypstTokenType::Less, 1)
297                }
298            }
299            '>' => {
300                if chars.len() > 1 && chars[1] == '=' {
301                    (TypstTokenType::GreaterEqual, 2)
302                }
303                else {
304                    (TypstTokenType::Greater, 1)
305                }
306            }
307            '&' => {
308                if chars.len() > 1 && chars[1] == '&' {
309                    (TypstTokenType::And, 2)
310                }
311                else {
312                    return false;
313                }
314            }
315            '|' => {
316                if chars.len() > 1 && chars[1] == '|' {
317                    (TypstTokenType::Or, 2)
318                }
319                else {
320                    return false;
321                }
322            }
323            '+' => (TypstTokenType::Plus, 1),
324            '-' => (TypstTokenType::Minus, 1),
325            '*' => (TypstTokenType::Star, 1),
326            '/' => (TypstTokenType::Slash, 1),
327            '%' => (TypstTokenType::Percent, 1),
328            _ => return false,
329        };
330
331        state.advance(len);
332        state.add_token(kind, start, state.get_position());
333        true
334    }
335
336    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
337        let start = state.get_position();
338        let text = state.rest();
339        if text.is_empty() {
340            return false;
341        }
342
343        let ch = text.chars().next().unwrap();
344
345        let kind = match ch {
346            '(' => TypstTokenType::LeftParen,
347            ')' => TypstTokenType::RightParen,
348            '{' => TypstTokenType::LeftBrace,
349            '}' => TypstTokenType::RightBrace,
350            '[' => TypstTokenType::LeftBracket,
351            ']' => TypstTokenType::RightBracket,
352            ';' => TypstTokenType::Semicolon,
353            ',' => TypstTokenType::Comma,
354            '.' => TypstTokenType::Dot,
355            ':' => TypstTokenType::Colon,
356            '#' => TypstTokenType::Hash,
357            '@' => TypstTokenType::At,
358            '$' => TypstTokenType::Dollar,
359            '_' => TypstTokenType::Underscore,
360            '`' => TypstTokenType::Backtick,
361            _ => return false,
362        };
363
364        state.advance(1);
365        state.add_token(kind, start, state.get_position());
366        true
367    }
368
369    fn lex_text<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
370        let start = state.get_position();
371        let mut has_text = false;
372
373        while let Some(ch) = state.peek() {
374            // Text should also consume alphanumeric characters if they don't form identifiers
375            // But here we want to break on anything that could be a special token
376            if ch.is_whitespace()
377                || ch == '/'
378                || ch == '"'
379                || ch == '='
380                || ch == '-'
381                || ch == '+'
382                || ch == '!'
383                || ch == '<'
384                || ch == '>'
385                || ch == '&'
386                || ch == '|'
387                || ch == '('
388                || ch == ')'
389                || ch == '{'
390                || ch == '}'
391                || ch == '['
392                || ch == ']'
393                || ch == ';'
394                || ch == ','
395                || ch == '.'
396                || ch == ':'
397                || ch == '#'
398                || ch == '@'
399                || ch == '$'
400                || ch == '`'
401                || ch == '\\'
402            {
403                break;
404            }
405
406            // Special handling for markup chars that were not handled by lex_markup
407            if ch == '*' || ch == '_' {
408                break;
409            }
410
411            state.advance(ch.len_utf8());
412            has_text = true;
413        }
414
415        if has_text {
416            state.add_token(TypstTokenType::Text, start, state.get_position());
417            true
418        }
419        else {
420            false
421        }
422    }
423}