Skip to main content

oak_typst/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8    source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, TypstLanguage>;
13
14static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct TypstLexer<'config> {
20    _config: &'config TypstLanguage,
21}
22
23impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
25        let mut state = State::new(source);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish(result)
31    }
32}
33
34impl<'config> TypstLexer<'config> {
35    pub fn new(config: &'config TypstLanguage) -> Self {
36        Self { _config: config }
37    }
38
39    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.lex_whitespace(state) {
44                continue;
45            }
46
47            if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
48                continue;
49            }
50
51            if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operators(state) {
64                continue;
65            }
66
67            if self.lex_single_char_tokens(state) {
68                continue;
69            }
70
71            state.advance_if_dead_lock(safe_point)
72        }
73
74        Ok(())
75    }
76
77    fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
78        if let Some(ch) = state.peek() {
79            if ch == '\n' || ch == '\r' {
80                let start = state.get_position();
81                state.advance(1);
82                if ch == '\r' && state.peek() == Some('\n') {
83                    state.advance(1);
84                }
85                state.add_token(TypstTokenType::Newline, start, state.get_position());
86                return true;
87            }
88        }
89        TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
90    }
91
92    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93        let start = state.get_position();
94        let text = state.rest();
95        if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
96            return false;
97        }
98
99        let mut pos = 0;
100        let chars: Vec<char> = text.chars().collect();
101
102        // 整数部分
103        while pos < chars.len() && chars[pos].is_ascii_digit() {
104            pos += 1;
105        }
106
107        // 小数部分
108        if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
109            pos += 1; // 跳过 '.'
110            while pos < chars.len() && chars[pos].is_ascii_digit() {
111                pos += 1;
112            }
113        }
114
115        // 指数部分
116        if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
117            pos += 1;
118            if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
119                pos += 1;
120            }
121            while pos < chars.len() && chars[pos].is_ascii_digit() {
122                pos += 1;
123            }
124        }
125
126        if pos > 0 {
127            state.advance(pos);
128            state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
129            return true;
130        }
131
132        false
133    }
134
135    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
136        let start = state.get_position();
137        let text = state.rest();
138        if text.is_empty() {
139            return false;
140        }
141
142        let first_char = text.chars().next().unwrap();
143        if !first_char.is_ascii_alphabetic() {
144            return false;
145        }
146
147        let mut pos = 0;
148        let chars: Vec<char> = text.chars().collect();
149
150        // 第一个字符
151        pos += 1;
152
153        // 后续字符
154        while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
155            pos += 1;
156        }
157
158        if pos > 0 {
159            let identifier_text = &text[..pos];
160            let kind = self.keyword_or_identifier(identifier_text);
161            state.advance(pos);
162            state.add_token(kind, start, state.get_position());
163            return true;
164        }
165
166        false
167    }
168
169    fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
170        match text {
171            "let" => TypstTokenType::Let,
172            "if" => TypstTokenType::If,
173            "else" => TypstTokenType::Else,
174            "for" => TypstTokenType::For,
175            "while" => TypstTokenType::While,
176            "break" => TypstTokenType::Break,
177            "continue" => TypstTokenType::Continue,
178            "return" => TypstTokenType::Return,
179            "true" => TypstTokenType::True,
180            "false" => TypstTokenType::False,
181            "set" => TypstTokenType::Set,
182            "show" => TypstTokenType::Show,
183            "import" => TypstTokenType::Import,
184            "include" => TypstTokenType::Include,
185            _ => TypstTokenType::Identifier,
186        }
187    }
188
189    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
190        let start = state.get_position();
191        let text = state.rest();
192        if text.is_empty() {
193            return false;
194        }
195
196        let chars: Vec<char> = text.chars().collect();
197
198        let (kind, len) = match chars[0] {
199            '=' => {
200                let mut count = 1;
201                while count < chars.len() && chars[count] == '=' {
202                    count += 1;
203                }
204                (TypstTokenType::Equal, count)
205            }
206            '!' => {
207                if chars.len() > 1 && chars[1] == '=' {
208                    (TypstTokenType::NotEqual, 2)
209                }
210                else {
211                    (TypstTokenType::Not, 1)
212                }
213            }
214            '<' => {
215                if chars.len() > 1 && chars[1] == '=' {
216                    (TypstTokenType::LessEqual, 2)
217                }
218                else {
219                    (TypstTokenType::Less, 1)
220                }
221            }
222            '>' => {
223                if chars.len() > 1 && chars[1] == '=' {
224                    (TypstTokenType::GreaterEqual, 2)
225                }
226                else {
227                    (TypstTokenType::Greater, 1)
228                }
229            }
230            '&' => {
231                if chars.len() > 1 && chars[1] == '&' {
232                    (TypstTokenType::And, 2)
233                }
234                else {
235                    return false;
236                }
237            }
238            '|' => {
239                if chars.len() > 1 && chars[1] == '|' {
240                    (TypstTokenType::Or, 2)
241                }
242                else {
243                    return false;
244                }
245            }
246            '+' => (TypstTokenType::Plus, 1),
247            '-' => (TypstTokenType::Minus, 1),
248            '*' => (TypstTokenType::Star, 1),
249            '/' => (TypstTokenType::Slash, 1),
250            '%' => (TypstTokenType::Percent, 1),
251            _ => return false,
252        };
253
254        state.advance(len);
255        state.add_token(kind, start, state.get_position());
256        true
257    }
258
259    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
260        let start = state.get_position();
261        let text = state.rest();
262        if text.is_empty() {
263            return false;
264        }
265
266        let ch = text.chars().next().unwrap();
267
268        let kind = match ch {
269            '(' => TypstTokenType::LeftParen,
270            ')' => TypstTokenType::RightParen,
271            '{' => TypstTokenType::LeftBrace,
272            '}' => TypstTokenType::RightBrace,
273            '[' => TypstTokenType::LeftBracket,
274            ']' => TypstTokenType::RightBracket,
275            ';' => TypstTokenType::Semicolon,
276            ',' => TypstTokenType::Comma,
277            '.' => TypstTokenType::Dot,
278            ':' => TypstTokenType::Colon,
279            '#' => TypstTokenType::Hash,
280            '@' => TypstTokenType::At,
281            '$' => TypstTokenType::Dollar,
282            '_' => TypstTokenType::Underscore,
283            '`' => TypstTokenType::Backtick,
284            _ => TypstTokenType::Error,
285        };
286
287        state.advance(1);
288        state.add_token(kind, start, state.get_position());
289        true
290    }
291}