1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8 source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, TypstLanguage>;
13
14static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct TypstLexer<'config> {
20 config: &'config TypstLanguage,
21}
22
23impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
25 let mut state = State::new(source);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof();
29 }
30 state.finish(result)
31 }
32}
33
34impl<'config> TypstLexer<'config> {
35 pub fn new(config: &'config TypstLanguage) -> Self {
36 Self { config }
37 }
38
39 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.lex_whitespace(state) {
44 continue;
45 }
46
47 if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
48 continue;
49 }
50
51 if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
52 continue;
53 }
54
55 if self.lex_number_literal(state) {
56 continue;
57 }
58
59 if self.lex_markup(state) {
60 continue;
61 }
62
63 if self.lex_identifier_or_keyword(state) {
64 continue;
65 }
66
67 if self.lex_operators(state) {
68 continue;
69 }
70
71 if self.lex_single_char_tokens(state) {
72 continue;
73 }
74
75 if self.lex_text(state) {
76 continue;
77 }
78
79 state.advance_if_dead_lock(safe_point)
80 }
81
82 Ok(())
83 }
84
85 fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86 if let Some(ch) = state.peek() {
87 if ch == '\n' || ch == '\r' {
88 let start = state.get_position();
89 state.advance(1);
90 if ch == '\r' && state.peek() == Some('\n') {
91 state.advance(1);
92 }
93 state.add_token(TypstTokenType::Newline, start, state.get_position());
94 return true;
95 }
96 }
97 TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
98 }
99
100 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
101 let start = state.get_position();
102 let text = state.rest();
103 if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
104 return false;
105 }
106
107 let mut pos = 0;
108 let chars: Vec<char> = text.chars().collect();
109
110 while pos < chars.len() && chars[pos].is_ascii_digit() {
112 pos += 1;
113 }
114
115 if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
117 pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
119 pos += 1;
120 }
121 }
122
123 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
125 pos += 1;
126 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
127 pos += 1;
128 }
129 while pos < chars.len() && chars[pos].is_ascii_digit() {
130 pos += 1;
131 }
132 }
133
134 if pos > 0 {
135 state.advance(pos);
136 state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
137 return true;
138 }
139
140 false
141 }
142
143 fn lex_markup<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
144 let start = state.get_position();
145
146 let is_line_start = start == 0 || matches!(state.source().get_char_at(start - 1), Some('\n') | Some('\r'));
148
149 if let Some(ch) = state.peek() {
150 match ch {
151 '=' if is_line_start => {
152 let mut count = 0;
153 while state.peek() == Some('=') {
154 count += 1;
155 state.advance(1);
156 }
157 if state.peek() == Some(' ') || state.peek() == Some('\t') {
158 state.add_token(TypstTokenType::Heading, start, state.get_position());
159 return true;
160 }
161 }
162 '-' | '+' if is_line_start => {
163 state.advance(1);
164 if state.peek() == Some(' ') || state.peek() == Some('\t') {
165 state.add_token(TypstTokenType::ListItem, start, state.get_position());
166 return true;
167 }
168 }
169 '0'..='9' if is_line_start => {
170 let mut pos = 0;
171 while let Some(c) = state.peek_next_n(pos) {
172 if c.is_ascii_digit() {
173 pos += 1;
174 }
175 else {
176 break;
177 }
178 }
179 if pos > 0 && state.peek_next_n(pos) == Some('.') {
180 pos += 1; if state.peek_next_n(pos) == Some(' ') || state.peek_next_n(pos) == Some('\t') {
182 state.advance(pos);
183 state.add_token(TypstTokenType::EnumItem, start, state.get_position());
184 return true;
185 }
186 }
187 }
188 '*' => {
189 let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
190 if !is_escaped {
191 state.advance(1);
192 state.add_token(TypstTokenType::Strong, start, state.get_position());
193 return true;
194 }
195 }
196 '_' => {
197 let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
198 if !is_escaped {
199 state.advance(1);
200 state.add_token(TypstTokenType::Emphasis, start, state.get_position());
201 return true;
202 }
203 }
204 _ => {}
205 }
206 }
207
208 state.set_position(start);
209 false
210 }
211
212 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
213 let start = state.get_position();
214 let text = state.rest();
215 if text.is_empty() {
216 return false;
217 }
218
219 let first_char = text.chars().next().unwrap();
220 if !first_char.is_ascii_alphabetic() {
221 return false;
222 }
223
224 let mut pos = 0;
225 let chars: Vec<char> = text.chars().collect();
226
227 pos += 1;
229
230 while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
232 pos += 1;
233 }
234
235 if pos > 0 {
236 let identifier_text = &text[..pos];
237 let kind = self.keyword_or_identifier(identifier_text);
238 state.advance(pos);
239 state.add_token(kind, start, state.get_position());
240 return true;
241 }
242
243 false
244 }
245
246 fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
247 match text {
248 "let" => TypstTokenType::Let,
249 "if" => TypstTokenType::If,
250 "else" => TypstTokenType::Else,
251 "for" => TypstTokenType::For,
252 "while" => TypstTokenType::While,
253 "break" => TypstTokenType::Break,
254 "continue" => TypstTokenType::Continue,
255 "return" => TypstTokenType::Return,
256 "true" => TypstTokenType::True,
257 "false" => TypstTokenType::False,
258 "set" => TypstTokenType::Set,
259 "show" => TypstTokenType::Show,
260 "import" => TypstTokenType::Import,
261 "include" => TypstTokenType::Include,
262 _ => TypstTokenType::Identifier,
263 }
264 }
265
266 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
267 let start = state.get_position();
268 let text = state.rest();
269 if text.is_empty() {
270 return false;
271 }
272
273 let chars: Vec<char> = text.chars().collect();
274
275 let (kind, len) = match chars[0] {
276 '=' => {
277 let mut count = 1;
278 while count < chars.len() && chars[count] == '=' {
279 count += 1;
280 }
281 (TypstTokenType::Equal, count)
282 }
283 '!' => {
284 if chars.len() > 1 && chars[1] == '=' {
285 (TypstTokenType::NotEqual, 2)
286 }
287 else {
288 (TypstTokenType::Not, 1)
289 }
290 }
291 '<' => {
292 if chars.len() > 1 && chars[1] == '=' {
293 (TypstTokenType::LessEqual, 2)
294 }
295 else {
296 (TypstTokenType::Less, 1)
297 }
298 }
299 '>' => {
300 if chars.len() > 1 && chars[1] == '=' {
301 (TypstTokenType::GreaterEqual, 2)
302 }
303 else {
304 (TypstTokenType::Greater, 1)
305 }
306 }
307 '&' => {
308 if chars.len() > 1 && chars[1] == '&' {
309 (TypstTokenType::And, 2)
310 }
311 else {
312 return false;
313 }
314 }
315 '|' => {
316 if chars.len() > 1 && chars[1] == '|' {
317 (TypstTokenType::Or, 2)
318 }
319 else {
320 return false;
321 }
322 }
323 '+' => (TypstTokenType::Plus, 1),
324 '-' => (TypstTokenType::Minus, 1),
325 '*' => (TypstTokenType::Star, 1),
326 '/' => (TypstTokenType::Slash, 1),
327 '%' => (TypstTokenType::Percent, 1),
328 _ => return false,
329 };
330
331 state.advance(len);
332 state.add_token(kind, start, state.get_position());
333 true
334 }
335
336 fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
337 let start = state.get_position();
338 let text = state.rest();
339 if text.is_empty() {
340 return false;
341 }
342
343 let ch = text.chars().next().unwrap();
344
345 let kind = match ch {
346 '(' => TypstTokenType::LeftParen,
347 ')' => TypstTokenType::RightParen,
348 '{' => TypstTokenType::LeftBrace,
349 '}' => TypstTokenType::RightBrace,
350 '[' => TypstTokenType::LeftBracket,
351 ']' => TypstTokenType::RightBracket,
352 ';' => TypstTokenType::Semicolon,
353 ',' => TypstTokenType::Comma,
354 '.' => TypstTokenType::Dot,
355 ':' => TypstTokenType::Colon,
356 '#' => TypstTokenType::Hash,
357 '@' => TypstTokenType::At,
358 '$' => TypstTokenType::Dollar,
359 '_' => TypstTokenType::Underscore,
360 '`' => TypstTokenType::Backtick,
361 _ => return false,
362 };
363
364 state.advance(1);
365 state.add_token(kind, start, state.get_position());
366 true
367 }
368
369 fn lex_text<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
370 let start = state.get_position();
371 let mut has_text = false;
372
373 while let Some(ch) = state.peek() {
374 if ch.is_whitespace()
377 || ch == '/'
378 || ch == '"'
379 || ch == '='
380 || ch == '-'
381 || ch == '+'
382 || ch == '!'
383 || ch == '<'
384 || ch == '>'
385 || ch == '&'
386 || ch == '|'
387 || ch == '('
388 || ch == ')'
389 || ch == '{'
390 || ch == '}'
391 || ch == '['
392 || ch == ']'
393 || ch == ';'
394 || ch == ','
395 || ch == '.'
396 || ch == ':'
397 || ch == '#'
398 || ch == '@'
399 || ch == '$'
400 || ch == '`'
401 || ch == '\\'
402 {
403 break;
404 }
405
406 if ch == '*' || ch == '_' {
408 break;
409 }
410
411 state.advance(ch.len_utf8());
412 has_text = true;
413 }
414
415 if has_text {
416 state.add_token(TypstTokenType::Text, start, state.get_position());
417 true
418 }
419 else {
420 false
421 }
422 }
423}