1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
9 source::{Source, TextEdit},
10};
11use std::sync::LazyLock;
12
13type State<'s, S> = LexerState<'s, S, TypstLanguage>;
14
15static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
17static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18
19#[derive(Clone, Debug)]
20pub struct TypstLexer<'config> {
22 config: &'config TypstLanguage,
23}
24
25impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
26 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
27 let mut state = State::new(source);
28 let result = self.run(&mut state);
29 if result.is_ok() {
30 state.add_eof();
31 }
32 state.finish(result)
33 }
34}
35
36impl<'config> TypstLexer<'config> {
37 pub fn new(config: &'config TypstLanguage) -> Self {
39 Self { config }
40 }
41
42 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
43 while state.not_at_end() {
44 let safe_point = state.get_position();
45
46 if self.lex_whitespace(state) {
47 continue;
48 }
49
50 if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
51 continue;
52 }
53
54 if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
55 continue;
56 }
57
58 if self.lex_number_literal(state) {
59 continue;
60 }
61
62 if self.lex_markup(state) {
63 continue;
64 }
65
66 if self.lex_identifier_or_keyword(state) {
67 continue;
68 }
69
70 if self.lex_operators(state) {
71 continue;
72 }
73
74 if self.lex_single_char_tokens(state) {
75 continue;
76 }
77
78 if self.lex_text(state) {
79 continue;
80 }
81
82 state.advance_if_dead_lock(safe_point)
83 }
84
85 Ok(())
86 }
87
88 fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89 if let Some(ch) = state.peek() {
90 if ch == '\n' || ch == '\r' {
91 let start = state.get_position();
92 state.advance(1);
93 if ch == '\r' && state.peek() == Some('\n') {
94 state.advance(1);
95 }
96 state.add_token(TypstTokenType::Newline, start, state.get_position());
97 return true;
98 }
99 }
100 TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
101 }
102
103 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
104 let start = state.get_position();
105 let text = state.rest();
106 if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
107 return false;
108 }
109
110 let mut pos = 0;
111 let chars: Vec<char> = text.chars().collect();
112
113 while pos < chars.len() && chars[pos].is_ascii_digit() {
115 pos += 1;
116 }
117
118 if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
120 pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
122 pos += 1;
123 }
124 }
125
126 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
128 pos += 1;
129 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
130 pos += 1;
131 }
132 while pos < chars.len() && chars[pos].is_ascii_digit() {
133 pos += 1;
134 }
135 }
136
137 if pos > 0 {
138 state.advance(pos);
139 state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
140 return true;
141 }
142
143 false
144 }
145
146 fn lex_markup<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
147 let start = state.get_position();
148
149 let is_line_start = start == 0 || matches!(state.source().get_char_at(start - 1), Some('\n') | Some('\r'));
151
152 if let Some(ch) = state.peek() {
153 match ch {
154 '=' if is_line_start => {
155 let mut count = 0;
156 while state.peek() == Some('=') {
157 count += 1;
158 state.advance(1);
159 }
160 if state.peek() == Some(' ') || state.peek() == Some('\t') {
161 state.add_token(TypstTokenType::Heading, start, state.get_position());
162 return true;
163 }
164 }
165 '-' | '+' if is_line_start => {
166 state.advance(1);
167 if state.peek() == Some(' ') || state.peek() == Some('\t') {
168 state.add_token(TypstTokenType::ListItem, start, state.get_position());
169 return true;
170 }
171 }
172 '0'..='9' if is_line_start => {
173 let mut pos = 0;
174 while let Some(c) = state.peek_next_n(pos) {
175 if c.is_ascii_digit() {
176 pos += 1;
177 }
178 else {
179 break;
180 }
181 }
182 if pos > 0 && state.peek_next_n(pos) == Some('.') {
183 pos += 1; if state.peek_next_n(pos) == Some(' ') || state.peek_next_n(pos) == Some('\t') {
185 state.advance(pos);
186 state.add_token(TypstTokenType::EnumItem, start, state.get_position());
187 return true;
188 }
189 }
190 }
191 '*' => {
192 let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
193 if !is_escaped {
194 state.advance(1);
195 state.add_token(TypstTokenType::Strong, start, state.get_position());
196 return true;
197 }
198 }
199 '_' => {
200 let is_escaped = start > 0 && state.source().get_char_at(start - 1) == Some('\\');
201 if !is_escaped {
202 state.advance(1);
203 state.add_token(TypstTokenType::Emphasis, start, state.get_position());
204 return true;
205 }
206 }
207 _ => {}
208 }
209 }
210
211 state.set_position(start);
212 false
213 }
214
215 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
216 let start = state.get_position();
217 let text = state.rest();
218 if text.is_empty() {
219 return false;
220 }
221
222 let first_char = text.chars().next().unwrap();
223 if !first_char.is_ascii_alphabetic() {
224 return false;
225 }
226
227 let mut pos = 0;
228 let chars: Vec<char> = text.chars().collect();
229
230 pos += 1;
232
233 while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
235 pos += 1;
236 }
237
238 if pos > 0 {
239 let identifier_text = &text[..pos];
240 let kind = self.keyword_or_identifier(identifier_text);
241 state.advance(pos);
242 state.add_token(kind, start, state.get_position());
243 return true;
244 }
245
246 false
247 }
248
249 fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
250 match text {
251 "let" => TypstTokenType::Let,
252 "if" => TypstTokenType::If,
253 "else" => TypstTokenType::Else,
254 "for" => TypstTokenType::For,
255 "while" => TypstTokenType::While,
256 "break" => TypstTokenType::Break,
257 "continue" => TypstTokenType::Continue,
258 "return" => TypstTokenType::Return,
259 "true" => TypstTokenType::True,
260 "false" => TypstTokenType::False,
261 "set" => TypstTokenType::Set,
262 "show" => TypstTokenType::Show,
263 "import" => TypstTokenType::Import,
264 "include" => TypstTokenType::Include,
265 _ => TypstTokenType::Identifier,
266 }
267 }
268
269 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
270 let start = state.get_position();
271 let text = state.rest();
272 if text.is_empty() {
273 return false;
274 }
275
276 let chars: Vec<char> = text.chars().collect();
277
278 let (kind, len) = match chars[0] {
279 '=' => {
280 let mut count = 1;
281 while count < chars.len() && chars[count] == '=' {
282 count += 1;
283 }
284 (TypstTokenType::Equal, count)
285 }
286 '!' => {
287 if chars.len() > 1 && chars[1] == '=' {
288 (TypstTokenType::NotEqual, 2)
289 }
290 else {
291 (TypstTokenType::Not, 1)
292 }
293 }
294 '<' => {
295 if chars.len() > 1 && chars[1] == '=' {
296 (TypstTokenType::LessEqual, 2)
297 }
298 else {
299 (TypstTokenType::Less, 1)
300 }
301 }
302 '>' => {
303 if chars.len() > 1 && chars[1] == '=' {
304 (TypstTokenType::GreaterEqual, 2)
305 }
306 else {
307 (TypstTokenType::Greater, 1)
308 }
309 }
310 '&' => {
311 if chars.len() > 1 && chars[1] == '&' {
312 (TypstTokenType::And, 2)
313 }
314 else {
315 return false;
316 }
317 }
318 '|' => {
319 if chars.len() > 1 && chars[1] == '|' {
320 (TypstTokenType::Or, 2)
321 }
322 else {
323 return false;
324 }
325 }
326 '+' => (TypstTokenType::Plus, 1),
327 '-' => (TypstTokenType::Minus, 1),
328 '*' => (TypstTokenType::Star, 1),
329 '/' => (TypstTokenType::Slash, 1),
330 '%' => (TypstTokenType::Percent, 1),
331 _ => return false,
332 };
333
334 state.advance(len);
335 state.add_token(kind, start, state.get_position());
336 true
337 }
338
339 fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
340 let start = state.get_position();
341 let text = state.rest();
342 if text.is_empty() {
343 return false;
344 }
345
346 let ch = text.chars().next().unwrap();
347
348 let kind = match ch {
349 '(' => TypstTokenType::LeftParen,
350 ')' => TypstTokenType::RightParen,
351 '{' => TypstTokenType::LeftBrace,
352 '}' => TypstTokenType::RightBrace,
353 '[' => TypstTokenType::LeftBracket,
354 ']' => TypstTokenType::RightBracket,
355 ';' => TypstTokenType::Semicolon,
356 ',' => TypstTokenType::Comma,
357 '.' => TypstTokenType::Dot,
358 ':' => TypstTokenType::Colon,
359 '#' => TypstTokenType::Hash,
360 '@' => TypstTokenType::At,
361 '$' => TypstTokenType::Dollar,
362 '_' => TypstTokenType::Underscore,
363 '`' => TypstTokenType::Backtick,
364 _ => return false,
365 };
366
367 state.advance(1);
368 state.add_token(kind, start, state.get_position());
369 true
370 }
371
372 fn lex_text<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
373 let start = state.get_position();
374 let mut has_text = false;
375
376 while let Some(ch) = state.peek() {
377 if ch.is_whitespace()
380 || ch == '/'
381 || ch == '"'
382 || ch == '='
383 || ch == '-'
384 || ch == '+'
385 || ch == '!'
386 || ch == '<'
387 || ch == '>'
388 || ch == '&'
389 || ch == '|'
390 || ch == '('
391 || ch == ')'
392 || ch == '{'
393 || ch == '}'
394 || ch == '['
395 || ch == ']'
396 || ch == ';'
397 || ch == ','
398 || ch == '.'
399 || ch == ':'
400 || ch == '#'
401 || ch == '@'
402 || ch == '$'
403 || ch == '`'
404 || ch == '\\'
405 {
406 break;
407 }
408
409 if ch == '*' || ch == '_' {
411 break;
412 }
413
414 state.advance(ch.len_utf8());
415 has_text = true;
416 }
417
418 if has_text {
419 state.add_token(TypstTokenType::Text, start, state.get_position());
420 true
421 }
422 else {
423 false
424 }
425 }
426}