1use crate::{kind::TypstSyntaxKind, language::TypstLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, TypstLanguage>;
10
11static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TYPST_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["//"] });
13static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct TypstLexer<'config> {
17 config: &'config TypstLanguage,
18}
19
20impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
21 fn lex_incremental(
22 &self,
23 source: impl Source,
24 changed: usize,
25 cache: IncrementalCache<TypstLanguage>,
26 ) -> LexOutput<TypstLanguage> {
27 let mut state = LexerState::new_with_cache(source, changed, cache);
28 let result = self.run(&mut state);
29 state.finish(result)
30 }
31}
32
33impl<'config> TypstLexer<'config> {
34 pub fn new(config: &'config TypstLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39 while state.not_at_end() {
40 let safe_point = state.get_position();
41
42 if self.skip_whitespace(state) {
43 continue;
44 }
45
46 if self.skip_comment(state) {
47 continue;
48 }
49
50 if self.lex_string_literal(state) {
51 continue;
52 }
53
54 if self.lex_number_literal(state) {
55 continue;
56 }
57
58 if self.lex_identifier_or_keyword(state) {
59 continue;
60 }
61
62 if self.lex_operators(state) {
63 continue;
64 }
65
66 if self.lex_single_char_tokens(state) {
67 continue;
68 }
69
70 state.safe_check(safe_point);
71 }
72
73 let eof_pos = state.get_position();
75 state.add_token(TypstSyntaxKind::Eof, eof_pos, eof_pos);
76 Ok(())
77 }
78
79 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
80 match TYPST_WHITESPACE.scan(state.rest(), state.get_position(), TypstSyntaxKind::Whitespace) {
81 Some(token) => {
82 state.advance_with(token);
83 return true;
84 }
85 None => {}
86 }
87 false
88 }
89
90 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
91 if let Some(token) = TYPST_COMMENT.scan(state.rest(), state.get_position(), TypstSyntaxKind::LineComment) {
93 state.advance_with(token);
94 return true;
95 }
96
97 if state.rest().starts_with("/*") {
99 let start = state.get_position();
100 let mut pos = 2;
101 let text = state.rest();
102
103 while pos < text.len() {
104 if text[pos..].starts_with("*/") {
105 pos += 2;
106 break;
107 }
108 pos += 1;
109 }
110
111 state.advance(pos);
112 state.add_token(TypstSyntaxKind::BlockComment, start, state.get_position());
113 return true;
114 }
115
116 false
117 }
118
119 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
120 match TYPST_STRING.scan(state.rest(), state.get_position(), TypstSyntaxKind::StringLiteral) {
121 Some(token) => {
122 state.advance_with(token);
123 return true;
124 }
125 None => {}
126 }
127 false
128 }
129
130 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
131 let text = state.rest();
132 if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
133 return false;
134 }
135
136 let start = state.get_position();
137 let mut pos = 0;
138 let chars: Vec<char> = text.chars().collect();
139
140 while pos < chars.len() && chars[pos].is_ascii_digit() {
142 pos += 1;
143 }
144
145 if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
147 pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
149 pos += 1;
150 }
151 }
152
153 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
155 pos += 1;
156 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
157 pos += 1;
158 }
159 while pos < chars.len() && chars[pos].is_ascii_digit() {
160 pos += 1;
161 }
162 }
163
164 if pos > 0 {
165 state.advance(pos);
166 state.add_token(TypstSyntaxKind::NumericLiteral, start, state.get_position());
167 return true;
168 }
169
170 false
171 }
172
173 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
174 let text = state.rest();
175 if text.is_empty() {
176 return false;
177 }
178
179 let first_char = text.chars().next().unwrap();
180 if !first_char.is_ascii_alphabetic() && first_char != '_' {
181 return false;
182 }
183
184 let start = state.get_position();
185 let mut pos = 0;
186 let chars: Vec<char> = text.chars().collect();
187
188 pos += 1;
190
191 while pos < chars.len() && (chars[pos].is_ascii_alphanumeric() || chars[pos] == '_') {
193 pos += 1;
194 }
195
196 if pos > 0 {
197 let identifier_text = &text[..pos];
198 let kind = self.keyword_or_identifier(identifier_text);
199 state.advance(pos);
200 state.add_token(kind, start, state.get_position());
201 return true;
202 }
203
204 false
205 }
206
207 fn keyword_or_identifier(&self, text: &str) -> TypstSyntaxKind {
208 match text {
209 "let" => TypstSyntaxKind::Let,
210 "if" => TypstSyntaxKind::If,
211 "else" => TypstSyntaxKind::Else,
212 "for" => TypstSyntaxKind::For,
213 "while" => TypstSyntaxKind::While,
214 "break" => TypstSyntaxKind::Break,
215 "continue" => TypstSyntaxKind::Continue,
216 "return" => TypstSyntaxKind::Return,
217 "true" => TypstSyntaxKind::True,
218 "false" => TypstSyntaxKind::False,
219 "set" => TypstSyntaxKind::Set,
220 "show" => TypstSyntaxKind::Show,
221 "import" => TypstSyntaxKind::Import,
222 "include" => TypstSyntaxKind::Include,
223 _ => TypstSyntaxKind::Identifier,
224 }
225 }
226
227 fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
228 let text = state.rest();
229 if text.is_empty() {
230 return false;
231 }
232
233 let start = state.get_position();
234 let chars: Vec<char> = text.chars().collect();
235
236 let (kind, len) = match chars[0] {
237 '=' => {
238 if chars.len() > 1 && chars[1] == '=' {
239 (TypstSyntaxKind::EqualEqual, 2)
240 }
241 else {
242 (TypstSyntaxKind::Equal, 1)
243 }
244 }
245 '!' => {
246 if chars.len() > 1 && chars[1] == '=' {
247 (TypstSyntaxKind::NotEqual, 2)
248 }
249 else {
250 (TypstSyntaxKind::Not, 1)
251 }
252 }
253 '<' => {
254 if chars.len() > 1 && chars[1] == '=' {
255 (TypstSyntaxKind::LessEqual, 2)
256 }
257 else {
258 (TypstSyntaxKind::Less, 1)
259 }
260 }
261 '>' => {
262 if chars.len() > 1 && chars[1] == '=' {
263 (TypstSyntaxKind::GreaterEqual, 2)
264 }
265 else {
266 (TypstSyntaxKind::Greater, 1)
267 }
268 }
269 '&' => {
270 if chars.len() > 1 && chars[1] == '&' {
271 (TypstSyntaxKind::And, 2)
272 }
273 else {
274 return false;
275 }
276 }
277 '|' => {
278 if chars.len() > 1 && chars[1] == '|' {
279 (TypstSyntaxKind::Or, 2)
280 }
281 else {
282 return false;
283 }
284 }
285 '+' => (TypstSyntaxKind::Plus, 1),
286 '-' => (TypstSyntaxKind::Minus, 1),
287 '*' => (TypstSyntaxKind::Star, 1),
288 '/' => (TypstSyntaxKind::Slash, 1),
289 '%' => (TypstSyntaxKind::Percent, 1),
290 _ => return false,
291 };
292
293 state.advance(len);
294 state.add_token(kind, start, state.get_position());
295 true
296 }
297
298 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
299 let text = state.rest();
300 if text.is_empty() {
301 return false;
302 }
303
304 let start = state.get_position();
305 let ch = text.chars().next().unwrap();
306
307 let kind = match ch {
308 '(' => TypstSyntaxKind::LeftParen,
309 ')' => TypstSyntaxKind::RightParen,
310 '{' => TypstSyntaxKind::LeftBrace,
311 '}' => TypstSyntaxKind::RightBrace,
312 '[' => TypstSyntaxKind::LeftBracket,
313 ']' => TypstSyntaxKind::RightBracket,
314 ';' => TypstSyntaxKind::Semicolon,
315 ',' => TypstSyntaxKind::Comma,
316 '.' => TypstSyntaxKind::Dot,
317 ':' => TypstSyntaxKind::Colon,
318 '#' => TypstSyntaxKind::Hash,
319 '@' => TypstSyntaxKind::At,
320 '$' => TypstSyntaxKind::Dollar,
321 '_' => TypstSyntaxKind::Underscore,
322 _ => TypstSyntaxKind::Error,
323 };
324
325 state.advance(1);
326 state.add_token(kind, start, state.get_position());
327 true
328 }
329}