1use crate::{kind::TypstSyntaxKind, language::TypstLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'s, S> = LexerState<'s, S, TypstLanguage>;
10
11static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
13static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct TypstLexer<'config> {
17 _config: &'config TypstLanguage,
18}
19
20impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
22 let mut state = State::new(source);
23 let result = self.run(&mut state);
24 if result.is_ok() {
25 state.add_eof();
26 }
27 state.finish(result)
28 }
29}
30
31impl<'config> TypstLexer<'config> {
32 pub fn new(config: &'config TypstLanguage) -> Self {
33 Self { _config: config }
34 }
35
36 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39
40 if self.lex_whitespace(state) {
41 continue;
42 }
43
44 if TYPST_COMMENT.scan(state, TypstSyntaxKind::LineComment, TypstSyntaxKind::BlockComment) {
45 continue;
46 }
47
48 if TYPST_STRING.scan(state, TypstSyntaxKind::StringLiteral) {
49 continue;
50 }
51
52 if self.lex_number_literal(state) {
53 continue;
54 }
55
56 if self.lex_identifier_or_keyword(state) {
57 continue;
58 }
59
60 if self.lex_operators(state) {
61 continue;
62 }
63
64 if self.lex_single_char_tokens(state) {
65 continue;
66 }
67
68 state.advance_if_dead_lock(safe_point);
69 }
70
71 Ok(())
72 }
73
74 fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
75 if let Some(ch) = state.peek() {
76 if ch == '\n' || ch == '\r' {
77 let start = state.get_position();
78 state.advance(1);
79 if ch == '\r' && state.peek() == Some('\n') {
80 state.advance(1);
81 }
82 state.add_token(TypstSyntaxKind::Newline, start, state.get_position());
83 return true;
84 }
85 }
86 TYPST_WHITESPACE.scan(state, TypstSyntaxKind::Whitespace)
87 }
88
89 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
90 let start = state.get_position();
91 let text = state.rest();
92 if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
93 return false;
94 }
95
96 let mut pos = 0;
97 let chars: Vec<char> = text.chars().collect();
98
99 while pos < chars.len() && chars[pos].is_ascii_digit() {
101 pos += 1;
102 }
103
104 if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
106 pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
108 pos += 1;
109 }
110 }
111
112 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
114 pos += 1;
115 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
116 pos += 1;
117 }
118 while pos < chars.len() && chars[pos].is_ascii_digit() {
119 pos += 1;
120 }
121 }
122
123 if pos > 0 {
124 state.advance(pos);
125 state.add_token(TypstSyntaxKind::NumericLiteral, start, state.get_position());
126 return true;
127 }
128
129 false
130 }
131
132 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
133 let start = state.get_position();
134 let text = state.rest();
135 if text.is_empty() {
136 return false;
137 }
138
139 let first_char = text.chars().next().unwrap();
140 if !first_char.is_ascii_alphabetic() {
141 return false;
142 }
143
144 let mut pos = 0;
145 let chars: Vec<char> = text.chars().collect();
146
147 pos += 1;
149
150 while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
152 pos += 1;
153 }
154
155 if pos > 0 {
156 let identifier_text = &text[..pos];
157 let kind = self.keyword_or_identifier(identifier_text);
158 state.advance(pos);
159 state.add_token(kind, start, state.get_position());
160 return true;
161 }
162
163 false
164 }
165
166 fn keyword_or_identifier(&self, text: &str) -> TypstSyntaxKind {
167 match text {
168 "let" => TypstSyntaxKind::Let,
169 "if" => TypstSyntaxKind::If,
170 "else" => TypstSyntaxKind::Else,
171 "for" => TypstSyntaxKind::For,
172 "while" => TypstSyntaxKind::While,
173 "break" => TypstSyntaxKind::Break,
174 "continue" => TypstSyntaxKind::Continue,
175 "return" => TypstSyntaxKind::Return,
176 "true" => TypstSyntaxKind::True,
177 "false" => TypstSyntaxKind::False,
178 "set" => TypstSyntaxKind::Set,
179 "show" => TypstSyntaxKind::Show,
180 "import" => TypstSyntaxKind::Import,
181 "include" => TypstSyntaxKind::Include,
182 _ => TypstSyntaxKind::Identifier,
183 }
184 }
185
186 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
187 let start = state.get_position();
188 let text = state.rest();
189 if text.is_empty() {
190 return false;
191 }
192
193 let chars: Vec<char> = text.chars().collect();
194
195 let (kind, len) = match chars[0] {
196 '=' => {
197 let mut count = 1;
198 while count < chars.len() && chars[count] == '=' {
199 count += 1;
200 }
201 (TypstSyntaxKind::Equal, count)
202 }
203 '!' => {
204 if chars.len() > 1 && chars[1] == '=' {
205 (TypstSyntaxKind::NotEqual, 2)
206 }
207 else {
208 (TypstSyntaxKind::Not, 1)
209 }
210 }
211 '<' => {
212 if chars.len() > 1 && chars[1] == '=' {
213 (TypstSyntaxKind::LessEqual, 2)
214 }
215 else {
216 (TypstSyntaxKind::Less, 1)
217 }
218 }
219 '>' => {
220 if chars.len() > 1 && chars[1] == '=' {
221 (TypstSyntaxKind::GreaterEqual, 2)
222 }
223 else {
224 (TypstSyntaxKind::Greater, 1)
225 }
226 }
227 '&' => {
228 if chars.len() > 1 && chars[1] == '&' {
229 (TypstSyntaxKind::And, 2)
230 }
231 else {
232 return false;
233 }
234 }
235 '|' => {
236 if chars.len() > 1 && chars[1] == '|' {
237 (TypstSyntaxKind::Or, 2)
238 }
239 else {
240 return false;
241 }
242 }
243 '+' => (TypstSyntaxKind::Plus, 1),
244 '-' => (TypstSyntaxKind::Minus, 1),
245 '*' => (TypstSyntaxKind::Star, 1),
246 '/' => (TypstSyntaxKind::Slash, 1),
247 '%' => (TypstSyntaxKind::Percent, 1),
248 _ => return false,
249 };
250
251 state.advance(len);
252 state.add_token(kind, start, state.get_position());
253 true
254 }
255
256 fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
257 let start = state.get_position();
258 let text = state.rest();
259 if text.is_empty() {
260 return false;
261 }
262
263 let ch = text.chars().next().unwrap();
264
265 let kind = match ch {
266 '(' => TypstSyntaxKind::LeftParen,
267 ')' => TypstSyntaxKind::RightParen,
268 '{' => TypstSyntaxKind::LeftBrace,
269 '}' => TypstSyntaxKind::RightBrace,
270 '[' => TypstSyntaxKind::LeftBracket,
271 ']' => TypstSyntaxKind::RightBracket,
272 ';' => TypstSyntaxKind::Semicolon,
273 ',' => TypstSyntaxKind::Comma,
274 '.' => TypstSyntaxKind::Dot,
275 ':' => TypstSyntaxKind::Colon,
276 '#' => TypstSyntaxKind::Hash,
277 '@' => TypstSyntaxKind::At,
278 '$' => TypstSyntaxKind::Dollar,
279 '_' => TypstSyntaxKind::Underscore,
280 '`' => TypstSyntaxKind::Backtick,
281 _ => TypstSyntaxKind::Error,
282 };
283
284 state.advance(1);
285 state.add_token(kind, start, state.get_position());
286 true
287 }
288}