1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::TypstLanguage, lexer::token_type::TypstTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8 source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'s, S> = LexerState<'s, S, TypstLanguage>;
13
14static TYPST_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static TYPST_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
16static TYPST_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct TypstLexer<'config> {
20 _config: &'config TypstLanguage,
21}
22
23impl<'config> Lexer<TypstLanguage> for TypstLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], _cache: &'a mut impl LexerCache<TypstLanguage>) -> LexOutput<TypstLanguage> {
25 let mut state = State::new(source);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof();
29 }
30 state.finish(result)
31 }
32}
33
34impl<'config> TypstLexer<'config> {
35 pub fn new(config: &'config TypstLanguage) -> Self {
36 Self { _config: config }
37 }
38
39 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.lex_whitespace(state) {
44 continue;
45 }
46
47 if TYPST_COMMENT.scan(state, TypstTokenType::LineComment, TypstTokenType::BlockComment) {
48 continue;
49 }
50
51 if TYPST_STRING.scan(state, TypstTokenType::StringLiteral) {
52 continue;
53 }
54
55 if self.lex_number_literal(state) {
56 continue;
57 }
58
59 if self.lex_identifier_or_keyword(state) {
60 continue;
61 }
62
63 if self.lex_operators(state) {
64 continue;
65 }
66
67 if self.lex_single_char_tokens(state) {
68 continue;
69 }
70
71 state.advance_if_dead_lock(safe_point)
72 }
73
74 Ok(())
75 }
76
77 fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
78 if let Some(ch) = state.peek() {
79 if ch == '\n' || ch == '\r' {
80 let start = state.get_position();
81 state.advance(1);
82 if ch == '\r' && state.peek() == Some('\n') {
83 state.advance(1);
84 }
85 state.add_token(TypstTokenType::Newline, start, state.get_position());
86 return true;
87 }
88 }
89 TYPST_WHITESPACE.scan(state, TypstTokenType::Whitespace)
90 }
91
92 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
93 let start = state.get_position();
94 let text = state.rest();
95 if text.is_empty() || !text.chars().next().unwrap().is_ascii_digit() {
96 return false;
97 }
98
99 let mut pos = 0;
100 let chars: Vec<char> = text.chars().collect();
101
102 while pos < chars.len() && chars[pos].is_ascii_digit() {
104 pos += 1;
105 }
106
107 if pos < chars.len() && chars[pos] == '.' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit() {
109 pos += 1; while pos < chars.len() && chars[pos].is_ascii_digit() {
111 pos += 1;
112 }
113 }
114
115 if pos < chars.len() && (chars[pos] == 'e' || chars[pos] == 'E') {
117 pos += 1;
118 if pos < chars.len() && (chars[pos] == '+' || chars[pos] == '-') {
119 pos += 1;
120 }
121 while pos < chars.len() && chars[pos].is_ascii_digit() {
122 pos += 1;
123 }
124 }
125
126 if pos > 0 {
127 state.advance(pos);
128 state.add_token(TypstTokenType::NumericLiteral, start, state.get_position());
129 return true;
130 }
131
132 false
133 }
134
135 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
136 let start = state.get_position();
137 let text = state.rest();
138 if text.is_empty() {
139 return false;
140 }
141
142 let first_char = text.chars().next().unwrap();
143 if !first_char.is_ascii_alphabetic() {
144 return false;
145 }
146
147 let mut pos = 0;
148 let chars: Vec<char> = text.chars().collect();
149
150 pos += 1;
152
153 while pos < chars.len() && (chars[pos].is_ascii_alphanumeric()) {
155 pos += 1;
156 }
157
158 if pos > 0 {
159 let identifier_text = &text[..pos];
160 let kind = self.keyword_or_identifier(identifier_text);
161 state.advance(pos);
162 state.add_token(kind, start, state.get_position());
163 return true;
164 }
165
166 false
167 }
168
169 fn keyword_or_identifier(&self, text: &str) -> TypstTokenType {
170 match text {
171 "let" => TypstTokenType::Let,
172 "if" => TypstTokenType::If,
173 "else" => TypstTokenType::Else,
174 "for" => TypstTokenType::For,
175 "while" => TypstTokenType::While,
176 "break" => TypstTokenType::Break,
177 "continue" => TypstTokenType::Continue,
178 "return" => TypstTokenType::Return,
179 "true" => TypstTokenType::True,
180 "false" => TypstTokenType::False,
181 "set" => TypstTokenType::Set,
182 "show" => TypstTokenType::Show,
183 "import" => TypstTokenType::Import,
184 "include" => TypstTokenType::Include,
185 _ => TypstTokenType::Identifier,
186 }
187 }
188
189 fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
190 let start = state.get_position();
191 let text = state.rest();
192 if text.is_empty() {
193 return false;
194 }
195
196 let chars: Vec<char> = text.chars().collect();
197
198 let (kind, len) = match chars[0] {
199 '=' => {
200 let mut count = 1;
201 while count < chars.len() && chars[count] == '=' {
202 count += 1;
203 }
204 (TypstTokenType::Equal, count)
205 }
206 '!' => {
207 if chars.len() > 1 && chars[1] == '=' {
208 (TypstTokenType::NotEqual, 2)
209 }
210 else {
211 (TypstTokenType::Not, 1)
212 }
213 }
214 '<' => {
215 if chars.len() > 1 && chars[1] == '=' {
216 (TypstTokenType::LessEqual, 2)
217 }
218 else {
219 (TypstTokenType::Less, 1)
220 }
221 }
222 '>' => {
223 if chars.len() > 1 && chars[1] == '=' {
224 (TypstTokenType::GreaterEqual, 2)
225 }
226 else {
227 (TypstTokenType::Greater, 1)
228 }
229 }
230 '&' => {
231 if chars.len() > 1 && chars[1] == '&' {
232 (TypstTokenType::And, 2)
233 }
234 else {
235 return false;
236 }
237 }
238 '|' => {
239 if chars.len() > 1 && chars[1] == '|' {
240 (TypstTokenType::Or, 2)
241 }
242 else {
243 return false;
244 }
245 }
246 '+' => (TypstTokenType::Plus, 1),
247 '-' => (TypstTokenType::Minus, 1),
248 '*' => (TypstTokenType::Star, 1),
249 '/' => (TypstTokenType::Slash, 1),
250 '%' => (TypstTokenType::Percent, 1),
251 _ => return false,
252 };
253
254 state.advance(len);
255 state.add_token(kind, start, state.get_position());
256 true
257 }
258
259 fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
260 let start = state.get_position();
261 let text = state.rest();
262 if text.is_empty() {
263 return false;
264 }
265
266 let ch = text.chars().next().unwrap();
267
268 let kind = match ch {
269 '(' => TypstTokenType::LeftParen,
270 ')' => TypstTokenType::RightParen,
271 '{' => TypstTokenType::LeftBrace,
272 '}' => TypstTokenType::RightBrace,
273 '[' => TypstTokenType::LeftBracket,
274 ']' => TypstTokenType::RightBracket,
275 ';' => TypstTokenType::Semicolon,
276 ',' => TypstTokenType::Comma,
277 '.' => TypstTokenType::Dot,
278 ':' => TypstTokenType::Colon,
279 '#' => TypstTokenType::Hash,
280 '@' => TypstTokenType::At,
281 '$' => TypstTokenType::Dollar,
282 '_' => TypstTokenType::Underscore,
283 '`' => TypstTokenType::Backtick,
284 _ => TypstTokenType::Error,
285 };
286
287 state.advance(1);
288 state.add_token(kind, start, state.get_position());
289 true
290 }
291}