1use crate::{kind::DjangoSyntaxKind, language::DjangoLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, DjangoLanguage>;
10
11static DJANGO_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static DJANGO_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["{#"] });
13static DJANGO_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct DjangoLexer<'config> {
17 config: &'config DjangoLanguage,
18}
19
20impl<'config> DjangoLexer<'config> {
21 pub fn new(config: &'config DjangoLanguage) -> Self {
22 Self { config }
23 }
24
25 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
26 while state.not_at_end() {
27 let safe_point = state.get_position();
28 if self.skip_whitespace(state) {
29 continue;
30 }
31
32 if self.skip_comment(state) {
33 continue;
34 }
35
36 if self.lex_string(state) {
37 continue;
38 }
39
40 if self.lex_number(state) {
41 continue;
42 }
43
44 if self.lex_identifier_or_keyword(state) {
45 continue;
46 }
47
48 if self.lex_django_tags(state) {
49 continue;
50 }
51
52 if self.lex_operator(state) {
53 continue;
54 }
55
56 if self.lex_delimiter(state) {
57 continue;
58 }
59
60 if self.lex_html_text(state) {
61 continue;
62 }
63
64 state.safe_check(safe_point);
65 }
66
67 let eof_pos = state.get_position();
69 state.add_token(DjangoSyntaxKind::Eof, eof_pos, eof_pos);
70 Ok(())
71 }
72
73 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
75 match DJANGO_WHITESPACE.scan(state.rest(), state.get_position(), DjangoSyntaxKind::Whitespace) {
76 Some(token) => {
77 let start = state.get_position();
78 state.advance(token.length());
79 state.add_token(DjangoSyntaxKind::Whitespace, start, state.get_position());
80 true
81 }
82 None => false,
83 }
84 }
85
86 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
88 if state.rest().starts_with("{#") {
89 let start = state.get_position();
90 state.advance(2); while state.not_at_end() {
94 if state.rest().starts_with("#}") {
95 state.advance(2); break;
97 }
98 state.advance(1);
99 }
100
101 state.add_token(DjangoSyntaxKind::Comment, start, state.get_position());
102 true
103 }
104 else {
105 false
106 }
107 }
108
109 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
111 match DJANGO_STRING.scan(state.rest(), state.get_position(), DjangoSyntaxKind::String) {
112 Some(token) => {
113 let start = state.get_position();
114 state.advance(token.length());
115 state.add_token(DjangoSyntaxKind::String, start, state.get_position());
116 true
117 }
118 None => false,
119 }
120 }
121
122 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
124 let start_pos = state.get_position();
125
126 if let Some('\n') = state.peek() {
127 state.advance(1);
128 state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
129 true
130 }
131 else if let Some('\r') = state.peek() {
132 state.advance(1);
133 if let Some('\n') = state.peek() {
134 state.advance(1);
135 }
136 state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
137 true
138 }
139 else {
140 false
141 }
142 }
143
144 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
146 let start_pos = state.get_position();
147
148 if let Some(ch) = state.peek() {
149 if ch.is_alphabetic() || ch == '_' {
150 state.advance(ch.len_utf8());
151
152 while let Some(ch) = state.peek() {
153 if ch.is_alphanumeric() || ch == '_' {
154 state.advance(ch.len_utf8());
155 }
156 else {
157 break;
158 }
159 }
160
161 let end_pos = state.get_position();
162 let text = state.get_text_in((start_pos..end_pos).into());
163
164 let token_kind = match text {
165 "if" => DjangoSyntaxKind::If,
166 "elif" => DjangoSyntaxKind::Elif,
167 "else" => DjangoSyntaxKind::Else,
168 "endif" => DjangoSyntaxKind::Endif,
169 "for" => DjangoSyntaxKind::For,
170 "empty" => DjangoSyntaxKind::Empty,
171 "endfor" => DjangoSyntaxKind::Endfor,
172 "block" => DjangoSyntaxKind::Block,
173 "endblock" => DjangoSyntaxKind::Endblock,
174 "extends" => DjangoSyntaxKind::Extends,
175 "include" => DjangoSyntaxKind::Include,
176 "load" => DjangoSyntaxKind::Load,
177 "with" => DjangoSyntaxKind::With,
178 "endwith" => DjangoSyntaxKind::Endwith,
179 "autoescape" => DjangoSyntaxKind::Autoescape,
180 "endautoescape" => DjangoSyntaxKind::Endautoescape,
181 "csrf_token" => DjangoSyntaxKind::Csrf,
182 "url" => DjangoSyntaxKind::Url,
183 "static" => DjangoSyntaxKind::Static,
184 "now" => DjangoSyntaxKind::Now,
185 "cycle" => DjangoSyntaxKind::Cycle,
186 "filter" => DjangoSyntaxKind::Filter,
187 "endfilter" => DjangoSyntaxKind::Endfilter,
188 "spaceless" => DjangoSyntaxKind::Spaceless,
189 "endspaceless" => DjangoSyntaxKind::Endspaceless,
190 "verbatim" => DjangoSyntaxKind::Verbatim,
191 "endverbatim" => DjangoSyntaxKind::Endverbatim,
192 "and" => DjangoSyntaxKind::And,
193 "or" => DjangoSyntaxKind::Or,
194 "not" => DjangoSyntaxKind::Not,
195 "in" => DjangoSyntaxKind::In,
196 _ => DjangoSyntaxKind::Identifier,
197 };
198
199 state.add_token(token_kind, start_pos, state.get_position());
200 true
201 }
202 else {
203 false
204 }
205 }
206 else {
207 false
208 }
209 }
210
211 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
213 let start_pos = state.get_position();
214
215 if let Some(ch) = state.peek() {
216 if ch.is_ascii_digit() {
217 state.advance(ch.len_utf8());
218
219 while let Some(ch) = state.peek() {
221 if ch.is_ascii_digit() {
222 state.advance(ch.len_utf8());
223 }
224 else {
225 break;
226 }
227 }
228
229 if let Some('.') = state.peek() {
231 let dot_pos = state.get_position();
232 state.advance(1);
233
234 if let Some(ch) = state.peek() {
235 if ch.is_ascii_digit() {
236 while let Some(ch) = state.peek() {
237 if ch.is_ascii_digit() {
238 state.advance(ch.len_utf8());
239 }
240 else {
241 break;
242 }
243 }
244 }
245 else {
246 state.set_position(dot_pos);
248 }
249 }
250 else {
251 state.set_position(dot_pos);
253 }
254 }
255
256 state.add_token(DjangoSyntaxKind::Number, start_pos, state.get_position());
257 true
258 }
259 else {
260 false
261 }
262 }
263 else {
264 false
265 }
266 }
267
268 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
271 let start_pos = state.get_position();
272
273 if let Some(quote) = state.peek() {
274 if quote == '"' || quote == '\'' {
275 state.advance(1);
276
277 while let Some(ch) = state.peek() {
278 if ch == quote {
279 state.advance(1);
280 state.add_token(DjangoSyntaxKind::String, start_pos, state.get_position());
281 return true;
282 }
283 else if ch == '\\' {
284 state.advance(1);
285 if state.peek().is_some() {
286 state.advance(1);
287 }
288 }
289 else {
290 state.advance(ch.len_utf8());
291 }
292 }
293
294 state.add_token(DjangoSyntaxKind::Error, start_pos, state.get_position());
297 true
298 }
299 else {
300 false
301 }
302 }
303 else {
304 false
305 }
306 }
307
308 fn lex_django_tags<S: Source>(&self, state: &mut State<S>) -> bool {
310 let start_pos = state.get_position();
311
312 if let Some('{') = state.peek() {
313 state.advance(1);
314
315 if let Some(next_ch) = state.peek() {
316 match next_ch {
317 '{' => {
318 state.advance(1);
320 state.add_token(DjangoSyntaxKind::VariableStart, start_pos, state.get_position());
321 true
322 }
323 '%' => {
324 state.advance(1);
326 state.add_token(DjangoSyntaxKind::TagStart, start_pos, state.get_position());
327 true
328 }
329 '#' => {
330 state.advance(1);
332 state.add_token(DjangoSyntaxKind::CommentStart, start_pos, state.get_position());
333 true
334 }
335 _ => {
336 state.set_position(start_pos);
338 false
339 }
340 }
341 }
342 else {
343 state.set_position(start_pos);
345 false
346 }
347 }
348 else if let Some('%') = state.peek() {
349 state.advance(1);
350 if let Some('}') = state.peek() {
351 state.advance(1);
352 state.add_token(DjangoSyntaxKind::TagEnd, start_pos, state.get_position());
353 true
354 }
355 else {
356 state.set_position(start_pos);
357 false
358 }
359 }
360 else if let Some('#') = state.peek() {
361 state.advance(1);
362 if let Some('}') = state.peek() {
363 state.advance(1);
364 state.add_token(DjangoSyntaxKind::CommentEnd, start_pos, state.get_position());
365 true
366 }
367 else {
368 state.set_position(start_pos);
369 false
370 }
371 }
372 else {
373 false
374 }
375 }
376
377 fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
379 let start_pos = state.get_position();
380
381 if let Some(ch) = state.peek() {
382 let token_kind = match ch {
383 '=' => {
384 state.advance(1);
385 if let Some('=') = state.peek() {
386 state.advance(1);
387 DjangoSyntaxKind::EqualEqual
388 }
389 else {
390 DjangoSyntaxKind::Equal
391 }
392 }
393 '!' => {
394 state.advance(1);
395 if let Some('=') = state.peek() {
396 state.advance(1);
397 DjangoSyntaxKind::NotEqual
398 }
399 else {
400 return false;
401 }
402 }
403 '<' => {
404 state.advance(1);
405 if let Some('=') = state.peek() {
406 state.advance(1);
407 DjangoSyntaxKind::LessEqual
408 }
409 else {
410 DjangoSyntaxKind::Less
411 }
412 }
413 '>' => {
414 state.advance(1);
415 if let Some('=') = state.peek() {
416 state.advance(1);
417 DjangoSyntaxKind::GreaterEqual
418 }
419 else {
420 DjangoSyntaxKind::Greater
421 }
422 }
423 '+' => {
424 state.advance(1);
425 DjangoSyntaxKind::Plus
426 }
427 '-' => {
428 state.advance(1);
429 DjangoSyntaxKind::Minus
430 }
431 '*' => {
432 state.advance(1);
433 DjangoSyntaxKind::Star
434 }
435 '/' => {
436 state.advance(1);
437 DjangoSyntaxKind::Slash
438 }
439 '%' => {
440 state.advance(1);
441 DjangoSyntaxKind::Percent
442 }
443 '|' => {
444 state.advance(1);
445 DjangoSyntaxKind::Pipe
446 }
447 _ => return false,
448 };
449
450 state.add_token(token_kind, start_pos, state.get_position());
451 true
452 }
453 else {
454 false
455 }
456 }
457
458 fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
461 let start_pos = state.get_position();
462
463 if let Some(ch) = state.peek() {
464 let token_kind = match ch {
465 '(' => DjangoSyntaxKind::LeftParen,
466 ')' => DjangoSyntaxKind::RightParen,
467 '[' => DjangoSyntaxKind::LeftBracket,
468 ']' => DjangoSyntaxKind::RightBracket,
469 ',' => DjangoSyntaxKind::Comma,
470 '.' => DjangoSyntaxKind::Dot,
471 ':' => DjangoSyntaxKind::Colon,
472 ';' => DjangoSyntaxKind::Semicolon,
473 _ => return false,
474 };
475
476 state.advance(ch.len_utf8());
477 state.add_token(token_kind, start_pos, state.get_position());
478 true
479 }
480 else {
481 false
482 }
483 }
484
485 fn lex_html_text<S: Source>(&self, state: &mut State<S>) -> bool {
487 let start_pos = state.get_position();
488
489 while let Some(ch) = state.peek() {
490 if ch == '{' || ch == '%' || ch == '#' {
492 break;
493 }
494 if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
496 break;
497 }
498 state.advance(ch.len_utf8());
499 }
500
501 if state.get_position() > start_pos {
502 state.add_token(DjangoSyntaxKind::HtmlText, start_pos, state.get_position());
503 true
504 }
505 else {
506 false
507 }
508 }
509}
510
511impl<'config> Lexer<DjangoLanguage> for DjangoLexer<'config> {
512 fn lex_incremental(
513 &self,
514 source: impl Source,
515 _changed: usize,
516 _cache: IncrementalCache<DjangoLanguage>,
517 ) -> LexOutput<DjangoLanguage> {
518 let mut state = LexerState::new_with_cache(source, _changed, _cache);
519 let result = self.run(&mut state);
520 state.finish(result)
521 }
522}