1use crate::{kind::DjangoSyntaxKind, language::DjangoLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, DjangoLanguage>;
10
11static DJANGO_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static _DJANGO_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "{#", block_start: "{#", block_end: "#}", nested_blocks: false });
13static DJANGO_STRING_DOUBLE: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static DJANGO_STRING_SINGLE: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone)]
17pub struct DjangoLexer<'config> {
18 _config: &'config DjangoLanguage,
19}
20
21impl<'config> Lexer<DjangoLanguage> for DjangoLexer<'config> {
22 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DjangoLanguage>) -> LexOutput<DjangoLanguage> {
23 let mut state = LexerState::new(source);
24 let result = self.run(&mut state);
25 if result.is_ok() {
26 state.add_eof();
27 }
28 state.finish_with_cache(result, cache)
29 }
30}
31
32impl<'config> DjangoLexer<'config> {
33 pub fn new(config: &'config DjangoLanguage) -> Self {
34 Self { _config: config }
35 }
36
37 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38 while state.not_at_end() {
39 let safe_point = state.get_position();
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_string(state) || self.lex_string_manual(state) {
49 continue;
50 }
51
52 if self.lex_number(state) {
53 continue;
54 }
55
56 if self.lex_identifier_or_keyword(state) {
57 continue;
58 }
59
60 if self.lex_django_tags(state) {
61 continue;
62 }
63
64 if self.lex_operator(state) {
65 continue;
66 }
67
68 if self.lex_delimiter(state) {
69 continue;
70 }
71
72 if self.lex_html_text(state) {
73 continue;
74 }
75
76 state.advance_if_dead_lock(safe_point);
77 }
78
79 Ok(())
80 }
81
82 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84 DJANGO_WHITESPACE.scan(state, DjangoSyntaxKind::Whitespace)
85 }
86
87 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89 if state.rest().starts_with("{#") {
90 let start = state.get_position();
91 state.advance(2); while state.not_at_end() {
95 if state.rest().starts_with("#}") {
96 state.advance(2); break;
98 }
99 state.advance(1);
100 }
101
102 state.add_token(DjangoSyntaxKind::Comment, start, state.get_position());
103 return true;
104 }
105 false
106 }
107
108 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 DJANGO_STRING_DOUBLE.scan(state, DjangoSyntaxKind::String) || DJANGO_STRING_SINGLE.scan(state, DjangoSyntaxKind::String)
111 }
112
113 fn _lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115 let start_pos = state.get_position();
116
117 if let Some('\n') = state.peek() {
118 state.advance(1);
119 state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
120 true
121 }
122 else if let Some('\r') = state.peek() {
123 state.advance(1);
124 if let Some('\n') = state.peek() {
125 state.advance(1);
126 }
127 state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
128 true
129 }
130 else {
131 false
132 }
133 }
134
135 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
137 let start_pos = state.get_position();
138
139 if let Some(ch) = state.peek() {
140 if ch.is_alphabetic() || ch == '_' {
141 state.advance(ch.len_utf8());
142
143 while let Some(ch) = state.peek() {
144 if ch.is_alphanumeric() || ch == '_' {
145 state.advance(ch.len_utf8());
146 }
147 else {
148 break;
149 }
150 }
151
152 let end_pos = state.get_position();
153 let text = state.get_text_in((start_pos..end_pos).into());
154
155 let token_kind = match text.as_ref() {
156 "if" => DjangoSyntaxKind::If,
157 "elif" => DjangoSyntaxKind::Elif,
158 "else" => DjangoSyntaxKind::Else,
159 "endif" => DjangoSyntaxKind::Endif,
160 "for" => DjangoSyntaxKind::For,
161 "empty" => DjangoSyntaxKind::Empty,
162 "endfor" => DjangoSyntaxKind::Endfor,
163 "block" => DjangoSyntaxKind::Block,
164 "endblock" => DjangoSyntaxKind::Endblock,
165 "extends" => DjangoSyntaxKind::Extends,
166 "include" => DjangoSyntaxKind::Include,
167 "load" => DjangoSyntaxKind::Load,
168 "with" => DjangoSyntaxKind::With,
169 "endwith" => DjangoSyntaxKind::Endwith,
170 "autoescape" => DjangoSyntaxKind::Autoescape,
171 "endautoescape" => DjangoSyntaxKind::Endautoescape,
172 "csrf_token" => DjangoSyntaxKind::Csrf,
173 "url" => DjangoSyntaxKind::Url,
174 "static" => DjangoSyntaxKind::Static,
175 "now" => DjangoSyntaxKind::Now,
176 "cycle" => DjangoSyntaxKind::Cycle,
177 "filter" => DjangoSyntaxKind::Filter,
178 "endfilter" => DjangoSyntaxKind::Endfilter,
179 "spaceless" => DjangoSyntaxKind::Spaceless,
180 "endspaceless" => DjangoSyntaxKind::Endspaceless,
181 "verbatim" => DjangoSyntaxKind::Verbatim,
182 "endverbatim" => DjangoSyntaxKind::Endverbatim,
183 "and" => DjangoSyntaxKind::And,
184 "or" => DjangoSyntaxKind::Or,
185 "not" => DjangoSyntaxKind::Not,
186 "in" => DjangoSyntaxKind::In,
187 _ => DjangoSyntaxKind::Identifier,
188 };
189
190 state.add_token(token_kind, start_pos, state.get_position());
191 true
192 }
193 else {
194 false
195 }
196 }
197 else {
198 false
199 }
200 }
201
202 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
205 let start_pos = state.get_position();
206
207 if let Some(ch) = state.peek() {
208 if ch.is_ascii_digit() {
209 state.advance(ch.len_utf8());
210
211 while let Some(ch) = state.peek() {
213 if ch.is_ascii_digit() {
214 state.advance(ch.len_utf8());
215 }
216 else {
217 break;
218 }
219 }
220
221 if let Some('.') = state.peek() {
223 let dot_pos = state.get_position();
224 state.advance(1);
225
226 if let Some(ch) = state.peek() {
227 if ch.is_ascii_digit() {
228 while let Some(ch) = state.peek() {
229 if ch.is_ascii_digit() {
230 state.advance(ch.len_utf8());
231 }
232 else {
233 break;
234 }
235 }
236 }
237 else {
238 state.set_position(dot_pos);
240 }
241 }
242 else {
243 state.set_position(dot_pos);
245 }
246 }
247
248 state.add_token(DjangoSyntaxKind::Number, start_pos, state.get_position());
249 true
250 }
251 else {
252 false
253 }
254 }
255 else {
256 false
257 }
258 }
259
260 fn lex_string_manual<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
263 let start_pos = state.get_position();
264
265 if let Some(quote) = state.peek() {
266 if quote == '"' || quote == '\'' {
267 state.advance(1);
268
269 while let Some(ch) = state.peek() {
270 if ch == quote {
271 state.advance(1);
272 state.add_token(DjangoSyntaxKind::String, start_pos, state.get_position());
273 return true;
274 }
275 else if ch == '\\' {
276 state.advance(1);
277 if state.peek().is_some() {
278 state.advance(1);
279 }
280 }
281 else {
282 state.advance(ch.len_utf8());
283 }
284 }
285
286 state.add_token(DjangoSyntaxKind::Error, start_pos, state.get_position());
289 true
290 }
291 else {
292 false
293 }
294 }
295 else {
296 false
297 }
298 }
299
300 fn lex_django_tags<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302 let start_pos = state.get_position();
303
304 if let Some('{') = state.peek() {
305 state.advance(1);
306
307 if let Some(next_ch) = state.peek() {
308 match next_ch {
309 '{' => {
310 state.advance(1);
312 state.add_token(DjangoSyntaxKind::VariableStart, start_pos, state.get_position());
313 true
314 }
315 '%' => {
316 state.advance(1);
318 state.add_token(DjangoSyntaxKind::TagStart, start_pos, state.get_position());
319 true
320 }
321 '#' => {
322 state.advance(1);
324 state.add_token(DjangoSyntaxKind::CommentStart, start_pos, state.get_position());
325 true
326 }
327 _ => {
328 state.set_position(start_pos);
330 false
331 }
332 }
333 }
334 else {
335 state.set_position(start_pos);
337 false
338 }
339 }
340 else if let Some('%') = state.peek() {
341 state.advance(1);
342 if let Some('}') = state.peek() {
343 state.advance(1);
344 state.add_token(DjangoSyntaxKind::TagEnd, start_pos, state.get_position());
345 true
346 }
347 else {
348 state.set_position(start_pos);
349 false
350 }
351 }
352 else if let Some('}') = state.peek() {
353 state.advance(1);
354 if let Some('}') = state.peek() {
355 state.advance(1);
356 state.add_token(DjangoSyntaxKind::VariableEnd, start_pos, state.get_position());
357 true
358 }
359 else {
360 state.set_position(start_pos);
361 false
362 }
363 }
364 else {
365 false
366 }
367 }
368
369 fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371 let start_pos = state.get_position();
372
373 if let Some(ch) = state.peek() {
374 let kind = match ch {
375 '=' => {
376 state.advance(1);
377 if let Some('=') = state.peek() {
378 state.advance(1);
379 Some(DjangoSyntaxKind::EqualEqual)
380 }
381 else {
382 Some(DjangoSyntaxKind::Equal)
383 }
384 }
385 '!' => {
386 state.advance(1);
387 if let Some('=') = state.peek() {
388 state.advance(1);
389 Some(DjangoSyntaxKind::NotEqual)
390 }
391 else {
392 None
393 }
394 }
395 '<' => {
396 state.advance(1);
397 if let Some('=') = state.peek() {
398 state.advance(1);
399 Some(DjangoSyntaxKind::LessEqual)
400 }
401 else {
402 Some(DjangoSyntaxKind::Less)
403 }
404 }
405 '>' => {
406 state.advance(1);
407 if let Some('=') = state.peek() {
408 state.advance(1);
409 Some(DjangoSyntaxKind::GreaterEqual)
410 }
411 else {
412 Some(DjangoSyntaxKind::Greater)
413 }
414 }
415 '|' => {
416 state.advance(1);
417 Some(DjangoSyntaxKind::Pipe)
418 }
419 ':' => {
420 state.advance(1);
421 Some(DjangoSyntaxKind::Colon)
422 }
423 '.' => {
424 state.advance(1);
425 Some(DjangoSyntaxKind::Dot)
426 }
427 ',' => {
428 state.advance(1);
429 Some(DjangoSyntaxKind::Comma)
430 }
431 '+' => {
432 state.advance(1);
433 Some(DjangoSyntaxKind::Plus)
434 }
435 '-' => {
436 state.advance(1);
437 Some(DjangoSyntaxKind::Minus)
438 }
439 '*' => {
440 state.advance(1);
441 Some(DjangoSyntaxKind::Star)
442 }
443 '/' => {
444 state.advance(1);
445 Some(DjangoSyntaxKind::Slash)
446 }
447 _ => None,
448 };
449
450 if let Some(kind) = kind {
451 state.add_token(kind, start_pos, state.get_position());
452 true
453 }
454 else {
455 state.set_position(start_pos);
456 false
457 }
458 }
459 else {
460 false
461 }
462 }
463
464 fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
466 let start_pos = state.get_position();
467
468 if let Some(ch) = state.peek() {
469 let kind = match ch {
470 '(' => Some(DjangoSyntaxKind::LeftParen),
471 ')' => Some(DjangoSyntaxKind::RightParen),
472 '[' => Some(DjangoSyntaxKind::LeftBracket),
473 ']' => Some(DjangoSyntaxKind::RightBracket),
474 ';' => Some(DjangoSyntaxKind::Semicolon),
475 _ => None,
476 };
477
478 if let Some(kind) = kind {
479 state.advance(1);
480 state.add_token(kind, start_pos, state.get_position());
481 true
482 }
483 else {
484 false
485 }
486 }
487 else {
488 false
489 }
490 }
491
492 fn lex_html_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
494 let start_pos = state.get_position();
495
496 while let Some(ch) = state.peek() {
497 if ch == '{' {
499 let current_pos = state.get_position();
500 state.advance(1);
501 if let Some(next_ch) = state.peek() {
502 if next_ch == '{' || next_ch == '%' || next_ch == '#' {
503 state.set_position(current_pos);
504 break;
505 }
506 }
507 }
508 state.advance(ch.len_utf8());
509 }
510
511 if state.get_position() > start_pos {
512 state.add_token(DjangoSyntaxKind::HtmlText, start_pos, state.get_position());
513 true
514 }
515 else {
516 false
517 }
518 }
519}