oak_django/lexer/
mod.rs

1use crate::{kind::DjangoSyntaxKind, language::DjangoLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, DjangoLanguage>;
10
11static DJANGO_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static DJANGO_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &["{#"] });
13static DJANGO_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct DjangoLexer<'config> {
17    config: &'config DjangoLanguage,
18}
19
20impl<'config> DjangoLexer<'config> {
21    pub fn new(config: &'config DjangoLanguage) -> Self {
22        Self { config }
23    }
24
25    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
26        while state.not_at_end() {
27            let safe_point = state.get_position();
28            if self.skip_whitespace(state) {
29                continue;
30            }
31
32            if self.skip_comment(state) {
33                continue;
34            }
35
36            if self.lex_string(state) {
37                continue;
38            }
39
40            if self.lex_number(state) {
41                continue;
42            }
43
44            if self.lex_identifier_or_keyword(state) {
45                continue;
46            }
47
48            if self.lex_django_tags(state) {
49                continue;
50            }
51
52            if self.lex_operator(state) {
53                continue;
54            }
55
56            if self.lex_delimiter(state) {
57                continue;
58            }
59
60            if self.lex_html_text(state) {
61                continue;
62            }
63
64            state.safe_check(safe_point);
65        }
66
67        // 添加 EOF kind
68        let eof_pos = state.get_position();
69        state.add_token(DjangoSyntaxKind::Eof, eof_pos, eof_pos);
70        Ok(())
71    }
72
73    /// 跳过空白字符
74    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
75        match DJANGO_WHITESPACE.scan(state.rest(), state.get_position(), DjangoSyntaxKind::Whitespace) {
76            Some(token) => {
77                let start = state.get_position();
78                state.advance(token.length());
79                state.add_token(DjangoSyntaxKind::Whitespace, start, state.get_position());
80                true
81            }
82            None => false,
83        }
84    }
85
86    /// 跳过注释
87    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
88        if state.rest().starts_with("{#") {
89            let start = state.get_position();
90            state.advance(2); // 跳过 "{#"
91
92            // 查找注释结束标记 "#}"
93            while state.not_at_end() {
94                if state.rest().starts_with("#}") {
95                    state.advance(2); // 跳过 "#}"
96                    break;
97                }
98                state.advance(1);
99            }
100
101            state.add_token(DjangoSyntaxKind::Comment, start, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    /// 处理字符串字面量
110    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
111        match DJANGO_STRING.scan(state.rest(), state.get_position(), DjangoSyntaxKind::String) {
112            Some(token) => {
113                let start = state.get_position();
114                state.advance(token.length());
115                state.add_token(DjangoSyntaxKind::String, start, state.get_position());
116                true
117            }
118            None => false,
119        }
120    }
121
122    /// 处理换行
123    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
124        let start_pos = state.get_position();
125
126        if let Some('\n') = state.peek() {
127            state.advance(1);
128            state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
129            true
130        }
131        else if let Some('\r') = state.peek() {
132            state.advance(1);
133            if let Some('\n') = state.peek() {
134                state.advance(1);
135            }
136            state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
137            true
138        }
139        else {
140            false
141        }
142    }
143
144    /// 处理标识符和关键字
145    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
146        let start_pos = state.get_position();
147
148        if let Some(ch) = state.peek() {
149            if ch.is_alphabetic() || ch == '_' {
150                state.advance(ch.len_utf8());
151
152                while let Some(ch) = state.peek() {
153                    if ch.is_alphanumeric() || ch == '_' {
154                        state.advance(ch.len_utf8());
155                    }
156                    else {
157                        break;
158                    }
159                }
160
161                let end_pos = state.get_position();
162                let text = state.get_text_in((start_pos..end_pos).into());
163
164                let token_kind = match text {
165                    "if" => DjangoSyntaxKind::If,
166                    "elif" => DjangoSyntaxKind::Elif,
167                    "else" => DjangoSyntaxKind::Else,
168                    "endif" => DjangoSyntaxKind::Endif,
169                    "for" => DjangoSyntaxKind::For,
170                    "empty" => DjangoSyntaxKind::Empty,
171                    "endfor" => DjangoSyntaxKind::Endfor,
172                    "block" => DjangoSyntaxKind::Block,
173                    "endblock" => DjangoSyntaxKind::Endblock,
174                    "extends" => DjangoSyntaxKind::Extends,
175                    "include" => DjangoSyntaxKind::Include,
176                    "load" => DjangoSyntaxKind::Load,
177                    "with" => DjangoSyntaxKind::With,
178                    "endwith" => DjangoSyntaxKind::Endwith,
179                    "autoescape" => DjangoSyntaxKind::Autoescape,
180                    "endautoescape" => DjangoSyntaxKind::Endautoescape,
181                    "csrf_token" => DjangoSyntaxKind::Csrf,
182                    "url" => DjangoSyntaxKind::Url,
183                    "static" => DjangoSyntaxKind::Static,
184                    "now" => DjangoSyntaxKind::Now,
185                    "cycle" => DjangoSyntaxKind::Cycle,
186                    "filter" => DjangoSyntaxKind::Filter,
187                    "endfilter" => DjangoSyntaxKind::Endfilter,
188                    "spaceless" => DjangoSyntaxKind::Spaceless,
189                    "endspaceless" => DjangoSyntaxKind::Endspaceless,
190                    "verbatim" => DjangoSyntaxKind::Verbatim,
191                    "endverbatim" => DjangoSyntaxKind::Endverbatim,
192                    "and" => DjangoSyntaxKind::And,
193                    "or" => DjangoSyntaxKind::Or,
194                    "not" => DjangoSyntaxKind::Not,
195                    "in" => DjangoSyntaxKind::In,
196                    _ => DjangoSyntaxKind::Identifier,
197                };
198
199                state.add_token(token_kind, start_pos, state.get_position());
200                true
201            }
202            else {
203                false
204            }
205        }
206        else {
207            false
208        }
209    }
210
211    /// 处理数字
212    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
213        let start_pos = state.get_position();
214
215        if let Some(ch) = state.peek() {
216            if ch.is_ascii_digit() {
217                state.advance(ch.len_utf8());
218
219                // 处理整数部分
220                while let Some(ch) = state.peek() {
221                    if ch.is_ascii_digit() {
222                        state.advance(ch.len_utf8());
223                    }
224                    else {
225                        break;
226                    }
227                }
228
229                // 处理小数部分
230                if let Some('.') = state.peek() {
231                    let dot_pos = state.get_position();
232                    state.advance(1);
233
234                    if let Some(ch) = state.peek() {
235                        if ch.is_ascii_digit() {
236                            while let Some(ch) = state.peek() {
237                                if ch.is_ascii_digit() {
238                                    state.advance(ch.len_utf8());
239                                }
240                                else {
241                                    break;
242                                }
243                            }
244                        }
245                        else {
246                            // 回退点号
247                            state.set_position(dot_pos);
248                        }
249                    }
250                    else {
251                        // 回退点号
252                        state.set_position(dot_pos);
253                    }
254                }
255
256                state.add_token(DjangoSyntaxKind::Number, start_pos, state.get_position());
257                true
258            }
259            else {
260                false
261            }
262        }
263        else {
264            false
265        }
266    }
267
268    /// 处理字符
269
270    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
271        let start_pos = state.get_position();
272
273        if let Some(quote) = state.peek() {
274            if quote == '"' || quote == '\'' {
275                state.advance(1);
276
277                while let Some(ch) = state.peek() {
278                    if ch == quote {
279                        state.advance(1);
280                        state.add_token(DjangoSyntaxKind::String, start_pos, state.get_position());
281                        return true;
282                    }
283                    else if ch == '\\' {
284                        state.advance(1);
285                        if state.peek().is_some() {
286                            state.advance(1);
287                        }
288                    }
289                    else {
290                        state.advance(ch.len_utf8());
291                    }
292                }
293
294                // 未闭合的字符
295
296                state.add_token(DjangoSyntaxKind::Error, start_pos, state.get_position());
297                true
298            }
299            else {
300                false
301            }
302        }
303        else {
304            false
305        }
306    }
307
308    /// 处理 Django 标签
309    fn lex_django_tags<S: Source>(&self, state: &mut State<S>) -> bool {
310        let start_pos = state.get_position();
311
312        if let Some('{') = state.peek() {
313            state.advance(1);
314
315            if let Some(next_ch) = state.peek() {
316                match next_ch {
317                    '{' => {
318                        // 变量标签 {{
319                        state.advance(1);
320                        state.add_token(DjangoSyntaxKind::VariableStart, start_pos, state.get_position());
321                        true
322                    }
323                    '%' => {
324                        // 模板标签 {%
325                        state.advance(1);
326                        state.add_token(DjangoSyntaxKind::TagStart, start_pos, state.get_position());
327                        true
328                    }
329                    '#' => {
330                        // 注释标签 {#
331                        state.advance(1);
332                        state.add_token(DjangoSyntaxKind::CommentStart, start_pos, state.get_position());
333                        true
334                    }
335                    _ => {
336                        // 回退
337                        state.set_position(start_pos);
338                        false
339                    }
340                }
341            }
342            else {
343                // 回退
344                state.set_position(start_pos);
345                false
346            }
347        }
348        else if let Some('%') = state.peek() {
349            state.advance(1);
350            if let Some('}') = state.peek() {
351                state.advance(1);
352                state.add_token(DjangoSyntaxKind::TagEnd, start_pos, state.get_position());
353                true
354            }
355            else {
356                state.set_position(start_pos);
357                false
358            }
359        }
360        else if let Some('#') = state.peek() {
361            state.advance(1);
362            if let Some('}') = state.peek() {
363                state.advance(1);
364                state.add_token(DjangoSyntaxKind::CommentEnd, start_pos, state.get_position());
365                true
366            }
367            else {
368                state.set_position(start_pos);
369                false
370            }
371        }
372        else {
373            false
374        }
375    }
376
377    /// 处理操作
378    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
379        let start_pos = state.get_position();
380
381        if let Some(ch) = state.peek() {
382            let token_kind = match ch {
383                '=' => {
384                    state.advance(1);
385                    if let Some('=') = state.peek() {
386                        state.advance(1);
387                        DjangoSyntaxKind::EqualEqual
388                    }
389                    else {
390                        DjangoSyntaxKind::Equal
391                    }
392                }
393                '!' => {
394                    state.advance(1);
395                    if let Some('=') = state.peek() {
396                        state.advance(1);
397                        DjangoSyntaxKind::NotEqual
398                    }
399                    else {
400                        return false;
401                    }
402                }
403                '<' => {
404                    state.advance(1);
405                    if let Some('=') = state.peek() {
406                        state.advance(1);
407                        DjangoSyntaxKind::LessEqual
408                    }
409                    else {
410                        DjangoSyntaxKind::Less
411                    }
412                }
413                '>' => {
414                    state.advance(1);
415                    if let Some('=') = state.peek() {
416                        state.advance(1);
417                        DjangoSyntaxKind::GreaterEqual
418                    }
419                    else {
420                        DjangoSyntaxKind::Greater
421                    }
422                }
423                '+' => {
424                    state.advance(1);
425                    DjangoSyntaxKind::Plus
426                }
427                '-' => {
428                    state.advance(1);
429                    DjangoSyntaxKind::Minus
430                }
431                '*' => {
432                    state.advance(1);
433                    DjangoSyntaxKind::Star
434                }
435                '/' => {
436                    state.advance(1);
437                    DjangoSyntaxKind::Slash
438                }
439                '%' => {
440                    state.advance(1);
441                    DjangoSyntaxKind::Percent
442                }
443                '|' => {
444                    state.advance(1);
445                    DjangoSyntaxKind::Pipe
446                }
447                _ => return false,
448            };
449
450            state.add_token(token_kind, start_pos, state.get_position());
451            true
452        }
453        else {
454            false
455        }
456    }
457
458    /// 处理分隔
459
460    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
461        let start_pos = state.get_position();
462
463        if let Some(ch) = state.peek() {
464            let token_kind = match ch {
465                '(' => DjangoSyntaxKind::LeftParen,
466                ')' => DjangoSyntaxKind::RightParen,
467                '[' => DjangoSyntaxKind::LeftBracket,
468                ']' => DjangoSyntaxKind::RightBracket,
469                ',' => DjangoSyntaxKind::Comma,
470                '.' => DjangoSyntaxKind::Dot,
471                ':' => DjangoSyntaxKind::Colon,
472                ';' => DjangoSyntaxKind::Semicolon,
473                _ => return false,
474            };
475
476            state.advance(ch.len_utf8());
477            state.add_token(token_kind, start_pos, state.get_position());
478            true
479        }
480        else {
481            false
482        }
483    }
484
485    /// 处理 HTML 文本
486    fn lex_html_text<S: Source>(&self, state: &mut State<S>) -> bool {
487        let start_pos = state.get_position();
488
489        while let Some(ch) = state.peek() {
490            // 遇到 Django 标签开始符号时停止
491            if ch == '{' || ch == '%' || ch == '#' {
492                break;
493            }
494            // 遇到特殊字符时停
495            if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
496                break;
497            }
498            state.advance(ch.len_utf8());
499        }
500
501        if state.get_position() > start_pos {
502            state.add_token(DjangoSyntaxKind::HtmlText, start_pos, state.get_position());
503            true
504        }
505        else {
506            false
507        }
508    }
509}
510
511impl<'config> Lexer<DjangoLanguage> for DjangoLexer<'config> {
512    fn lex_incremental(
513        &self,
514        source: impl Source,
515        _changed: usize,
516        _cache: IncrementalCache<DjangoLanguage>,
517    ) -> LexOutput<DjangoLanguage> {
518        let mut state = LexerState::new_with_cache(source, _changed, _cache);
519        let result = self.run(&mut state);
520        state.finish(result)
521    }
522}