Skip to main content

oak_django/lexer/
mod.rs

1use crate::{kind::DjangoSyntaxKind, language::DjangoLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, DjangoLanguage>;
10
11static DJANGO_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static _DJANGO_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "{#", block_start: "{#", block_end: "#}", nested_blocks: false });
13static DJANGO_STRING_DOUBLE: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14static DJANGO_STRING_SINGLE: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
15
16#[derive(Clone)]
17pub struct DjangoLexer<'config> {
18    _config: &'config DjangoLanguage,
19}
20
21impl<'config> Lexer<DjangoLanguage> for DjangoLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<DjangoLanguage>) -> LexOutput<DjangoLanguage> {
23        let mut state = LexerState::new(source);
24        let result = self.run(&mut state);
25        if result.is_ok() {
26            state.add_eof();
27        }
28        state.finish_with_cache(result, cache)
29    }
30}
31
32impl<'config> DjangoLexer<'config> {
33    pub fn new(config: &'config DjangoLanguage) -> Self {
34        Self { _config: config }
35    }
36
37    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_string(state) || self.lex_string_manual(state) {
49                continue;
50            }
51
52            if self.lex_number(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_django_tags(state) {
61                continue;
62            }
63
64            if self.lex_operator(state) {
65                continue;
66            }
67
68            if self.lex_delimiter(state) {
69                continue;
70            }
71
72            if self.lex_html_text(state) {
73                continue;
74            }
75
76            state.advance_if_dead_lock(safe_point);
77        }
78
79        Ok(())
80    }
81
82    /// 跳过空白字符
83    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
84        DJANGO_WHITESPACE.scan(state, DjangoSyntaxKind::Whitespace)
85    }
86
87    /// 跳过注释
88    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
89        if state.rest().starts_with("{#") {
90            let start = state.get_position();
91            state.advance(2); // 跳过 "{#"
92
93            // 查找注释结束标记 "#}"
94            while state.not_at_end() {
95                if state.rest().starts_with("#}") {
96                    state.advance(2); // 跳过 "#}"
97                    break;
98                }
99                state.advance(1);
100            }
101
102            state.add_token(DjangoSyntaxKind::Comment, start, state.get_position());
103            return true;
104        }
105        false
106    }
107
108    /// 词法分析字符串
109    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        DJANGO_STRING_DOUBLE.scan(state, DjangoSyntaxKind::String) || DJANGO_STRING_SINGLE.scan(state, DjangoSyntaxKind::String)
111    }
112
113    /// 处理换行
114    fn _lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115        let start_pos = state.get_position();
116
117        if let Some('\n') = state.peek() {
118            state.advance(1);
119            state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
120            true
121        }
122        else if let Some('\r') = state.peek() {
123            state.advance(1);
124            if let Some('\n') = state.peek() {
125                state.advance(1);
126            }
127            state.add_token(DjangoSyntaxKind::Newline, start_pos, state.get_position());
128            true
129        }
130        else {
131            false
132        }
133    }
134
135    /// 处理标识符和关键字
136    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
137        let start_pos = state.get_position();
138
139        if let Some(ch) = state.peek() {
140            if ch.is_alphabetic() || ch == '_' {
141                state.advance(ch.len_utf8());
142
143                while let Some(ch) = state.peek() {
144                    if ch.is_alphanumeric() || ch == '_' {
145                        state.advance(ch.len_utf8());
146                    }
147                    else {
148                        break;
149                    }
150                }
151
152                let end_pos = state.get_position();
153                let text = state.get_text_in((start_pos..end_pos).into());
154
155                let token_kind = match text.as_ref() {
156                    "if" => DjangoSyntaxKind::If,
157                    "elif" => DjangoSyntaxKind::Elif,
158                    "else" => DjangoSyntaxKind::Else,
159                    "endif" => DjangoSyntaxKind::Endif,
160                    "for" => DjangoSyntaxKind::For,
161                    "empty" => DjangoSyntaxKind::Empty,
162                    "endfor" => DjangoSyntaxKind::Endfor,
163                    "block" => DjangoSyntaxKind::Block,
164                    "endblock" => DjangoSyntaxKind::Endblock,
165                    "extends" => DjangoSyntaxKind::Extends,
166                    "include" => DjangoSyntaxKind::Include,
167                    "load" => DjangoSyntaxKind::Load,
168                    "with" => DjangoSyntaxKind::With,
169                    "endwith" => DjangoSyntaxKind::Endwith,
170                    "autoescape" => DjangoSyntaxKind::Autoescape,
171                    "endautoescape" => DjangoSyntaxKind::Endautoescape,
172                    "csrf_token" => DjangoSyntaxKind::Csrf,
173                    "url" => DjangoSyntaxKind::Url,
174                    "static" => DjangoSyntaxKind::Static,
175                    "now" => DjangoSyntaxKind::Now,
176                    "cycle" => DjangoSyntaxKind::Cycle,
177                    "filter" => DjangoSyntaxKind::Filter,
178                    "endfilter" => DjangoSyntaxKind::Endfilter,
179                    "spaceless" => DjangoSyntaxKind::Spaceless,
180                    "endspaceless" => DjangoSyntaxKind::Endspaceless,
181                    "verbatim" => DjangoSyntaxKind::Verbatim,
182                    "endverbatim" => DjangoSyntaxKind::Endverbatim,
183                    "and" => DjangoSyntaxKind::And,
184                    "or" => DjangoSyntaxKind::Or,
185                    "not" => DjangoSyntaxKind::Not,
186                    "in" => DjangoSyntaxKind::In,
187                    _ => DjangoSyntaxKind::Identifier,
188                };
189
190                state.add_token(token_kind, start_pos, state.get_position());
191                true
192            }
193            else {
194                false
195            }
196        }
197        else {
198            false
199        }
200    }
201
202    /// 处理数字
203    /// 词法分析数字
204    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
205        let start_pos = state.get_position();
206
207        if let Some(ch) = state.peek() {
208            if ch.is_ascii_digit() {
209                state.advance(ch.len_utf8());
210
211                // 处理整数部分
212                while let Some(ch) = state.peek() {
213                    if ch.is_ascii_digit() {
214                        state.advance(ch.len_utf8());
215                    }
216                    else {
217                        break;
218                    }
219                }
220
221                // 处理小数部分
222                if let Some('.') = state.peek() {
223                    let dot_pos = state.get_position();
224                    state.advance(1);
225
226                    if let Some(ch) = state.peek() {
227                        if ch.is_ascii_digit() {
228                            while let Some(ch) = state.peek() {
229                                if ch.is_ascii_digit() {
230                                    state.advance(ch.len_utf8());
231                                }
232                                else {
233                                    break;
234                                }
235                            }
236                        }
237                        else {
238                            // 回退点号
239                            state.set_position(dot_pos);
240                        }
241                    }
242                    else {
243                        // 回退点号
244                        state.set_position(dot_pos);
245                    }
246                }
247
248                state.add_token(DjangoSyntaxKind::Number, start_pos, state.get_position());
249                true
250            }
251            else {
252                false
253            }
254        }
255        else {
256            false
257        }
258    }
259
260    /// 处理字符
261
262    fn lex_string_manual<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
263        let start_pos = state.get_position();
264
265        if let Some(quote) = state.peek() {
266            if quote == '"' || quote == '\'' {
267                state.advance(1);
268
269                while let Some(ch) = state.peek() {
270                    if ch == quote {
271                        state.advance(1);
272                        state.add_token(DjangoSyntaxKind::String, start_pos, state.get_position());
273                        return true;
274                    }
275                    else if ch == '\\' {
276                        state.advance(1);
277                        if state.peek().is_some() {
278                            state.advance(1);
279                        }
280                    }
281                    else {
282                        state.advance(ch.len_utf8());
283                    }
284                }
285
286                // 未闭合的字符
287
288                state.add_token(DjangoSyntaxKind::Error, start_pos, state.get_position());
289                true
290            }
291            else {
292                false
293            }
294        }
295        else {
296            false
297        }
298    }
299
300    /// 处理 Django 标签
301    fn lex_django_tags<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
302        let start_pos = state.get_position();
303
304        if let Some('{') = state.peek() {
305            state.advance(1);
306
307            if let Some(next_ch) = state.peek() {
308                match next_ch {
309                    '{' => {
310                        // 变量标签 {{
311                        state.advance(1);
312                        state.add_token(DjangoSyntaxKind::VariableStart, start_pos, state.get_position());
313                        true
314                    }
315                    '%' => {
316                        // 模板标签 {%
317                        state.advance(1);
318                        state.add_token(DjangoSyntaxKind::TagStart, start_pos, state.get_position());
319                        true
320                    }
321                    '#' => {
322                        // 注释标签 {#
323                        state.advance(1);
324                        state.add_token(DjangoSyntaxKind::CommentStart, start_pos, state.get_position());
325                        true
326                    }
327                    _ => {
328                        // 回退
329                        state.set_position(start_pos);
330                        false
331                    }
332                }
333            }
334            else {
335                // 回退
336                state.set_position(start_pos);
337                false
338            }
339        }
340        else if let Some('%') = state.peek() {
341            state.advance(1);
342            if let Some('}') = state.peek() {
343                state.advance(1);
344                state.add_token(DjangoSyntaxKind::TagEnd, start_pos, state.get_position());
345                true
346            }
347            else {
348                state.set_position(start_pos);
349                false
350            }
351        }
352        else if let Some('}') = state.peek() {
353            state.advance(1);
354            if let Some('}') = state.peek() {
355                state.advance(1);
356                state.add_token(DjangoSyntaxKind::VariableEnd, start_pos, state.get_position());
357                true
358            }
359            else {
360                state.set_position(start_pos);
361                false
362            }
363        }
364        else {
365            false
366        }
367    }
368
369    /// 处理操作符
370    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371        let start_pos = state.get_position();
372
373        if let Some(ch) = state.peek() {
374            let kind = match ch {
375                '=' => {
376                    state.advance(1);
377                    if let Some('=') = state.peek() {
378                        state.advance(1);
379                        Some(DjangoSyntaxKind::EqualEqual)
380                    }
381                    else {
382                        Some(DjangoSyntaxKind::Equal)
383                    }
384                }
385                '!' => {
386                    state.advance(1);
387                    if let Some('=') = state.peek() {
388                        state.advance(1);
389                        Some(DjangoSyntaxKind::NotEqual)
390                    }
391                    else {
392                        None
393                    }
394                }
395                '<' => {
396                    state.advance(1);
397                    if let Some('=') = state.peek() {
398                        state.advance(1);
399                        Some(DjangoSyntaxKind::LessEqual)
400                    }
401                    else {
402                        Some(DjangoSyntaxKind::Less)
403                    }
404                }
405                '>' => {
406                    state.advance(1);
407                    if let Some('=') = state.peek() {
408                        state.advance(1);
409                        Some(DjangoSyntaxKind::GreaterEqual)
410                    }
411                    else {
412                        Some(DjangoSyntaxKind::Greater)
413                    }
414                }
415                '|' => {
416                    state.advance(1);
417                    Some(DjangoSyntaxKind::Pipe)
418                }
419                ':' => {
420                    state.advance(1);
421                    Some(DjangoSyntaxKind::Colon)
422                }
423                '.' => {
424                    state.advance(1);
425                    Some(DjangoSyntaxKind::Dot)
426                }
427                ',' => {
428                    state.advance(1);
429                    Some(DjangoSyntaxKind::Comma)
430                }
431                '+' => {
432                    state.advance(1);
433                    Some(DjangoSyntaxKind::Plus)
434                }
435                '-' => {
436                    state.advance(1);
437                    Some(DjangoSyntaxKind::Minus)
438                }
439                '*' => {
440                    state.advance(1);
441                    Some(DjangoSyntaxKind::Star)
442                }
443                '/' => {
444                    state.advance(1);
445                    Some(DjangoSyntaxKind::Slash)
446                }
447                _ => None,
448            };
449
450            if let Some(kind) = kind {
451                state.add_token(kind, start_pos, state.get_position());
452                true
453            }
454            else {
455                state.set_position(start_pos);
456                false
457            }
458        }
459        else {
460            false
461        }
462    }
463
464    /// 处理分隔符
465    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
466        let start_pos = state.get_position();
467
468        if let Some(ch) = state.peek() {
469            let kind = match ch {
470                '(' => Some(DjangoSyntaxKind::LeftParen),
471                ')' => Some(DjangoSyntaxKind::RightParen),
472                '[' => Some(DjangoSyntaxKind::LeftBracket),
473                ']' => Some(DjangoSyntaxKind::RightBracket),
474                ';' => Some(DjangoSyntaxKind::Semicolon),
475                _ => None,
476            };
477
478            if let Some(kind) = kind {
479                state.advance(1);
480                state.add_token(kind, start_pos, state.get_position());
481                true
482            }
483            else {
484                false
485            }
486        }
487        else {
488            false
489        }
490    }
491
492    /// 处理 HTML 文本
493    fn lex_html_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
494        let start_pos = state.get_position();
495
496        while let Some(ch) = state.peek() {
497            // 如果遇到 Django 标签的开始,停止
498            if ch == '{' {
499                let current_pos = state.get_position();
500                state.advance(1);
501                if let Some(next_ch) = state.peek() {
502                    if next_ch == '{' || next_ch == '%' || next_ch == '#' {
503                        state.set_position(current_pos);
504                        break;
505                    }
506                }
507            }
508            state.advance(ch.len_utf8());
509        }
510
511        if state.get_position() > start_pos {
512            state.add_token(DjangoSyntaxKind::HtmlText, start_pos, state.get_position());
513            true
514        }
515        else {
516            false
517        }
518    }
519}