oak_html/lexer/
mod.rs

1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentBlock, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, HtmlLanguage>;
10
11// HTML 静态配置
12static HTML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static HTML_COMMENT: LazyLock<CommentBlock> =
15    LazyLock::new(|| CommentBlock { block_markers: &[("<!--", "-->")], nested_blocks: false });
16
17static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19#[derive(Clone)]
20pub struct HtmlLexer<'config> {
21    config: &'config HtmlLanguage,
22}
23
24impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
25    fn lex_incremental(
26        &self,
27        source: impl Source,
28        changed: usize,
29        cache: IncrementalCache<HtmlLanguage>,
30    ) -> LexOutput<HtmlLanguage> {
31        let mut state = LexerState::new_with_cache(source, changed, cache);
32        let result = self.run(&mut state);
33        state.finish(result)
34    }
35}
36
37impl<'config> HtmlLexer<'config> {
38    pub fn new(config: &'config HtmlLanguage) -> Self {
39        Self { config }
40    }
41
42    /// 主要的词法分析循环
43    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
44        while state.not_at_end() {
45            let safe_point = state.get_position();
46
47            if self.skip_whitespace(state) {
48                continue;
49            }
50
51            if self.lex_comment(state) {
52                continue;
53            }
54
55            if self.lex_doctype(state) {
56                continue;
57            }
58
59            if self.lex_cdata(state) {
60                continue;
61            }
62
63            if self.lex_processing_instruction(state) {
64                continue;
65            }
66
67            if self.lex_tag_operators(state) {
68                continue;
69            }
70
71            if self.lex_entity_reference(state) {
72                continue;
73            }
74
75            if self.lex_string_literal(state) {
76                continue;
77            }
78
79            if self.lex_identifier(state) {
80                continue;
81            }
82
83            if self.lex_single_char_tokens(state) {
84                continue;
85            }
86
87            if self.lex_text(state) {
88                continue;
89            }
90
91            // 安全点检查,防止无限循环
92            state.safe_check(safe_point);
93        }
94
95        Ok(())
96    }
97
98    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
99        match HTML_WHITESPACE.scan(state.rest(), state.get_position(), HtmlSyntaxKind::Whitespace) {
100            Some(token) => {
101                state.advance_with(token);
102                true
103            }
104            None => false,
105        }
106    }
107
108    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
109        match HTML_COMMENT.scan(state.rest(), state.get_position(), HtmlSyntaxKind::Comment) {
110            Some(token) => {
111                state.advance_with(token);
112                true
113            }
114            None => false,
115        }
116    }
117
118    fn lex_doctype<S: Source>(&self, state: &mut State<S>) -> bool {
119        let start_pos = state.get_position();
120
121        if let Some('<') = state.peek() {
122            if let Some('!') = state.peek_next_n(1) {
123                if let Some('D') = state.peek_next_n(2) {
124                    let doctype_start = "DOCTYPE";
125                    let mut matches = true;
126
127                    for (i, expected_ch) in doctype_start.chars().enumerate() {
128                        if let Some(actual_ch) = state.peek_next_n(2 + i) {
129                            if actual_ch.to_ascii_uppercase() != expected_ch {
130                                matches = false;
131                                break;
132                            }
133                        }
134                        else {
135                            matches = false;
136                            break;
137                        }
138                    }
139
140                    if matches {
141                        state.advance(2 + doctype_start.len()); // Skip <!DOCTYPE
142
143                        // Find doctype end >
144                        while state.not_at_end() {
145                            if let Some('>') = state.peek() {
146                                state.advance(1); // Skip >
147                                state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
148                                return true;
149                            }
150                            if let Some(ch) = state.peek() {
151                                state.advance(ch.len_utf8());
152                            }
153                            else {
154                                break;
155                            }
156                        }
157
158                        // Unclosed doctype
159                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
160                        return true;
161                    }
162                }
163            }
164        }
165
166        false
167    }
168
169    fn lex_cdata<S: Source>(&self, state: &mut State<S>) -> bool {
170        let start_pos = state.get_position();
171
172        if let Some('<') = state.peek() {
173            if let Some('!') = state.peek_next_n(1) {
174                if let Some('[') = state.peek_next_n(2) {
175                    let cdata_start = "CDATA[";
176                    let mut matches = true;
177
178                    for (i, expected_ch) in cdata_start.chars().enumerate() {
179                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
180                            if actual_ch != expected_ch {
181                                matches = false;
182                                break;
183                            }
184                        }
185                        else {
186                            matches = false;
187                            break;
188                        }
189                    }
190
191                    if matches {
192                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
193
194                        // Find CDATA end ]]>
195                        while state.not_at_end() {
196                            if let Some(']') = state.peek() {
197                                if let Some(']') = state.peek_next_n(1) {
198                                    if let Some('>') = state.peek_next_n(2) {
199                                        state.advance(3); // Skip ]]>
200                                        state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
201                                        return true;
202                                    }
203                                }
204                            }
205                            if let Some(ch) = state.peek() {
206                                state.advance(ch.len_utf8());
207                            }
208                            else {
209                                break;
210                            }
211                        }
212
213                        // Unclosed CDATA
214                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
215                        return true;
216                    }
217                }
218            }
219        }
220
221        false
222    }
223
224    fn lex_processing_instruction<S: Source>(&self, state: &mut State<S>) -> bool {
225        let start_pos = state.get_position();
226
227        if let Some('<') = state.peek() {
228            if let Some('?') = state.peek_next_n(1) {
229                state.advance(2); // Skip <?
230
231                // Find processing instruction end ?>
232                while state.not_at_end() {
233                    if let Some('?') = state.peek() {
234                        if let Some('>') = state.peek_next_n(1) {
235                            state.advance(2); // Skip ?>
236                            state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
237                            return true;
238                        }
239                    }
240                    if let Some(ch) = state.peek() {
241                        state.advance(ch.len_utf8());
242                    }
243                    else {
244                        break;
245                    }
246                }
247
248                // Unclosed processing instruction
249                state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
250                return true;
251            }
252        }
253
254        false
255    }
256
257    fn lex_tag_operators<S: Source>(&self, state: &mut State<S>) -> bool {
258        let start_pos = state.get_position();
259
260        match state.peek() {
261            Some('<') => {
262                if let Some('/') = state.peek_next_n(1) {
263                    state.advance(2);
264                    state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
265                    true
266                }
267                else {
268                    state.advance(1);
269                    state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
270                    true
271                }
272            }
273            Some('/') => {
274                if let Some('>') = state.peek_next_n(1) {
275                    state.advance(2);
276                    state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
277                    true
278                }
279                else {
280                    false
281                }
282            }
283            Some('>') => {
284                state.advance(1);
285                state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
286                true
287            }
288            _ => false,
289        }
290    }
291
292    fn lex_entity_reference<S: Source>(&self, state: &mut State<S>) -> bool {
293        let start_pos = state.get_position();
294
295        if let Some('&') = state.peek() {
296            state.advance(1);
297
298            if let Some('#') = state.peek() {
299                state.advance(1);
300
301                // Character reference &#123; or &#x1A;
302                if let Some('x') = state.peek() {
303                    state.advance(1);
304                    // Hexadecimal character reference
305                    let mut has_digits = false;
306                    while let Some(ch) = state.peek() {
307                        if ch.is_ascii_hexdigit() {
308                            state.advance(1);
309                            has_digits = true;
310                        }
311                        else {
312                            break;
313                        }
314                    }
315
316                    if has_digits && state.peek() == Some(';') {
317                        state.advance(1);
318                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
319                        return true;
320                    }
321                }
322                else {
323                    // Decimal character reference
324                    let mut has_digits = false;
325                    while let Some(ch) = state.peek() {
326                        if ch.is_ascii_digit() {
327                            state.advance(1);
328                            has_digits = true;
329                        }
330                        else {
331                            break;
332                        }
333                    }
334
335                    if has_digits && state.peek() == Some(';') {
336                        state.advance(1);
337                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
338                        return true;
339                    }
340                }
341            }
342            else {
343                // Named entity reference &name;
344                let mut has_name = false;
345                while let Some(ch) = state.peek() {
346                    if ch.is_ascii_alphanumeric() {
347                        state.advance(1);
348                        has_name = true;
349                    }
350                    else {
351                        break;
352                    }
353                }
354
355                if has_name && state.peek() == Some(';') {
356                    state.advance(1);
357                    state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
358                    return true;
359                }
360            }
361
362            // Invalid entity reference
363            state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
364            return true;
365        }
366
367        false
368    }
369
370    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
371        match HTML_STRING.scan(state.rest(), 0, HtmlSyntaxKind::AttributeValue) {
372            Some(mut token) => {
373                // Adjust token span to absolute position
374                token.span.start += state.get_position();
375                token.span.end += state.get_position();
376                state.advance_with(token);
377                true
378            }
379            None => false,
380        }
381    }
382
383    fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
384        let start_pos = state.get_position();
385
386        if let Some(ch) = state.peek() {
387            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
388                state.advance(ch.len_utf8());
389
390                while let Some(ch) = state.peek() {
391                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
392                        state.advance(ch.len_utf8());
393                    }
394                    else {
395                        break;
396                    }
397                }
398
399                state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
400                return true;
401            }
402        }
403
404        false
405    }
406
407    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
408        let start_pos = state.get_position();
409
410        let kind = match state.peek() {
411            Some('=') => HtmlSyntaxKind::Equal,
412            Some('"') => HtmlSyntaxKind::Quote,
413            Some('\'') => HtmlSyntaxKind::Quote,
414            Some('!') => return false, // 已在其他地方处理
415            Some('?') => return false, // 已在其他地方处理
416            Some('&') => return false, // 已在其他地方处理
417            Some(';') => return false, // 已在其他地方处理
418            _ => return false,
419        };
420
421        if let Some(ch) = state.peek() {
422            state.advance(ch.len_utf8());
423            state.add_token(kind, start_pos, state.get_position());
424            true
425        }
426        else {
427            false
428        }
429    }
430
431    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
432        let start_pos = state.get_position();
433        let mut has_text = false;
434
435        while let Some(ch) = state.peek() {
436            match ch {
437                '<' | '&' => break,
438                _ if ch.is_whitespace() => break,
439                _ => {
440                    state.advance(ch.len_utf8());
441                    has_text = true;
442                }
443            }
444        }
445
446        if has_text {
447            state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
448            true
449        }
450        else {
451            false
452        }
453    }
454}