oak_html/lexer/
mod.rs

1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{LexOutput, StringConfig},
5    source::{Source, TextEdit},
6};
7use std::{simd::prelude::*, sync::LazyLock};
8
9type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
10
11// HTML 静态配置
12
13static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
14
15#[derive(Clone)]
16pub struct HtmlLexer<'config> {
17    _config: &'config HtmlLanguage,
18}
19
20impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
22        let mut state = LexerState::new(source);
23        let result = self.run(&mut state);
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> HtmlLexer<'config> {
29    pub fn new(config: &'config HtmlLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    /// 主要的词法分析循环
34    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35        while state.not_at_end() {
36            let safe_point = state.get_position();
37
38            if let Some(ch) = state.peek() {
39                match ch {
40                    ' ' | '\t' | '\n' | '\r' => {
41                        self.skip_whitespace(state);
42                    }
43                    '<' => {
44                        if let Some(next) = state.peek_next_n(1) {
45                            if next == '!' {
46                                if state.starts_with("<!--") {
47                                    self.lex_comment(state);
48                                }
49                                else if state.starts_with("<![CDATA[") {
50                                    self.lex_cdata(state);
51                                }
52                                else {
53                                    // Try Doctype
54                                    if !self.lex_doctype(state) {
55                                        // Fallback to tag operator (TagOpen) or Text?
56                                        // Original loop: tries doctype, cdata, then tag_operators.
57                                        // If doctype fails (e.g. <!FOO>), tag_operators will see < and consume it as TagOpen.
58                                        self.lex_tag_operators(state);
59                                    }
60                                }
61                            }
62                            else if next == '?' {
63                                self.lex_processing_instruction(state);
64                            }
65                            else {
66                                self.lex_tag_operators(state);
67                            }
68                        }
69                        else {
70                            self.lex_tag_operators(state);
71                        }
72                    }
73                    '/' | '>' => {
74                        if self.lex_tag_operators(state) {
75                            continue;
76                        }
77                        self.lex_text(state);
78                    }
79                    '&' => {
80                        self.lex_entity_reference(state);
81                    }
82                    '"' | '\'' => {
83                        self.lex_string_literal(state);
84                    }
85                    'a'..='z' | 'A'..='Z' | '_' | ':' => {
86                        self.lex_identifier(state);
87                    }
88                    '=' => {
89                        self.lex_single_char_tokens(state);
90                    }
91                    _ => {
92                        if self.lex_text(state) {
93                            continue;
94                        }
95
96                        // 安全点检查,防止无限循环
97                        state.advance(ch.len_utf8());
98                        state.add_token(HtmlSyntaxKind::Error, safe_point, state.get_position());
99                    }
100                }
101            }
102
103            state.advance_if_dead_lock(safe_point);
104        }
105
106        Ok(())
107    }
108
109    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        let start = state.get_position();
111        let bytes = state.rest_bytes();
112        let mut i = 0;
113        let len = bytes.len();
114        const LANES: usize = 32;
115
116        while i + LANES <= len {
117            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
118            let is_le_space = chunk.simd_le(Simd::splat(32));
119
120            if !is_le_space.all() {
121                let not_space = !is_le_space;
122                let idx = not_space.first_set().unwrap();
123                i += idx;
124                state.advance(i);
125                state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
126                return true;
127            }
128            i += LANES;
129        }
130
131        while i < len {
132            if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
133                break;
134            }
135            i += 1;
136        }
137
138        if i > 0 {
139            state.advance(i);
140            state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
141            true
142        }
143        else {
144            false
145        }
146    }
147
148    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149        if !state.starts_with("<!--") {
150            return false;
151        }
152
153        let start = state.get_position();
154        let len = {
155            let rest = state.rest();
156            match rest.find("-->") {
157                Some(end_at) => end_at + "-->".len(),
158                None => rest.len(),
159            }
160        };
161        state.advance(len);
162        state.add_token(HtmlSyntaxKind::Comment, start, state.get_position());
163        true
164    }
165
166    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
167        let start_pos = state.get_position();
168
169        if let Some('<') = state.peek() {
170            if let Some('!') = state.peek_next_n(1) {
171                if let Some('D') = state.peek_next_n(2) {
172                    let doctype_start = "DOCTYPE";
173                    let mut matches = true;
174
175                    for (i, expected_ch) in doctype_start.chars().enumerate() {
176                        if let Some(actual_ch) = state.peek_next_n(2 + i) {
177                            if actual_ch.to_ascii_uppercase() != expected_ch {
178                                matches = false;
179                                break;
180                            }
181                        }
182                        else {
183                            matches = false;
184                            break;
185                        }
186                    }
187
188                    if matches {
189                        state.advance(2 + doctype_start.len()); // Skip <!DOCTYPE
190
191                        // Find doctype end >
192                        while state.not_at_end() {
193                            if let Some('>') = state.peek() {
194                                state.advance(1); // Skip >
195                                state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
196                                return true;
197                            }
198                            if let Some(ch) = state.peek() {
199                                state.advance(ch.len_utf8());
200                            }
201                            else {
202                                break;
203                            }
204                        }
205
206                        // Unclosed doctype
207                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
208                        return true;
209                    }
210                }
211            }
212        }
213
214        false
215    }
216
217    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218        let start_pos = state.get_position();
219
220        if let Some('<') = state.peek() {
221            if let Some('!') = state.peek_next_n(1) {
222                if let Some('[') = state.peek_next_n(2) {
223                    let cdata_start = "CDATA[";
224                    let mut matches = true;
225
226                    for (i, expected_ch) in cdata_start.chars().enumerate() {
227                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
228                            if actual_ch != expected_ch {
229                                matches = false;
230                                break;
231                            }
232                        }
233                        else {
234                            matches = false;
235                            break;
236                        }
237                    }
238
239                    if matches {
240                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
241
242                        // Find CDATA end ]]>
243                        while state.not_at_end() {
244                            if let Some(']') = state.peek() {
245                                if let Some(']') = state.peek_next_n(1) {
246                                    if let Some('>') = state.peek_next_n(2) {
247                                        state.advance(3); // Skip ]]>
248                                        state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
249                                        return true;
250                                    }
251                                }
252                            }
253                            if let Some(ch) = state.peek() {
254                                state.advance(ch.len_utf8());
255                            }
256                            else {
257                                break;
258                            }
259                        }
260
261                        // Unclosed CDATA
262                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
263                        return true;
264                    }
265                }
266            }
267        }
268
269        false
270    }
271
272    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
273        let start_pos = state.get_position();
274
275        if let Some('<') = state.peek() {
276            if let Some('?') = state.peek_next_n(1) {
277                state.advance(2); // Skip <?
278
279                // Find processing instruction end ?>
280                while state.not_at_end() {
281                    if let Some('?') = state.peek() {
282                        if let Some('>') = state.peek_next_n(1) {
283                            state.advance(2); // Skip ?>
284                            state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
285                            return true;
286                        }
287                    }
288                    if let Some(ch) = state.peek() {
289                        state.advance(ch.len_utf8());
290                    }
291                    else {
292                        break;
293                    }
294                }
295
296                // Unclosed processing instruction
297                state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
298                return true;
299            }
300        }
301
302        false
303    }
304
305    fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
306        let start_pos = state.get_position();
307
308        match state.peek() {
309            Some('<') => {
310                if let Some('/') = state.peek_next_n(1) {
311                    state.advance(2);
312                    state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
313                    true
314                }
315                else {
316                    state.advance(1);
317                    state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
318                    true
319                }
320            }
321            Some('/') => {
322                if let Some('>') = state.peek_next_n(1) {
323                    state.advance(2);
324                    state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
325                    true
326                }
327                else {
328                    false
329                }
330            }
331            Some('>') => {
332                state.advance(1);
333                state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
334                true
335            }
336            _ => false,
337        }
338    }
339
340    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
341        let start_pos = state.get_position();
342
343        if let Some('&') = state.peek() {
344            state.advance(1);
345
346            if let Some('#') = state.peek() {
347                state.advance(1);
348
349                // Character reference &#123; or &#x1A;
350                if let Some('x') = state.peek() {
351                    state.advance(1);
352                    // Hexadecimal character reference
353                    let mut has_digits = false;
354                    while let Some(ch) = state.peek() {
355                        if ch.is_ascii_hexdigit() {
356                            state.advance(1);
357                            has_digits = true;
358                        }
359                        else {
360                            break;
361                        }
362                    }
363
364                    if has_digits && state.peek() == Some(';') {
365                        state.advance(1);
366                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
367                        return true;
368                    }
369                }
370                else {
371                    // Decimal character reference
372                    let mut has_digits = false;
373                    while let Some(ch) = state.peek() {
374                        if ch.is_ascii_digit() {
375                            state.advance(1);
376                            has_digits = true;
377                        }
378                        else {
379                            break;
380                        }
381                    }
382
383                    if has_digits && state.peek() == Some(';') {
384                        state.advance(1);
385                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
386                        return true;
387                    }
388                }
389            }
390            else {
391                // Named entity reference &name;
392                let mut has_name = false;
393                while let Some(ch) = state.peek() {
394                    if ch.is_ascii_alphanumeric() {
395                        state.advance(1);
396                        has_name = true;
397                    }
398                    else {
399                        break;
400                    }
401                }
402
403                if has_name && state.peek() == Some(';') {
404                    state.advance(1);
405                    state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
406                    return true;
407                }
408            }
409
410            // Invalid entity reference
411            state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
412            return true;
413        }
414
415        false
416    }
417
418    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
419        HTML_STRING.scan(state, HtmlSyntaxKind::AttributeValue)
420    }
421
422    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
423        let start_pos = state.get_position();
424
425        if let Some(ch) = state.peek() {
426            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
427                state.advance(ch.len_utf8());
428
429                while let Some(ch) = state.peek() {
430                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
431                        state.advance(ch.len_utf8());
432                    }
433                    else {
434                        break;
435                    }
436                }
437
438                state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
439                return true;
440            }
441        }
442
443        false
444    }
445
446    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
447        let start_pos = state.get_position();
448
449        let kind = match state.peek() {
450            Some('=') => HtmlSyntaxKind::Equal,
451            Some('"') => HtmlSyntaxKind::Quote,
452            Some('\'') => HtmlSyntaxKind::Quote,
453            Some('!') => return false, // 已在其他地方处理
454            Some('?') => return false, // 已在其他地方处理
455            Some('&') => return false, // 已在其他地方处理
456            Some(';') => return false, // 已在其他地方处理
457            _ => return false,
458        };
459
460        if let Some(ch) = state.peek() {
461            state.advance(ch.len_utf8());
462            state.add_token(kind, start_pos, state.get_position());
463            true
464        }
465        else {
466            false
467        }
468    }
469
470    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
471        let start_pos = state.get_position();
472        let bytes = state.rest_bytes();
473        let mut i = 0;
474        let len = bytes.len();
475        const LANES: usize = 32;
476
477        while i + LANES <= len {
478            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
479
480            let is_lt = chunk.simd_eq(Simd::splat(b'<'));
481            let is_amp = chunk.simd_eq(Simd::splat(b'&'));
482            let is_le_space = chunk.simd_le(Simd::splat(32));
483
484            let stop = is_lt | is_amp | is_le_space;
485
486            if stop.any() {
487                let idx = stop.first_set().unwrap();
488                i += idx;
489                state.advance(i);
490                state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
491                return true;
492            }
493            i += LANES;
494        }
495
496        while i < len {
497            let ch = unsafe { *bytes.get_unchecked(i) };
498            if ch == b'<' || ch == b'&' || ch.is_ascii_whitespace() {
499                break;
500            }
501            i += 1;
502        }
503
504        if i > 0 {
505            state.advance(i);
506            state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
507            true
508        }
509        else {
510            false
511        }
512    }
513}