Skip to main content

oak_html/lexer/
mod.rs

1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{LexOutput, StringConfig},
5    source::{Source, TextEdit},
6};
7use std::{simd::prelude::*, sync::LazyLock};
8
9type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
10
11// HTML 静态配置
12
13static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
14
15#[derive(Clone, Debug)]
16pub struct HtmlLexer<'config> {
17    _config: &'config HtmlLanguage,
18}
19
20impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
22        let mut state = State::new_with_cache(source, 0, cache);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> HtmlLexer<'config> {
32    pub fn new(config: &'config HtmlLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    /// 主要的词法分析循环
37    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if let Some(ch) = state.peek() {
42                match ch {
43                    ' ' | '\t' | '\n' | '\r' => {
44                        self.skip_whitespace(state);
45                    }
46                    '<' => {
47                        if let Some(next) = state.peek_next_n(1) {
48                            if next == '!' {
49                                if state.starts_with("<!--") {
50                                    self.lex_comment(state);
51                                }
52                                else if state.starts_with("<![CDATA[") {
53                                    self.lex_cdata(state);
54                                }
55                                else {
56                                    // Try Doctype
57                                    if !self.lex_doctype(state) {
58                                        // Fallback to tag operator (TagOpen) or Text?
59                                        // Original loop: tries doctype, cdata, then tag_operators.
60                                        // If doctype fails (e.g. <!FOO>), tag_operators will see < and consume it as TagOpen.
61                                        self.lex_tag_operators(state);
62                                    }
63                                }
64                            }
65                            else if next == '?' {
66                                self.lex_processing_instruction(state);
67                            }
68                            else {
69                                self.lex_tag_operators(state);
70                            }
71                        }
72                        else {
73                            self.lex_tag_operators(state);
74                        }
75                    }
76                    '/' | '>' => {
77                        if self.lex_tag_operators(state) {
78                            continue;
79                        }
80                        self.lex_text(state);
81                    }
82                    '&' => {
83                        self.lex_entity_reference(state);
84                    }
85                    '"' | '\'' => {
86                        self.lex_string_literal(state);
87                    }
88                    'a'..='z' | 'A'..='Z' | '_' | ':' => {
89                        self.lex_identifier(state);
90                    }
91                    '=' => {
92                        self.lex_single_char_tokens(state);
93                    }
94                    _ => {
95                        if self.lex_text(state) {
96                            continue;
97                        }
98
99                        // 安全点检查,防止无限循环
100                        state.advance(ch.len_utf8());
101                        state.add_token(HtmlSyntaxKind::Error, safe_point, state.get_position());
102                    }
103                }
104            }
105
106            state.advance_if_dead_lock(safe_point);
107        }
108
109        Ok(())
110    }
111
112    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start = state.get_position();
114        let bytes = state.rest_bytes();
115        let mut i = 0;
116        let len = bytes.len();
117        const LANES: usize = 32;
118
119        while i + LANES <= len {
120            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
121            let is_le_space = chunk.simd_le(Simd::splat(32));
122
123            if !is_le_space.all() {
124                let not_space = !is_le_space;
125                let idx = not_space.first_set().unwrap();
126                i += idx;
127                state.advance(i);
128                state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
129                return true;
130            }
131            i += LANES;
132        }
133
134        while i < len {
135            if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
136                break;
137            }
138            i += 1;
139        }
140
141        if i > 0 {
142            state.advance(i);
143            state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
144            true
145        }
146        else {
147            false
148        }
149    }
150
151    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
152        if !state.starts_with("<!--") {
153            return false;
154        }
155
156        let start = state.get_position();
157        let len = {
158            let rest = state.rest();
159            match rest.find("-->") {
160                Some(end_at) => end_at + "-->".len(),
161                None => rest.len(),
162            }
163        };
164        state.advance(len);
165        state.add_token(HtmlSyntaxKind::Comment, start, state.get_position());
166        true
167    }
168
169    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
170        let start_pos = state.get_position();
171
172        if let Some('<') = state.peek() {
173            if let Some('!') = state.peek_next_n(1) {
174                if let Some('D') = state.peek_next_n(2) {
175                    let doctype_start = "DOCTYPE";
176                    let mut matches = true;
177
178                    for (i, expected_ch) in doctype_start.chars().enumerate() {
179                        if let Some(actual_ch) = state.peek_next_n(2 + i) {
180                            if actual_ch.to_ascii_uppercase() != expected_ch {
181                                matches = false;
182                                break;
183                            }
184                        }
185                        else {
186                            matches = false;
187                            break;
188                        }
189                    }
190
191                    if matches {
192                        state.advance(2 + doctype_start.len()); // Skip <!DOCTYPE
193
194                        // Find doctype end >
195                        while state.not_at_end() {
196                            if let Some('>') = state.peek() {
197                                state.advance(1); // Skip >
198                                state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
199                                return true;
200                            }
201                            if let Some(ch) = state.peek() {
202                                state.advance(ch.len_utf8());
203                            }
204                            else {
205                                break;
206                            }
207                        }
208
209                        // Unclosed doctype
210                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
211                        return true;
212                    }
213                }
214            }
215        }
216
217        false
218    }
219
220    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
221        let start_pos = state.get_position();
222
223        if let Some('<') = state.peek() {
224            if let Some('!') = state.peek_next_n(1) {
225                if let Some('[') = state.peek_next_n(2) {
226                    let cdata_start = "CDATA[";
227                    let mut matches = true;
228
229                    for (i, expected_ch) in cdata_start.chars().enumerate() {
230                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
231                            if actual_ch != expected_ch {
232                                matches = false;
233                                break;
234                            }
235                        }
236                        else {
237                            matches = false;
238                            break;
239                        }
240                    }
241
242                    if matches {
243                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
244
245                        // Find CDATA end ]]>
246                        while state.not_at_end() {
247                            if let Some(']') = state.peek() {
248                                if let Some(']') = state.peek_next_n(1) {
249                                    if let Some('>') = state.peek_next_n(2) {
250                                        state.advance(3); // Skip ]]>
251                                        state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
252                                        return true;
253                                    }
254                                }
255                            }
256                            if let Some(ch) = state.peek() {
257                                state.advance(ch.len_utf8());
258                            }
259                            else {
260                                break;
261                            }
262                        }
263
264                        // Unclosed CDATA
265                        state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
266                        return true;
267                    }
268                }
269            }
270        }
271
272        false
273    }
274
275    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276        let start_pos = state.get_position();
277
278        if let Some('<') = state.peek() {
279            if let Some('?') = state.peek_next_n(1) {
280                state.advance(2); // Skip <?
281
282                // Find processing instruction end ?>
283                while state.not_at_end() {
284                    if let Some('?') = state.peek() {
285                        if let Some('>') = state.peek_next_n(1) {
286                            state.advance(2); // Skip ?>
287                            state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
288                            return true;
289                        }
290                    }
291                    if let Some(ch) = state.peek() {
292                        state.advance(ch.len_utf8());
293                    }
294                    else {
295                        break;
296                    }
297                }
298
299                // Unclosed processing instruction
300                state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
301                return true;
302            }
303        }
304
305        false
306    }
307
308    fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
309        let start_pos = state.get_position();
310
311        match state.peek() {
312            Some('<') => {
313                if let Some('/') = state.peek_next_n(1) {
314                    state.advance(2);
315                    state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
316                    true
317                }
318                else {
319                    state.advance(1);
320                    state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
321                    true
322                }
323            }
324            Some('/') => {
325                if let Some('>') = state.peek_next_n(1) {
326                    state.advance(2);
327                    state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
328                    true
329                }
330                else {
331                    false
332                }
333            }
334            Some('>') => {
335                state.advance(1);
336                state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
337                true
338            }
339            _ => false,
340        }
341    }
342
343    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
344        let start_pos = state.get_position();
345
346        if let Some('&') = state.peek() {
347            state.advance(1);
348
349            if let Some('#') = state.peek() {
350                state.advance(1);
351
352                // Character reference &#123; or &#x1A;
353                if let Some('x') = state.peek() {
354                    state.advance(1);
355                    // Hexadecimal character reference
356                    let mut has_digits = false;
357                    while let Some(ch) = state.peek() {
358                        if ch.is_ascii_hexdigit() {
359                            state.advance(1);
360                            has_digits = true;
361                        }
362                        else {
363                            break;
364                        }
365                    }
366
367                    if has_digits && state.peek() == Some(';') {
368                        state.advance(1);
369                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
370                        return true;
371                    }
372                }
373                else {
374                    // Decimal character reference
375                    let mut has_digits = false;
376                    while let Some(ch) = state.peek() {
377                        if ch.is_ascii_digit() {
378                            state.advance(1);
379                            has_digits = true;
380                        }
381                        else {
382                            break;
383                        }
384                    }
385
386                    if has_digits && state.peek() == Some(';') {
387                        state.advance(1);
388                        state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
389                        return true;
390                    }
391                }
392            }
393            else {
394                // Named entity reference &name;
395                let mut has_name = false;
396                while let Some(ch) = state.peek() {
397                    if ch.is_ascii_alphanumeric() {
398                        state.advance(1);
399                        has_name = true;
400                    }
401                    else {
402                        break;
403                    }
404                }
405
406                if has_name && state.peek() == Some(';') {
407                    state.advance(1);
408                    state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
409                    return true;
410                }
411            }
412
413            // Invalid entity reference
414            state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
415            return true;
416        }
417
418        false
419    }
420
421    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
422        HTML_STRING.scan(state, HtmlSyntaxKind::AttributeValue)
423    }
424
425    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
426        let start_pos = state.get_position();
427
428        if let Some(ch) = state.peek() {
429            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
430                state.advance(ch.len_utf8());
431
432                while let Some(ch) = state.peek() {
433                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
434                        state.advance(ch.len_utf8());
435                    }
436                    else {
437                        break;
438                    }
439                }
440
441                state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
442                return true;
443            }
444        }
445
446        false
447    }
448
449    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
450        let start_pos = state.get_position();
451
452        let kind = match state.peek() {
453            Some('=') => HtmlSyntaxKind::Equal,
454            Some('"') => HtmlSyntaxKind::Quote,
455            Some('\'') => HtmlSyntaxKind::Quote,
456            Some('!') => return false, // 已在其他地方处理
457            Some('?') => return false, // 已在其他地方处理
458            Some('&') => return false, // 已在其他地方处理
459            Some(';') => return false, // 已在其他地方处理
460            _ => return false,
461        };
462
463        if let Some(ch) = state.peek() {
464            state.advance(ch.len_utf8());
465            state.add_token(kind, start_pos, state.get_position());
466            true
467        }
468        else {
469            false
470        }
471    }
472
473    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
474        let start_pos = state.get_position();
475        let bytes = state.rest_bytes();
476        let mut i = 0;
477        let len = bytes.len();
478        const LANES: usize = 32;
479
480        while i + LANES <= len {
481            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
482
483            let is_lt = chunk.simd_eq(Simd::splat(b'<'));
484            let is_amp = chunk.simd_eq(Simd::splat(b'&'));
485            let is_le_space = chunk.simd_le(Simd::splat(32));
486
487            let stop = is_lt | is_amp | is_le_space;
488
489            if stop.any() {
490                let idx = stop.first_set().unwrap();
491                i += idx;
492                state.advance(i);
493                state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
494                return true;
495            }
496            i += LANES;
497        }
498
499        while i < len {
500            let ch = unsafe { *bytes.get_unchecked(i) };
501            if ch == b'<' || ch == b'&' || ch.is_ascii_whitespace() {
502                break;
503            }
504            i += 1;
505        }
506
507        if i > 0 {
508            state.advance(i);
509            state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
510            true
511        }
512        else {
513            false
514        }
515    }
516}