Skip to main content

oak_html/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type module for HTML.
3pub mod token_type;
4
5use crate::{language::HtmlLanguage, lexer::token_type::HtmlTokenType};
6use oak_core::{
7    Lexer, LexerCache, LexerState, OakError,
8    lexer::{LexOutput, StringConfig},
9    source::{Source, TextEdit},
10};
11use std::{simd::prelude::*, sync::LazyLock};
12
13pub(crate) type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
14
15// HTML static configuration
16
17static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19/// Lexer for the HTML language.
20///
21/// This lexer converts a raw string into a stream of HTML syntax tokens.
22#[derive(Clone, Debug)]
23pub struct HtmlLexer<'config> {
24    config: &'config HtmlLanguage,
25}
26
27impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
28    /// Tokenizes the input source text using the provided cache.
29    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
30        let mut state = State::new_with_cache(source, 0, cache);
31        let result = self.run(&mut state);
32        if result.is_ok() {
33            state.add_eof();
34        }
35        state.finish_with_cache(result, cache)
36    }
37}
38
39impl<'config> HtmlLexer<'config> {
40    /// Creates a new `HtmlLexer` with the given configuration.
41    pub fn new(config: &'config HtmlLanguage) -> Self {
42        Self { config }
43    }
44
45    /// The main lexing loop that iterates through the source text.
46    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
47        while state.not_at_end() {
48            let safe_point = state.get_position();
49
50            if let Some(ch) = state.peek() {
51                match ch {
52                    ' ' | '\t' | '\n' | '\r' => {
53                        self.skip_whitespace(state);
54                    }
55                    '<' => {
56                        if let Some(next) = state.peek_next_n(1) {
57                            if next == '!' {
58                                if state.starts_with("<!--") {
59                                    self.lex_comment(state);
60                                }
61                                else if state.starts_with("<![CDATA[") {
62                                    self.lex_cdata(state);
63                                }
64                                else {
65                                    // Try Doctype
66                                    if !self.lex_doctype(state) {
67                                        // Fallback to tag operator (TagOpen) or Text?
68                                        // Original loop: tries doctype, cdata, then tag_operators.
69                                        // If doctype fails (e.g. <!FOO>), tag_operators will see < and consume it as TagOpen.
70                                        self.lex_tag_operators(state);
71                                    }
72                                }
73                            }
74                            else if next == '?' {
75                                self.lex_processing_instruction(state);
76                            }
77                            else {
78                                self.lex_tag_operators(state);
79                            }
80                        }
81                        else {
82                            self.lex_tag_operators(state);
83                        }
84                    }
85                    '/' | '>' => {
86                        if self.lex_tag_operators(state) {
87                            continue;
88                        }
89                        self.lex_text(state);
90                    }
91                    '&' => {
92                        self.lex_entity_reference(state);
93                    }
94                    '"' | '\'' => {
95                        self.lex_string_literal(state);
96                    }
97                    'a'..='z' | 'A'..='Z' | '_' | ':' => {
98                        self.lex_identifier(state);
99                    }
100                    '=' => {
101                        self.lex_single_char_tokens(state);
102                    }
103                    _ => {
104                        if self.lex_text(state) {
105                            continue;
106                        }
107
108                        // Safety check to prevent infinite loop
109                        state.advance(ch.len_utf8());
110                        state.add_token(HtmlTokenType::Error, safe_point, state.get_position());
111                    }
112                }
113            }
114
115            state.advance_if_dead_lock(safe_point)
116        }
117
118        Ok(())
119    }
120
121    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
122        let start = state.get_position();
123        let bytes = state.rest_bytes();
124        let mut i = 0;
125        let len = bytes.len();
126        const LANES: usize = 32;
127
128        while i + LANES <= len {
129            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
130            let is_le_space = chunk.simd_le(Simd::splat(32));
131
132            if !is_le_space.all() {
133                let not_space = !is_le_space;
134                let idx = not_space.first_set().unwrap();
135                i += idx;
136                state.advance(i);
137                state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
138                return true;
139            }
140            i += LANES;
141        }
142
143        while i < len {
144            if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
145                break;
146            }
147            i += 1;
148        }
149
150        if i > 0 {
151            state.advance(i);
152            state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
153            true
154        }
155        else {
156            false
157        }
158    }
159
160    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
161        if !state.starts_with("<!--") {
162            return false;
163        }
164
165        let start = state.get_position();
166        let len = {
167            let rest = state.rest();
168            match rest.find("-->") {
169                Some(end_at) => end_at + "-->".len(),
170                None => rest.len(),
171            }
172        };
173        state.advance(len);
174        state.add_token(HtmlTokenType::Comment, start, state.get_position());
175        true
176    }
177
178    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
179        let start_pos = state.get_position();
180
181        if let Some('<') = state.peek() {
182            if let Some('!') = state.peek_next_n(1) {
183                if let Some('D') = state.peek_next_n(2) {
184                    let doctype_start = "DOCTYPE";
185                    let mut matches = true;
186
187                    for (i, expected_ch) in doctype_start.chars().enumerate() {
188                        if let Some(actual_ch) = state.peek_next_n(2 + i) {
189                            if actual_ch.to_ascii_uppercase() != expected_ch {
190                                matches = false;
191                                break;
192                            }
193                        }
194                        else {
195                            matches = false;
196                            break;
197                        }
198                    }
199
200                    if matches {
201                        state.advance(2 + doctype_start.len()); // Skip <!DOCTYPE
202
203                        // Find doctype end >
204                        while state.not_at_end() {
205                            if let Some('>') = state.peek() {
206                                state.advance(1); // Skip >
207                                state.add_token(HtmlTokenType::Doctype, start_pos, state.get_position());
208                                return true;
209                            }
210                            if let Some(ch) = state.peek() {
211                                state.advance(ch.len_utf8());
212                            }
213                            else {
214                                break;
215                            }
216                        }
217
218                        // Unclosed doctype
219                        state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
220                        return true;
221                    }
222                }
223            }
224        }
225
226        false
227    }
228
229    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230        let start_pos = state.get_position();
231
232        if let Some('<') = state.peek() {
233            if let Some('!') = state.peek_next_n(1) {
234                if let Some('[') = state.peek_next_n(2) {
235                    let cdata_start = "CDATA[";
236                    let mut matches = true;
237
238                    for (i, expected_ch) in cdata_start.chars().enumerate() {
239                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
240                            if actual_ch != expected_ch {
241                                matches = false;
242                                break;
243                            }
244                        }
245                        else {
246                            matches = false;
247                            break;
248                        }
249                    }
250
251                    if matches {
252                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
253
254                        // Find CDATA end ]]>
255                        while state.not_at_end() {
256                            if let Some(']') = state.peek() {
257                                if let Some(']') = state.peek_next_n(1) {
258                                    if let Some('>') = state.peek_next_n(2) {
259                                        state.advance(3); // Skip ]]>
260                                        state.add_token(HtmlTokenType::CData, start_pos, state.get_position());
261                                        return true;
262                                    }
263                                }
264                            }
265                            if let Some(ch) = state.peek() {
266                                state.advance(ch.len_utf8());
267                            }
268                            else {
269                                break;
270                            }
271                        }
272
273                        // Unclosed CDATA
274                        state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
275                        return true;
276                    }
277                }
278            }
279        }
280
281        false
282    }
283
284    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
285        let start_pos = state.get_position();
286
287        if let Some('<') = state.peek() {
288            if let Some('?') = state.peek_next_n(1) {
289                state.advance(2); // Skip <?
290
291                // Find processing instruction end ?>
292                while state.not_at_end() {
293                    if let Some('?') = state.peek() {
294                        if let Some('>') = state.peek_next_n(1) {
295                            state.advance(2); // Skip ?>
296                            state.add_token(HtmlTokenType::ProcessingInstruction, start_pos, state.get_position());
297                            return true;
298                        }
299                    }
300                    if let Some(ch) = state.peek() {
301                        state.advance(ch.len_utf8());
302                    }
303                    else {
304                        break;
305                    }
306                }
307
308                // Unclosed processing instruction
309                state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
310                return true;
311            }
312        }
313
314        false
315    }
316
317    fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
318        let start_pos = state.get_position();
319
320        match state.peek() {
321            Some('<') => {
322                if let Some('/') = state.peek_next_n(1) {
323                    state.advance(2);
324                    state.add_token(HtmlTokenType::TagSlashOpen, start_pos, state.get_position());
325                    true
326                }
327                else {
328                    state.advance(1);
329                    state.add_token(HtmlTokenType::TagOpen, start_pos, state.get_position());
330                    true
331                }
332            }
333            Some('/') => {
334                if let Some('>') = state.peek_next_n(1) {
335                    state.advance(2);
336                    state.add_token(HtmlTokenType::TagSelfClose, start_pos, state.get_position());
337                    true
338                }
339                else {
340                    false
341                }
342            }
343            Some('>') => {
344                state.advance(1);
345                state.add_token(HtmlTokenType::TagClose, start_pos, state.get_position());
346                true
347            }
348            _ => false,
349        }
350    }
351
352    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
353        let start_pos = state.get_position();
354
355        if let Some('&') = state.peek() {
356            state.advance(1);
357
358            if let Some('#') = state.peek() {
359                state.advance(1);
360
361                // Character reference &#123; or &#x1A;
362                if let Some('x') = state.peek() {
363                    state.advance(1);
364                    // Hexadecimal character reference
365                    let mut has_digits = false;
366                    while let Some(ch) = state.peek() {
367                        if ch.is_ascii_hexdigit() {
368                            state.advance(1);
369                            has_digits = true;
370                        }
371                        else {
372                            break;
373                        }
374                    }
375
376                    if has_digits && state.peek() == Some(';') {
377                        state.advance(1);
378                        state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
379                        return true;
380                    }
381                }
382                else {
383                    // Decimal character reference
384                    let mut has_digits = false;
385                    while let Some(ch) = state.peek() {
386                        if ch.is_ascii_digit() {
387                            state.advance(1);
388                            has_digits = true;
389                        }
390                        else {
391                            break;
392                        }
393                    }
394
395                    if has_digits && state.peek() == Some(';') {
396                        state.advance(1);
397                        state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
398                        return true;
399                    }
400                }
401            }
402            else {
403                // Named entity reference &name;
404                let mut has_name = false;
405                while let Some(ch) = state.peek() {
406                    if ch.is_ascii_alphanumeric() {
407                        state.advance(1);
408                        has_name = true;
409                    }
410                    else {
411                        break;
412                    }
413                }
414
415                if has_name && state.peek() == Some(';') {
416                    state.advance(1);
417                    state.add_token(HtmlTokenType::EntityRef, start_pos, state.get_position());
418                    return true;
419                }
420            }
421
422            // Invalid entity reference
423            state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
424            return true;
425        }
426
427        false
428    }
429
430    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
431        HTML_STRING.scan(state, HtmlTokenType::AttributeValue)
432    }
433
434    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
435        let start_pos = state.get_position();
436
437        if let Some(ch) = state.peek() {
438            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
439                state.advance(ch.len_utf8());
440
441                while let Some(ch) = state.peek() {
442                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
443                        state.advance(ch.len_utf8());
444                    }
445                    else {
446                        break;
447                    }
448                }
449
450                state.add_token(HtmlTokenType::TagName, start_pos, state.get_position());
451                return true;
452            }
453        }
454
455        false
456    }
457
458    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
459        let start_pos = state.get_position();
460
461        let kind = match state.peek() {
462            Some('=') => HtmlTokenType::Equal,
463            Some('"') => HtmlTokenType::Quote,
464            Some('\'') => HtmlTokenType::Quote,
465            Some('!') => return false, // Already handled elsewhere
466            Some('?') => return false, // Already handled elsewhere
467            Some('&') => return false, // Already handled elsewhere
468            Some(';') => return false, // Already handled elsewhere
469            _ => return false,
470        };
471
472        if let Some(ch) = state.peek() {
473            state.advance(ch.len_utf8());
474            state.add_token(kind, start_pos, state.get_position());
475            true
476        }
477        else {
478            false
479        }
480    }
481
482    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
483        let start_pos = state.get_position();
484        let bytes = state.rest_bytes();
485        let mut i = 0;
486        let len = bytes.len();
487        const LANES: usize = 32;
488
489        while i + LANES <= len {
490            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
491
492            let is_lt = chunk.simd_eq(Simd::splat(b'<'));
493            let is_amp = chunk.simd_eq(Simd::splat(b'&'));
494            let is_le_space = chunk.simd_le(Simd::splat(32));
495
496            let stop = is_lt | is_amp | is_le_space;
497
498            if stop.any() {
499                let idx = stop.first_set().unwrap();
500                i += idx;
501                state.advance(i);
502                state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
503                return true;
504            }
505            i += LANES
506        }
507        while i < len {
508            let ch = unsafe { *bytes.get_unchecked(i) };
509            if ch == b'<' || ch == b'&' || ch <= 32 {
510                break;
511            }
512            i += 1
513        }
514
515        if i > 0 {
516            state.advance(i);
517            state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
518            true
519        }
520        else {
521            false
522        }
523    }
524}