Skip to main content

oak_html/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HtmlLanguage, lexer::token_type::HtmlTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{LexOutput, StringConfig},
8    source::{Source, TextEdit},
9};
10use std::{simd::prelude::*, sync::LazyLock};
11
12type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
13
14// HTML static configuration
15
16static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
17
18/// Lexer for the HTML language.
19///
20/// This lexer converts a raw string into a stream of HTML syntax tokens.
21#[derive(Clone, Debug)]
22pub struct HtmlLexer<'config> {
23    _config: &'config HtmlLanguage,
24}
25
26impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
27    /// Tokenizes the input source text using the provided cache.
28    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
29        let mut state = State::new_with_cache(source, 0, cache);
30        let result = self.run(&mut state);
31        if result.is_ok() {
32            state.add_eof();
33        }
34        state.finish_with_cache(result, cache)
35    }
36}
37
38impl<'config> HtmlLexer<'config> {
39    /// Creates a new `HtmlLexer` with the given configuration.
40    pub fn new(config: &'config HtmlLanguage) -> Self {
41        Self { _config: config }
42    }
43
44    /// The main lexing loop that iterates through the source text.
45    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
46        while state.not_at_end() {
47            let safe_point = state.get_position();
48
49            if let Some(ch) = state.peek() {
50                match ch {
51                    ' ' | '\t' | '\n' | '\r' => {
52                        self.skip_whitespace(state);
53                    }
54                    '<' => {
55                        if let Some(next) = state.peek_next_n(1) {
56                            if next == '!' {
57                                if state.starts_with("<!--") {
58                                    self.lex_comment(state);
59                                }
60                                else if state.starts_with("<![CDATA[") {
61                                    self.lex_cdata(state);
62                                }
63                                else {
64                                    // Try Doctype
65                                    if !self.lex_doctype(state) {
66                                        // Fallback to tag operator (TagOpen) or Text?
67                                        // Original loop: tries doctype, cdata, then tag_operators.
68                                        // If doctype fails (e.g. <!FOO>), tag_operators will see < and consume it as TagOpen.
69                                        self.lex_tag_operators(state);
70                                    }
71                                }
72                            }
73                            else if next == '?' {
74                                self.lex_processing_instruction(state);
75                            }
76                            else {
77                                self.lex_tag_operators(state);
78                            }
79                        }
80                        else {
81                            self.lex_tag_operators(state);
82                        }
83                    }
84                    '/' | '>' => {
85                        if self.lex_tag_operators(state) {
86                            continue;
87                        }
88                        self.lex_text(state);
89                    }
90                    '&' => {
91                        self.lex_entity_reference(state);
92                    }
93                    '"' | '\'' => {
94                        self.lex_string_literal(state);
95                    }
96                    'a'..='z' | 'A'..='Z' | '_' | ':' => {
97                        self.lex_identifier(state);
98                    }
99                    '=' => {
100                        self.lex_single_char_tokens(state);
101                    }
102                    _ => {
103                        if self.lex_text(state) {
104                            continue;
105                        }
106
107                        // Safety check to prevent infinite loop
108                        state.advance(ch.len_utf8());
109                        state.add_token(HtmlTokenType::Error, safe_point, state.get_position());
110                    }
111                }
112            }
113
114            state.advance_if_dead_lock(safe_point)
115        }
116
117        Ok(())
118    }
119
120    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
121        let start = state.get_position();
122        let bytes = state.rest_bytes();
123        let mut i = 0;
124        let len = bytes.len();
125        const LANES: usize = 32;
126
127        while i + LANES <= len {
128            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
129            let is_le_space = chunk.simd_le(Simd::splat(32));
130
131            if !is_le_space.all() {
132                let not_space = !is_le_space;
133                let idx = not_space.first_set().unwrap();
134                i += idx;
135                state.advance(i);
136                state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
137                return true;
138            }
139            i += LANES;
140        }
141
142        while i < len {
143            if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
144                break;
145            }
146            i += 1;
147        }
148
149        if i > 0 {
150            state.advance(i);
151            state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
152            true
153        }
154        else {
155            false
156        }
157    }
158
159    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160        if !state.starts_with("<!--") {
161            return false;
162        }
163
164        let start = state.get_position();
165        let len = {
166            let rest = state.rest();
167            match rest.find("-->") {
168                Some(end_at) => end_at + "-->".len(),
169                None => rest.len(),
170            }
171        };
172        state.advance(len);
173        state.add_token(HtmlTokenType::Comment, start, state.get_position());
174        true
175    }
176
177    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
178        let start_pos = state.get_position();
179
180        if let Some('<') = state.peek() {
181            if let Some('!') = state.peek_next_n(1) {
182                if let Some('D') = state.peek_next_n(2) {
183                    let doctype_start = "DOCTYPE";
184                    let mut matches = true;
185
186                    for (i, expected_ch) in doctype_start.chars().enumerate() {
187                        if let Some(actual_ch) = state.peek_next_n(2 + i) {
188                            if actual_ch.to_ascii_uppercase() != expected_ch {
189                                matches = false;
190                                break;
191                            }
192                        }
193                        else {
194                            matches = false;
195                            break;
196                        }
197                    }
198
199                    if matches {
200                        state.advance(2 + doctype_start.len()); // Skip <!DOCTYPE
201
202                        // Find doctype end >
203                        while state.not_at_end() {
204                            if let Some('>') = state.peek() {
205                                state.advance(1); // Skip >
206                                state.add_token(HtmlTokenType::Doctype, start_pos, state.get_position());
207                                return true;
208                            }
209                            if let Some(ch) = state.peek() {
210                                state.advance(ch.len_utf8());
211                            }
212                            else {
213                                break;
214                            }
215                        }
216
217                        // Unclosed doctype
218                        state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
219                        return true;
220                    }
221                }
222            }
223        }
224
225        false
226    }
227
228    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229        let start_pos = state.get_position();
230
231        if let Some('<') = state.peek() {
232            if let Some('!') = state.peek_next_n(1) {
233                if let Some('[') = state.peek_next_n(2) {
234                    let cdata_start = "CDATA[";
235                    let mut matches = true;
236
237                    for (i, expected_ch) in cdata_start.chars().enumerate() {
238                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
239                            if actual_ch != expected_ch {
240                                matches = false;
241                                break;
242                            }
243                        }
244                        else {
245                            matches = false;
246                            break;
247                        }
248                    }
249
250                    if matches {
251                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
252
253                        // Find CDATA end ]]>
254                        while state.not_at_end() {
255                            if let Some(']') = state.peek() {
256                                if let Some(']') = state.peek_next_n(1) {
257                                    if let Some('>') = state.peek_next_n(2) {
258                                        state.advance(3); // Skip ]]>
259                                        state.add_token(HtmlTokenType::CData, start_pos, state.get_position());
260                                        return true;
261                                    }
262                                }
263                            }
264                            if let Some(ch) = state.peek() {
265                                state.advance(ch.len_utf8());
266                            }
267                            else {
268                                break;
269                            }
270                        }
271
272                        // Unclosed CDATA
273                        state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
274                        return true;
275                    }
276                }
277            }
278        }
279
280        false
281    }
282
283    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
284        let start_pos = state.get_position();
285
286        if let Some('<') = state.peek() {
287            if let Some('?') = state.peek_next_n(1) {
288                state.advance(2); // Skip <?
289
290                // Find processing instruction end ?>
291                while state.not_at_end() {
292                    if let Some('?') = state.peek() {
293                        if let Some('>') = state.peek_next_n(1) {
294                            state.advance(2); // Skip ?>
295                            state.add_token(HtmlTokenType::ProcessingInstruction, start_pos, state.get_position());
296                            return true;
297                        }
298                    }
299                    if let Some(ch) = state.peek() {
300                        state.advance(ch.len_utf8());
301                    }
302                    else {
303                        break;
304                    }
305                }
306
307                // Unclosed processing instruction
308                state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
309                return true;
310            }
311        }
312
313        false
314    }
315
316    fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
317        let start_pos = state.get_position();
318
319        match state.peek() {
320            Some('<') => {
321                if let Some('/') = state.peek_next_n(1) {
322                    state.advance(2);
323                    state.add_token(HtmlTokenType::TagSlashOpen, start_pos, state.get_position());
324                    true
325                }
326                else {
327                    state.advance(1);
328                    state.add_token(HtmlTokenType::TagOpen, start_pos, state.get_position());
329                    true
330                }
331            }
332            Some('/') => {
333                if let Some('>') = state.peek_next_n(1) {
334                    state.advance(2);
335                    state.add_token(HtmlTokenType::TagSelfClose, start_pos, state.get_position());
336                    true
337                }
338                else {
339                    false
340                }
341            }
342            Some('>') => {
343                state.advance(1);
344                state.add_token(HtmlTokenType::TagClose, start_pos, state.get_position());
345                true
346            }
347            _ => false,
348        }
349    }
350
351    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
352        let start_pos = state.get_position();
353
354        if let Some('&') = state.peek() {
355            state.advance(1);
356
357            if let Some('#') = state.peek() {
358                state.advance(1);
359
360                // Character reference &#123; or &#x1A;
361                if let Some('x') = state.peek() {
362                    state.advance(1);
363                    // Hexadecimal character reference
364                    let mut has_digits = false;
365                    while let Some(ch) = state.peek() {
366                        if ch.is_ascii_hexdigit() {
367                            state.advance(1);
368                            has_digits = true;
369                        }
370                        else {
371                            break;
372                        }
373                    }
374
375                    if has_digits && state.peek() == Some(';') {
376                        state.advance(1);
377                        state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
378                        return true;
379                    }
380                }
381                else {
382                    // Decimal character reference
383                    let mut has_digits = false;
384                    while let Some(ch) = state.peek() {
385                        if ch.is_ascii_digit() {
386                            state.advance(1);
387                            has_digits = true;
388                        }
389                        else {
390                            break;
391                        }
392                    }
393
394                    if has_digits && state.peek() == Some(';') {
395                        state.advance(1);
396                        state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
397                        return true;
398                    }
399                }
400            }
401            else {
402                // Named entity reference &name;
403                let mut has_name = false;
404                while let Some(ch) = state.peek() {
405                    if ch.is_ascii_alphanumeric() {
406                        state.advance(1);
407                        has_name = true;
408                    }
409                    else {
410                        break;
411                    }
412                }
413
414                if has_name && state.peek() == Some(';') {
415                    state.advance(1);
416                    state.add_token(HtmlTokenType::EntityRef, start_pos, state.get_position());
417                    return true;
418                }
419            }
420
421            // Invalid entity reference
422            state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
423            return true;
424        }
425
426        false
427    }
428
429    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
430        HTML_STRING.scan(state, HtmlTokenType::AttributeValue)
431    }
432
433    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
434        let start_pos = state.get_position();
435
436        if let Some(ch) = state.peek() {
437            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
438                state.advance(ch.len_utf8());
439
440                while let Some(ch) = state.peek() {
441                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
442                        state.advance(ch.len_utf8());
443                    }
444                    else {
445                        break;
446                    }
447                }
448
449                state.add_token(HtmlTokenType::TagName, start_pos, state.get_position());
450                return true;
451            }
452        }
453
454        false
455    }
456
457    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
458        let start_pos = state.get_position();
459
460        let kind = match state.peek() {
461            Some('=') => HtmlTokenType::Equal,
462            Some('"') => HtmlTokenType::Quote,
463            Some('\'') => HtmlTokenType::Quote,
464            Some('!') => return false, // Already handled elsewhere
465            Some('?') => return false, // Already handled elsewhere
466            Some('&') => return false, // Already handled elsewhere
467            Some(';') => return false, // Already handled elsewhere
468            _ => return false,
469        };
470
471        if let Some(ch) = state.peek() {
472            state.advance(ch.len_utf8());
473            state.add_token(kind, start_pos, state.get_position());
474            true
475        }
476        else {
477            false
478        }
479    }
480
481    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
482        let start_pos = state.get_position();
483        let bytes = state.rest_bytes();
484        let mut i = 0;
485        let len = bytes.len();
486        const LANES: usize = 32;
487
488        while i + LANES <= len {
489            let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
490
491            let is_lt = chunk.simd_eq(Simd::splat(b'<'));
492            let is_amp = chunk.simd_eq(Simd::splat(b'&'));
493            let is_le_space = chunk.simd_le(Simd::splat(32));
494
495            let stop = is_lt | is_amp | is_le_space;
496
497            if stop.any() {
498                let idx = stop.first_set().unwrap();
499                i += idx;
500                state.advance(i);
501                state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
502                return true;
503            }
504            i += LANES
505        }
506        while i < len {
507            let ch = unsafe { *bytes.get_unchecked(i) };
508            if ch == b'<' || ch == b'&' || ch <= 32 {
509                break;
510            }
511            i += 1
512        }
513
514        if i > 0 {
515            state.advance(i);
516            state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
517            true
518        }
519        else {
520            false
521        }
522    }
523}