1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{LexOutput, StringConfig},
5 source::{Source, TextEdit},
6};
7use std::{simd::prelude::*, sync::LazyLock};
8
9type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
10
11static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
14
15#[derive(Clone, Debug)]
16pub struct HtmlLexer<'config> {
17 _config: &'config HtmlLanguage,
18}
19
20impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
22 let mut state = State::new_with_cache(source, 0, cache);
23 let result = self.run(&mut state);
24 if result.is_ok() {
25 state.add_eof();
26 }
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> HtmlLexer<'config> {
32 pub fn new(config: &'config HtmlLanguage) -> Self {
33 Self { _config: config }
34 }
35
36 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38 while state.not_at_end() {
39 let safe_point = state.get_position();
40
41 if let Some(ch) = state.peek() {
42 match ch {
43 ' ' | '\t' | '\n' | '\r' => {
44 self.skip_whitespace(state);
45 }
46 '<' => {
47 if let Some(next) = state.peek_next_n(1) {
48 if next == '!' {
49 if state.starts_with("<!--") {
50 self.lex_comment(state);
51 }
52 else if state.starts_with("<![CDATA[") {
53 self.lex_cdata(state);
54 }
55 else {
56 if !self.lex_doctype(state) {
58 self.lex_tag_operators(state);
62 }
63 }
64 }
65 else if next == '?' {
66 self.lex_processing_instruction(state);
67 }
68 else {
69 self.lex_tag_operators(state);
70 }
71 }
72 else {
73 self.lex_tag_operators(state);
74 }
75 }
76 '/' | '>' => {
77 if self.lex_tag_operators(state) {
78 continue;
79 }
80 self.lex_text(state);
81 }
82 '&' => {
83 self.lex_entity_reference(state);
84 }
85 '"' | '\'' => {
86 self.lex_string_literal(state);
87 }
88 'a'..='z' | 'A'..='Z' | '_' | ':' => {
89 self.lex_identifier(state);
90 }
91 '=' => {
92 self.lex_single_char_tokens(state);
93 }
94 _ => {
95 if self.lex_text(state) {
96 continue;
97 }
98
99 state.advance(ch.len_utf8());
101 state.add_token(HtmlSyntaxKind::Error, safe_point, state.get_position());
102 }
103 }
104 }
105
106 state.advance_if_dead_lock(safe_point);
107 }
108
109 Ok(())
110 }
111
112 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113 let start = state.get_position();
114 let bytes = state.rest_bytes();
115 let mut i = 0;
116 let len = bytes.len();
117 const LANES: usize = 32;
118
119 while i + LANES <= len {
120 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
121 let is_le_space = chunk.simd_le(Simd::splat(32));
122
123 if !is_le_space.all() {
124 let not_space = !is_le_space;
125 let idx = not_space.first_set().unwrap();
126 i += idx;
127 state.advance(i);
128 state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
129 return true;
130 }
131 i += LANES;
132 }
133
134 while i < len {
135 if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
136 break;
137 }
138 i += 1;
139 }
140
141 if i > 0 {
142 state.advance(i);
143 state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
144 true
145 }
146 else {
147 false
148 }
149 }
150
151 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
152 if !state.starts_with("<!--") {
153 return false;
154 }
155
156 let start = state.get_position();
157 let len = {
158 let rest = state.rest();
159 match rest.find("-->") {
160 Some(end_at) => end_at + "-->".len(),
161 None => rest.len(),
162 }
163 };
164 state.advance(len);
165 state.add_token(HtmlSyntaxKind::Comment, start, state.get_position());
166 true
167 }
168
169 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
170 let start_pos = state.get_position();
171
172 if let Some('<') = state.peek() {
173 if let Some('!') = state.peek_next_n(1) {
174 if let Some('D') = state.peek_next_n(2) {
175 let doctype_start = "DOCTYPE";
176 let mut matches = true;
177
178 for (i, expected_ch) in doctype_start.chars().enumerate() {
179 if let Some(actual_ch) = state.peek_next_n(2 + i) {
180 if actual_ch.to_ascii_uppercase() != expected_ch {
181 matches = false;
182 break;
183 }
184 }
185 else {
186 matches = false;
187 break;
188 }
189 }
190
191 if matches {
192 state.advance(2 + doctype_start.len()); while state.not_at_end() {
196 if let Some('>') = state.peek() {
197 state.advance(1); state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
199 return true;
200 }
201 if let Some(ch) = state.peek() {
202 state.advance(ch.len_utf8());
203 }
204 else {
205 break;
206 }
207 }
208
209 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
211 return true;
212 }
213 }
214 }
215 }
216
217 false
218 }
219
220 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
221 let start_pos = state.get_position();
222
223 if let Some('<') = state.peek() {
224 if let Some('!') = state.peek_next_n(1) {
225 if let Some('[') = state.peek_next_n(2) {
226 let cdata_start = "CDATA[";
227 let mut matches = true;
228
229 for (i, expected_ch) in cdata_start.chars().enumerate() {
230 if let Some(actual_ch) = state.peek_next_n(3 + i) {
231 if actual_ch != expected_ch {
232 matches = false;
233 break;
234 }
235 }
236 else {
237 matches = false;
238 break;
239 }
240 }
241
242 if matches {
243 state.advance(3 + cdata_start.len()); while state.not_at_end() {
247 if let Some(']') = state.peek() {
248 if let Some(']') = state.peek_next_n(1) {
249 if let Some('>') = state.peek_next_n(2) {
250 state.advance(3); state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
252 return true;
253 }
254 }
255 }
256 if let Some(ch) = state.peek() {
257 state.advance(ch.len_utf8());
258 }
259 else {
260 break;
261 }
262 }
263
264 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
266 return true;
267 }
268 }
269 }
270 }
271
272 false
273 }
274
275 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276 let start_pos = state.get_position();
277
278 if let Some('<') = state.peek() {
279 if let Some('?') = state.peek_next_n(1) {
280 state.advance(2); while state.not_at_end() {
284 if let Some('?') = state.peek() {
285 if let Some('>') = state.peek_next_n(1) {
286 state.advance(2); state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
288 return true;
289 }
290 }
291 if let Some(ch) = state.peek() {
292 state.advance(ch.len_utf8());
293 }
294 else {
295 break;
296 }
297 }
298
299 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
301 return true;
302 }
303 }
304
305 false
306 }
307
308 fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
309 let start_pos = state.get_position();
310
311 match state.peek() {
312 Some('<') => {
313 if let Some('/') = state.peek_next_n(1) {
314 state.advance(2);
315 state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
316 true
317 }
318 else {
319 state.advance(1);
320 state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
321 true
322 }
323 }
324 Some('/') => {
325 if let Some('>') = state.peek_next_n(1) {
326 state.advance(2);
327 state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
328 true
329 }
330 else {
331 false
332 }
333 }
334 Some('>') => {
335 state.advance(1);
336 state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
337 true
338 }
339 _ => false,
340 }
341 }
342
343 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
344 let start_pos = state.get_position();
345
346 if let Some('&') = state.peek() {
347 state.advance(1);
348
349 if let Some('#') = state.peek() {
350 state.advance(1);
351
352 if let Some('x') = state.peek() {
354 state.advance(1);
355 let mut has_digits = false;
357 while let Some(ch) = state.peek() {
358 if ch.is_ascii_hexdigit() {
359 state.advance(1);
360 has_digits = true;
361 }
362 else {
363 break;
364 }
365 }
366
367 if has_digits && state.peek() == Some(';') {
368 state.advance(1);
369 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
370 return true;
371 }
372 }
373 else {
374 let mut has_digits = false;
376 while let Some(ch) = state.peek() {
377 if ch.is_ascii_digit() {
378 state.advance(1);
379 has_digits = true;
380 }
381 else {
382 break;
383 }
384 }
385
386 if has_digits && state.peek() == Some(';') {
387 state.advance(1);
388 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
389 return true;
390 }
391 }
392 }
393 else {
394 let mut has_name = false;
396 while let Some(ch) = state.peek() {
397 if ch.is_ascii_alphanumeric() {
398 state.advance(1);
399 has_name = true;
400 }
401 else {
402 break;
403 }
404 }
405
406 if has_name && state.peek() == Some(';') {
407 state.advance(1);
408 state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
409 return true;
410 }
411 }
412
413 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
415 return true;
416 }
417
418 false
419 }
420
421 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
422 HTML_STRING.scan(state, HtmlSyntaxKind::AttributeValue)
423 }
424
425 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
426 let start_pos = state.get_position();
427
428 if let Some(ch) = state.peek() {
429 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
430 state.advance(ch.len_utf8());
431
432 while let Some(ch) = state.peek() {
433 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
434 state.advance(ch.len_utf8());
435 }
436 else {
437 break;
438 }
439 }
440
441 state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
442 return true;
443 }
444 }
445
446 false
447 }
448
449 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
450 let start_pos = state.get_position();
451
452 let kind = match state.peek() {
453 Some('=') => HtmlSyntaxKind::Equal,
454 Some('"') => HtmlSyntaxKind::Quote,
455 Some('\'') => HtmlSyntaxKind::Quote,
456 Some('!') => return false, Some('?') => return false, Some('&') => return false, Some(';') => return false, _ => return false,
461 };
462
463 if let Some(ch) = state.peek() {
464 state.advance(ch.len_utf8());
465 state.add_token(kind, start_pos, state.get_position());
466 true
467 }
468 else {
469 false
470 }
471 }
472
473 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
474 let start_pos = state.get_position();
475 let bytes = state.rest_bytes();
476 let mut i = 0;
477 let len = bytes.len();
478 const LANES: usize = 32;
479
480 while i + LANES <= len {
481 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
482
483 let is_lt = chunk.simd_eq(Simd::splat(b'<'));
484 let is_amp = chunk.simd_eq(Simd::splat(b'&'));
485 let is_le_space = chunk.simd_le(Simd::splat(32));
486
487 let stop = is_lt | is_amp | is_le_space;
488
489 if stop.any() {
490 let idx = stop.first_set().unwrap();
491 i += idx;
492 state.advance(i);
493 state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
494 return true;
495 }
496 i += LANES;
497 }
498
499 while i < len {
500 let ch = unsafe { *bytes.get_unchecked(i) };
501 if ch == b'<' || ch == b'&' || ch.is_ascii_whitespace() {
502 break;
503 }
504 i += 1;
505 }
506
507 if i > 0 {
508 state.advance(i);
509 state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
510 true
511 }
512 else {
513 false
514 }
515 }
516}