1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{LexOutput, StringConfig},
5 source::{Source, TextEdit},
6};
7use std::{simd::prelude::*, sync::LazyLock};
8
9type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
10
11static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
14
15#[derive(Clone)]
16pub struct HtmlLexer<'config> {
17 _config: &'config HtmlLanguage,
18}
19
20impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
22 let mut state = LexerState::new(source);
23 let result = self.run(&mut state);
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> HtmlLexer<'config> {
29 pub fn new(config: &'config HtmlLanguage) -> Self {
30 Self { _config: config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35 while state.not_at_end() {
36 let safe_point = state.get_position();
37
38 if let Some(ch) = state.peek() {
39 match ch {
40 ' ' | '\t' | '\n' | '\r' => {
41 self.skip_whitespace(state);
42 }
43 '<' => {
44 if let Some(next) = state.peek_next_n(1) {
45 if next == '!' {
46 if state.starts_with("<!--") {
47 self.lex_comment(state);
48 }
49 else if state.starts_with("<![CDATA[") {
50 self.lex_cdata(state);
51 }
52 else {
53 if !self.lex_doctype(state) {
55 self.lex_tag_operators(state);
59 }
60 }
61 }
62 else if next == '?' {
63 self.lex_processing_instruction(state);
64 }
65 else {
66 self.lex_tag_operators(state);
67 }
68 }
69 else {
70 self.lex_tag_operators(state);
71 }
72 }
73 '/' | '>' => {
74 if self.lex_tag_operators(state) {
75 continue;
76 }
77 self.lex_text(state);
78 }
79 '&' => {
80 self.lex_entity_reference(state);
81 }
82 '"' | '\'' => {
83 self.lex_string_literal(state);
84 }
85 'a'..='z' | 'A'..='Z' | '_' | ':' => {
86 self.lex_identifier(state);
87 }
88 '=' => {
89 self.lex_single_char_tokens(state);
90 }
91 _ => {
92 if self.lex_text(state) {
93 continue;
94 }
95
96 state.advance(ch.len_utf8());
98 state.add_token(HtmlSyntaxKind::Error, safe_point, state.get_position());
99 }
100 }
101 }
102
103 state.advance_if_dead_lock(safe_point);
104 }
105
106 Ok(())
107 }
108
109 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 let start = state.get_position();
111 let bytes = state.rest_bytes();
112 let mut i = 0;
113 let len = bytes.len();
114 const LANES: usize = 32;
115
116 while i + LANES <= len {
117 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
118 let is_le_space = chunk.simd_le(Simd::splat(32));
119
120 if !is_le_space.all() {
121 let not_space = !is_le_space;
122 let idx = not_space.first_set().unwrap();
123 i += idx;
124 state.advance(i);
125 state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
126 return true;
127 }
128 i += LANES;
129 }
130
131 while i < len {
132 if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
133 break;
134 }
135 i += 1;
136 }
137
138 if i > 0 {
139 state.advance(i);
140 state.add_token(HtmlSyntaxKind::Whitespace, start, state.get_position());
141 true
142 }
143 else {
144 false
145 }
146 }
147
148 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
149 if !state.starts_with("<!--") {
150 return false;
151 }
152
153 let start = state.get_position();
154 let len = {
155 let rest = state.rest();
156 match rest.find("-->") {
157 Some(end_at) => end_at + "-->".len(),
158 None => rest.len(),
159 }
160 };
161 state.advance(len);
162 state.add_token(HtmlSyntaxKind::Comment, start, state.get_position());
163 true
164 }
165
166 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
167 let start_pos = state.get_position();
168
169 if let Some('<') = state.peek() {
170 if let Some('!') = state.peek_next_n(1) {
171 if let Some('D') = state.peek_next_n(2) {
172 let doctype_start = "DOCTYPE";
173 let mut matches = true;
174
175 for (i, expected_ch) in doctype_start.chars().enumerate() {
176 if let Some(actual_ch) = state.peek_next_n(2 + i) {
177 if actual_ch.to_ascii_uppercase() != expected_ch {
178 matches = false;
179 break;
180 }
181 }
182 else {
183 matches = false;
184 break;
185 }
186 }
187
188 if matches {
189 state.advance(2 + doctype_start.len()); while state.not_at_end() {
193 if let Some('>') = state.peek() {
194 state.advance(1); state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
196 return true;
197 }
198 if let Some(ch) = state.peek() {
199 state.advance(ch.len_utf8());
200 }
201 else {
202 break;
203 }
204 }
205
206 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
208 return true;
209 }
210 }
211 }
212 }
213
214 false
215 }
216
217 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
218 let start_pos = state.get_position();
219
220 if let Some('<') = state.peek() {
221 if let Some('!') = state.peek_next_n(1) {
222 if let Some('[') = state.peek_next_n(2) {
223 let cdata_start = "CDATA[";
224 let mut matches = true;
225
226 for (i, expected_ch) in cdata_start.chars().enumerate() {
227 if let Some(actual_ch) = state.peek_next_n(3 + i) {
228 if actual_ch != expected_ch {
229 matches = false;
230 break;
231 }
232 }
233 else {
234 matches = false;
235 break;
236 }
237 }
238
239 if matches {
240 state.advance(3 + cdata_start.len()); while state.not_at_end() {
244 if let Some(']') = state.peek() {
245 if let Some(']') = state.peek_next_n(1) {
246 if let Some('>') = state.peek_next_n(2) {
247 state.advance(3); state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
249 return true;
250 }
251 }
252 }
253 if let Some(ch) = state.peek() {
254 state.advance(ch.len_utf8());
255 }
256 else {
257 break;
258 }
259 }
260
261 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
263 return true;
264 }
265 }
266 }
267 }
268
269 false
270 }
271
272 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
273 let start_pos = state.get_position();
274
275 if let Some('<') = state.peek() {
276 if let Some('?') = state.peek_next_n(1) {
277 state.advance(2); while state.not_at_end() {
281 if let Some('?') = state.peek() {
282 if let Some('>') = state.peek_next_n(1) {
283 state.advance(2); state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
285 return true;
286 }
287 }
288 if let Some(ch) = state.peek() {
289 state.advance(ch.len_utf8());
290 }
291 else {
292 break;
293 }
294 }
295
296 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
298 return true;
299 }
300 }
301
302 false
303 }
304
305 fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
306 let start_pos = state.get_position();
307
308 match state.peek() {
309 Some('<') => {
310 if let Some('/') = state.peek_next_n(1) {
311 state.advance(2);
312 state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
313 true
314 }
315 else {
316 state.advance(1);
317 state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
318 true
319 }
320 }
321 Some('/') => {
322 if let Some('>') = state.peek_next_n(1) {
323 state.advance(2);
324 state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
325 true
326 }
327 else {
328 false
329 }
330 }
331 Some('>') => {
332 state.advance(1);
333 state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
334 true
335 }
336 _ => false,
337 }
338 }
339
340 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
341 let start_pos = state.get_position();
342
343 if let Some('&') = state.peek() {
344 state.advance(1);
345
346 if let Some('#') = state.peek() {
347 state.advance(1);
348
349 if let Some('x') = state.peek() {
351 state.advance(1);
352 let mut has_digits = false;
354 while let Some(ch) = state.peek() {
355 if ch.is_ascii_hexdigit() {
356 state.advance(1);
357 has_digits = true;
358 }
359 else {
360 break;
361 }
362 }
363
364 if has_digits && state.peek() == Some(';') {
365 state.advance(1);
366 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
367 return true;
368 }
369 }
370 else {
371 let mut has_digits = false;
373 while let Some(ch) = state.peek() {
374 if ch.is_ascii_digit() {
375 state.advance(1);
376 has_digits = true;
377 }
378 else {
379 break;
380 }
381 }
382
383 if has_digits && state.peek() == Some(';') {
384 state.advance(1);
385 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
386 return true;
387 }
388 }
389 }
390 else {
391 let mut has_name = false;
393 while let Some(ch) = state.peek() {
394 if ch.is_ascii_alphanumeric() {
395 state.advance(1);
396 has_name = true;
397 }
398 else {
399 break;
400 }
401 }
402
403 if has_name && state.peek() == Some(';') {
404 state.advance(1);
405 state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
406 return true;
407 }
408 }
409
410 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
412 return true;
413 }
414
415 false
416 }
417
418 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
419 HTML_STRING.scan(state, HtmlSyntaxKind::AttributeValue)
420 }
421
422 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
423 let start_pos = state.get_position();
424
425 if let Some(ch) = state.peek() {
426 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
427 state.advance(ch.len_utf8());
428
429 while let Some(ch) = state.peek() {
430 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
431 state.advance(ch.len_utf8());
432 }
433 else {
434 break;
435 }
436 }
437
438 state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
439 return true;
440 }
441 }
442
443 false
444 }
445
446 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
447 let start_pos = state.get_position();
448
449 let kind = match state.peek() {
450 Some('=') => HtmlSyntaxKind::Equal,
451 Some('"') => HtmlSyntaxKind::Quote,
452 Some('\'') => HtmlSyntaxKind::Quote,
453 Some('!') => return false, Some('?') => return false, Some('&') => return false, Some(';') => return false, _ => return false,
458 };
459
460 if let Some(ch) = state.peek() {
461 state.advance(ch.len_utf8());
462 state.add_token(kind, start_pos, state.get_position());
463 true
464 }
465 else {
466 false
467 }
468 }
469
470 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
471 let start_pos = state.get_position();
472 let bytes = state.rest_bytes();
473 let mut i = 0;
474 let len = bytes.len();
475 const LANES: usize = 32;
476
477 while i + LANES <= len {
478 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
479
480 let is_lt = chunk.simd_eq(Simd::splat(b'<'));
481 let is_amp = chunk.simd_eq(Simd::splat(b'&'));
482 let is_le_space = chunk.simd_le(Simd::splat(32));
483
484 let stop = is_lt | is_amp | is_le_space;
485
486 if stop.any() {
487 let idx = stop.first_set().unwrap();
488 i += idx;
489 state.advance(i);
490 state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
491 return true;
492 }
493 i += LANES;
494 }
495
496 while i < len {
497 let ch = unsafe { *bytes.get_unchecked(i) };
498 if ch == b'<' || ch == b'&' || ch.is_ascii_whitespace() {
499 break;
500 }
501 i += 1;
502 }
503
504 if i > 0 {
505 state.advance(i);
506 state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
507 true
508 }
509 else {
510 false
511 }
512 }
513}