1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5use crate::{language::HtmlLanguage, lexer::token_type::HtmlTokenType};
6use oak_core::{
7 Lexer, LexerCache, LexerState, OakError,
8 lexer::{LexOutput, StringConfig},
9 source::{Source, TextEdit},
10};
11use std::{simd::prelude::*, sync::LazyLock};
12
13pub(crate) type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
14
15static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19#[derive(Clone, Debug)]
23pub struct HtmlLexer<'config> {
24 config: &'config HtmlLanguage,
25}
26
27impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
28 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
30 let mut state = State::new_with_cache(source, 0, cache);
31 let result = self.run(&mut state);
32 if result.is_ok() {
33 state.add_eof();
34 }
35 state.finish_with_cache(result, cache)
36 }
37}
38
39impl<'config> HtmlLexer<'config> {
40 pub fn new(config: &'config HtmlLanguage) -> Self {
42 Self { config }
43 }
44
45 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
47 while state.not_at_end() {
48 let safe_point = state.get_position();
49
50 if let Some(ch) = state.peek() {
51 match ch {
52 ' ' | '\t' | '\n' | '\r' => {
53 self.skip_whitespace(state);
54 }
55 '<' => {
56 if let Some(next) = state.peek_next_n(1) {
57 if next == '!' {
58 if state.starts_with("<!--") {
59 self.lex_comment(state);
60 }
61 else if state.starts_with("<![CDATA[") {
62 self.lex_cdata(state);
63 }
64 else {
65 if !self.lex_doctype(state) {
67 self.lex_tag_operators(state);
71 }
72 }
73 }
74 else if next == '?' {
75 self.lex_processing_instruction(state);
76 }
77 else {
78 self.lex_tag_operators(state);
79 }
80 }
81 else {
82 self.lex_tag_operators(state);
83 }
84 }
85 '/' | '>' => {
86 if self.lex_tag_operators(state) {
87 continue;
88 }
89 self.lex_text(state);
90 }
91 '&' => {
92 self.lex_entity_reference(state);
93 }
94 '"' | '\'' => {
95 self.lex_string_literal(state);
96 }
97 'a'..='z' | 'A'..='Z' | '_' | ':' => {
98 self.lex_identifier(state);
99 }
100 '=' => {
101 self.lex_single_char_tokens(state);
102 }
103 _ => {
104 if self.lex_text(state) {
105 continue;
106 }
107
108 state.advance(ch.len_utf8());
110 state.add_token(HtmlTokenType::Error, safe_point, state.get_position());
111 }
112 }
113 }
114
115 state.advance_if_dead_lock(safe_point)
116 }
117
118 Ok(())
119 }
120
121 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
122 let start = state.get_position();
123 let bytes = state.rest_bytes();
124 let mut i = 0;
125 let len = bytes.len();
126 const LANES: usize = 32;
127
128 while i + LANES <= len {
129 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
130 let is_le_space = chunk.simd_le(Simd::splat(32));
131
132 if !is_le_space.all() {
133 let not_space = !is_le_space;
134 let idx = not_space.first_set().unwrap();
135 i += idx;
136 state.advance(i);
137 state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
138 return true;
139 }
140 i += LANES;
141 }
142
143 while i < len {
144 if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
145 break;
146 }
147 i += 1;
148 }
149
150 if i > 0 {
151 state.advance(i);
152 state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
153 true
154 }
155 else {
156 false
157 }
158 }
159
160 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
161 if !state.starts_with("<!--") {
162 return false;
163 }
164
165 let start = state.get_position();
166 let len = {
167 let rest = state.rest();
168 match rest.find("-->") {
169 Some(end_at) => end_at + "-->".len(),
170 None => rest.len(),
171 }
172 };
173 state.advance(len);
174 state.add_token(HtmlTokenType::Comment, start, state.get_position());
175 true
176 }
177
178 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
179 let start_pos = state.get_position();
180
181 if let Some('<') = state.peek() {
182 if let Some('!') = state.peek_next_n(1) {
183 if let Some('D') = state.peek_next_n(2) {
184 let doctype_start = "DOCTYPE";
185 let mut matches = true;
186
187 for (i, expected_ch) in doctype_start.chars().enumerate() {
188 if let Some(actual_ch) = state.peek_next_n(2 + i) {
189 if actual_ch.to_ascii_uppercase() != expected_ch {
190 matches = false;
191 break;
192 }
193 }
194 else {
195 matches = false;
196 break;
197 }
198 }
199
200 if matches {
201 state.advance(2 + doctype_start.len()); while state.not_at_end() {
205 if let Some('>') = state.peek() {
206 state.advance(1); state.add_token(HtmlTokenType::Doctype, start_pos, state.get_position());
208 return true;
209 }
210 if let Some(ch) = state.peek() {
211 state.advance(ch.len_utf8());
212 }
213 else {
214 break;
215 }
216 }
217
218 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
220 return true;
221 }
222 }
223 }
224 }
225
226 false
227 }
228
229 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
230 let start_pos = state.get_position();
231
232 if let Some('<') = state.peek() {
233 if let Some('!') = state.peek_next_n(1) {
234 if let Some('[') = state.peek_next_n(2) {
235 let cdata_start = "CDATA[";
236 let mut matches = true;
237
238 for (i, expected_ch) in cdata_start.chars().enumerate() {
239 if let Some(actual_ch) = state.peek_next_n(3 + i) {
240 if actual_ch != expected_ch {
241 matches = false;
242 break;
243 }
244 }
245 else {
246 matches = false;
247 break;
248 }
249 }
250
251 if matches {
252 state.advance(3 + cdata_start.len()); while state.not_at_end() {
256 if let Some(']') = state.peek() {
257 if let Some(']') = state.peek_next_n(1) {
258 if let Some('>') = state.peek_next_n(2) {
259 state.advance(3); state.add_token(HtmlTokenType::CData, start_pos, state.get_position());
261 return true;
262 }
263 }
264 }
265 if let Some(ch) = state.peek() {
266 state.advance(ch.len_utf8());
267 }
268 else {
269 break;
270 }
271 }
272
273 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
275 return true;
276 }
277 }
278 }
279 }
280
281 false
282 }
283
284 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
285 let start_pos = state.get_position();
286
287 if let Some('<') = state.peek() {
288 if let Some('?') = state.peek_next_n(1) {
289 state.advance(2); while state.not_at_end() {
293 if let Some('?') = state.peek() {
294 if let Some('>') = state.peek_next_n(1) {
295 state.advance(2); state.add_token(HtmlTokenType::ProcessingInstruction, start_pos, state.get_position());
297 return true;
298 }
299 }
300 if let Some(ch) = state.peek() {
301 state.advance(ch.len_utf8());
302 }
303 else {
304 break;
305 }
306 }
307
308 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
310 return true;
311 }
312 }
313
314 false
315 }
316
317 fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
318 let start_pos = state.get_position();
319
320 match state.peek() {
321 Some('<') => {
322 if let Some('/') = state.peek_next_n(1) {
323 state.advance(2);
324 state.add_token(HtmlTokenType::TagSlashOpen, start_pos, state.get_position());
325 true
326 }
327 else {
328 state.advance(1);
329 state.add_token(HtmlTokenType::TagOpen, start_pos, state.get_position());
330 true
331 }
332 }
333 Some('/') => {
334 if let Some('>') = state.peek_next_n(1) {
335 state.advance(2);
336 state.add_token(HtmlTokenType::TagSelfClose, start_pos, state.get_position());
337 true
338 }
339 else {
340 false
341 }
342 }
343 Some('>') => {
344 state.advance(1);
345 state.add_token(HtmlTokenType::TagClose, start_pos, state.get_position());
346 true
347 }
348 _ => false,
349 }
350 }
351
352 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
353 let start_pos = state.get_position();
354
355 if let Some('&') = state.peek() {
356 state.advance(1);
357
358 if let Some('#') = state.peek() {
359 state.advance(1);
360
361 if let Some('x') = state.peek() {
363 state.advance(1);
364 let mut has_digits = false;
366 while let Some(ch) = state.peek() {
367 if ch.is_ascii_hexdigit() {
368 state.advance(1);
369 has_digits = true;
370 }
371 else {
372 break;
373 }
374 }
375
376 if has_digits && state.peek() == Some(';') {
377 state.advance(1);
378 state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
379 return true;
380 }
381 }
382 else {
383 let mut has_digits = false;
385 while let Some(ch) = state.peek() {
386 if ch.is_ascii_digit() {
387 state.advance(1);
388 has_digits = true;
389 }
390 else {
391 break;
392 }
393 }
394
395 if has_digits && state.peek() == Some(';') {
396 state.advance(1);
397 state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
398 return true;
399 }
400 }
401 }
402 else {
403 let mut has_name = false;
405 while let Some(ch) = state.peek() {
406 if ch.is_ascii_alphanumeric() {
407 state.advance(1);
408 has_name = true;
409 }
410 else {
411 break;
412 }
413 }
414
415 if has_name && state.peek() == Some(';') {
416 state.advance(1);
417 state.add_token(HtmlTokenType::EntityRef, start_pos, state.get_position());
418 return true;
419 }
420 }
421
422 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
424 return true;
425 }
426
427 false
428 }
429
430 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
431 HTML_STRING.scan(state, HtmlTokenType::AttributeValue)
432 }
433
434 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
435 let start_pos = state.get_position();
436
437 if let Some(ch) = state.peek() {
438 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
439 state.advance(ch.len_utf8());
440
441 while let Some(ch) = state.peek() {
442 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
443 state.advance(ch.len_utf8());
444 }
445 else {
446 break;
447 }
448 }
449
450 state.add_token(HtmlTokenType::TagName, start_pos, state.get_position());
451 return true;
452 }
453 }
454
455 false
456 }
457
458 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
459 let start_pos = state.get_position();
460
461 let kind = match state.peek() {
462 Some('=') => HtmlTokenType::Equal,
463 Some('"') => HtmlTokenType::Quote,
464 Some('\'') => HtmlTokenType::Quote,
465 Some('!') => return false, Some('?') => return false, Some('&') => return false, Some(';') => return false, _ => return false,
470 };
471
472 if let Some(ch) = state.peek() {
473 state.advance(ch.len_utf8());
474 state.add_token(kind, start_pos, state.get_position());
475 true
476 }
477 else {
478 false
479 }
480 }
481
482 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
483 let start_pos = state.get_position();
484 let bytes = state.rest_bytes();
485 let mut i = 0;
486 let len = bytes.len();
487 const LANES: usize = 32;
488
489 while i + LANES <= len {
490 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
491
492 let is_lt = chunk.simd_eq(Simd::splat(b'<'));
493 let is_amp = chunk.simd_eq(Simd::splat(b'&'));
494 let is_le_space = chunk.simd_le(Simd::splat(32));
495
496 let stop = is_lt | is_amp | is_le_space;
497
498 if stop.any() {
499 let idx = stop.first_set().unwrap();
500 i += idx;
501 state.advance(i);
502 state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
503 return true;
504 }
505 i += LANES
506 }
507 while i < len {
508 let ch = unsafe { *bytes.get_unchecked(i) };
509 if ch == b'<' || ch == b'&' || ch <= 32 {
510 break;
511 }
512 i += 1
513 }
514
515 if i > 0 {
516 state.advance(i);
517 state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
518 true
519 }
520 else {
521 false
522 }
523 }
524}