1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HtmlLanguage, lexer::token_type::HtmlTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{LexOutput, StringConfig},
8 source::{Source, TextEdit},
9};
10use std::{simd::prelude::*, sync::LazyLock};
11
12type State<'a, S> = LexerState<'a, S, HtmlLanguage>;
13
14static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
17
18#[derive(Clone, Debug)]
22pub struct HtmlLexer<'config> {
23 _config: &'config HtmlLanguage,
24}
25
26impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
27 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HtmlLanguage>) -> LexOutput<HtmlLanguage> {
29 let mut state = State::new_with_cache(source, 0, cache);
30 let result = self.run(&mut state);
31 if result.is_ok() {
32 state.add_eof();
33 }
34 state.finish_with_cache(result, cache)
35 }
36}
37
38impl<'config> HtmlLexer<'config> {
39 pub fn new(config: &'config HtmlLanguage) -> Self {
41 Self { _config: config }
42 }
43
44 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
46 while state.not_at_end() {
47 let safe_point = state.get_position();
48
49 if let Some(ch) = state.peek() {
50 match ch {
51 ' ' | '\t' | '\n' | '\r' => {
52 self.skip_whitespace(state);
53 }
54 '<' => {
55 if let Some(next) = state.peek_next_n(1) {
56 if next == '!' {
57 if state.starts_with("<!--") {
58 self.lex_comment(state);
59 }
60 else if state.starts_with("<![CDATA[") {
61 self.lex_cdata(state);
62 }
63 else {
64 if !self.lex_doctype(state) {
66 self.lex_tag_operators(state);
70 }
71 }
72 }
73 else if next == '?' {
74 self.lex_processing_instruction(state);
75 }
76 else {
77 self.lex_tag_operators(state);
78 }
79 }
80 else {
81 self.lex_tag_operators(state);
82 }
83 }
84 '/' | '>' => {
85 if self.lex_tag_operators(state) {
86 continue;
87 }
88 self.lex_text(state);
89 }
90 '&' => {
91 self.lex_entity_reference(state);
92 }
93 '"' | '\'' => {
94 self.lex_string_literal(state);
95 }
96 'a'..='z' | 'A'..='Z' | '_' | ':' => {
97 self.lex_identifier(state);
98 }
99 '=' => {
100 self.lex_single_char_tokens(state);
101 }
102 _ => {
103 if self.lex_text(state) {
104 continue;
105 }
106
107 state.advance(ch.len_utf8());
109 state.add_token(HtmlTokenType::Error, safe_point, state.get_position());
110 }
111 }
112 }
113
114 state.advance_if_dead_lock(safe_point)
115 }
116
117 Ok(())
118 }
119
120 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
121 let start = state.get_position();
122 let bytes = state.rest_bytes();
123 let mut i = 0;
124 let len = bytes.len();
125 const LANES: usize = 32;
126
127 while i + LANES <= len {
128 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
129 let is_le_space = chunk.simd_le(Simd::splat(32));
130
131 if !is_le_space.all() {
132 let not_space = !is_le_space;
133 let idx = not_space.first_set().unwrap();
134 i += idx;
135 state.advance(i);
136 state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
137 return true;
138 }
139 i += LANES;
140 }
141
142 while i < len {
143 if !unsafe { *bytes.get_unchecked(i) }.is_ascii_whitespace() {
144 break;
145 }
146 i += 1;
147 }
148
149 if i > 0 {
150 state.advance(i);
151 state.add_token(HtmlTokenType::Whitespace, start, state.get_position());
152 true
153 }
154 else {
155 false
156 }
157 }
158
159 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160 if !state.starts_with("<!--") {
161 return false;
162 }
163
164 let start = state.get_position();
165 let len = {
166 let rest = state.rest();
167 match rest.find("-->") {
168 Some(end_at) => end_at + "-->".len(),
169 None => rest.len(),
170 }
171 };
172 state.advance(len);
173 state.add_token(HtmlTokenType::Comment, start, state.get_position());
174 true
175 }
176
177 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
178 let start_pos = state.get_position();
179
180 if let Some('<') = state.peek() {
181 if let Some('!') = state.peek_next_n(1) {
182 if let Some('D') = state.peek_next_n(2) {
183 let doctype_start = "DOCTYPE";
184 let mut matches = true;
185
186 for (i, expected_ch) in doctype_start.chars().enumerate() {
187 if let Some(actual_ch) = state.peek_next_n(2 + i) {
188 if actual_ch.to_ascii_uppercase() != expected_ch {
189 matches = false;
190 break;
191 }
192 }
193 else {
194 matches = false;
195 break;
196 }
197 }
198
199 if matches {
200 state.advance(2 + doctype_start.len()); while state.not_at_end() {
204 if let Some('>') = state.peek() {
205 state.advance(1); state.add_token(HtmlTokenType::Doctype, start_pos, state.get_position());
207 return true;
208 }
209 if let Some(ch) = state.peek() {
210 state.advance(ch.len_utf8());
211 }
212 else {
213 break;
214 }
215 }
216
217 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
219 return true;
220 }
221 }
222 }
223 }
224
225 false
226 }
227
228 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
229 let start_pos = state.get_position();
230
231 if let Some('<') = state.peek() {
232 if let Some('!') = state.peek_next_n(1) {
233 if let Some('[') = state.peek_next_n(2) {
234 let cdata_start = "CDATA[";
235 let mut matches = true;
236
237 for (i, expected_ch) in cdata_start.chars().enumerate() {
238 if let Some(actual_ch) = state.peek_next_n(3 + i) {
239 if actual_ch != expected_ch {
240 matches = false;
241 break;
242 }
243 }
244 else {
245 matches = false;
246 break;
247 }
248 }
249
250 if matches {
251 state.advance(3 + cdata_start.len()); while state.not_at_end() {
255 if let Some(']') = state.peek() {
256 if let Some(']') = state.peek_next_n(1) {
257 if let Some('>') = state.peek_next_n(2) {
258 state.advance(3); state.add_token(HtmlTokenType::CData, start_pos, state.get_position());
260 return true;
261 }
262 }
263 }
264 if let Some(ch) = state.peek() {
265 state.advance(ch.len_utf8());
266 }
267 else {
268 break;
269 }
270 }
271
272 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
274 return true;
275 }
276 }
277 }
278 }
279
280 false
281 }
282
283 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
284 let start_pos = state.get_position();
285
286 if let Some('<') = state.peek() {
287 if let Some('?') = state.peek_next_n(1) {
288 state.advance(2); while state.not_at_end() {
292 if let Some('?') = state.peek() {
293 if let Some('>') = state.peek_next_n(1) {
294 state.advance(2); state.add_token(HtmlTokenType::ProcessingInstruction, start_pos, state.get_position());
296 return true;
297 }
298 }
299 if let Some(ch) = state.peek() {
300 state.advance(ch.len_utf8());
301 }
302 else {
303 break;
304 }
305 }
306
307 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
309 return true;
310 }
311 }
312
313 false
314 }
315
316 fn lex_tag_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
317 let start_pos = state.get_position();
318
319 match state.peek() {
320 Some('<') => {
321 if let Some('/') = state.peek_next_n(1) {
322 state.advance(2);
323 state.add_token(HtmlTokenType::TagSlashOpen, start_pos, state.get_position());
324 true
325 }
326 else {
327 state.advance(1);
328 state.add_token(HtmlTokenType::TagOpen, start_pos, state.get_position());
329 true
330 }
331 }
332 Some('/') => {
333 if let Some('>') = state.peek_next_n(1) {
334 state.advance(2);
335 state.add_token(HtmlTokenType::TagSelfClose, start_pos, state.get_position());
336 true
337 }
338 else {
339 false
340 }
341 }
342 Some('>') => {
343 state.advance(1);
344 state.add_token(HtmlTokenType::TagClose, start_pos, state.get_position());
345 true
346 }
347 _ => false,
348 }
349 }
350
351 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
352 let start_pos = state.get_position();
353
354 if let Some('&') = state.peek() {
355 state.advance(1);
356
357 if let Some('#') = state.peek() {
358 state.advance(1);
359
360 if let Some('x') = state.peek() {
362 state.advance(1);
363 let mut has_digits = false;
365 while let Some(ch) = state.peek() {
366 if ch.is_ascii_hexdigit() {
367 state.advance(1);
368 has_digits = true;
369 }
370 else {
371 break;
372 }
373 }
374
375 if has_digits && state.peek() == Some(';') {
376 state.advance(1);
377 state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
378 return true;
379 }
380 }
381 else {
382 let mut has_digits = false;
384 while let Some(ch) = state.peek() {
385 if ch.is_ascii_digit() {
386 state.advance(1);
387 has_digits = true;
388 }
389 else {
390 break;
391 }
392 }
393
394 if has_digits && state.peek() == Some(';') {
395 state.advance(1);
396 state.add_token(HtmlTokenType::CharRef, start_pos, state.get_position());
397 return true;
398 }
399 }
400 }
401 else {
402 let mut has_name = false;
404 while let Some(ch) = state.peek() {
405 if ch.is_ascii_alphanumeric() {
406 state.advance(1);
407 has_name = true;
408 }
409 else {
410 break;
411 }
412 }
413
414 if has_name && state.peek() == Some(';') {
415 state.advance(1);
416 state.add_token(HtmlTokenType::EntityRef, start_pos, state.get_position());
417 return true;
418 }
419 }
420
421 state.add_token(HtmlTokenType::Error, start_pos, state.get_position());
423 return true;
424 }
425
426 false
427 }
428
429 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
430 HTML_STRING.scan(state, HtmlTokenType::AttributeValue)
431 }
432
433 fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
434 let start_pos = state.get_position();
435
436 if let Some(ch) = state.peek() {
437 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
438 state.advance(ch.len_utf8());
439
440 while let Some(ch) = state.peek() {
441 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
442 state.advance(ch.len_utf8());
443 }
444 else {
445 break;
446 }
447 }
448
449 state.add_token(HtmlTokenType::TagName, start_pos, state.get_position());
450 return true;
451 }
452 }
453
454 false
455 }
456
457 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
458 let start_pos = state.get_position();
459
460 let kind = match state.peek() {
461 Some('=') => HtmlTokenType::Equal,
462 Some('"') => HtmlTokenType::Quote,
463 Some('\'') => HtmlTokenType::Quote,
464 Some('!') => return false, Some('?') => return false, Some('&') => return false, Some(';') => return false, _ => return false,
469 };
470
471 if let Some(ch) = state.peek() {
472 state.advance(ch.len_utf8());
473 state.add_token(kind, start_pos, state.get_position());
474 true
475 }
476 else {
477 false
478 }
479 }
480
481 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
482 let start_pos = state.get_position();
483 let bytes = state.rest_bytes();
484 let mut i = 0;
485 let len = bytes.len();
486 const LANES: usize = 32;
487
488 while i + LANES <= len {
489 let chunk = Simd::<u8, LANES>::from_slice(unsafe { bytes.get_unchecked(i..i + LANES) });
490
491 let is_lt = chunk.simd_eq(Simd::splat(b'<'));
492 let is_amp = chunk.simd_eq(Simd::splat(b'&'));
493 let is_le_space = chunk.simd_le(Simd::splat(32));
494
495 let stop = is_lt | is_amp | is_le_space;
496
497 if stop.any() {
498 let idx = stop.first_set().unwrap();
499 i += idx;
500 state.advance(i);
501 state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
502 return true;
503 }
504 i += LANES
505 }
506 while i < len {
507 let ch = unsafe { *bytes.get_unchecked(i) };
508 if ch == b'<' || ch == b'&' || ch <= 32 {
509 break;
510 }
511 i += 1
512 }
513
514 if i > 0 {
515 state.advance(i);
516 state.add_token(HtmlTokenType::Text, start_pos, state.get_position());
517 true
518 }
519 else {
520 false
521 }
522 }
523}