1use crate::{kind::HtmlSyntaxKind, language::HtmlLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentBlock, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, HtmlLanguage>;
10
11static HTML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static HTML_COMMENT: LazyLock<CommentBlock> =
15 LazyLock::new(|| CommentBlock { block_markers: &[("<!--", "-->")], nested_blocks: false });
16
17static HTML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19#[derive(Clone)]
20pub struct HtmlLexer<'config> {
21 config: &'config HtmlLanguage,
22}
23
24impl<'config> Lexer<HtmlLanguage> for HtmlLexer<'config> {
25 fn lex_incremental(
26 &self,
27 source: impl Source,
28 changed: usize,
29 cache: IncrementalCache<HtmlLanguage>,
30 ) -> LexOutput<HtmlLanguage> {
31 let mut state = LexerState::new_with_cache(source, changed, cache);
32 let result = self.run(&mut state);
33 state.finish(result)
34 }
35}
36
37impl<'config> HtmlLexer<'config> {
38 pub fn new(config: &'config HtmlLanguage) -> Self {
39 Self { config }
40 }
41
42 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
44 while state.not_at_end() {
45 let safe_point = state.get_position();
46
47 if self.skip_whitespace(state) {
48 continue;
49 }
50
51 if self.lex_comment(state) {
52 continue;
53 }
54
55 if self.lex_doctype(state) {
56 continue;
57 }
58
59 if self.lex_cdata(state) {
60 continue;
61 }
62
63 if self.lex_processing_instruction(state) {
64 continue;
65 }
66
67 if self.lex_tag_operators(state) {
68 continue;
69 }
70
71 if self.lex_entity_reference(state) {
72 continue;
73 }
74
75 if self.lex_string_literal(state) {
76 continue;
77 }
78
79 if self.lex_identifier(state) {
80 continue;
81 }
82
83 if self.lex_single_char_tokens(state) {
84 continue;
85 }
86
87 if self.lex_text(state) {
88 continue;
89 }
90
91 state.safe_check(safe_point);
93 }
94
95 Ok(())
96 }
97
98 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
99 match HTML_WHITESPACE.scan(state.rest(), state.get_position(), HtmlSyntaxKind::Whitespace) {
100 Some(token) => {
101 state.advance_with(token);
102 true
103 }
104 None => false,
105 }
106 }
107
108 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
109 match HTML_COMMENT.scan(state.rest(), state.get_position(), HtmlSyntaxKind::Comment) {
110 Some(token) => {
111 state.advance_with(token);
112 true
113 }
114 None => false,
115 }
116 }
117
118 fn lex_doctype<S: Source>(&self, state: &mut State<S>) -> bool {
119 let start_pos = state.get_position();
120
121 if let Some('<') = state.peek() {
122 if let Some('!') = state.peek_next_n(1) {
123 if let Some('D') = state.peek_next_n(2) {
124 let doctype_start = "DOCTYPE";
125 let mut matches = true;
126
127 for (i, expected_ch) in doctype_start.chars().enumerate() {
128 if let Some(actual_ch) = state.peek_next_n(2 + i) {
129 if actual_ch.to_ascii_uppercase() != expected_ch {
130 matches = false;
131 break;
132 }
133 }
134 else {
135 matches = false;
136 break;
137 }
138 }
139
140 if matches {
141 state.advance(2 + doctype_start.len()); while state.not_at_end() {
145 if let Some('>') = state.peek() {
146 state.advance(1); state.add_token(HtmlSyntaxKind::Doctype, start_pos, state.get_position());
148 return true;
149 }
150 if let Some(ch) = state.peek() {
151 state.advance(ch.len_utf8());
152 }
153 else {
154 break;
155 }
156 }
157
158 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
160 return true;
161 }
162 }
163 }
164 }
165
166 false
167 }
168
169 fn lex_cdata<S: Source>(&self, state: &mut State<S>) -> bool {
170 let start_pos = state.get_position();
171
172 if let Some('<') = state.peek() {
173 if let Some('!') = state.peek_next_n(1) {
174 if let Some('[') = state.peek_next_n(2) {
175 let cdata_start = "CDATA[";
176 let mut matches = true;
177
178 for (i, expected_ch) in cdata_start.chars().enumerate() {
179 if let Some(actual_ch) = state.peek_next_n(3 + i) {
180 if actual_ch != expected_ch {
181 matches = false;
182 break;
183 }
184 }
185 else {
186 matches = false;
187 break;
188 }
189 }
190
191 if matches {
192 state.advance(3 + cdata_start.len()); while state.not_at_end() {
196 if let Some(']') = state.peek() {
197 if let Some(']') = state.peek_next_n(1) {
198 if let Some('>') = state.peek_next_n(2) {
199 state.advance(3); state.add_token(HtmlSyntaxKind::CData, start_pos, state.get_position());
201 return true;
202 }
203 }
204 }
205 if let Some(ch) = state.peek() {
206 state.advance(ch.len_utf8());
207 }
208 else {
209 break;
210 }
211 }
212
213 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
215 return true;
216 }
217 }
218 }
219 }
220
221 false
222 }
223
224 fn lex_processing_instruction<S: Source>(&self, state: &mut State<S>) -> bool {
225 let start_pos = state.get_position();
226
227 if let Some('<') = state.peek() {
228 if let Some('?') = state.peek_next_n(1) {
229 state.advance(2); while state.not_at_end() {
233 if let Some('?') = state.peek() {
234 if let Some('>') = state.peek_next_n(1) {
235 state.advance(2); state.add_token(HtmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
237 return true;
238 }
239 }
240 if let Some(ch) = state.peek() {
241 state.advance(ch.len_utf8());
242 }
243 else {
244 break;
245 }
246 }
247
248 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
250 return true;
251 }
252 }
253
254 false
255 }
256
257 fn lex_tag_operators<S: Source>(&self, state: &mut State<S>) -> bool {
258 let start_pos = state.get_position();
259
260 match state.peek() {
261 Some('<') => {
262 if let Some('/') = state.peek_next_n(1) {
263 state.advance(2);
264 state.add_token(HtmlSyntaxKind::TagSlashOpen, start_pos, state.get_position());
265 true
266 }
267 else {
268 state.advance(1);
269 state.add_token(HtmlSyntaxKind::TagOpen, start_pos, state.get_position());
270 true
271 }
272 }
273 Some('/') => {
274 if let Some('>') = state.peek_next_n(1) {
275 state.advance(2);
276 state.add_token(HtmlSyntaxKind::TagSelfClose, start_pos, state.get_position());
277 true
278 }
279 else {
280 false
281 }
282 }
283 Some('>') => {
284 state.advance(1);
285 state.add_token(HtmlSyntaxKind::TagClose, start_pos, state.get_position());
286 true
287 }
288 _ => false,
289 }
290 }
291
292 fn lex_entity_reference<S: Source>(&self, state: &mut State<S>) -> bool {
293 let start_pos = state.get_position();
294
295 if let Some('&') = state.peek() {
296 state.advance(1);
297
298 if let Some('#') = state.peek() {
299 state.advance(1);
300
301 if let Some('x') = state.peek() {
303 state.advance(1);
304 let mut has_digits = false;
306 while let Some(ch) = state.peek() {
307 if ch.is_ascii_hexdigit() {
308 state.advance(1);
309 has_digits = true;
310 }
311 else {
312 break;
313 }
314 }
315
316 if has_digits && state.peek() == Some(';') {
317 state.advance(1);
318 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
319 return true;
320 }
321 }
322 else {
323 let mut has_digits = false;
325 while let Some(ch) = state.peek() {
326 if ch.is_ascii_digit() {
327 state.advance(1);
328 has_digits = true;
329 }
330 else {
331 break;
332 }
333 }
334
335 if has_digits && state.peek() == Some(';') {
336 state.advance(1);
337 state.add_token(HtmlSyntaxKind::CharRef, start_pos, state.get_position());
338 return true;
339 }
340 }
341 }
342 else {
343 let mut has_name = false;
345 while let Some(ch) = state.peek() {
346 if ch.is_ascii_alphanumeric() {
347 state.advance(1);
348 has_name = true;
349 }
350 else {
351 break;
352 }
353 }
354
355 if has_name && state.peek() == Some(';') {
356 state.advance(1);
357 state.add_token(HtmlSyntaxKind::EntityRef, start_pos, state.get_position());
358 return true;
359 }
360 }
361
362 state.add_token(HtmlSyntaxKind::Error, start_pos, state.get_position());
364 return true;
365 }
366
367 false
368 }
369
370 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
371 match HTML_STRING.scan(state.rest(), 0, HtmlSyntaxKind::AttributeValue) {
372 Some(mut token) => {
373 token.span.start += state.get_position();
375 token.span.end += state.get_position();
376 state.advance_with(token);
377 true
378 }
379 None => false,
380 }
381 }
382
383 fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
384 let start_pos = state.get_position();
385
386 if let Some(ch) = state.peek() {
387 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
388 state.advance(ch.len_utf8());
389
390 while let Some(ch) = state.peek() {
391 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
392 state.advance(ch.len_utf8());
393 }
394 else {
395 break;
396 }
397 }
398
399 state.add_token(HtmlSyntaxKind::TagName, start_pos, state.get_position());
400 return true;
401 }
402 }
403
404 false
405 }
406
407 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
408 let start_pos = state.get_position();
409
410 let kind = match state.peek() {
411 Some('=') => HtmlSyntaxKind::Equal,
412 Some('"') => HtmlSyntaxKind::Quote,
413 Some('\'') => HtmlSyntaxKind::Quote,
414 Some('!') => return false, Some('?') => return false, Some('&') => return false, Some(';') => return false, _ => return false,
419 };
420
421 if let Some(ch) = state.peek() {
422 state.advance(ch.len_utf8());
423 state.add_token(kind, start_pos, state.get_position());
424 true
425 }
426 else {
427 false
428 }
429 }
430
431 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
432 let start_pos = state.get_position();
433 let mut has_text = false;
434
435 while let Some(ch) = state.peek() {
436 match ch {
437 '<' | '&' => break,
438 _ if ch.is_whitespace() => break,
439 _ => {
440 state.advance(ch.len_utf8());
441 has_text = true;
442 }
443 }
444 }
445
446 if has_text {
447 state.add_token(HtmlSyntaxKind::Text, start_pos, state.get_position());
448 true
449 }
450 else {
451 false
452 }
453 }
454}