1use crate::{kind::XmlSyntaxKind, language::XmlLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentBlock, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, XmlLanguage>;
10
11static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static XML_COMMENT: LazyLock<CommentBlock> =
15 LazyLock::new(|| CommentBlock { block_markers: &[("<!--", "-->")], nested_blocks: false });
16
17static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
20 fn lex_incremental(
21 &self,
22 source: impl Source,
23 changed: usize,
24 cache: IncrementalCache<XmlLanguage>,
25 ) -> LexOutput<XmlLanguage> {
26 let mut state = LexerState::new_with_cache(source, changed, cache);
27 let result = self.run(&mut state);
28 state.finish(result)
29 }
30}
31
32#[derive(Clone)]
33pub struct XmlLexer<'config> {
34 config: &'config XmlLanguage,
35}
36
37impl<'config> XmlLexer<'config> {
38 pub fn new(config: &'config XmlLanguage) -> Self {
39 Self { config }
40 }
41
42 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
44 while state.not_at_end() {
45 let safe_point = state.get_position();
46
47 if self.skip_whitespace(state) {
48 continue;
49 }
50
51 if self.lex_comment(state) {
52 continue;
53 }
54
55 if self.lex_doctype(state) {
56 continue;
57 }
58
59 if self.lex_cdata(state) {
60 continue;
61 }
62
63 if self.lex_processing_instruction(state) {
64 continue;
65 }
66
67 if self.lex_tag_operators(state) {
68 continue;
69 }
70
71 if self.lex_entity_reference(state) {
72 continue;
73 }
74
75 if self.lex_string_literal(state) {
76 continue;
77 }
78
79 if self.lex_identifier(state) {
80 continue;
81 }
82
83 if self.lex_single_char_tokens(state) {
84 continue;
85 }
86
87 if self.lex_text(state) {
88 continue;
89 }
90
91 state.safe_check(safe_point);
92 }
93
94 let eof_pos = state.get_position();
96 state.add_token(XmlSyntaxKind::Eof, eof_pos, eof_pos);
97 Ok(())
98 }
99
100 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
102 match XML_WHITESPACE.scan(state.rest(), state.get_position(), XmlSyntaxKind::Whitespace) {
103 Some(token) => {
104 state.advance_with(token);
105 true
106 }
107 None => false,
108 }
109 }
110
111 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
113 match XML_COMMENT.scan(state.rest(), state.get_position(), XmlSyntaxKind::Comment) {
114 Some(token) => {
115 state.advance_with(token);
116 true
117 }
118 None => false,
119 }
120 }
121
122 fn lex_doctype<S: Source>(&self, state: &mut State<S>) -> bool {
123 let start_pos = state.get_position();
124
125 if let Some('<') = state.peek() {
126 if let Some('!') = state.peek_next_n(1) {
127 let doctype_keyword = "DOCTYPE";
129 let mut matches = true;
130 for (i, expected_ch) in doctype_keyword.chars().enumerate() {
131 if let Some(actual_ch) = state.peek_next_n(2 + i) {
132 if actual_ch.to_ascii_uppercase() != expected_ch {
133 matches = false;
134 break;
135 }
136 }
137 else {
138 matches = false;
139 break;
140 }
141 }
142
143 if matches {
144 state.advance(2 + doctype_keyword.len()); let mut bracket_depth = 0;
147 while state.not_at_end() {
149 match state.peek() {
150 Some('[') => {
151 bracket_depth += 1;
152 state.advance(1);
153 }
154 Some(']') => {
155 bracket_depth -= 1;
156 state.advance(1);
157 }
158 Some('>') => {
159 if bracket_depth == 0 {
160 state.advance(1); state.add_token(XmlSyntaxKind::DoctypeDeclaration, start_pos, state.get_position());
162 return true;
163 }
164 else {
165 state.advance(1);
166 }
167 }
168 Some(ch) => {
169 state.advance(ch.len_utf8());
170 }
171 None => break,
172 }
173 }
174
175 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
177 return true;
178 }
179 }
180 }
181
182 false
183 }
184
185 fn lex_cdata<S: Source>(&self, state: &mut State<S>) -> bool {
186 let start_pos = state.get_position();
187
188 if let Some('<') = state.peek() {
189 if let Some('!') = state.peek_next_n(1) {
190 if let Some('[') = state.peek_next_n(2) {
191 let cdata_start = "CDATA[";
193 let mut matches = true;
194 for (i, expected_ch) in cdata_start.chars().enumerate() {
195 if let Some(actual_ch) = state.peek_next_n(3 + i) {
196 if actual_ch != expected_ch {
197 matches = false;
198 break;
199 }
200 }
201 else {
202 matches = false;
203 break;
204 }
205 }
206
207 if matches {
208 state.advance(3 + cdata_start.len()); while state.not_at_end() {
212 if let Some(']') = state.peek() {
213 if let Some(']') = state.peek_next_n(1) {
214 if let Some('>') = state.peek_next_n(2) {
215 state.advance(3); state.add_token(XmlSyntaxKind::CData, start_pos, state.get_position());
217 return true;
218 }
219 }
220 }
221 if let Some(ch) = state.peek() {
222 state.advance(ch.len_utf8());
223 }
224 else {
225 break;
226 }
227 }
228
229 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
231 return true;
232 }
233 }
234 }
235 }
236
237 false
238 }
239
240 fn lex_processing_instruction<S: Source>(&self, state: &mut State<S>) -> bool {
241 let start_pos = state.get_position();
242
243 if let Some('<') = state.peek() {
244 if let Some('?') = state.peek_next_n(1) {
245 state.advance(2); while state.not_at_end() {
249 if let Some('?') = state.peek() {
250 if let Some('>') = state.peek_next_n(1) {
251 state.advance(2); state.add_token(XmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
253 return true;
254 }
255 }
256 if let Some(ch) = state.peek() {
257 state.advance(ch.len_utf8());
258 }
259 else {
260 break;
261 }
262 }
263
264 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
266 return true;
267 }
268 }
269
270 false
271 }
272
273 fn lex_tag_operators<S: Source>(&self, state: &mut State<S>) -> bool {
274 let start_pos = state.get_position();
275
276 match state.peek() {
277 Some('<') => {
278 state.advance(1);
279 if state.peek() == Some('/') {
280 state.advance(1);
281 state.add_token(XmlSyntaxKind::LeftAngleSlash, start_pos, state.get_position());
282 }
283 else {
284 state.add_token(XmlSyntaxKind::LeftAngle, start_pos, state.get_position());
285 }
286 true
287 }
288 Some('/') => {
289 if state.peek_next_n(1) == Some('>') {
290 state.advance(2);
291 state.add_token(XmlSyntaxKind::SlashRightAngle, start_pos, state.get_position());
292 true
293 }
294 else {
295 false
296 }
297 }
298 Some('>') => {
299 state.advance(1);
300 state.add_token(XmlSyntaxKind::RightAngle, start_pos, state.get_position());
301 true
302 }
303 Some('=') => {
304 state.advance(1);
305 state.add_token(XmlSyntaxKind::Equals, start_pos, state.get_position());
306 true
307 }
308 _ => false,
309 }
310 }
311
312 fn lex_entity_reference<S: Source>(&self, state: &mut State<S>) -> bool {
313 let start_pos = state.get_position();
314
315 if state.peek() == Some('&') {
316 state.advance(1);
317
318 if state.peek() == Some('#') {
320 state.advance(1);
321 let mut has_digits = false;
322
323 if state.peek() == Some('x') {
325 state.advance(1);
326 while let Some(ch) = state.peek() {
327 if ch.is_ascii_hexdigit() {
328 state.advance(1);
329 has_digits = true;
330 }
331 else {
332 break;
333 }
334 }
335 }
336 else {
337 while let Some(ch) = state.peek() {
339 if ch.is_ascii_digit() {
340 state.advance(1);
341 has_digits = true;
342 }
343 else {
344 break;
345 }
346 }
347 }
348
349 if has_digits && state.peek() == Some(';') {
350 state.advance(1);
351 state.add_token(XmlSyntaxKind::CharacterReference, start_pos, state.get_position());
352 return true;
353 }
354 }
355 else {
356 let mut has_name = false;
358 while let Some(ch) = state.peek() {
359 if ch.is_ascii_alphanumeric() {
360 state.advance(1);
361 has_name = true;
362 }
363 else {
364 break;
365 }
366 }
367
368 if has_name && state.peek() == Some(';') {
369 state.advance(1);
370 state.add_token(XmlSyntaxKind::EntityReference, start_pos, state.get_position());
371 return true;
372 }
373 }
374
375 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
377 return true;
378 }
379
380 false
381 }
382
383 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
384 match XML_STRING.scan(state.rest(), 0, XmlSyntaxKind::StringLiteral) {
385 Some(mut token) => {
386 token.span.start += state.get_position();
388 token.span.end += state.get_position();
389 state.advance_with(token);
390 true
391 }
392 None => false,
393 }
394 }
395
396 fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
397 let start_pos = state.get_position();
398
399 if let Some(ch) = state.peek() {
400 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
401 state.advance(ch.len_utf8());
402
403 while let Some(ch) = state.peek() {
404 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
405 state.advance(ch.len_utf8());
406 }
407 else {
408 break;
409 }
410 }
411
412 state.add_token(XmlSyntaxKind::Identifier, start_pos, state.get_position());
413 return true;
414 }
415 }
416
417 false
418 }
419
420 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
421 let start_pos = state.get_position();
422
423 match state.peek() {
424 Some('"') => {
425 state.advance(1);
426 state.add_token(XmlSyntaxKind::Quote, start_pos, state.get_position());
427 true
428 }
429 Some('\'') => {
430 state.advance(1);
431 state.add_token(XmlSyntaxKind::SingleQuote, start_pos, state.get_position());
432 true
433 }
434 Some('!') => {
435 state.advance(1);
436 state.add_token(XmlSyntaxKind::Exclamation, start_pos, state.get_position());
437 true
438 }
439 Some('?') => {
440 state.advance(1);
441 state.add_token(XmlSyntaxKind::Question, start_pos, state.get_position());
442 true
443 }
444 Some('&') => {
445 state.advance(1);
446 state.add_token(XmlSyntaxKind::Ampersand, start_pos, state.get_position());
447 true
448 }
449 Some(';') => {
450 state.advance(1);
451 state.add_token(XmlSyntaxKind::Semicolon, start_pos, state.get_position());
452 true
453 }
454 _ => false,
455 }
456 }
457
458 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
459 let start_pos = state.get_position();
460
461 while let Some(ch) = state.peek() {
462 match ch {
464 ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
465 _ => {
466 state.advance(ch.len_utf8());
467 }
468 }
469 }
470
471 if state.get_position() > start_pos {
472 state.add_token(XmlSyntaxKind::Text, start_pos, state.get_position());
473 true
474 }
475 else {
476 false
477 }
478 }
479}