1use crate::{kind::XmlSyntaxKind, language::XmlLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, XmlLanguage>;
10
11static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
15
16static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
17
18impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
19 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
20 let mut state = LexerState::new(source);
21 let result = self.run(&mut state);
22 if result.is_ok() {
23 state.add_eof();
24 }
25 state.finish_with_cache(result, cache)
26 }
27}
28
29#[derive(Clone)]
30pub struct XmlLexer<'config> {
31 _config: &'config XmlLanguage,
32}
33
34impl<'config> XmlLexer<'config> {
35 pub fn new(config: &'config XmlLanguage) -> Self {
36 Self { _config: config }
37 }
38
39 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41 while state.not_at_end() {
42 let safe_point = state.get_position();
43
44 if self.skip_whitespace(state) {
45 continue;
46 }
47
48 if self.lex_comment(state) {
49 continue;
50 }
51
52 if self.lex_doctype(state) {
53 continue;
54 }
55
56 if self.lex_cdata(state) {
57 continue;
58 }
59
60 if self.lex_processing_instruction(state) {
61 continue;
62 }
63
64 if self.lex_tag_start(state) {
65 continue;
66 }
67
68 if self.lex_entity_reference(state) {
69 continue;
70 }
71
72 if self.lex_string_literal(state) {
73 continue;
74 }
75
76 if self.lex_identifier_or_tag_name(state) {
77 continue;
78 }
79
80 if self.lex_single_char_tokens(state) {
81 continue;
82 }
83
84 if self.lex_text(state) {
85 continue;
86 }
87
88 state.advance_if_dead_lock(safe_point);
89 }
90
91 Ok(())
92 }
93
94 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95 XML_WHITESPACE.scan(state, XmlSyntaxKind::Whitespace)
96 }
97
98 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
99 XML_COMMENT.scan(state, XmlSyntaxKind::Comment, XmlSyntaxKind::Comment)
100 }
101
102 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103 let start_pos = state.get_position();
104
105 if let Some('<') = state.peek() {
106 if let Some('!') = state.peek_next_n(1) {
107 let doctype_keyword = "DOCTYPE";
109 let mut matches = true;
110 for (i, expected_ch) in doctype_keyword.chars().enumerate() {
111 if let Some(actual_ch) = state.peek_next_n(2 + i) {
112 if actual_ch.to_ascii_uppercase() != expected_ch {
113 matches = false;
114 break;
115 }
116 }
117 else {
118 matches = false;
119 break;
120 }
121 }
122
123 if matches {
124 state.advance(2 + doctype_keyword.len()); let mut bracket_depth = 0;
127 while state.not_at_end() {
129 match state.peek() {
130 Some('[') => {
131 bracket_depth += 1;
132 state.advance(1);
133 }
134 Some(']') => {
135 bracket_depth -= 1;
136 state.advance(1);
137 }
138 Some('>') => {
139 if bracket_depth == 0 {
140 state.advance(1); state.add_token(XmlSyntaxKind::DoctypeDeclaration, start_pos, state.get_position());
142 return true;
143 }
144 else {
145 state.advance(1);
146 }
147 }
148 Some(ch) => {
149 state.advance(ch.len_utf8());
150 }
151 None => break,
152 }
153 }
154
155 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
157 return true;
158 }
159 }
160 }
161
162 false
163 }
164
165 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
166 let start_pos = state.get_position();
167
168 if let Some('<') = state.peek() {
169 if let Some('!') = state.peek_next_n(1) {
170 if let Some('[') = state.peek_next_n(2) {
171 let cdata_start = "CDATA[";
173 let mut matches = true;
174 for (i, expected_ch) in cdata_start.chars().enumerate() {
175 if let Some(actual_ch) = state.peek_next_n(3 + i) {
176 if actual_ch != expected_ch {
177 matches = false;
178 break;
179 }
180 }
181 else {
182 matches = false;
183 break;
184 }
185 }
186
187 if matches {
188 state.advance(3 + cdata_start.len()); while state.not_at_end() {
192 if let Some(']') = state.peek() {
193 if let Some(']') = state.peek_next_n(1) {
194 if let Some('>') = state.peek_next_n(2) {
195 state.advance(3); state.add_token(XmlSyntaxKind::CData, start_pos, state.get_position());
197 return true;
198 }
199 }
200 }
201 if let Some(ch) = state.peek() {
202 state.advance(ch.len_utf8());
203 }
204 else {
205 break;
206 }
207 }
208
209 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
211 return true;
212 }
213 }
214 }
215 }
216
217 false
218 }
219
220 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
221 let start_pos = state.get_position();
222
223 if let Some('<') = state.peek() {
224 if let Some('?') = state.peek_next_n(1) {
225 state.advance(2); while state.not_at_end() {
229 if let Some('?') = state.peek() {
230 if let Some('>') = state.peek_next_n(1) {
231 state.advance(2); state.add_token(XmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
233 return true;
234 }
235 }
236 if let Some(ch) = state.peek() {
237 state.advance(ch.len_utf8());
238 }
239 else {
240 break;
241 }
242 }
243
244 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
246 return true;
247 }
248 }
249
250 false
251 }
252
253 fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
254 let start_pos = state.get_position();
255
256 match state.peek() {
257 Some('<') => {
258 state.advance(1);
259 if state.peek() == Some('/') {
260 state.advance(1);
261 state.add_token(XmlSyntaxKind::LeftAngleSlash, start_pos, state.get_position());
262 }
263 else {
264 state.add_token(XmlSyntaxKind::LeftAngle, start_pos, state.get_position());
265 }
266 true
267 }
268 Some('/') => {
269 if state.peek_next_n(1) == Some('>') {
270 state.advance(2);
271 state.add_token(XmlSyntaxKind::SlashRightAngle, start_pos, state.get_position());
272 true
273 }
274 else {
275 false
276 }
277 }
278 Some('>') => {
279 state.advance(1);
280 state.add_token(XmlSyntaxKind::RightAngle, start_pos, state.get_position());
281 true
282 }
283 Some('=') => {
284 state.advance(1);
285 state.add_token(XmlSyntaxKind::Equals, start_pos, state.get_position());
286 true
287 }
288 _ => false,
289 }
290 }
291
292 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
293 let start_pos = state.get_position();
294
295 if state.peek() == Some('&') {
296 state.advance(1);
297
298 if state.peek() == Some('#') {
300 state.advance(1);
301 let mut has_digits = false;
302
303 if state.peek() == Some('x') {
305 state.advance(1);
306 while let Some(ch) = state.peek() {
307 if ch.is_ascii_hexdigit() {
308 state.advance(1);
309 has_digits = true;
310 }
311 else {
312 break;
313 }
314 }
315 }
316 else {
317 while let Some(ch) = state.peek() {
319 if ch.is_ascii_digit() {
320 state.advance(1);
321 has_digits = true;
322 }
323 else {
324 break;
325 }
326 }
327 }
328
329 if has_digits && state.peek() == Some(';') {
330 state.advance(1);
331 state.add_token(XmlSyntaxKind::CharacterReference, start_pos, state.get_position());
332 return true;
333 }
334 }
335 else {
336 let mut has_name = false;
338 while let Some(ch) = state.peek() {
339 if ch.is_ascii_alphanumeric() {
340 state.advance(1);
341 has_name = true;
342 }
343 else {
344 break;
345 }
346 }
347
348 if has_name && state.peek() == Some(';') {
349 state.advance(1);
350 state.add_token(XmlSyntaxKind::EntityReference, start_pos, state.get_position());
351 return true;
352 }
353 }
354
355 state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
357 return true;
358 }
359
360 false
361 }
362
363 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364 XML_STRING.scan(state, XmlSyntaxKind::StringLiteral)
365 }
366
367 fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368 let start_pos = state.get_position();
369
370 if let Some(ch) = state.peek() {
371 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
372 state.advance(ch.len_utf8());
373
374 while let Some(ch) = state.peek() {
375 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
376 state.advance(ch.len_utf8());
377 }
378 else {
379 break;
380 }
381 }
382
383 state.add_token(XmlSyntaxKind::Identifier, start_pos, state.get_position());
384 return true;
385 }
386 }
387
388 false
389 }
390
391 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
392 let start_pos = state.get_position();
393
394 match state.peek() {
395 Some('"') => {
396 state.advance(1);
397 state.add_token(XmlSyntaxKind::Quote, start_pos, state.get_position());
398 true
399 }
400 Some('\'') => {
401 state.advance(1);
402 state.add_token(XmlSyntaxKind::SingleQuote, start_pos, state.get_position());
403 true
404 }
405 Some('!') => {
406 state.advance(1);
407 state.add_token(XmlSyntaxKind::Exclamation, start_pos, state.get_position());
408 true
409 }
410 Some('?') => {
411 state.advance(1);
412 state.add_token(XmlSyntaxKind::Question, start_pos, state.get_position());
413 true
414 }
415 Some('&') => {
416 state.advance(1);
417 state.add_token(XmlSyntaxKind::Ampersand, start_pos, state.get_position());
418 true
419 }
420 Some(';') => {
421 state.advance(1);
422 state.add_token(XmlSyntaxKind::Semicolon, start_pos, state.get_position());
423 true
424 }
425 _ => false,
426 }
427 }
428
429 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
430 let start_pos = state.get_position();
431
432 while let Some(ch) = state.peek() {
433 match ch {
435 ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
436 _ => {
437 state.advance(ch.len_utf8());
438 }
439 }
440 }
441
442 if state.get_position() > start_pos {
443 state.add_token(XmlSyntaxKind::Text, start_pos, state.get_position());
444 true
445 }
446 else {
447 false
448 }
449 }
450}