1#![doc = include_str!("readme.md")]
2
3pub mod token_type;
5
6use crate::{language::XmlLanguage, lexer::token_type::XmlTokenType};
7use oak_core::{
8 Lexer, LexerCache, LexerState, OakError,
9 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
10 source::Source,
11};
12use std::sync::LazyLock;
13
14pub(crate) type State<'a, S> = LexerState<'a, S, XmlLanguage>;
15
16static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
20
21static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
22
23impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
25 let mut state = LexerState::new(source);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof();
29 }
30 state.finish_with_cache(result, cache)
31 }
32}
33
34#[derive(Clone)]
36pub struct XmlLexer<'config> {
37 config: &'config XmlLanguage,
38}
39
40impl<'config> XmlLexer<'config> {
41 pub fn new(config: &'config XmlLanguage) -> Self {
43 Self { config }
44 }
45
46 fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
48 while state.not_at_end() {
49 let safe_point = state.get_position();
50
51 if self.skip_whitespace(state) {
52 continue;
53 }
54
55 if self.lex_comment(state) {
56 continue;
57 }
58
59 if self.lex_doctype(state) {
60 continue;
61 }
62
63 if self.lex_cdata(state) {
64 continue;
65 }
66
67 if self.lex_processing_instruction(state) {
68 continue;
69 }
70
71 if self.lex_tag_start(state) {
72 continue;
73 }
74
75 if self.lex_entity_reference(state) {
76 continue;
77 }
78
79 if self.lex_string_literal(state) {
80 continue;
81 }
82
83 if self.lex_identifier_or_tag_name(state) {
84 continue;
85 }
86
87 if self.lex_single_char_tokens(state) {
88 continue;
89 }
90
91 if self.lex_text(state) {
92 continue;
93 }
94
95 state.advance_if_dead_lock(safe_point);
96 }
97
98 Ok(())
99 }
100
101 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102 XML_WHITESPACE.scan(state, XmlTokenType::Whitespace)
103 }
104
105 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106 XML_COMMENT.scan(state, XmlTokenType::Comment, XmlTokenType::Comment)
107 }
108
109 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110 let start_pos = state.get_position();
111
112 if let Some('<') = state.peek() {
113 if let Some('!') = state.peek_next_n(1) {
114 let doctype_keyword = "DOCTYPE";
116 let mut matches = true;
117 for (i, expected_ch) in doctype_keyword.chars().enumerate() {
118 if let Some(actual_ch) = state.peek_next_n(2 + i) {
119 if actual_ch.to_ascii_uppercase() != expected_ch {
120 matches = false;
121 break;
122 }
123 }
124 else {
125 matches = false;
126 break;
127 }
128 }
129
130 if matches {
131 state.advance(2 + doctype_keyword.len()); let mut bracket_depth = 0;
134 while state.not_at_end() {
136 match state.peek() {
137 Some('[') => {
138 bracket_depth += 1;
139 state.advance(1);
140 }
141 Some(']') => {
142 bracket_depth -= 1;
143 state.advance(1);
144 }
145 Some('>') => {
146 if bracket_depth == 0 {
147 state.advance(1); state.add_token(XmlTokenType::DoctypeDeclaration, start_pos, state.get_position());
149 return true;
150 }
151 else {
152 state.advance(1);
153 }
154 }
155 Some(ch) => {
156 state.advance(ch.len_utf8());
157 }
158 None => break,
159 }
160 }
161
162 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
164 return true;
165 }
166 }
167 }
168
169 false
170 }
171
172 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
173 let start_pos = state.get_position();
174
175 if let Some('<') = state.peek() {
176 if let Some('!') = state.peek_next_n(1) {
177 if let Some('[') = state.peek_next_n(2) {
178 let cdata_start = "CDATA[";
180 let mut matches = true;
181 for (i, expected_ch) in cdata_start.chars().enumerate() {
182 if let Some(actual_ch) = state.peek_next_n(3 + i) {
183 if actual_ch != expected_ch {
184 matches = false;
185 break;
186 }
187 }
188 else {
189 matches = false;
190 break;
191 }
192 }
193
194 if matches {
195 state.advance(3 + cdata_start.len()); while state.not_at_end() {
199 if let Some(']') = state.peek() {
200 if let Some(']') = state.peek_next_n(1) {
201 if let Some('>') = state.peek_next_n(2) {
202 state.advance(3); state.add_token(XmlTokenType::CData, start_pos, state.get_position());
204 return true;
205 }
206 }
207 }
208 if let Some(ch) = state.peek() {
209 state.advance(ch.len_utf8());
210 }
211 else {
212 break;
213 }
214 }
215
216 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
218 return true;
219 }
220 }
221 }
222 }
223
224 false
225 }
226
227 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228 let start_pos = state.get_position();
229
230 if let Some('<') = state.peek() {
231 if let Some('?') = state.peek_next_n(1) {
232 state.advance(2); while state.not_at_end() {
236 if let Some('?') = state.peek() {
237 if let Some('>') = state.peek_next_n(1) {
238 state.advance(2); state.add_token(XmlTokenType::ProcessingInstruction, start_pos, state.get_position());
240 return true;
241 }
242 }
243 if let Some(ch) = state.peek() {
244 state.advance(ch.len_utf8());
245 }
246 else {
247 break;
248 }
249 }
250
251 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
253 return true;
254 }
255 }
256
257 false
258 }
259
260 fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
261 let start_pos = state.get_position();
262
263 match state.peek() {
264 Some('<') => {
265 state.advance(1);
266 if state.peek() == Some('/') {
267 state.advance(1);
268 state.add_token(XmlTokenType::LeftAngleSlash, start_pos, state.get_position());
269 }
270 else {
271 state.add_token(XmlTokenType::LeftAngle, start_pos, state.get_position());
272 }
273 true
274 }
275 Some('/') => {
276 if state.peek_next_n(1) == Some('>') {
277 state.advance(2);
278 state.add_token(XmlTokenType::SlashRightAngle, start_pos, state.get_position());
279 true
280 }
281 else {
282 false
283 }
284 }
285 Some('>') => {
286 state.advance(1);
287 state.add_token(XmlTokenType::RightAngle, start_pos, state.get_position());
288 true
289 }
290 Some('=') => {
291 state.advance(1);
292 state.add_token(XmlTokenType::Equals, start_pos, state.get_position());
293 true
294 }
295 _ => false,
296 }
297 }
298
299 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300 let start_pos = state.get_position();
301
302 if state.peek() == Some('&') {
303 state.advance(1);
304
305 if state.peek() == Some('#') {
307 state.advance(1);
308 let mut has_digits = false;
309
310 if state.peek() == Some('x') {
312 state.advance(1);
313 while let Some(ch) = state.peek() {
314 if ch.is_ascii_hexdigit() {
315 state.advance(1);
316 has_digits = true;
317 }
318 else {
319 break;
320 }
321 }
322 }
323 else {
324 while let Some(ch) = state.peek() {
326 if ch.is_ascii_digit() {
327 state.advance(1);
328 has_digits = true;
329 }
330 else {
331 break;
332 }
333 }
334 }
335
336 if has_digits && state.peek() == Some(';') {
337 state.advance(1);
338 state.add_token(XmlTokenType::CharacterReference, start_pos, state.get_position());
339 return true;
340 }
341 }
342 else {
343 let mut has_name = false;
345 while let Some(ch) = state.peek() {
346 if ch.is_ascii_alphanumeric() {
347 state.advance(1);
348 has_name = true;
349 }
350 else {
351 break;
352 }
353 }
354
355 if has_name && state.peek() == Some(';') {
356 state.advance(1);
357 state.add_token(XmlTokenType::EntityReference, start_pos, state.get_position());
358 return true;
359 }
360 }
361
362 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
364 return true;
365 }
366
367 false
368 }
369
370 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371 XML_STRING.scan(state, XmlTokenType::StringLiteral)
372 }
373
374 fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
375 let start_pos = state.get_position();
376
377 if let Some(ch) = state.peek() {
378 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
379 state.advance(ch.len_utf8());
380
381 while let Some(ch) = state.peek() {
382 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
383 state.advance(ch.len_utf8());
384 }
385 else {
386 break;
387 }
388 }
389
390 state.add_token(XmlTokenType::Identifier, start_pos, state.get_position());
391 return true;
392 }
393 }
394
395 false
396 }
397
398 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
399 let start_pos = state.get_position();
400
401 match state.peek() {
402 Some('"') => {
403 state.advance(1);
404 state.add_token(XmlTokenType::Quote, start_pos, state.get_position());
405 true
406 }
407 Some('\'') => {
408 state.advance(1);
409 state.add_token(XmlTokenType::SingleQuote, start_pos, state.get_position());
410 true
411 }
412 Some('!') => {
413 state.advance(1);
414 state.add_token(XmlTokenType::Exclamation, start_pos, state.get_position());
415 true
416 }
417 Some('?') => {
418 state.advance(1);
419 state.add_token(XmlTokenType::Question, start_pos, state.get_position());
420 true
421 }
422 Some('&') => {
423 state.advance(1);
424 state.add_token(XmlTokenType::Ampersand, start_pos, state.get_position());
425 true
426 }
427 Some(';') => {
428 state.advance(1);
429 state.add_token(XmlTokenType::Semicolon, start_pos, state.get_position());
430 true
431 }
432 _ => false,
433 }
434 }
435
436 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
437 let start_pos = state.get_position();
438
439 while let Some(ch) = state.peek() {
440 match ch {
442 ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
443 _ => {
444 state.advance(ch.len_utf8());
445 }
446 }
447 }
448
449 if state.get_position() > start_pos {
450 state.add_token(XmlTokenType::Text, start_pos, state.get_position());
451 true
452 }
453 else {
454 false
455 }
456 }
457}