1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::XmlLanguage, lexer::token_type::XmlTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError, TextEdit,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8 source::Source,
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, XmlLanguage>;
13
14static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16
17static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
18
19static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
20
21impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
22 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
23 let mut state = LexerState::new(source);
24 let result = self.run(&mut state);
25 if result.is_ok() {
26 state.add_eof();
27 }
28 state.finish_with_cache(result, cache)
29 }
30}
31
32#[derive(Clone)]
33pub struct XmlLexer<'config> {
34 _config: &'config XmlLanguage,
35}
36
37impl<'config> XmlLexer<'config> {
38 pub fn new(config: &'config XmlLanguage) -> Self {
39 Self { _config: config }
40 }
41
42 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
44 while state.not_at_end() {
45 let safe_point = state.get_position();
46
47 if self.skip_whitespace(state) {
48 continue;
49 }
50
51 if self.lex_comment(state) {
52 continue;
53 }
54
55 if self.lex_doctype(state) {
56 continue;
57 }
58
59 if self.lex_cdata(state) {
60 continue;
61 }
62
63 if self.lex_processing_instruction(state) {
64 continue;
65 }
66
67 if self.lex_tag_start(state) {
68 continue;
69 }
70
71 if self.lex_entity_reference(state) {
72 continue;
73 }
74
75 if self.lex_string_literal(state) {
76 continue;
77 }
78
79 if self.lex_identifier_or_tag_name(state) {
80 continue;
81 }
82
83 if self.lex_single_char_tokens(state) {
84 continue;
85 }
86
87 if self.lex_text(state) {
88 continue;
89 }
90
91 state.advance_if_dead_lock(safe_point);
92 }
93
94 Ok(())
95 }
96
97 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
98 XML_WHITESPACE.scan(state, XmlTokenType::Whitespace)
99 }
100
101 fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102 XML_COMMENT.scan(state, XmlTokenType::Comment, XmlTokenType::Comment)
103 }
104
105 fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106 let start_pos = state.get_position();
107
108 if let Some('<') = state.peek() {
109 if let Some('!') = state.peek_next_n(1) {
110 let doctype_keyword = "DOCTYPE";
112 let mut matches = true;
113 for (i, expected_ch) in doctype_keyword.chars().enumerate() {
114 if let Some(actual_ch) = state.peek_next_n(2 + i) {
115 if actual_ch.to_ascii_uppercase() != expected_ch {
116 matches = false;
117 break;
118 }
119 }
120 else {
121 matches = false;
122 break;
123 }
124 }
125
126 if matches {
127 state.advance(2 + doctype_keyword.len()); let mut bracket_depth = 0;
130 while state.not_at_end() {
132 match state.peek() {
133 Some('[') => {
134 bracket_depth += 1;
135 state.advance(1);
136 }
137 Some(']') => {
138 bracket_depth -= 1;
139 state.advance(1);
140 }
141 Some('>') => {
142 if bracket_depth == 0 {
143 state.advance(1); state.add_token(XmlTokenType::DoctypeDeclaration, start_pos, state.get_position());
145 return true;
146 }
147 else {
148 state.advance(1);
149 }
150 }
151 Some(ch) => {
152 state.advance(ch.len_utf8());
153 }
154 None => break,
155 }
156 }
157
158 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
160 return true;
161 }
162 }
163 }
164
165 false
166 }
167
168 fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
169 let start_pos = state.get_position();
170
171 if let Some('<') = state.peek() {
172 if let Some('!') = state.peek_next_n(1) {
173 if let Some('[') = state.peek_next_n(2) {
174 let cdata_start = "CDATA[";
176 let mut matches = true;
177 for (i, expected_ch) in cdata_start.chars().enumerate() {
178 if let Some(actual_ch) = state.peek_next_n(3 + i) {
179 if actual_ch != expected_ch {
180 matches = false;
181 break;
182 }
183 }
184 else {
185 matches = false;
186 break;
187 }
188 }
189
190 if matches {
191 state.advance(3 + cdata_start.len()); while state.not_at_end() {
195 if let Some(']') = state.peek() {
196 if let Some(']') = state.peek_next_n(1) {
197 if let Some('>') = state.peek_next_n(2) {
198 state.advance(3); state.add_token(XmlTokenType::CData, start_pos, state.get_position());
200 return true;
201 }
202 }
203 }
204 if let Some(ch) = state.peek() {
205 state.advance(ch.len_utf8());
206 }
207 else {
208 break;
209 }
210 }
211
212 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
214 return true;
215 }
216 }
217 }
218 }
219
220 false
221 }
222
223 fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
224 let start_pos = state.get_position();
225
226 if let Some('<') = state.peek() {
227 if let Some('?') = state.peek_next_n(1) {
228 state.advance(2); while state.not_at_end() {
232 if let Some('?') = state.peek() {
233 if let Some('>') = state.peek_next_n(1) {
234 state.advance(2); state.add_token(XmlTokenType::ProcessingInstruction, start_pos, state.get_position());
236 return true;
237 }
238 }
239 if let Some(ch) = state.peek() {
240 state.advance(ch.len_utf8());
241 }
242 else {
243 break;
244 }
245 }
246
247 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
249 return true;
250 }
251 }
252
253 false
254 }
255
256 fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
257 let start_pos = state.get_position();
258
259 match state.peek() {
260 Some('<') => {
261 state.advance(1);
262 if state.peek() == Some('/') {
263 state.advance(1);
264 state.add_token(XmlTokenType::LeftAngleSlash, start_pos, state.get_position());
265 }
266 else {
267 state.add_token(XmlTokenType::LeftAngle, start_pos, state.get_position());
268 }
269 true
270 }
271 Some('/') => {
272 if state.peek_next_n(1) == Some('>') {
273 state.advance(2);
274 state.add_token(XmlTokenType::SlashRightAngle, start_pos, state.get_position());
275 true
276 }
277 else {
278 false
279 }
280 }
281 Some('>') => {
282 state.advance(1);
283 state.add_token(XmlTokenType::RightAngle, start_pos, state.get_position());
284 true
285 }
286 Some('=') => {
287 state.advance(1);
288 state.add_token(XmlTokenType::Equals, start_pos, state.get_position());
289 true
290 }
291 _ => false,
292 }
293 }
294
295 fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
296 let start_pos = state.get_position();
297
298 if state.peek() == Some('&') {
299 state.advance(1);
300
301 if state.peek() == Some('#') {
303 state.advance(1);
304 let mut has_digits = false;
305
306 if state.peek() == Some('x') {
308 state.advance(1);
309 while let Some(ch) = state.peek() {
310 if ch.is_ascii_hexdigit() {
311 state.advance(1);
312 has_digits = true;
313 }
314 else {
315 break;
316 }
317 }
318 }
319 else {
320 while let Some(ch) = state.peek() {
322 if ch.is_ascii_digit() {
323 state.advance(1);
324 has_digits = true;
325 }
326 else {
327 break;
328 }
329 }
330 }
331
332 if has_digits && state.peek() == Some(';') {
333 state.advance(1);
334 state.add_token(XmlTokenType::CharacterReference, start_pos, state.get_position());
335 return true;
336 }
337 }
338 else {
339 let mut has_name = false;
341 while let Some(ch) = state.peek() {
342 if ch.is_ascii_alphanumeric() {
343 state.advance(1);
344 has_name = true;
345 }
346 else {
347 break;
348 }
349 }
350
351 if has_name && state.peek() == Some(';') {
352 state.advance(1);
353 state.add_token(XmlTokenType::EntityReference, start_pos, state.get_position());
354 return true;
355 }
356 }
357
358 state.add_token(XmlTokenType::Error, start_pos, state.get_position());
360 return true;
361 }
362
363 false
364 }
365
366 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
367 XML_STRING.scan(state, XmlTokenType::StringLiteral)
368 }
369
370 fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371 let start_pos = state.get_position();
372
373 if let Some(ch) = state.peek() {
374 if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
375 state.advance(ch.len_utf8());
376
377 while let Some(ch) = state.peek() {
378 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
379 state.advance(ch.len_utf8());
380 }
381 else {
382 break;
383 }
384 }
385
386 state.add_token(XmlTokenType::Identifier, start_pos, state.get_position());
387 return true;
388 }
389 }
390
391 false
392 }
393
394 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
395 let start_pos = state.get_position();
396
397 match state.peek() {
398 Some('"') => {
399 state.advance(1);
400 state.add_token(XmlTokenType::Quote, start_pos, state.get_position());
401 true
402 }
403 Some('\'') => {
404 state.advance(1);
405 state.add_token(XmlTokenType::SingleQuote, start_pos, state.get_position());
406 true
407 }
408 Some('!') => {
409 state.advance(1);
410 state.add_token(XmlTokenType::Exclamation, start_pos, state.get_position());
411 true
412 }
413 Some('?') => {
414 state.advance(1);
415 state.add_token(XmlTokenType::Question, start_pos, state.get_position());
416 true
417 }
418 Some('&') => {
419 state.advance(1);
420 state.add_token(XmlTokenType::Ampersand, start_pos, state.get_position());
421 true
422 }
423 Some(';') => {
424 state.advance(1);
425 state.add_token(XmlTokenType::Semicolon, start_pos, state.get_position());
426 true
427 }
428 _ => false,
429 }
430 }
431
432 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
433 let start_pos = state.get_position();
434
435 while let Some(ch) = state.peek() {
436 match ch {
438 ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
439 _ => {
440 state.advance(ch.len_utf8());
441 }
442 }
443 }
444
445 if state.get_position() > start_pos {
446 state.add_token(XmlTokenType::Text, start_pos, state.get_position());
447 true
448 }
449 else {
450 false
451 }
452 }
453}