Skip to main content

oak_xml/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2
3/// XML token types.
4pub mod token_type;
5
6use crate::{language::XmlLanguage, lexer::token_type::XmlTokenType};
7use oak_core::{
8    Lexer, LexerCache, LexerState, OakError,
9    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
10    source::Source,
11};
12use std::sync::LazyLock;
13
14pub(crate) type State<'a, S> = LexerState<'a, S, XmlLanguage>;
15
16// XML static configuration
17static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
18
19static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
20
21static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
22
23impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
25        let mut state = LexerState::new(source);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34/// XML lexer.
35#[derive(Clone)]
36pub struct XmlLexer<'config> {
37    config: &'config XmlLanguage,
38}
39
40impl<'config> XmlLexer<'config> {
41    /// Creates a new `XmlLexer`.
42    pub fn new(config: &'config XmlLanguage) -> Self {
43        Self { config }
44    }
45
46    /// Main lexer loop.
47    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
48        while state.not_at_end() {
49            let safe_point = state.get_position();
50
51            if self.skip_whitespace(state) {
52                continue;
53            }
54
55            if self.lex_comment(state) {
56                continue;
57            }
58
59            if self.lex_doctype(state) {
60                continue;
61            }
62
63            if self.lex_cdata(state) {
64                continue;
65            }
66
67            if self.lex_processing_instruction(state) {
68                continue;
69            }
70
71            if self.lex_tag_start(state) {
72                continue;
73            }
74
75            if self.lex_entity_reference(state) {
76                continue;
77            }
78
79            if self.lex_string_literal(state) {
80                continue;
81            }
82
83            if self.lex_identifier_or_tag_name(state) {
84                continue;
85            }
86
87            if self.lex_single_char_tokens(state) {
88                continue;
89            }
90
91            if self.lex_text(state) {
92                continue;
93            }
94
95            state.advance_if_dead_lock(safe_point);
96        }
97
98        Ok(())
99    }
100
101    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        XML_WHITESPACE.scan(state, XmlTokenType::Whitespace)
103    }
104
105    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        XML_COMMENT.scan(state, XmlTokenType::Comment, XmlTokenType::Comment)
107    }
108
109    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        let start_pos = state.get_position();
111
112        if let Some('<') = state.peek() {
113            if let Some('!') = state.peek_next_n(1) {
114                // Check for DOCTYPE keyword
115                let doctype_keyword = "DOCTYPE";
116                let mut matches = true;
117                for (i, expected_ch) in doctype_keyword.chars().enumerate() {
118                    if let Some(actual_ch) = state.peek_next_n(2 + i) {
119                        if actual_ch.to_ascii_uppercase() != expected_ch {
120                            matches = false;
121                            break;
122                        }
123                    }
124                    else {
125                        matches = false;
126                        break;
127                    }
128                }
129
130                if matches {
131                    state.advance(2 + doctype_keyword.len()); // Skip <!DOCTYPE
132
133                    let mut bracket_depth = 0;
134                    // Find DOCTYPE end
135                    while state.not_at_end() {
136                        match state.peek() {
137                            Some('[') => {
138                                bracket_depth += 1;
139                                state.advance(1);
140                            }
141                            Some(']') => {
142                                bracket_depth -= 1;
143                                state.advance(1);
144                            }
145                            Some('>') => {
146                                if bracket_depth == 0 {
147                                    state.advance(1); // Skip >
148                                    state.add_token(XmlTokenType::DoctypeDeclaration, start_pos, state.get_position());
149                                    return true;
150                                }
151                                else {
152                                    state.advance(1);
153                                }
154                            }
155                            Some(ch) => {
156                                state.advance(ch.len_utf8());
157                            }
158                            None => break,
159                        }
160                    }
161
162                    // Unclosed DOCTYPE
163                    state.add_token(XmlTokenType::Error, start_pos, state.get_position());
164                    return true;
165                }
166            }
167        }
168
169        false
170    }
171
172    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
173        let start_pos = state.get_position();
174
175        if let Some('<') = state.peek() {
176            if let Some('!') = state.peek_next_n(1) {
177                if let Some('[') = state.peek_next_n(2) {
178                    // Check CDATA start tag
179                    let cdata_start = "CDATA[";
180                    let mut matches = true;
181                    for (i, expected_ch) in cdata_start.chars().enumerate() {
182                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
183                            if actual_ch != expected_ch {
184                                matches = false;
185                                break;
186                            }
187                        }
188                        else {
189                            matches = false;
190                            break;
191                        }
192                    }
193
194                    if matches {
195                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
196
197                        // Find CDATA end ]]>
198                        while state.not_at_end() {
199                            if let Some(']') = state.peek() {
200                                if let Some(']') = state.peek_next_n(1) {
201                                    if let Some('>') = state.peek_next_n(2) {
202                                        state.advance(3); // Skip ]]>
203                                        state.add_token(XmlTokenType::CData, start_pos, state.get_position());
204                                        return true;
205                                    }
206                                }
207                            }
208                            if let Some(ch) = state.peek() {
209                                state.advance(ch.len_utf8());
210                            }
211                            else {
212                                break;
213                            }
214                        }
215
216                        // Unclosed CDATA
217                        state.add_token(XmlTokenType::Error, start_pos, state.get_position());
218                        return true;
219                    }
220                }
221            }
222        }
223
224        false
225    }
226
227    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
228        let start_pos = state.get_position();
229
230        if let Some('<') = state.peek() {
231            if let Some('?') = state.peek_next_n(1) {
232                state.advance(2); // Skip <?
233
234                // Find processing instruction end ?>
235                while state.not_at_end() {
236                    if let Some('?') = state.peek() {
237                        if let Some('>') = state.peek_next_n(1) {
238                            state.advance(2); // Skip ?>
239                            state.add_token(XmlTokenType::ProcessingInstruction, start_pos, state.get_position());
240                            return true;
241                        }
242                    }
243                    if let Some(ch) = state.peek() {
244                        state.advance(ch.len_utf8());
245                    }
246                    else {
247                        break;
248                    }
249                }
250
251                // Unclosed processing instruction
252                state.add_token(XmlTokenType::Error, start_pos, state.get_position());
253                return true;
254            }
255        }
256
257        false
258    }
259
260    fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
261        let start_pos = state.get_position();
262
263        match state.peek() {
264            Some('<') => {
265                state.advance(1);
266                if state.peek() == Some('/') {
267                    state.advance(1);
268                    state.add_token(XmlTokenType::LeftAngleSlash, start_pos, state.get_position());
269                }
270                else {
271                    state.add_token(XmlTokenType::LeftAngle, start_pos, state.get_position());
272                }
273                true
274            }
275            Some('/') => {
276                if state.peek_next_n(1) == Some('>') {
277                    state.advance(2);
278                    state.add_token(XmlTokenType::SlashRightAngle, start_pos, state.get_position());
279                    true
280                }
281                else {
282                    false
283                }
284            }
285            Some('>') => {
286                state.advance(1);
287                state.add_token(XmlTokenType::RightAngle, start_pos, state.get_position());
288                true
289            }
290            Some('=') => {
291                state.advance(1);
292                state.add_token(XmlTokenType::Equals, start_pos, state.get_position());
293                true
294            }
295            _ => false,
296        }
297    }
298
299    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
300        let start_pos = state.get_position();
301
302        if state.peek() == Some('&') {
303            state.advance(1);
304
305            // Check for character reference &#...;
306            if state.peek() == Some('#') {
307                state.advance(1);
308                let mut has_digits = false;
309
310                // Hexadecimal character reference &#x...;
311                if state.peek() == Some('x') {
312                    state.advance(1);
313                    while let Some(ch) = state.peek() {
314                        if ch.is_ascii_hexdigit() {
315                            state.advance(1);
316                            has_digits = true;
317                        }
318                        else {
319                            break;
320                        }
321                    }
322                }
323                else {
324                    // Decimal character reference &#...;
325                    while let Some(ch) = state.peek() {
326                        if ch.is_ascii_digit() {
327                            state.advance(1);
328                            has_digits = true;
329                        }
330                        else {
331                            break;
332                        }
333                    }
334                }
335
336                if has_digits && state.peek() == Some(';') {
337                    state.advance(1);
338                    state.add_token(XmlTokenType::CharacterReference, start_pos, state.get_position());
339                    return true;
340                }
341            }
342            else {
343                // Named entity reference &name;
344                let mut has_name = false;
345                while let Some(ch) = state.peek() {
346                    if ch.is_ascii_alphanumeric() {
347                        state.advance(1);
348                        has_name = true;
349                    }
350                    else {
351                        break;
352                    }
353                }
354
355                if has_name && state.peek() == Some(';') {
356                    state.advance(1);
357                    state.add_token(XmlTokenType::EntityReference, start_pos, state.get_position());
358                    return true;
359                }
360            }
361
362            // Invalid entity reference
363            state.add_token(XmlTokenType::Error, start_pos, state.get_position());
364            return true;
365        }
366
367        false
368    }
369
370    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371        XML_STRING.scan(state, XmlTokenType::StringLiteral)
372    }
373
374    fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
375        let start_pos = state.get_position();
376
377        if let Some(ch) = state.peek() {
378            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
379                state.advance(ch.len_utf8());
380
381                while let Some(ch) = state.peek() {
382                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
383                        state.advance(ch.len_utf8());
384                    }
385                    else {
386                        break;
387                    }
388                }
389
390                state.add_token(XmlTokenType::Identifier, start_pos, state.get_position());
391                return true;
392            }
393        }
394
395        false
396    }
397
398    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
399        let start_pos = state.get_position();
400
401        match state.peek() {
402            Some('"') => {
403                state.advance(1);
404                state.add_token(XmlTokenType::Quote, start_pos, state.get_position());
405                true
406            }
407            Some('\'') => {
408                state.advance(1);
409                state.add_token(XmlTokenType::SingleQuote, start_pos, state.get_position());
410                true
411            }
412            Some('!') => {
413                state.advance(1);
414                state.add_token(XmlTokenType::Exclamation, start_pos, state.get_position());
415                true
416            }
417            Some('?') => {
418                state.advance(1);
419                state.add_token(XmlTokenType::Question, start_pos, state.get_position());
420                true
421            }
422            Some('&') => {
423                state.advance(1);
424                state.add_token(XmlTokenType::Ampersand, start_pos, state.get_position());
425                true
426            }
427            Some(';') => {
428                state.advance(1);
429                state.add_token(XmlTokenType::Semicolon, start_pos, state.get_position());
430                true
431            }
432            _ => false,
433        }
434    }
435
436    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
437        let start_pos = state.get_position();
438
439        while let Some(ch) = state.peek() {
440            // Stop at special characters
441            match ch {
442                ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
443                _ => {
444                    state.advance(ch.len_utf8());
445                }
446            }
447        }
448
449        if state.get_position() > start_pos {
450            state.add_token(XmlTokenType::Text, start_pos, state.get_position());
451            true
452        }
453        else {
454            false
455        }
456    }
457}