Skip to main content

oak_xml/lexer/
mod.rs

1use crate::{kind::XmlSyntaxKind, language::XmlLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, XmlLanguage>;
10
11// XML 静态配置
12static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
15
16static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
17
18impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
19    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
20        let mut state = LexerState::new(source);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof();
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29#[derive(Clone)]
30pub struct XmlLexer<'config> {
31    _config: &'config XmlLanguage,
32}
33
34impl<'config> XmlLexer<'config> {
35    pub fn new(config: &'config XmlLanguage) -> Self {
36        Self { _config: config }
37    }
38
39    /// 主要的词法分析循环
40    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
41        while state.not_at_end() {
42            let safe_point = state.get_position();
43
44            if self.skip_whitespace(state) {
45                continue;
46            }
47
48            if self.lex_comment(state) {
49                continue;
50            }
51
52            if self.lex_doctype(state) {
53                continue;
54            }
55
56            if self.lex_cdata(state) {
57                continue;
58            }
59
60            if self.lex_processing_instruction(state) {
61                continue;
62            }
63
64            if self.lex_tag_start(state) {
65                continue;
66            }
67
68            if self.lex_entity_reference(state) {
69                continue;
70            }
71
72            if self.lex_string_literal(state) {
73                continue;
74            }
75
76            if self.lex_identifier_or_tag_name(state) {
77                continue;
78            }
79
80            if self.lex_single_char_tokens(state) {
81                continue;
82            }
83
84            if self.lex_text(state) {
85                continue;
86            }
87
88            state.advance_if_dead_lock(safe_point);
89        }
90
91        Ok(())
92    }
93
94    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        XML_WHITESPACE.scan(state, XmlSyntaxKind::Whitespace)
96    }
97
98    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
99        XML_COMMENT.scan(state, XmlSyntaxKind::Comment, XmlSyntaxKind::Comment)
100    }
101
102    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103        let start_pos = state.get_position();
104
105        if let Some('<') = state.peek() {
106            if let Some('!') = state.peek_next_n(1) {
107                // Check for DOCTYPE keyword
108                let doctype_keyword = "DOCTYPE";
109                let mut matches = true;
110                for (i, expected_ch) in doctype_keyword.chars().enumerate() {
111                    if let Some(actual_ch) = state.peek_next_n(2 + i) {
112                        if actual_ch.to_ascii_uppercase() != expected_ch {
113                            matches = false;
114                            break;
115                        }
116                    }
117                    else {
118                        matches = false;
119                        break;
120                    }
121                }
122
123                if matches {
124                    state.advance(2 + doctype_keyword.len()); // Skip <!DOCTYPE
125
126                    let mut bracket_depth = 0;
127                    // Find DOCTYPE end
128                    while state.not_at_end() {
129                        match state.peek() {
130                            Some('[') => {
131                                bracket_depth += 1;
132                                state.advance(1);
133                            }
134                            Some(']') => {
135                                bracket_depth -= 1;
136                                state.advance(1);
137                            }
138                            Some('>') => {
139                                if bracket_depth == 0 {
140                                    state.advance(1); // Skip >
141                                    state.add_token(XmlSyntaxKind::DoctypeDeclaration, start_pos, state.get_position());
142                                    return true;
143                                }
144                                else {
145                                    state.advance(1);
146                                }
147                            }
148                            Some(ch) => {
149                                state.advance(ch.len_utf8());
150                            }
151                            None => break,
152                        }
153                    }
154
155                    // Unclosed DOCTYPE
156                    state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
157                    return true;
158                }
159            }
160        }
161
162        false
163    }
164
165    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
166        let start_pos = state.get_position();
167
168        if let Some('<') = state.peek() {
169            if let Some('!') = state.peek_next_n(1) {
170                if let Some('[') = state.peek_next_n(2) {
171                    // Check CDATA start tag
172                    let cdata_start = "CDATA[";
173                    let mut matches = true;
174                    for (i, expected_ch) in cdata_start.chars().enumerate() {
175                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
176                            if actual_ch != expected_ch {
177                                matches = false;
178                                break;
179                            }
180                        }
181                        else {
182                            matches = false;
183                            break;
184                        }
185                    }
186
187                    if matches {
188                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
189
190                        // Find CDATA end ]]>
191                        while state.not_at_end() {
192                            if let Some(']') = state.peek() {
193                                if let Some(']') = state.peek_next_n(1) {
194                                    if let Some('>') = state.peek_next_n(2) {
195                                        state.advance(3); // Skip ]]>
196                                        state.add_token(XmlSyntaxKind::CData, start_pos, state.get_position());
197                                        return true;
198                                    }
199                                }
200                            }
201                            if let Some(ch) = state.peek() {
202                                state.advance(ch.len_utf8());
203                            }
204                            else {
205                                break;
206                            }
207                        }
208
209                        // Unclosed CDATA
210                        state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
211                        return true;
212                    }
213                }
214            }
215        }
216
217        false
218    }
219
220    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
221        let start_pos = state.get_position();
222
223        if let Some('<') = state.peek() {
224            if let Some('?') = state.peek_next_n(1) {
225                state.advance(2); // Skip <?
226
227                // Find processing instruction end ?>
228                while state.not_at_end() {
229                    if let Some('?') = state.peek() {
230                        if let Some('>') = state.peek_next_n(1) {
231                            state.advance(2); // Skip ?>
232                            state.add_token(XmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
233                            return true;
234                        }
235                    }
236                    if let Some(ch) = state.peek() {
237                        state.advance(ch.len_utf8());
238                    }
239                    else {
240                        break;
241                    }
242                }
243
244                // Unclosed processing instruction
245                state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
246                return true;
247            }
248        }
249
250        false
251    }
252
253    fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
254        let start_pos = state.get_position();
255
256        match state.peek() {
257            Some('<') => {
258                state.advance(1);
259                if state.peek() == Some('/') {
260                    state.advance(1);
261                    state.add_token(XmlSyntaxKind::LeftAngleSlash, start_pos, state.get_position());
262                }
263                else {
264                    state.add_token(XmlSyntaxKind::LeftAngle, start_pos, state.get_position());
265                }
266                true
267            }
268            Some('/') => {
269                if state.peek_next_n(1) == Some('>') {
270                    state.advance(2);
271                    state.add_token(XmlSyntaxKind::SlashRightAngle, start_pos, state.get_position());
272                    true
273                }
274                else {
275                    false
276                }
277            }
278            Some('>') => {
279                state.advance(1);
280                state.add_token(XmlSyntaxKind::RightAngle, start_pos, state.get_position());
281                true
282            }
283            Some('=') => {
284                state.advance(1);
285                state.add_token(XmlSyntaxKind::Equals, start_pos, state.get_position());
286                true
287            }
288            _ => false,
289        }
290    }
291
292    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
293        let start_pos = state.get_position();
294
295        if state.peek() == Some('&') {
296            state.advance(1);
297
298            // Check for character reference &#...;
299            if state.peek() == Some('#') {
300                state.advance(1);
301                let mut has_digits = false;
302
303                // Hexadecimal character reference &#x...;
304                if state.peek() == Some('x') {
305                    state.advance(1);
306                    while let Some(ch) = state.peek() {
307                        if ch.is_ascii_hexdigit() {
308                            state.advance(1);
309                            has_digits = true;
310                        }
311                        else {
312                            break;
313                        }
314                    }
315                }
316                else {
317                    // Decimal character reference &#...;
318                    while let Some(ch) = state.peek() {
319                        if ch.is_ascii_digit() {
320                            state.advance(1);
321                            has_digits = true;
322                        }
323                        else {
324                            break;
325                        }
326                    }
327                }
328
329                if has_digits && state.peek() == Some(';') {
330                    state.advance(1);
331                    state.add_token(XmlSyntaxKind::CharacterReference, start_pos, state.get_position());
332                    return true;
333                }
334            }
335            else {
336                // Named entity reference &name;
337                let mut has_name = false;
338                while let Some(ch) = state.peek() {
339                    if ch.is_ascii_alphanumeric() {
340                        state.advance(1);
341                        has_name = true;
342                    }
343                    else {
344                        break;
345                    }
346                }
347
348                if has_name && state.peek() == Some(';') {
349                    state.advance(1);
350                    state.add_token(XmlSyntaxKind::EntityReference, start_pos, state.get_position());
351                    return true;
352                }
353            }
354
355            // Invalid entity reference
356            state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
357            return true;
358        }
359
360        false
361    }
362
363    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
364        XML_STRING.scan(state, XmlSyntaxKind::StringLiteral)
365    }
366
367    fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
368        let start_pos = state.get_position();
369
370        if let Some(ch) = state.peek() {
371            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
372                state.advance(ch.len_utf8());
373
374                while let Some(ch) = state.peek() {
375                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
376                        state.advance(ch.len_utf8());
377                    }
378                    else {
379                        break;
380                    }
381                }
382
383                state.add_token(XmlSyntaxKind::Identifier, start_pos, state.get_position());
384                return true;
385            }
386        }
387
388        false
389    }
390
391    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
392        let start_pos = state.get_position();
393
394        match state.peek() {
395            Some('"') => {
396                state.advance(1);
397                state.add_token(XmlSyntaxKind::Quote, start_pos, state.get_position());
398                true
399            }
400            Some('\'') => {
401                state.advance(1);
402                state.add_token(XmlSyntaxKind::SingleQuote, start_pos, state.get_position());
403                true
404            }
405            Some('!') => {
406                state.advance(1);
407                state.add_token(XmlSyntaxKind::Exclamation, start_pos, state.get_position());
408                true
409            }
410            Some('?') => {
411                state.advance(1);
412                state.add_token(XmlSyntaxKind::Question, start_pos, state.get_position());
413                true
414            }
415            Some('&') => {
416                state.advance(1);
417                state.add_token(XmlSyntaxKind::Ampersand, start_pos, state.get_position());
418                true
419            }
420            Some(';') => {
421                state.advance(1);
422                state.add_token(XmlSyntaxKind::Semicolon, start_pos, state.get_position());
423                true
424            }
425            _ => false,
426        }
427    }
428
429    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
430        let start_pos = state.get_position();
431
432        while let Some(ch) = state.peek() {
433            // Stop at special characters
434            match ch {
435                ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
436                _ => {
437                    state.advance(ch.len_utf8());
438                }
439            }
440        }
441
442        if state.get_position() > start_pos {
443            state.add_token(XmlSyntaxKind::Text, start_pos, state.get_position());
444            true
445        }
446        else {
447            false
448        }
449    }
450}