oak_xml/lexer/
mod.rs

1use crate::{kind::XmlSyntaxKind, language::XmlLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentBlock, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, XmlLanguage>;
10
11// XML 静态配置
12static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
13
14static XML_COMMENT: LazyLock<CommentBlock> =
15    LazyLock::new(|| CommentBlock { block_markers: &[("<!--", "-->")], nested_blocks: false });
16
17static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
18
19impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
20    fn lex_incremental(
21        &self,
22        source: impl Source,
23        changed: usize,
24        cache: IncrementalCache<XmlLanguage>,
25    ) -> LexOutput<XmlLanguage> {
26        let mut state = LexerState::new_with_cache(source, changed, cache);
27        let result = self.run(&mut state);
28        state.finish(result)
29    }
30}
31
32#[derive(Clone)]
33pub struct XmlLexer<'config> {
34    config: &'config XmlLanguage,
35}
36
37impl<'config> XmlLexer<'config> {
38    pub fn new(config: &'config XmlLanguage) -> Self {
39        Self { config }
40    }
41
42    /// 主要的词法分析循环
43    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
44        while state.not_at_end() {
45            let safe_point = state.get_position();
46
47            if self.skip_whitespace(state) {
48                continue;
49            }
50
51            if self.lex_comment(state) {
52                continue;
53            }
54
55            if self.lex_doctype(state) {
56                continue;
57            }
58
59            if self.lex_cdata(state) {
60                continue;
61            }
62
63            if self.lex_processing_instruction(state) {
64                continue;
65            }
66
67            if self.lex_tag_operators(state) {
68                continue;
69            }
70
71            if self.lex_entity_reference(state) {
72                continue;
73            }
74
75            if self.lex_string_literal(state) {
76                continue;
77            }
78
79            if self.lex_identifier(state) {
80                continue;
81            }
82
83            if self.lex_single_char_tokens(state) {
84                continue;
85            }
86
87            if self.lex_text(state) {
88                continue;
89            }
90
91            state.safe_check(safe_point);
92        }
93
94        // 添加 EOF token
95        let eof_pos = state.get_position();
96        state.add_token(XmlSyntaxKind::Eof, eof_pos, eof_pos);
97        Ok(())
98    }
99
100    /// 跳过空白字符
101    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
102        match XML_WHITESPACE.scan(state.rest(), state.get_position(), XmlSyntaxKind::Whitespace) {
103            Some(token) => {
104                state.advance_with(token);
105                true
106            }
107            None => false,
108        }
109    }
110
111    /// 解析XML注释 <!-- ... -->
112    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
113        match XML_COMMENT.scan(state.rest(), state.get_position(), XmlSyntaxKind::Comment) {
114            Some(token) => {
115                state.advance_with(token);
116                true
117            }
118            None => false,
119        }
120    }
121
122    fn lex_doctype<S: Source>(&self, state: &mut State<S>) -> bool {
123        let start_pos = state.get_position();
124
125        if let Some('<') = state.peek() {
126            if let Some('!') = state.peek_next_n(1) {
127                // Check for DOCTYPE keyword
128                let doctype_keyword = "DOCTYPE";
129                let mut matches = true;
130                for (i, expected_ch) in doctype_keyword.chars().enumerate() {
131                    if let Some(actual_ch) = state.peek_next_n(2 + i) {
132                        if actual_ch.to_ascii_uppercase() != expected_ch {
133                            matches = false;
134                            break;
135                        }
136                    }
137                    else {
138                        matches = false;
139                        break;
140                    }
141                }
142
143                if matches {
144                    state.advance(2 + doctype_keyword.len()); // Skip <!DOCTYPE
145
146                    let mut bracket_depth = 0;
147                    // Find DOCTYPE end
148                    while state.not_at_end() {
149                        match state.peek() {
150                            Some('[') => {
151                                bracket_depth += 1;
152                                state.advance(1);
153                            }
154                            Some(']') => {
155                                bracket_depth -= 1;
156                                state.advance(1);
157                            }
158                            Some('>') => {
159                                if bracket_depth == 0 {
160                                    state.advance(1); // Skip >
161                                    state.add_token(XmlSyntaxKind::DoctypeDeclaration, start_pos, state.get_position());
162                                    return true;
163                                }
164                                else {
165                                    state.advance(1);
166                                }
167                            }
168                            Some(ch) => {
169                                state.advance(ch.len_utf8());
170                            }
171                            None => break,
172                        }
173                    }
174
175                    // Unclosed DOCTYPE
176                    state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
177                    return true;
178                }
179            }
180        }
181
182        false
183    }
184
185    fn lex_cdata<S: Source>(&self, state: &mut State<S>) -> bool {
186        let start_pos = state.get_position();
187
188        if let Some('<') = state.peek() {
189            if let Some('!') = state.peek_next_n(1) {
190                if let Some('[') = state.peek_next_n(2) {
191                    // Check CDATA start tag
192                    let cdata_start = "CDATA[";
193                    let mut matches = true;
194                    for (i, expected_ch) in cdata_start.chars().enumerate() {
195                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
196                            if actual_ch != expected_ch {
197                                matches = false;
198                                break;
199                            }
200                        }
201                        else {
202                            matches = false;
203                            break;
204                        }
205                    }
206
207                    if matches {
208                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
209
210                        // Find CDATA end ]]>
211                        while state.not_at_end() {
212                            if let Some(']') = state.peek() {
213                                if let Some(']') = state.peek_next_n(1) {
214                                    if let Some('>') = state.peek_next_n(2) {
215                                        state.advance(3); // Skip ]]>
216                                        state.add_token(XmlSyntaxKind::CData, start_pos, state.get_position());
217                                        return true;
218                                    }
219                                }
220                            }
221                            if let Some(ch) = state.peek() {
222                                state.advance(ch.len_utf8());
223                            }
224                            else {
225                                break;
226                            }
227                        }
228
229                        // Unclosed CDATA
230                        state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
231                        return true;
232                    }
233                }
234            }
235        }
236
237        false
238    }
239
240    fn lex_processing_instruction<S: Source>(&self, state: &mut State<S>) -> bool {
241        let start_pos = state.get_position();
242
243        if let Some('<') = state.peek() {
244            if let Some('?') = state.peek_next_n(1) {
245                state.advance(2); // Skip <?
246
247                // Find processing instruction end ?>
248                while state.not_at_end() {
249                    if let Some('?') = state.peek() {
250                        if let Some('>') = state.peek_next_n(1) {
251                            state.advance(2); // Skip ?>
252                            state.add_token(XmlSyntaxKind::ProcessingInstruction, start_pos, state.get_position());
253                            return true;
254                        }
255                    }
256                    if let Some(ch) = state.peek() {
257                        state.advance(ch.len_utf8());
258                    }
259                    else {
260                        break;
261                    }
262                }
263
264                // Unclosed processing instruction
265                state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
266                return true;
267            }
268        }
269
270        false
271    }
272
273    fn lex_tag_operators<S: Source>(&self, state: &mut State<S>) -> bool {
274        let start_pos = state.get_position();
275
276        match state.peek() {
277            Some('<') => {
278                state.advance(1);
279                if state.peek() == Some('/') {
280                    state.advance(1);
281                    state.add_token(XmlSyntaxKind::LeftAngleSlash, start_pos, state.get_position());
282                }
283                else {
284                    state.add_token(XmlSyntaxKind::LeftAngle, start_pos, state.get_position());
285                }
286                true
287            }
288            Some('/') => {
289                if state.peek_next_n(1) == Some('>') {
290                    state.advance(2);
291                    state.add_token(XmlSyntaxKind::SlashRightAngle, start_pos, state.get_position());
292                    true
293                }
294                else {
295                    false
296                }
297            }
298            Some('>') => {
299                state.advance(1);
300                state.add_token(XmlSyntaxKind::RightAngle, start_pos, state.get_position());
301                true
302            }
303            Some('=') => {
304                state.advance(1);
305                state.add_token(XmlSyntaxKind::Equals, start_pos, state.get_position());
306                true
307            }
308            _ => false,
309        }
310    }
311
312    fn lex_entity_reference<S: Source>(&self, state: &mut State<S>) -> bool {
313        let start_pos = state.get_position();
314
315        if state.peek() == Some('&') {
316            state.advance(1);
317
318            // Check for character reference &#...;
319            if state.peek() == Some('#') {
320                state.advance(1);
321                let mut has_digits = false;
322
323                // Hexadecimal character reference &#x...;
324                if state.peek() == Some('x') {
325                    state.advance(1);
326                    while let Some(ch) = state.peek() {
327                        if ch.is_ascii_hexdigit() {
328                            state.advance(1);
329                            has_digits = true;
330                        }
331                        else {
332                            break;
333                        }
334                    }
335                }
336                else {
337                    // Decimal character reference &#...;
338                    while let Some(ch) = state.peek() {
339                        if ch.is_ascii_digit() {
340                            state.advance(1);
341                            has_digits = true;
342                        }
343                        else {
344                            break;
345                        }
346                    }
347                }
348
349                if has_digits && state.peek() == Some(';') {
350                    state.advance(1);
351                    state.add_token(XmlSyntaxKind::CharacterReference, start_pos, state.get_position());
352                    return true;
353                }
354            }
355            else {
356                // Named entity reference &name;
357                let mut has_name = false;
358                while let Some(ch) = state.peek() {
359                    if ch.is_ascii_alphanumeric() {
360                        state.advance(1);
361                        has_name = true;
362                    }
363                    else {
364                        break;
365                    }
366                }
367
368                if has_name && state.peek() == Some(';') {
369                    state.advance(1);
370                    state.add_token(XmlSyntaxKind::EntityReference, start_pos, state.get_position());
371                    return true;
372                }
373            }
374
375            // Invalid entity reference
376            state.add_token(XmlSyntaxKind::Error, start_pos, state.get_position());
377            return true;
378        }
379
380        false
381    }
382
383    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
384        match XML_STRING.scan(state.rest(), 0, XmlSyntaxKind::StringLiteral) {
385            Some(mut token) => {
386                // Adjust token span to absolute position
387                token.span.start += state.get_position();
388                token.span.end += state.get_position();
389                state.advance_with(token);
390                true
391            }
392            None => false,
393        }
394    }
395
396    fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
397        let start_pos = state.get_position();
398
399        if let Some(ch) = state.peek() {
400            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
401                state.advance(ch.len_utf8());
402
403                while let Some(ch) = state.peek() {
404                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
405                        state.advance(ch.len_utf8());
406                    }
407                    else {
408                        break;
409                    }
410                }
411
412                state.add_token(XmlSyntaxKind::Identifier, start_pos, state.get_position());
413                return true;
414            }
415        }
416
417        false
418    }
419
420    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
421        let start_pos = state.get_position();
422
423        match state.peek() {
424            Some('"') => {
425                state.advance(1);
426                state.add_token(XmlSyntaxKind::Quote, start_pos, state.get_position());
427                true
428            }
429            Some('\'') => {
430                state.advance(1);
431                state.add_token(XmlSyntaxKind::SingleQuote, start_pos, state.get_position());
432                true
433            }
434            Some('!') => {
435                state.advance(1);
436                state.add_token(XmlSyntaxKind::Exclamation, start_pos, state.get_position());
437                true
438            }
439            Some('?') => {
440                state.advance(1);
441                state.add_token(XmlSyntaxKind::Question, start_pos, state.get_position());
442                true
443            }
444            Some('&') => {
445                state.advance(1);
446                state.add_token(XmlSyntaxKind::Ampersand, start_pos, state.get_position());
447                true
448            }
449            Some(';') => {
450                state.advance(1);
451                state.add_token(XmlSyntaxKind::Semicolon, start_pos, state.get_position());
452                true
453            }
454            _ => false,
455        }
456    }
457
458    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
459        let start_pos = state.get_position();
460
461        while let Some(ch) = state.peek() {
462            // Stop at special characters
463            match ch {
464                ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
465                _ => {
466                    state.advance(ch.len_utf8());
467                }
468            }
469        }
470
471        if state.get_position() > start_pos {
472            state.add_token(XmlSyntaxKind::Text, start_pos, state.get_position());
473            true
474        }
475        else {
476            false
477        }
478    }
479}