Skip to main content

oak_xml/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::XmlLanguage, lexer::token_type::XmlTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, TextEdit,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8    source::Source,
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, XmlLanguage>;
13
14// XML 静态配置
15static XML_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
16
17static XML_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "<!--", block_end: "-->", nested_blocks: false });
18
19static XML_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"', '\''], escape: None });
20
21impl<'config> Lexer<XmlLanguage> for XmlLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<XmlLanguage>) -> LexOutput<XmlLanguage> {
23        let mut state = LexerState::new(source);
24        let result = self.run(&mut state);
25        if result.is_ok() {
26            state.add_eof();
27        }
28        state.finish_with_cache(result, cache)
29    }
30}
31
32#[derive(Clone)]
33pub struct XmlLexer<'config> {
34    _config: &'config XmlLanguage,
35}
36
37impl<'config> XmlLexer<'config> {
38    pub fn new(config: &'config XmlLanguage) -> Self {
39        Self { _config: config }
40    }
41
42    /// 主要的词法分析循环
43    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
44        while state.not_at_end() {
45            let safe_point = state.get_position();
46
47            if self.skip_whitespace(state) {
48                continue;
49            }
50
51            if self.lex_comment(state) {
52                continue;
53            }
54
55            if self.lex_doctype(state) {
56                continue;
57            }
58
59            if self.lex_cdata(state) {
60                continue;
61            }
62
63            if self.lex_processing_instruction(state) {
64                continue;
65            }
66
67            if self.lex_tag_start(state) {
68                continue;
69            }
70
71            if self.lex_entity_reference(state) {
72                continue;
73            }
74
75            if self.lex_string_literal(state) {
76                continue;
77            }
78
79            if self.lex_identifier_or_tag_name(state) {
80                continue;
81            }
82
83            if self.lex_single_char_tokens(state) {
84                continue;
85            }
86
87            if self.lex_text(state) {
88                continue;
89            }
90
91            state.advance_if_dead_lock(safe_point);
92        }
93
94        Ok(())
95    }
96
97    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
98        XML_WHITESPACE.scan(state, XmlTokenType::Whitespace)
99    }
100
101    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        XML_COMMENT.scan(state, XmlTokenType::Comment, XmlTokenType::Comment)
103    }
104
105    fn lex_doctype<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
106        let start_pos = state.get_position();
107
108        if let Some('<') = state.peek() {
109            if let Some('!') = state.peek_next_n(1) {
110                // Check for DOCTYPE keyword
111                let doctype_keyword = "DOCTYPE";
112                let mut matches = true;
113                for (i, expected_ch) in doctype_keyword.chars().enumerate() {
114                    if let Some(actual_ch) = state.peek_next_n(2 + i) {
115                        if actual_ch.to_ascii_uppercase() != expected_ch {
116                            matches = false;
117                            break;
118                        }
119                    }
120                    else {
121                        matches = false;
122                        break;
123                    }
124                }
125
126                if matches {
127                    state.advance(2 + doctype_keyword.len()); // Skip <!DOCTYPE
128
129                    let mut bracket_depth = 0;
130                    // Find DOCTYPE end
131                    while state.not_at_end() {
132                        match state.peek() {
133                            Some('[') => {
134                                bracket_depth += 1;
135                                state.advance(1);
136                            }
137                            Some(']') => {
138                                bracket_depth -= 1;
139                                state.advance(1);
140                            }
141                            Some('>') => {
142                                if bracket_depth == 0 {
143                                    state.advance(1); // Skip >
144                                    state.add_token(XmlTokenType::DoctypeDeclaration, start_pos, state.get_position());
145                                    return true;
146                                }
147                                else {
148                                    state.advance(1);
149                                }
150                            }
151                            Some(ch) => {
152                                state.advance(ch.len_utf8());
153                            }
154                            None => break,
155                        }
156                    }
157
158                    // Unclosed DOCTYPE
159                    state.add_token(XmlTokenType::Error, start_pos, state.get_position());
160                    return true;
161                }
162            }
163        }
164
165        false
166    }
167
168    fn lex_cdata<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
169        let start_pos = state.get_position();
170
171        if let Some('<') = state.peek() {
172            if let Some('!') = state.peek_next_n(1) {
173                if let Some('[') = state.peek_next_n(2) {
174                    // Check CDATA start tag
175                    let cdata_start = "CDATA[";
176                    let mut matches = true;
177                    for (i, expected_ch) in cdata_start.chars().enumerate() {
178                        if let Some(actual_ch) = state.peek_next_n(3 + i) {
179                            if actual_ch != expected_ch {
180                                matches = false;
181                                break;
182                            }
183                        }
184                        else {
185                            matches = false;
186                            break;
187                        }
188                    }
189
190                    if matches {
191                        state.advance(3 + cdata_start.len()); // Skip <![CDATA[
192
193                        // Find CDATA end ]]>
194                        while state.not_at_end() {
195                            if let Some(']') = state.peek() {
196                                if let Some(']') = state.peek_next_n(1) {
197                                    if let Some('>') = state.peek_next_n(2) {
198                                        state.advance(3); // Skip ]]>
199                                        state.add_token(XmlTokenType::CData, start_pos, state.get_position());
200                                        return true;
201                                    }
202                                }
203                            }
204                            if let Some(ch) = state.peek() {
205                                state.advance(ch.len_utf8());
206                            }
207                            else {
208                                break;
209                            }
210                        }
211
212                        // Unclosed CDATA
213                        state.add_token(XmlTokenType::Error, start_pos, state.get_position());
214                        return true;
215                    }
216                }
217            }
218        }
219
220        false
221    }
222
223    fn lex_processing_instruction<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
224        let start_pos = state.get_position();
225
226        if let Some('<') = state.peek() {
227            if let Some('?') = state.peek_next_n(1) {
228                state.advance(2); // Skip <?
229
230                // Find processing instruction end ?>
231                while state.not_at_end() {
232                    if let Some('?') = state.peek() {
233                        if let Some('>') = state.peek_next_n(1) {
234                            state.advance(2); // Skip ?>
235                            state.add_token(XmlTokenType::ProcessingInstruction, start_pos, state.get_position());
236                            return true;
237                        }
238                    }
239                    if let Some(ch) = state.peek() {
240                        state.advance(ch.len_utf8());
241                    }
242                    else {
243                        break;
244                    }
245                }
246
247                // Unclosed processing instruction
248                state.add_token(XmlTokenType::Error, start_pos, state.get_position());
249                return true;
250            }
251        }
252
253        false
254    }
255
256    fn lex_tag_start<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
257        let start_pos = state.get_position();
258
259        match state.peek() {
260            Some('<') => {
261                state.advance(1);
262                if state.peek() == Some('/') {
263                    state.advance(1);
264                    state.add_token(XmlTokenType::LeftAngleSlash, start_pos, state.get_position());
265                }
266                else {
267                    state.add_token(XmlTokenType::LeftAngle, start_pos, state.get_position());
268                }
269                true
270            }
271            Some('/') => {
272                if state.peek_next_n(1) == Some('>') {
273                    state.advance(2);
274                    state.add_token(XmlTokenType::SlashRightAngle, start_pos, state.get_position());
275                    true
276                }
277                else {
278                    false
279                }
280            }
281            Some('>') => {
282                state.advance(1);
283                state.add_token(XmlTokenType::RightAngle, start_pos, state.get_position());
284                true
285            }
286            Some('=') => {
287                state.advance(1);
288                state.add_token(XmlTokenType::Equals, start_pos, state.get_position());
289                true
290            }
291            _ => false,
292        }
293    }
294
295    fn lex_entity_reference<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
296        let start_pos = state.get_position();
297
298        if state.peek() == Some('&') {
299            state.advance(1);
300
301            // Check for character reference &#...;
302            if state.peek() == Some('#') {
303                state.advance(1);
304                let mut has_digits = false;
305
306                // Hexadecimal character reference &#x...;
307                if state.peek() == Some('x') {
308                    state.advance(1);
309                    while let Some(ch) = state.peek() {
310                        if ch.is_ascii_hexdigit() {
311                            state.advance(1);
312                            has_digits = true;
313                        }
314                        else {
315                            break;
316                        }
317                    }
318                }
319                else {
320                    // Decimal character reference &#...;
321                    while let Some(ch) = state.peek() {
322                        if ch.is_ascii_digit() {
323                            state.advance(1);
324                            has_digits = true;
325                        }
326                        else {
327                            break;
328                        }
329                    }
330                }
331
332                if has_digits && state.peek() == Some(';') {
333                    state.advance(1);
334                    state.add_token(XmlTokenType::CharacterReference, start_pos, state.get_position());
335                    return true;
336                }
337            }
338            else {
339                // Named entity reference &name;
340                let mut has_name = false;
341                while let Some(ch) = state.peek() {
342                    if ch.is_ascii_alphanumeric() {
343                        state.advance(1);
344                        has_name = true;
345                    }
346                    else {
347                        break;
348                    }
349                }
350
351                if has_name && state.peek() == Some(';') {
352                    state.advance(1);
353                    state.add_token(XmlTokenType::EntityReference, start_pos, state.get_position());
354                    return true;
355                }
356            }
357
358            // Invalid entity reference
359            state.add_token(XmlTokenType::Error, start_pos, state.get_position());
360            return true;
361        }
362
363        false
364    }
365
366    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
367        XML_STRING.scan(state, XmlTokenType::StringLiteral)
368    }
369
370    fn lex_identifier_or_tag_name<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
371        let start_pos = state.get_position();
372
373        if let Some(ch) = state.peek() {
374            if ch.is_ascii_alphabetic() || ch == '_' || ch == ':' {
375                state.advance(ch.len_utf8());
376
377                while let Some(ch) = state.peek() {
378                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' || ch == '.' || ch == ':' {
379                        state.advance(ch.len_utf8());
380                    }
381                    else {
382                        break;
383                    }
384                }
385
386                state.add_token(XmlTokenType::Identifier, start_pos, state.get_position());
387                return true;
388            }
389        }
390
391        false
392    }
393
394    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
395        let start_pos = state.get_position();
396
397        match state.peek() {
398            Some('"') => {
399                state.advance(1);
400                state.add_token(XmlTokenType::Quote, start_pos, state.get_position());
401                true
402            }
403            Some('\'') => {
404                state.advance(1);
405                state.add_token(XmlTokenType::SingleQuote, start_pos, state.get_position());
406                true
407            }
408            Some('!') => {
409                state.advance(1);
410                state.add_token(XmlTokenType::Exclamation, start_pos, state.get_position());
411                true
412            }
413            Some('?') => {
414                state.advance(1);
415                state.add_token(XmlTokenType::Question, start_pos, state.get_position());
416                true
417            }
418            Some('&') => {
419                state.advance(1);
420                state.add_token(XmlTokenType::Ampersand, start_pos, state.get_position());
421                true
422            }
423            Some(';') => {
424                state.advance(1);
425                state.add_token(XmlTokenType::Semicolon, start_pos, state.get_position());
426                true
427            }
428            _ => false,
429        }
430    }
431
432    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
433        let start_pos = state.get_position();
434
435        while let Some(ch) = state.peek() {
436            // Stop at special characters
437            match ch {
438                ' ' | '\t' | '\n' | '\r' | '<' | '>' | '=' | '"' | '\'' | '!' | '?' | '&' | ';' => break,
439                _ => {
440                    state.advance(ch.len_utf8());
441                }
442            }
443        }
444
445        if state.get_position() > start_pos {
446            state.add_token(XmlTokenType::Text, start_pos, state.get_position());
447            true
448        }
449        else {
450            false
451        }
452    }
453}