Skip to main content

oak_yaml/lexer/
mod.rs

1use crate::{kind::YamlSyntaxKind, language::YamlLanguage};
2use oak_core::{
3    Lexer, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, LexerCache, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7
8static YAML_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: false };
9
10static YAML_COMMENT: CommentConfig = CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false };
11
12static YAML_STRING: StringConfig = StringConfig { quotes: &['"'], escape: Some('\\') };
13
14type State<'s, S> = LexerState<'s, S, YamlLanguage>;
15
16#[derive(Clone)]
17pub struct YamlLexer<'config> {
18    _config: &'config YamlLanguage,
19}
20
21impl<'config> YamlLexer<'config> {
22    pub fn new(config: &'config YamlLanguage) -> Self {
23        Self { _config: config }
24    }
25
26    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
27        while state.not_at_end() {
28            let safe_point = state.get_position();
29
30            if let Some(ch) = state.peek() {
31                match ch {
32                    ' ' | '\t' => {
33                        self.lex_whitespace(state);
34                    }
35                    '#' => {
36                        self.lex_comment(state);
37                    }
38                    '\n' | '\r' => {
39                        self.lex_newline(state);
40                    }
41                    '"' => {
42                        self.lex_string_literal(state)?;
43                    }
44                    '0'..='9' | '+' => {
45                        if self.lex_number_literal(state)? {
46                            continue;
47                        }
48                        if self.lex_single_char_tokens(state) {
49                            continue;
50                        }
51                    }
52                    '-' => {
53                        // Could be number, document start (---), or dash
54                        if self.lex_number_literal(state)? {
55                            continue;
56                        }
57                        if self.lex_multi_char_operators(state) {
58                            continue;
59                        }
60                        if self.lex_single_char_tokens(state) {
61                            continue;
62                        }
63                    }
64                    '.' => {
65                        // Could be document end (...)
66                        if self.lex_multi_char_operators(state) {
67                            continue;
68                        }
69                        // Fallback to error/unknown if not handled
70                        if self.lex_single_char_tokens(state) {
71                            continue;
72                        }
73                        // If we reach here, we have an unexpected character (handled below)
74                        state.advance(ch.len_utf8());
75                        state.add_token(YamlSyntaxKind::Error, safe_point, state.get_position());
76                    }
77                    'a'..='z' | 'A'..='Z' | '_' => {
78                        self.lex_identifier_or_keyword(state)?;
79                    }
80                    _ => {
81                        if self.lex_single_char_tokens(state) {
82                            continue;
83                        }
84
85                        // If we reach here, we have an unexpected character
86                        state.advance(ch.len_utf8());
87                        state.add_token(YamlSyntaxKind::Error, safe_point, state.get_position());
88                    }
89                }
90            }
91
92            state.advance_if_dead_lock(safe_point);
93        }
94
95        state.add_eof();
96        Ok(())
97    }
98}
99
100impl<'config> Lexer<YamlLanguage> for YamlLexer<'config> {
101    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<YamlLanguage>) -> LexOutput<YamlLanguage> {
102        let mut state = State::new_with_cache(source, 0, cache);
103        let result = self.run(&mut state);
104        state.finish_with_cache(result, cache)
105    }
106}
107
108impl YamlLexer<'_> {
109    fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
110        YAML_WHITESPACE.scan(state, YamlSyntaxKind::Whitespace)
111    }
112
113    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
114        YAML_COMMENT.scan(state, YamlSyntaxKind::Comment, YamlSyntaxKind::Comment)
115    }
116
117    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
118        if let Some(ch) = state.current() {
119            if ch == '\n' {
120                let start = state.get_position();
121                state.advance(1);
122                state.add_token(YamlSyntaxKind::Newline, start, state.get_position());
123                return true;
124            }
125            else if ch == '\r' {
126                let start = state.get_position();
127                state.advance(1);
128                if state.current() == Some('\n') {
129                    state.advance(1);
130                }
131                state.add_token(YamlSyntaxKind::Newline, start, state.get_position());
132                return true;
133            }
134        }
135        false
136    }
137
138    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
139        Ok(YAML_STRING.scan(state, YamlSyntaxKind::StringLiteral))
140    }
141
142    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
143        let start = state.get_position();
144
145        if let Some(ch) = state.peek() {
146            if ch.is_ascii_digit() || (ch == '-' || ch == '+') {
147                if ch == '-' || ch == '+' {
148                    state.advance(1);
149                    if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
150                        // Not a number, backtrack
151                        state.set_position(start);
152                        return Ok(false);
153                    }
154                }
155
156                // Integer part
157                while let Some(ch) = state.peek() {
158                    if ch.is_ascii_digit() || ch == '_' {
159                        state.advance(ch.len_utf8());
160                    }
161                    else {
162                        break;
163                    }
164                }
165
166                // Decimal part
167                if state.peek() == Some('.') {
168                    state.advance(1);
169                    while let Some(ch) = state.peek() {
170                        if ch.is_ascii_digit() || ch == '_' {
171                            state.advance(ch.len_utf8());
172                        }
173                        else {
174                            break;
175                        }
176                    }
177                }
178
179                // Exponent part
180                if state.peek() == Some('e') || state.peek() == Some('E') {
181                    state.advance(1);
182                    if state.peek() == Some('+') || state.peek() == Some('-') {
183                        state.advance(1);
184                    }
185                    while let Some(ch) = state.peek() {
186                        if ch.is_ascii_digit() || ch == '_' {
187                            state.advance(ch.len_utf8());
188                        }
189                        else {
190                            break;
191                        }
192                    }
193                }
194
195                state.add_token(YamlSyntaxKind::NumberLiteral, start, state.get_position());
196                Ok(true)
197            }
198            else {
199                Ok(false)
200            }
201        }
202        else {
203            Ok(false)
204        }
205    }
206
207    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
208        let start = state.get_position();
209
210        if let Some(ch) = state.peek() {
211            if ch.is_alphabetic() || ch == '_' {
212                state.advance(ch.len_utf8());
213
214                while let Some(ch) = state.peek() {
215                    if ch.is_alphanumeric() || ch == '_' || ch == '-' {
216                        state.advance(ch.len_utf8());
217                    }
218                    else {
219                        break;
220                    }
221                }
222
223                let end = state.get_position();
224                let text = state.source().get_text_in((start..end).into());
225                let kind = self.keyword_kind(text.as_ref()).unwrap_or(YamlSyntaxKind::Identifier);
226                state.add_token(kind, start, end);
227                Ok(true)
228            }
229            else {
230                Ok(false)
231            }
232        }
233        else {
234            Ok(false)
235        }
236    }
237
238    fn lex_multi_char_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
239        let start = state.get_position();
240
241        // Document start: ---
242        if state.peek() == Some('-') && state.peek_next_n(1) == Some('-') && state.peek_next_n(2) == Some('-') {
243            state.advance(3);
244            state.add_token(YamlSyntaxKind::DocumentStart, start, state.get_position());
245            return true;
246        }
247
248        // Document end: ...
249        if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') && state.peek_next_n(2) == Some('.') {
250            state.advance(3);
251            state.add_token(YamlSyntaxKind::DocumentEnd, start, state.get_position());
252            return true;
253        }
254
255        false
256    }
257
258    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
259        if let Some(ch) = state.peek() {
260            let start = state.get_position();
261
262            if let Some(kind) = self.single_char_kind(ch) {
263                state.advance(ch.len_utf8());
264                state.add_token(kind, start, state.get_position());
265                return true;
266            }
267        }
268        false
269    }
270
271    fn keyword_kind(&self, text: &str) -> Option<YamlSyntaxKind> {
272        match text {
273            "true" | "True" | "TRUE" | "false" | "False" | "FALSE" => Some(YamlSyntaxKind::BooleanLiteral),
274            "null" | "Null" | "NULL" | "~" => Some(YamlSyntaxKind::NullLiteral),
275            _ => None,
276        }
277    }
278
279    fn single_char_kind(&self, ch: char) -> Option<YamlSyntaxKind> {
280        match ch {
281            ':' => Some(YamlSyntaxKind::Colon),
282            '-' => Some(YamlSyntaxKind::Dash),
283            '|' => Some(YamlSyntaxKind::Pipe),
284            '>' => Some(YamlSyntaxKind::GreaterThan),
285            '?' => Some(YamlSyntaxKind::Question),
286            '&' => Some(YamlSyntaxKind::Ampersand),
287            '*' => Some(YamlSyntaxKind::Asterisk),
288            '!' => Some(YamlSyntaxKind::Exclamation),
289            '[' => Some(YamlSyntaxKind::LeftBracket),
290            ']' => Some(YamlSyntaxKind::RightBracket),
291            '{' => Some(YamlSyntaxKind::LeftBrace),
292            '}' => Some(YamlSyntaxKind::RightBrace),
293            _ => None,
294        }
295    }
296}