1use crate::{kind::YamlSyntaxKind, language::YamlLanguage};
2use oak_core::{
3 Lexer, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, LexerCache, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7
8static YAML_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: false };
9
10static YAML_COMMENT: CommentConfig = CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false };
11
12static YAML_STRING: StringConfig = StringConfig { quotes: &['"'], escape: Some('\\') };
13
14type State<'s, S> = LexerState<'s, S, YamlLanguage>;
15
16#[derive(Clone)]
17pub struct YamlLexer<'config> {
18 _config: &'config YamlLanguage,
19}
20
21impl<'config> YamlLexer<'config> {
22 pub fn new(config: &'config YamlLanguage) -> Self {
23 Self { _config: config }
24 }
25
26 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
27 while state.not_at_end() {
28 let safe_point = state.get_position();
29
30 if let Some(ch) = state.peek() {
31 match ch {
32 ' ' | '\t' => {
33 self.lex_whitespace(state);
34 }
35 '#' => {
36 self.lex_comment(state);
37 }
38 '\n' | '\r' => {
39 self.lex_newline(state);
40 }
41 '"' => {
42 self.lex_string_literal(state)?;
43 }
44 '0'..='9' | '+' => {
45 if self.lex_number_literal(state)? {
46 continue;
47 }
48 if self.lex_single_char_tokens(state) {
49 continue;
50 }
51 }
52 '-' => {
53 if self.lex_number_literal(state)? {
55 continue;
56 }
57 if self.lex_multi_char_operators(state) {
58 continue;
59 }
60 if self.lex_single_char_tokens(state) {
61 continue;
62 }
63 }
64 '.' => {
65 if self.lex_multi_char_operators(state) {
67 continue;
68 }
69 if self.lex_single_char_tokens(state) {
71 continue;
72 }
73 state.advance(ch.len_utf8());
75 state.add_token(YamlSyntaxKind::Error, safe_point, state.get_position());
76 }
77 'a'..='z' | 'A'..='Z' | '_' => {
78 self.lex_identifier_or_keyword(state)?;
79 }
80 _ => {
81 if self.lex_single_char_tokens(state) {
82 continue;
83 }
84
85 state.advance(ch.len_utf8());
87 state.add_token(YamlSyntaxKind::Error, safe_point, state.get_position());
88 }
89 }
90 }
91
92 state.advance_if_dead_lock(safe_point);
93 }
94
95 state.add_eof();
96 Ok(())
97 }
98}
99
100impl<'config> Lexer<YamlLanguage> for YamlLexer<'config> {
101 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<YamlLanguage>) -> LexOutput<YamlLanguage> {
102 let mut state = State::new_with_cache(source, 0, cache);
103 let result = self.run(&mut state);
104 state.finish_with_cache(result, cache)
105 }
106}
107
108impl YamlLexer<'_> {
109 fn lex_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
110 YAML_WHITESPACE.scan(state, YamlSyntaxKind::Whitespace)
111 }
112
113 fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
114 YAML_COMMENT.scan(state, YamlSyntaxKind::Comment, YamlSyntaxKind::Comment)
115 }
116
117 fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
118 if let Some(ch) = state.current() {
119 if ch == '\n' {
120 let start = state.get_position();
121 state.advance(1);
122 state.add_token(YamlSyntaxKind::Newline, start, state.get_position());
123 return true;
124 }
125 else if ch == '\r' {
126 let start = state.get_position();
127 state.advance(1);
128 if state.current() == Some('\n') {
129 state.advance(1);
130 }
131 state.add_token(YamlSyntaxKind::Newline, start, state.get_position());
132 return true;
133 }
134 }
135 false
136 }
137
138 fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
139 Ok(YAML_STRING.scan(state, YamlSyntaxKind::StringLiteral))
140 }
141
142 fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
143 let start = state.get_position();
144
145 if let Some(ch) = state.peek() {
146 if ch.is_ascii_digit() || (ch == '-' || ch == '+') {
147 if ch == '-' || ch == '+' {
148 state.advance(1);
149 if !state.peek().map_or(false, |c| c.is_ascii_digit()) {
150 state.set_position(start);
152 return Ok(false);
153 }
154 }
155
156 while let Some(ch) = state.peek() {
158 if ch.is_ascii_digit() || ch == '_' {
159 state.advance(ch.len_utf8());
160 }
161 else {
162 break;
163 }
164 }
165
166 if state.peek() == Some('.') {
168 state.advance(1);
169 while let Some(ch) = state.peek() {
170 if ch.is_ascii_digit() || ch == '_' {
171 state.advance(ch.len_utf8());
172 }
173 else {
174 break;
175 }
176 }
177 }
178
179 if state.peek() == Some('e') || state.peek() == Some('E') {
181 state.advance(1);
182 if state.peek() == Some('+') || state.peek() == Some('-') {
183 state.advance(1);
184 }
185 while let Some(ch) = state.peek() {
186 if ch.is_ascii_digit() || ch == '_' {
187 state.advance(ch.len_utf8());
188 }
189 else {
190 break;
191 }
192 }
193 }
194
195 state.add_token(YamlSyntaxKind::NumberLiteral, start, state.get_position());
196 Ok(true)
197 }
198 else {
199 Ok(false)
200 }
201 }
202 else {
203 Ok(false)
204 }
205 }
206
207 fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<bool, OakError> {
208 let start = state.get_position();
209
210 if let Some(ch) = state.peek() {
211 if ch.is_alphabetic() || ch == '_' {
212 state.advance(ch.len_utf8());
213
214 while let Some(ch) = state.peek() {
215 if ch.is_alphanumeric() || ch == '_' || ch == '-' {
216 state.advance(ch.len_utf8());
217 }
218 else {
219 break;
220 }
221 }
222
223 let end = state.get_position();
224 let text = state.source().get_text_in((start..end).into());
225 let kind = self.keyword_kind(text.as_ref()).unwrap_or(YamlSyntaxKind::Identifier);
226 state.add_token(kind, start, end);
227 Ok(true)
228 }
229 else {
230 Ok(false)
231 }
232 }
233 else {
234 Ok(false)
235 }
236 }
237
238 fn lex_multi_char_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
239 let start = state.get_position();
240
241 if state.peek() == Some('-') && state.peek_next_n(1) == Some('-') && state.peek_next_n(2) == Some('-') {
243 state.advance(3);
244 state.add_token(YamlSyntaxKind::DocumentStart, start, state.get_position());
245 return true;
246 }
247
248 if state.peek() == Some('.') && state.peek_next_n(1) == Some('.') && state.peek_next_n(2) == Some('.') {
250 state.advance(3);
251 state.add_token(YamlSyntaxKind::DocumentEnd, start, state.get_position());
252 return true;
253 }
254
255 false
256 }
257
258 fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
259 if let Some(ch) = state.peek() {
260 let start = state.get_position();
261
262 if let Some(kind) = self.single_char_kind(ch) {
263 state.advance(ch.len_utf8());
264 state.add_token(kind, start, state.get_position());
265 return true;
266 }
267 }
268 false
269 }
270
271 fn keyword_kind(&self, text: &str) -> Option<YamlSyntaxKind> {
272 match text {
273 "true" | "True" | "TRUE" | "false" | "False" | "FALSE" => Some(YamlSyntaxKind::BooleanLiteral),
274 "null" | "Null" | "NULL" | "~" => Some(YamlSyntaxKind::NullLiteral),
275 _ => None,
276 }
277 }
278
279 fn single_char_kind(&self, ch: char) -> Option<YamlSyntaxKind> {
280 match ch {
281 ':' => Some(YamlSyntaxKind::Colon),
282 '-' => Some(YamlSyntaxKind::Dash),
283 '|' => Some(YamlSyntaxKind::Pipe),
284 '>' => Some(YamlSyntaxKind::GreaterThan),
285 '?' => Some(YamlSyntaxKind::Question),
286 '&' => Some(YamlSyntaxKind::Ampersand),
287 '*' => Some(YamlSyntaxKind::Asterisk),
288 '!' => Some(YamlSyntaxKind::Exclamation),
289 '[' => Some(YamlSyntaxKind::LeftBracket),
290 ']' => Some(YamlSyntaxKind::RightBracket),
291 '{' => Some(YamlSyntaxKind::LeftBrace),
292 '}' => Some(YamlSyntaxKind::RightBrace),
293 _ => None,
294 }
295 }
296}