1use crate::{kind::BashSyntaxKind, language::BashLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3use std::sync::LazyLock;
4
5type State<S> = LexerState<S, BashLanguage>;
6
7#[derive(Clone)]
8pub struct BashLexer<'config> {
9 config: &'config BashLanguage,
10}
11
12impl<'config> BashLexer<'config> {
13 pub fn new(config: &'config BashLanguage) -> Self {
14 Self { config }
15 }
16
17 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
18 while state.not_at_end() {
19 if self.skip_whitespace(state) {
20 continue;
21 }
22
23 if self.skip_comment(state) {
24 continue;
25 }
26
27 if self.lex_newline(state) {
28 continue;
29 }
30
31 if self.lex_string(state) {
32 continue;
33 }
34
35 if self.lex_variable(state) {
36 continue;
37 }
38
39 if self.lex_number(state) {
40 continue;
41 }
42
43 if self.lex_keyword_or_identifier(state) {
44 continue;
45 }
46
47 if self.lex_operator_or_delimiter(state) {
48 continue;
49 }
50
51 if self.lex_heredoc(state) {
52 continue;
53 }
54
55 if self.lex_glob_pattern(state) {
56 continue;
57 }
58
59 if self.lex_special_char(state) {
60 continue;
61 }
62
63 if self.lex_text(state) {
64 continue;
65 }
66
67 state.advance(1);
69 }
70 Ok(())
71 }
72
73 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
74 let start_pos = state.get_position();
75
76 while let Some(ch) = state.peek() {
77 if ch == ' ' || ch == '\t' {
78 state.advance(ch.len_utf8());
79 }
80 else {
81 break;
82 }
83 }
84
85 if state.get_position() > start_pos {
86 state.add_token(BashSyntaxKind::Whitespace, start_pos, state.get_position());
87 true
88 }
89 else {
90 false
91 }
92 }
93
94 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
95 let start_pos = state.get_position();
96
97 if let Some('#') = state.peek() {
98 state.advance(1);
99 while let Some(ch) = state.peek() {
100 if ch == '\n' || ch == '\r' {
101 break;
102 }
103 state.advance(ch.len_utf8());
104 }
105 state.add_token(BashSyntaxKind::Comment, start_pos, state.get_position());
106 true
107 }
108 else {
109 false
110 }
111 }
112
113 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
114 let start_pos = state.get_position();
115
116 if let Some('\n') = state.peek() {
117 state.advance(1);
118 state.add_token(BashSyntaxKind::Newline, start_pos, state.get_position());
119 true
120 }
121 else if let Some('\r') = state.peek() {
122 state.advance(1);
123 if let Some('\n') = state.peek() {
124 state.advance(1);
125 }
126 state.add_token(BashSyntaxKind::Newline, start_pos, state.get_position());
127 true
128 }
129 else {
130 false
131 }
132 }
133
134 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
135 let start_pos = state.get_position();
136
137 if let Some(quote) = state.peek() {
138 if quote == '"' || quote == '\'' {
139 state.advance(1);
140 let mut escaped = false;
141
142 while let Some(ch) = state.peek() {
143 if escaped {
144 escaped = false;
145 state.advance(ch.len_utf8());
146 continue;
147 }
148
149 if ch == '\\' {
150 escaped = true;
151 state.advance(1);
152 continue;
153 }
154
155 if ch == quote {
156 state.advance(1);
157 break;
158 }
159
160 state.advance(ch.len_utf8());
161 }
162
163 state.add_token(BashSyntaxKind::StringLiteral, start_pos, state.get_position());
164 return true;
165 }
166 }
167
168 false
169 }
170
171 fn lex_variable<S: Source>(&self, state: &mut State<S>) -> bool {
172 let start_pos = state.get_position();
173
174 if let Some('$') = state.peek() {
175 state.advance(1);
176
177 if let Some(ch) = state.peek() {
179 if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '@' || ch == '*' {
180 state.advance(1);
181 state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
182 return true;
183 }
184 }
185
186 if let Some('{') = state.peek() {
188 state.advance(1);
189 while let Some(ch) = state.peek() {
190 if ch == '}' {
191 state.advance(1);
192 break;
193 }
194 state.advance(ch.len_utf8());
195 }
196 state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
197 return true;
198 }
199
200 if let Some(ch) = state.peek() {
202 if ch.is_alphabetic() || ch == '_' {
203 state.advance(ch.len_utf8());
204 while let Some(ch) = state.peek() {
205 if ch.is_alphanumeric() || ch == '_' {
206 state.advance(ch.len_utf8());
207 }
208 else {
209 break;
210 }
211 }
212 state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
213 return true;
214 }
215 }
216
217 state.set_position(start_pos);
219 }
220
221 false
222 }
223
224 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
225 let start_pos = state.get_position();
226
227 if let Some(ch) = state.peek() {
228 if ch.is_ascii_digit() {
229 state.advance(1);
230 while let Some(ch) = state.peek() {
231 if ch.is_ascii_digit() {
232 state.advance(1);
233 }
234 else {
235 break;
236 }
237 }
238 state.add_token(BashSyntaxKind::NumberLiteral, start_pos, state.get_position());
239 return true;
240 }
241 }
242
243 false
244 }
245
246 fn lex_keyword_or_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
247 let start_pos = state.get_position();
248
249 if let Some(ch) = state.peek() {
250 if ch.is_alphabetic() || ch == '_' {
251 state.advance(ch.len_utf8());
252 while let Some(ch) = state.peek() {
253 if ch.is_alphanumeric() || ch == '_' {
254 state.advance(ch.len_utf8());
255 }
256 else {
257 break;
258 }
259 }
260
261 let text = state.get_text_in((start_pos..state.get_position()).into());
262 let kind = if BASH_KEYWORDS.contains(&text) { BashSyntaxKind::Keyword } else { BashSyntaxKind::Identifier };
263
264 state.add_token(kind, start_pos, state.get_position());
265 return true;
266 }
267 }
268
269 false
270 }
271
272 fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
273 let start_pos = state.get_position();
274
275 if let Some(ch) = state.peek() {
276 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
277
278 if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
280 state.advance(2);
281 state.add_token(BashSyntaxKind::Operator, start_pos, state.get_position());
282 return true;
283 }
284
285 let ch_str = ch.to_string();
287 if BASH_OPERATORS.contains(&ch_str.as_str()) {
288 state.advance(1);
289 state.add_token(BashSyntaxKind::Operator, start_pos, state.get_position());
290 return true;
291 }
292
293 if BASH_DELIMITERS.contains(&ch_str.as_str()) {
294 state.advance(1);
295 state.add_token(BashSyntaxKind::Delimiter, start_pos, state.get_position());
296 return true;
297 }
298 }
299
300 false
301 }
302
303 fn lex_heredoc<S: Source>(&self, state: &mut State<S>) -> bool {
304 let start_pos = state.get_position();
305
306 if let Some('<') = state.peek() {
308 if let Some('<') = state.peek_next_n(1) {
309 state.advance(2);
310
311 if let Some('-') = state.peek() {
313 state.advance(1);
314 }
315
316 while let Some(ch) = state.peek() {
318 if ch.is_alphanumeric() || ch == '_' {
319 state.advance(ch.len_utf8());
320 }
321 else {
322 break;
323 }
324 }
325
326 state.add_token(BashSyntaxKind::Heredoc, start_pos, state.get_position());
327 return true;
328 }
329 }
330
331 false
332 }
333
334 fn lex_glob_pattern<S: Source>(&self, state: &mut State<S>) -> bool {
335 let start_pos = state.get_position();
336
337 if let Some(ch) = state.peek() {
338 if ch == '*' || ch == '?' || ch == '[' {
339 state.advance(1);
340
341 if ch == '[' {
342 if let Some('!') = state.peek() {
344 state.advance(1);
345 }
346 while let Some(ch) = state.peek() {
347 if ch == ']' {
348 state.advance(1);
349 break;
350 }
351 state.advance(ch.len_utf8());
352 }
353 }
354
355 state.add_token(BashSyntaxKind::GlobPattern, start_pos, state.get_position());
356 return true;
357 }
358 }
359
360 false
361 }
362
363 fn lex_special_char<S: Source>(&self, state: &mut State<S>) -> bool {
364 let start_pos = state.get_position();
365
366 if let Some(ch) = state.peek() {
367 if BASH_SPECIAL_CHARS.contains(&ch) {
368 state.advance(1);
369 state.add_token(BashSyntaxKind::SpecialChar, start_pos, state.get_position());
370 return true;
371 }
372 }
373
374 false
375 }
376
377 fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
378 let start_pos = state.get_position();
379
380 if let Some(ch) = state.peek() {
381 if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
382 state.advance(ch.len_utf8());
383 state.add_token(BashSyntaxKind::Text, start_pos, state.get_position());
384 return true;
385 }
386 }
387
388 false
389 }
390}
391
392impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
393 fn lex_incremental(
394 &self,
395 source: impl Source,
396 _changed: usize,
397 _cache: IncrementalCache<BashLanguage>,
398 ) -> LexOutput<BashLanguage> {
399 let mut state = LexerState::new_with_cache(source, _changed, _cache);
400 let result = self.run(&mut state);
401 if result.is_ok() {
402 let eof_pos = state.get_position();
403 state.add_token(BashSyntaxKind::Eof, eof_pos, eof_pos);
404 }
405 state.finish(result)
406 }
407}
408
409static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
410 &[
411 "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return",
412 "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".",
413 "eval", "exec", "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let",
414 "test", "[", "[[", "]]", "time", "coproc", "select", "in",
415 ]
416});
417
418static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
419
420static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| {
421 &[
422 "==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=",
423 ">>=", "**",
424 ]
425});
426
427static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
428
429static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| {
430 &[
431 '\\', '`', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';',
432 '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`',
433 ]
434});