1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::BashTokenType;
5
6use crate::language::BashLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, BashLanguage>;
11
12#[derive(Clone)]
13pub struct BashLexer<'config> {
14 _config: &'config BashLanguage,
15}
16
17impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
18 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<BashLanguage>) -> LexOutput<BashLanguage> {
19 let mut state = LexerState::new_with_cache(source, 0, cache);
20 let result = self.run(&mut state);
21 if result.is_ok() {
22 state.add_eof()
23 }
24 state.finish_with_cache(result, cache)
25 }
26}
27
28impl<'config> BashLexer<'config> {
29 pub fn new(config: &'config BashLanguage) -> Self {
30 Self { _config: config }
31 }
32
33 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34 while state.not_at_end() {
35 let safe_point = state.get_position();
36 if self.skip_whitespace(state) {
37 continue;
38 }
39
40 if self.skip_comment(state) {
41 continue;
42 }
43
44 if self.lex_newline(state) {
45 continue;
46 }
47
48 if self.lex_string(state) {
49 continue;
50 }
51
52 if self.lex_variable(state) {
53 continue;
54 }
55
56 if self.lex_number(state) {
57 continue;
58 }
59
60 if self.lex_keyword_or_identifier(state) {
61 continue;
62 }
63
64 if self.lex_operator_or_delimiter(state) {
65 continue;
66 }
67
68 if self.lex_heredoc(state) {
69 continue;
70 }
71
72 if self.lex_glob_pattern(state) {
73 continue;
74 }
75
76 if self.lex_special_char(state) {
77 continue;
78 }
79
80 if self.lex_text(state) {
81 continue;
82 }
83
84 let start_pos = state.get_position();
86 if let Some(ch) = state.peek() {
87 state.advance(ch.len_utf8());
88 state.add_token(BashTokenType::Error, start_pos, state.get_position())
89 }
90
91 state.advance_if_dead_lock(safe_point)
92 }
93 Ok(())
94 }
95
96 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
97 let start_pos = state.get_position();
98
99 while let Some(ch) = state.peek() {
100 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
101 }
102
103 if state.get_position() > start_pos {
104 state.add_token(BashTokenType::Whitespace, start_pos, state.get_position());
105 true
106 }
107 else {
108 false
109 }
110 }
111
112 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113 let start_pos = state.get_position();
114
115 if let Some('#') = state.peek() {
116 state.advance(1);
117 while let Some(ch) = state.peek() {
118 if ch == '\n' || ch == '\r' {
119 break;
120 }
121 state.advance(ch.len_utf8())
122 }
123 state.add_token(BashTokenType::Comment, start_pos, state.get_position());
124 true
125 }
126 else {
127 false
128 }
129 }
130
131 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
132 let start_pos = state.get_position();
133
134 if let Some('\n') = state.peek() {
135 state.advance(1);
136 state.add_token(BashTokenType::Newline, start_pos, state.get_position());
137 true
138 }
139 else if let Some('\r') = state.peek() {
140 state.advance(1);
141 if let Some('\n') = state.peek() {
142 state.advance(1)
143 }
144 state.add_token(BashTokenType::Newline, start_pos, state.get_position());
145 true
146 }
147 else {
148 false
149 }
150 }
151
152 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
153 let start_pos = state.get_position();
154
155 if let Some(quote) = state.peek() {
156 if quote == '"' || quote == '\'' {
157 state.advance(1);
158 let mut escaped = false;
159
160 while let Some(ch) = state.peek() {
161 if escaped {
162 escaped = false;
163 state.advance(ch.len_utf8());
164 continue;
165 }
166
167 if ch == '\\' {
168 escaped = true;
169 state.advance(1);
170 continue;
171 }
172
173 if ch == quote {
174 state.advance(1);
175 break;
176 }
177
178 state.advance(ch.len_utf8())
179 }
180
181 state.add_token(BashTokenType::StringLiteral, start_pos, state.get_position());
182 return true;
183 }
184 }
185
186 false
187 }
188
189 fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
190 let start_pos = state.get_position();
191
192 if let Some('$') = state.peek() {
193 state.advance(1);
194
195 if let Some(ch) = state.peek() {
197 if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '↯' || ch == '*' {
198 state.advance(1);
199 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
200 return true;
201 }
202 }
203
204 if let Some('{') = state.peek() {
206 state.advance(1);
207 while let Some(ch) = state.peek() {
208 if ch == '}' {
209 state.advance(1);
210 break;
211 }
212 state.advance(ch.len_utf8())
213 }
214 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
215 return true;
216 }
217
218 if let Some(ch) = state.peek() {
220 if ch.is_alphabetic() || ch == '_' {
221 state.advance(ch.len_utf8());
222 while let Some(ch) = state.peek() {
223 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
224 }
225 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
226 return true;
227 }
228 }
229
230 state.set_position(start_pos);
232 }
233
234 false
235 }
236
237 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
238 let start_pos = state.get_position();
239
240 if let Some(ch) = state.peek() {
241 if ch.is_ascii_digit() {
242 state.advance(1);
243 while let Some(ch) = state.peek() {
244 if ch.is_ascii_digit() { state.advance(1) } else { break }
245 }
246 state.add_token(BashTokenType::NumberLiteral, start_pos, state.get_position());
247 return true;
248 }
249 }
250
251 false
252 }
253
254 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
255 let start_pos = state.get_position();
256
257 if let Some(ch) = state.peek() {
258 if ch.is_ascii_alphabetic() || ch == '_' {
259 state.advance(ch.len_utf8());
260 while let Some(ch) = state.peek() {
261 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
262 }
263
264 let text = state.get_text_in((start_pos..state.get_position()).into());
265 let kind = if BASH_KEYWORDS.contains(&text.as_ref()) { BashTokenType::Keyword } else { BashTokenType::Identifier };
266
267 state.add_token(kind, start_pos, state.get_position());
268 return true;
269 }
270 }
271
272 false
273 }
274
275 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276 let start_pos = state.get_position();
277
278 if let Some(ch) = state.peek() {
279 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
280
281 if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
283 state.advance(2);
284 state.add_token(BashTokenType::Operator, start_pos, state.get_position());
285 return true;
286 }
287
288 let ch_str = ch.to_string();
290 if BASH_OPERATORS.contains(&ch_str.as_str()) {
291 state.advance(1);
292 state.add_token(BashTokenType::Operator, start_pos, state.get_position());
293 return true;
294 }
295
296 if BASH_DELIMITERS.contains(&ch_str.as_str()) {
297 state.advance(1);
298 state.add_token(BashTokenType::Delimiter, start_pos, state.get_position());
299 return true;
300 }
301 }
302
303 false
304 }
305
306 fn lex_heredoc<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
307 let start_pos = state.get_position();
308
309 if let Some('<') = state.peek() {
311 if let Some('<') = state.peek_next_n(1) {
312 state.advance(2);
313
314 if let Some('-') = state.peek() {
316 state.advance(1)
317 }
318
319 while let Some(ch) = state.peek() {
321 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
322 }
323
324 state.add_token(BashTokenType::Heredoc, start_pos, state.get_position());
325 return true;
326 }
327 }
328
329 false
330 }
331
332 fn lex_glob_pattern<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
333 let start_pos = state.get_position();
334
335 if let Some(ch) = state.peek() {
336 if ch == '*' || ch == '?' || ch == '[' {
337 state.advance(1);
338
339 if ch == '[' {
340 if let Some('!') = state.peek() {
342 state.advance(1)
343 }
344 while let Some(ch) = state.peek() {
345 if ch == ']' {
346 state.advance(1);
347 break;
348 }
349 state.advance(ch.len_utf8())
350 }
351 }
352
353 state.add_token(BashTokenType::GlobPattern, start_pos, state.get_position());
354 return true;
355 }
356 }
357
358 false
359 }
360
361 fn lex_special_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
362 let start_pos = state.get_position();
363
364 if let Some(ch) = state.peek() {
365 if BASH_SPECIAL_CHARS.contains(&ch) {
366 state.advance(1);
367 state.add_token(BashTokenType::SpecialChar, start_pos, state.get_position());
368 return true;
369 }
370 }
371
372 false
373 }
374
375 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
376 let start_pos = state.get_position();
377
378 if let Some(ch) = state.peek() {
379 if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
380 state.advance(ch.len_utf8());
381 state.add_token(BashTokenType::Text, start_pos, state.get_position());
382 return true;
383 }
384 }
385
386 false
387 }
388}
389
390static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
391 &[
392 "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return", "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".", "eval", "exec",
393 "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let", "test", "[", "[[", "]]", "time", "coproc", "select", "in",
394 ]
395});
396
397static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
398
399static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "**"]);
400
401static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
402
403static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| &['\\', '`', '~', '↯', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';', '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`']);