1#![doc = include_str!("readme.md")]
2pub mod token_type;
4
5pub use token_type::BashTokenType;
6
7use crate::language::BashLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, BashLanguage>;
12
13#[derive(Clone)]
15pub struct BashLexer<'config> {
16 config: &'config BashLanguage,
17}
18
19impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
20 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<BashLanguage>) -> LexOutput<BashLanguage> {
21 let mut state = LexerState::new_with_cache(source, 0, cache);
22 let result = self.run(&mut state);
23 if result.is_ok() {
24 state.add_eof()
25 }
26 state.finish_with_cache(result, cache)
27 }
28}
29
30impl<'config> BashLexer<'config> {
31 pub fn new(config: &'config BashLanguage) -> Self {
33 Self { config }
34 }
35
36 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39 if self.skip_whitespace(state) {
40 continue;
41 }
42
43 if self.skip_comment(state) {
44 continue;
45 }
46
47 if self.lex_newline(state) {
48 continue;
49 }
50
51 if self.lex_string(state) {
52 continue;
53 }
54
55 if self.lex_variable(state) {
56 continue;
57 }
58
59 if self.lex_number(state) {
60 continue;
61 }
62
63 if self.lex_keyword_or_identifier(state) {
64 continue;
65 }
66
67 if self.lex_operator_or_delimiter(state) {
68 continue;
69 }
70
71 if self.lex_heredoc(state) {
72 continue;
73 }
74
75 if self.lex_glob_pattern(state) {
76 continue;
77 }
78
79 if self.lex_special_char(state) {
80 continue;
81 }
82
83 if self.lex_text(state) {
84 continue;
85 }
86
87 let start_pos = state.get_position();
89 if let Some(ch) = state.peek() {
90 state.advance(ch.len_utf8());
91 state.add_token(BashTokenType::Error, start_pos, state.get_position())
92 }
93
94 state.advance_if_dead_lock(safe_point)
95 }
96 Ok(())
97 }
98
99 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100 let start_pos = state.get_position();
101
102 while let Some(ch) = state.peek() {
103 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
104 }
105
106 if state.get_position() > start_pos {
107 state.add_token(BashTokenType::Whitespace, start_pos, state.get_position());
108 true
109 }
110 else {
111 false
112 }
113 }
114
115 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116 let start_pos = state.get_position();
117
118 if let Some('#') = state.peek() {
119 state.advance(1);
120 while let Some(ch) = state.peek() {
121 if ch == '\n' || ch == '\r' {
122 break;
123 }
124 state.advance(ch.len_utf8())
125 }
126 state.add_token(BashTokenType::Comment, start_pos, state.get_position());
127 true
128 }
129 else {
130 false
131 }
132 }
133
134 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
135 let start_pos = state.get_position();
136
137 if let Some('\n') = state.peek() {
138 state.advance(1);
139 state.add_token(BashTokenType::Newline, start_pos, state.get_position());
140 true
141 }
142 else if let Some('\r') = state.peek() {
143 state.advance(1);
144 if let Some('\n') = state.peek() {
145 state.advance(1)
146 }
147 state.add_token(BashTokenType::Newline, start_pos, state.get_position());
148 true
149 }
150 else {
151 false
152 }
153 }
154
155 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156 let start_pos = state.get_position();
157
158 if let Some(quote) = state.peek() {
159 if quote == '"' || quote == '\'' {
160 state.advance(1);
161 let mut escaped = false;
162
163 while let Some(ch) = state.peek() {
164 if escaped {
165 escaped = false;
166 state.advance(ch.len_utf8());
167 continue;
168 }
169
170 if ch == '\\' {
171 escaped = true;
172 state.advance(1);
173 continue;
174 }
175
176 if ch == quote {
177 state.advance(1);
178 break;
179 }
180
181 state.advance(ch.len_utf8())
182 }
183
184 state.add_token(BashTokenType::StringLiteral, start_pos, state.get_position());
185 return true;
186 }
187 }
188
189 false
190 }
191
192 fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
193 let start_pos = state.get_position();
194
195 if let Some('$') = state.peek() {
196 state.advance(1);
197
198 if let Some(ch) = state.peek() {
200 if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '@' || ch == '*' {
201 state.advance(1);
202 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
203 return true;
204 }
205 }
206
207 if let Some('{') = state.peek() {
209 state.advance(1);
210 while let Some(ch) = state.peek() {
211 if ch == '}' {
212 state.advance(1);
213 break;
214 }
215 state.advance(ch.len_utf8())
216 }
217 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
218 return true;
219 }
220
221 if let Some(ch) = state.peek() {
223 if ch.is_alphabetic() || ch == '_' {
224 state.advance(ch.len_utf8());
225 while let Some(ch) = state.peek() {
226 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
227 }
228 state.add_token(BashTokenType::Variable, start_pos, state.get_position());
229 return true;
230 }
231 }
232
233 state.set_position(start_pos);
235 }
236
237 false
238 }
239
240 fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241 let start_pos = state.get_position();
242
243 if let Some(ch) = state.peek() {
244 if ch.is_ascii_digit() {
245 state.advance(1);
246 while let Some(ch) = state.peek() {
247 if ch.is_ascii_digit() { state.advance(1) } else { break }
248 }
249 state.add_token(BashTokenType::NumberLiteral, start_pos, state.get_position());
250 return true;
251 }
252 }
253
254 false
255 }
256
257 fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
258 let start_pos = state.get_position();
259
260 if let Some(ch) = state.peek() {
261 if ch.is_ascii_alphabetic() || ch == '_' {
262 state.advance(ch.len_utf8());
263 while let Some(ch) = state.peek() {
264 if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
265 }
266
267 let text = state.get_text_in((start_pos..state.get_position()).into());
268 let kind = if BASH_KEYWORDS.contains(&text.as_ref()) { BashTokenType::Keyword } else { BashTokenType::Identifier };
269
270 state.add_token(kind, start_pos, state.get_position());
271 return true;
272 }
273 }
274
275 false
276 }
277
278 fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
279 let start_pos = state.get_position();
280
281 if let Some(ch) = state.peek() {
282 let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
283
284 if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
286 state.advance(2);
287 state.add_token(BashTokenType::Operator, start_pos, state.get_position());
288 return true;
289 }
290
291 let ch_str = ch.to_string();
293 if BASH_OPERATORS.contains(&ch_str.as_str()) {
294 state.advance(1);
295 state.add_token(BashTokenType::Operator, start_pos, state.get_position());
296 return true;
297 }
298
299 if BASH_DELIMITERS.contains(&ch_str.as_str()) {
300 state.advance(1);
301 state.add_token(BashTokenType::Delimiter, start_pos, state.get_position());
302 return true;
303 }
304 }
305
306 false
307 }
308
309 fn lex_heredoc<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
310 let start_pos = state.get_position();
311
312 if let Some('<') = state.peek() {
314 if let Some('<') = state.peek_next_n(1) {
315 state.advance(2);
316
317 if let Some('-') = state.peek() {
319 state.advance(1)
320 }
321
322 while let Some(ch) = state.peek() {
324 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
325 }
326
327 state.add_token(BashTokenType::Heredoc, start_pos, state.get_position());
328 return true;
329 }
330 }
331
332 false
333 }
334
335 fn lex_glob_pattern<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
336 let start_pos = state.get_position();
337
338 if let Some(ch) = state.peek() {
339 if ch == '*' || ch == '?' || ch == '[' {
340 state.advance(1);
341
342 if ch == '[' {
343 if let Some('!') = state.peek() {
345 state.advance(1)
346 }
347 while let Some(ch) = state.peek() {
348 if ch == ']' {
349 state.advance(1);
350 break;
351 }
352 state.advance(ch.len_utf8())
353 }
354 }
355
356 state.add_token(BashTokenType::GlobPattern, start_pos, state.get_position());
357 return true;
358 }
359 }
360
361 false
362 }
363
364 fn lex_special_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
365 let start_pos = state.get_position();
366
367 if let Some(ch) = state.peek() {
368 if BASH_SPECIAL_CHARS.contains(&ch) {
369 state.advance(1);
370 state.add_token(BashTokenType::SpecialChar, start_pos, state.get_position());
371 return true;
372 }
373 }
374
375 false
376 }
377
378 fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
379 let start_pos = state.get_position();
380
381 if let Some(ch) = state.peek() {
382 if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
383 state.advance(ch.len_utf8());
384 state.add_token(BashTokenType::Text, start_pos, state.get_position());
385 return true;
386 }
387 }
388
389 false
390 }
391}
392
393static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
394 &[
395 "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return", "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".", "eval", "exec",
396 "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let", "test", "[", "[[", "]]", "time", "coproc", "select", "in",
397 ]
398});
399
400static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
401
402static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "**"]);
403
404static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
405
406static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| &['\\', '`', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';', '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`']);