1#![allow(dead_code)]
3
4use std::str::Chars;
5
6use shebling_ast::{ControlOp, RedirOp, Span, Spanned};
7
8#[derive(Debug, PartialEq)]
11pub enum Token {
12 Comment(String),
13 ControlOp(ControlOp),
14 LParen,
15 RedirOp(RedirOp),
16 RParen,
17 Word(Vec<Spanned<WordSgmt>>),
18}
19
20#[derive(Debug, PartialEq)]
21pub enum WordSgmt {
22 CmdSub {
23 tokens: Vec<Spanned<Token>>,
24 closed: bool,
25 },
26 DoubleQuoted {
27 sgmts: Vec<Spanned<WordSgmt>>,
28 closed: bool,
29 },
30 Lit(String),
31 ParamExpansion(Vec<Spanned<WordSgmt>>),
32 SingleQuoted {
33 string: String,
34 closed: bool,
35 },
36}
37
38#[derive(Debug, thiserror::Error, miette::Diagnostic)]
39pub enum LexerDiagnostic {
40 #[error("CRLF line ending!")]
41 #[diagnostic(
42 code(shebling::cr_lf),
43 help("Try running the script through tr -d '\\r'.")
44 )]
45 CrLf(#[label("literal carriage return")] usize),
46
47 #[error("unclosed {1}!")]
48 #[diagnostic(code(shebling::unclosed))]
49 Unclosed(#[label("missing closing '{2}'")] usize, &'static str, char),
50}
51
52pub struct Lexer<'a> {
53 chars: Chars<'a>,
54 source_len: usize,
55 diags: Vec<LexerDiagnostic>,
56}
57
58impl<'a> Lexer<'a> {
59 pub fn new(source: &'a str) -> Self {
60 Lexer {
61 chars: source.chars(),
62 source_len: source.len(),
63 diags: Vec::new(),
64 }
65 }
66
67 pub fn tokenize(mut self) -> (Vec<Spanned<Token>>, Vec<LexerDiagnostic>) {
68 let mut tokens = Vec::new();
69
70 self.blanks();
72
73 while let Some(token) = self.token() {
74 tokens.push(token);
75
76 self.blanks();
78 }
79
80 assert!(self.chars.next().is_none());
82
83 (tokens, self.diags)
84 }
85
86 fn blanks(&mut self) {
88 self.eat_while(|c| matches!(c, ' ' | '\t'));
89 }
90
91 fn cmd_sub_or_arith(&mut self) -> WordSgmt {
92 assert!(self.bump().is_some_and(|c| c == '('));
93
94 let mut tokens = Vec::new();
97 while let Some(token) = self.token() {
98 if *token.token() == Token::RParen {
99 return WordSgmt::CmdSub {
100 tokens,
101 closed: true,
102 };
103 } else {
104 tokens.push(token);
105 self.blanks();
106 }
107 }
108
109 self.diags.push(LexerDiagnostic::Unclosed(
111 self.position(),
112 "command substitution",
113 ')',
114 ));
115
116 WordSgmt::CmdSub {
117 tokens,
118 closed: false,
119 }
120 }
121
122 fn double_quoted(&mut self) -> WordSgmt {
123 assert!(self.bump().is_some_and(|c| c == '"'));
124
125 let mut sgmts = Vec::new();
126
127 while let Some(c) = self.peek() {
128 let sgmt_start = self.position();
129
130 let sgmt = if let Some(lit) = self.lit("\\\"$`", "$`\"") {
131 WordSgmt::Lit(lit)
132 } else {
133 match c {
134 '"' => break,
136 '$' => {
137 self.bump();
138
139 match self.peek() {
140 None => WordSgmt::Lit('$'.into()),
143 _ => todo!(),
144 }
145 }
146 _ => todo!(),
147 }
148 };
149
150 sgmts.push(Spanned::new(sgmt, self.capture_span(sgmt_start)));
151 }
152
153 WordSgmt::DoubleQuoted {
154 sgmts,
155 closed: if let Some(c) = self.bump() {
156 assert!(c == '"');
157 true
158 } else {
159 self.diags.push(LexerDiagnostic::Unclosed(
160 self.position(),
161 "double quoted",
162 '"',
163 ));
164 false
165 },
166 }
167 }
168
169 fn lit(&mut self, can_escape: &str, stop_with: &str) -> Option<String> {
170 let mut lit = String::new();
171
172 while let Some(c) = self.peek_bump(|c| !stop_with.contains(c)) {
173 match c {
174 '\\' => match self.bump() {
175 Some('\n') | None => {
176 }
179 Some(c) => {
180 if !can_escape.contains(c) {
181 lit.push('\\');
182 }
183 lit.push(c);
184 }
185 },
186 c => lit.push(c),
187 };
188 }
189
190 if lit.is_empty() {
191 None
192 } else {
193 Some(lit)
194 }
195 }
196
197 fn param_expansion(&mut self) -> WordSgmt {
198 todo!()
199 }
200
201 fn single_quoted(&mut self) -> WordSgmt {
202 assert!(self.bump().is_some_and(|c| c == '\''));
203
204 let string = self.eat_while(|c| c != '\'');
205
206 WordSgmt::SingleQuoted {
207 string,
208 closed: if let Some(c) = self.bump() {
209 assert!(c == '\'');
210 true
211 } else {
212 self.diags.push(LexerDiagnostic::Unclosed(
213 self.position(),
214 "single quoted",
215 '\'',
216 ));
217 false
218 },
219 }
220 }
221
222 fn token(&mut self) -> Option<Spanned<Token>> {
223 if let Some(c) = self.peek() {
224 let mut start = self.position();
225
226 let token = match c {
227 '&' => Token::ControlOp(if self.peek_bump(|c| c == '&').is_some() {
229 ControlOp::AndIf
230 } else {
231 ControlOp::And
232 }),
233 ';' => Token::ControlOp(if self.peek_bump(|c| c == ';').is_some() {
234 ControlOp::DSemi
235 } else {
236 ControlOp::Semi
237 }),
238 '|' => Token::ControlOp(match self.peek_bump(|c| matches!(c, '&' | '|')) {
239 Some('&') => ControlOp::OrAnd,
240 Some('|') => ControlOp::OrIf,
241 _ => ControlOp::Or,
242 }),
243 '\n' => Token::ControlOp(ControlOp::Newline),
244 '\r' if self.peek2().is_some_and(|c| c == '\n') => {
245 self.diags.push(LexerDiagnostic::CrLf(start));
248
249 self.bump();
250 start = self.position();
251
252 Token::ControlOp(ControlOp::Newline)
253 }
254 '<' => Token::RedirOp(match self.peek_bump(|c| matches!(c, '<' | '&' | '>')) {
256 Some('<') => match self.peek_bump(|c| matches!(c, '_' | '<')) {
257 Some('-') => RedirOp::DLessDash,
258 Some('<') => RedirOp::TLess,
259 _ => RedirOp::DLess,
260 },
261 Some('&') => RedirOp::LessAnd,
262 Some('>') => RedirOp::LessGreat,
263 _ => RedirOp::Less,
264 }),
265 '>' => Token::RedirOp(match self.peek_bump(|c| matches!(c, '|' | '>' | '&')) {
266 Some('|') => RedirOp::Clobber,
267 Some('>') => RedirOp::DGreat,
268 Some('&') => RedirOp::GreatAnd,
269 _ => RedirOp::Great,
270 }),
271 '#' => Token::Comment(self.eat_while(|c| c != '\n')),
272 '(' => {
273 self.bump();
274 Token::LParen
275 }
276 ')' => {
277 self.bump();
278 Token::RParen
279 }
280 _ => {
281 if let Some(word) = self.word() {
282 word
283 } else {
284 assert!(start < self.position());
287
288 return None;
289 }
290 }
291 };
292
293 Some(Spanned::new(token, self.capture_span(start)))
294 } else {
295 None
296 }
297 }
298
299 fn word(&mut self) -> Option<Token> {
300 let mut word = Vec::new();
301
302 while let Some(c) = self.peek() {
303 let sgmt_start = self.position();
304
305 let sgmt = match c {
306 '"' => self.double_quoted(),
307 '\'' => self.single_quoted(),
308 '$' => {
309 self.bump();
310
311 match self.peek() {
312 Some('"') => self.double_quoted(),
313 Some('\'') => self.single_quoted(),
314 Some('(') => self.cmd_sub_or_arith(),
315 Some(_) => self.param_expansion(),
316 None => {
317 WordSgmt::Lit('$'.into())
320 }
321 }
322 }
323 _ => {
324 if let Some(lit) = self.lit("|&;<>()$`\\\"' \t\n", "#|&;<>()$`\"' \t\r\n") {
325 WordSgmt::Lit(lit)
326 } else {
327 break;
328 }
329 }
330 };
331
332 word.push(Spanned::new(sgmt, self.capture_span(sgmt_start)));
333 }
334
335 if word.is_empty() {
336 None
337 } else {
338 Some(Token::Word(word))
339 }
340 }
341 fn bump(&mut self) -> Option<char> {
345 self.chars.next()
346 }
347
348 fn eat_while(&mut self, condition: impl Fn(char) -> bool) -> String {
349 let mut eaten = String::new();
350
351 while let Some(c) = self.peek_bump(&condition) {
352 eaten.push(c);
353 }
354
355 eaten
356 }
357
358 fn peek(&self) -> Option<char> {
359 self.chars.clone().next()
360 }
361
362 fn peek2(&self) -> Option<char> {
363 let mut chars = self.chars.clone();
364 chars.next();
365 chars.next()
366 }
367
368 fn peek_bump(&mut self, condition: impl Fn(char) -> bool) -> Option<char> {
369 let c = self.peek()?;
370
371 if condition(c) {
372 self.bump()
373 } else {
374 None
375 }
376 }
377
378 fn position(&self) -> usize {
379 self.source_len - self.chars.as_str().len()
380 }
381
382 fn capture_span(&self, start: usize) -> Span {
383 assert!(start < self.position());
384
385 Span::new(start, self.position())
386 }
387 }