shebling_lexer/
lib.rs

1// TODO: Remove this after development.
2#![allow(dead_code)]
3
4use std::str::Chars;
5
6use shebling_ast::{ControlOp, RedirOp, Span, Spanned};
7
8// TODO: Use itertools?
9// TODO: Document types and function logic.
10#[derive(Debug, PartialEq)]
11pub enum Token {
12    Comment(String),
13    ControlOp(ControlOp),
14    LParen,
15    RedirOp(RedirOp),
16    RParen,
17    Word(Vec<Spanned<WordSgmt>>),
18}
19
20#[derive(Debug, PartialEq)]
21pub enum WordSgmt {
22    CmdSub {
23        tokens: Vec<Spanned<Token>>,
24        closed: bool,
25    },
26    DoubleQuoted {
27        sgmts: Vec<Spanned<WordSgmt>>,
28        closed: bool,
29    },
30    Lit(String),
31    ParamExpansion(Vec<Spanned<WordSgmt>>),
32    SingleQuoted {
33        string: String,
34        closed: bool,
35    },
36}
37
38#[derive(Debug, thiserror::Error, miette::Diagnostic)]
39pub enum LexerDiagnostic {
40    #[error("CRLF line ending!")]
41    #[diagnostic(
42        code(shebling::cr_lf),
43        help("Try running the script through tr -d '\\r'.")
44    )]
45    CrLf(#[label("literal carriage return")] usize),
46
47    #[error("unclosed {1}!")]
48    #[diagnostic(code(shebling::unclosed))]
49    Unclosed(#[label("missing closing '{2}'")] usize, &'static str, char),
50}
51
52pub struct Lexer<'a> {
53    chars: Chars<'a>,
54    source_len: usize,
55    diags: Vec<LexerDiagnostic>,
56}
57
58impl<'a> Lexer<'a> {
59    pub fn new(source: &'a str) -> Self {
60        Lexer {
61            chars: source.chars(),
62            source_len: source.len(),
63            diags: Vec::new(),
64        }
65    }
66
67    pub fn tokenize(mut self) -> (Vec<Spanned<Token>>, Vec<LexerDiagnostic>) {
68        let mut tokens = Vec::new();
69
70        // Eat any starting whitespace.
71        self.blanks();
72
73        while let Some(token) = self.token() {
74            tokens.push(token);
75
76            // Consume trailing whitespace after each token.
77            self.blanks();
78        }
79
80        // Make sure we read everything.
81        assert!(self.chars.next().is_none());
82
83        (tokens, self.diags)
84    }
85
86    // region: Individual tokenizers.
87    fn blanks(&mut self) {
88        self.eat_while(|c| matches!(c, ' ' | '\t'));
89    }
90
91    fn cmd_sub_or_arith(&mut self) -> WordSgmt {
92        assert!(self.bump().is_some_and(|c| c == '('));
93
94        // TODO: First try to read an arithmetic expression.
95
96        let mut tokens = Vec::new();
97        while let Some(token) = self.token() {
98            if *token.token() == Token::RParen {
99                return WordSgmt::CmdSub {
100                    tokens,
101                    closed: true,
102                };
103            } else {
104                tokens.push(token);
105                self.blanks();
106            }
107        }
108
109        // If we get here, we didn't find the closing paren.
110        self.diags.push(LexerDiagnostic::Unclosed(
111            self.position(),
112            "command substitution",
113            ')',
114        ));
115
116        WordSgmt::CmdSub {
117            tokens,
118            closed: false,
119        }
120    }
121
122    fn double_quoted(&mut self) -> WordSgmt {
123        assert!(self.bump().is_some_and(|c| c == '"'));
124
125        let mut sgmts = Vec::new();
126
127        while let Some(c) = self.peek() {
128            let sgmt_start = self.position();
129
130            let sgmt = if let Some(lit) = self.lit("\\\"$`", "$`\"") {
131                WordSgmt::Lit(lit)
132            } else {
133                match c {
134                    // Reached the end of the string.
135                    '"' => break,
136                    '$' => {
137                        self.bump();
138
139                        match self.peek() {
140                            // This means that the string is unclosed so we don't know
141                            // what the dollar is supposed to be. Anyway...
142                            None => WordSgmt::Lit('$'.into()),
143                            _ => todo!(),
144                        }
145                    }
146                    _ => todo!(),
147                }
148            };
149
150            sgmts.push(Spanned::new(sgmt, self.capture_span(sgmt_start)));
151        }
152
153        WordSgmt::DoubleQuoted {
154            sgmts,
155            closed: if let Some(c) = self.bump() {
156                assert!(c == '"');
157                true
158            } else {
159                self.diags.push(LexerDiagnostic::Unclosed(
160                    self.position(),
161                    "double quoted",
162                    '"',
163                ));
164                false
165            },
166        }
167    }
168
169    fn lit(&mut self, can_escape: &str, stop_with: &str) -> Option<String> {
170        let mut lit = String::new();
171
172        while let Some(c) = self.peek_bump(|c| !stop_with.contains(c)) {
173            match c {
174                '\\' => match self.bump() {
175                    Some('\n') | None => {
176                        // Either a line continuation or a lonely backslash. Either way,
177                        // don't add anything to the literal.
178                    }
179                    Some(c) => {
180                        if !can_escape.contains(c) {
181                            lit.push('\\');
182                        }
183                        lit.push(c);
184                    }
185                },
186                c => lit.push(c),
187            };
188        }
189
190        if lit.is_empty() {
191            None
192        } else {
193            Some(lit)
194        }
195    }
196
197    fn param_expansion(&mut self) -> WordSgmt {
198        todo!()
199    }
200
201    fn single_quoted(&mut self) -> WordSgmt {
202        assert!(self.bump().is_some_and(|c| c == '\''));
203
204        let string = self.eat_while(|c| c != '\'');
205
206        WordSgmt::SingleQuoted {
207            string,
208            closed: if let Some(c) = self.bump() {
209                assert!(c == '\'');
210                true
211            } else {
212                self.diags.push(LexerDiagnostic::Unclosed(
213                    self.position(),
214                    "single quoted",
215                    '\'',
216                ));
217                false
218            },
219        }
220    }
221
222    fn token(&mut self) -> Option<Spanned<Token>> {
223        if let Some(c) = self.peek() {
224            let mut start = self.position();
225
226            let token = match c {
227                // Control operators:
228                '&' => Token::ControlOp(if self.peek_bump(|c| c == '&').is_some() {
229                    ControlOp::AndIf
230                } else {
231                    ControlOp::And
232                }),
233                ';' => Token::ControlOp(if self.peek_bump(|c| c == ';').is_some() {
234                    ControlOp::DSemi
235                } else {
236                    ControlOp::Semi
237                }),
238                '|' => Token::ControlOp(match self.peek_bump(|c| matches!(c, '&' | '|')) {
239                    Some('&') => ControlOp::OrAnd,
240                    Some('|') => ControlOp::OrIf,
241                    _ => ControlOp::Or,
242                }),
243                '\n' => Token::ControlOp(ControlOp::Newline),
244                '\r' if self.peek2().is_some_and(|c| c == '\n') => {
245                    // Special case for CRLF line endings, which we
246                    // leniently read as new lines.
247                    self.diags.push(LexerDiagnostic::CrLf(start));
248
249                    self.bump();
250                    start = self.position();
251
252                    Token::ControlOp(ControlOp::Newline)
253                }
254                // Redirection operators:
255                '<' => Token::RedirOp(match self.peek_bump(|c| matches!(c, '<' | '&' | '>')) {
256                    Some('<') => match self.peek_bump(|c| matches!(c, '_' | '<')) {
257                        Some('-') => RedirOp::DLessDash,
258                        Some('<') => RedirOp::TLess,
259                        _ => RedirOp::DLess,
260                    },
261                    Some('&') => RedirOp::LessAnd,
262                    Some('>') => RedirOp::LessGreat,
263                    _ => RedirOp::Less,
264                }),
265                '>' => Token::RedirOp(match self.peek_bump(|c| matches!(c, '|' | '>' | '&')) {
266                    Some('|') => RedirOp::Clobber,
267                    Some('>') => RedirOp::DGreat,
268                    Some('&') => RedirOp::GreatAnd,
269                    _ => RedirOp::Great,
270                }),
271                '#' => Token::Comment(self.eat_while(|c| c != '\n')),
272                '(' => {
273                    self.bump();
274                    Token::LParen
275                }
276                ')' => {
277                    self.bump();
278                    Token::RParen
279                }
280                _ => {
281                    if let Some(word) = self.word() {
282                        word
283                    } else {
284                        // This can happen if the word is just a line continuation.
285                        // Do check that we bumped the cursor.
286                        assert!(start < self.position());
287
288                        return None;
289                    }
290                }
291            };
292
293            Some(Spanned::new(token, self.capture_span(start)))
294        } else {
295            None
296        }
297    }
298
299    fn word(&mut self) -> Option<Token> {
300        let mut word = Vec::new();
301
302        while let Some(c) = self.peek() {
303            let sgmt_start = self.position();
304
305            let sgmt = match c {
306                '"' => self.double_quoted(),
307                '\'' => self.single_quoted(),
308                '$' => {
309                    self.bump();
310
311                    match self.peek() {
312                        Some('"') => self.double_quoted(),
313                        Some('\'') => self.single_quoted(),
314                        Some('(') => self.cmd_sub_or_arith(),
315                        Some(_) => self.param_expansion(),
316                        None => {
317                            // This is technically undefined behavior, but we'll just treat it as
318                            // a literal dollar.
319                            WordSgmt::Lit('$'.into())
320                        }
321                    }
322                }
323                _ => {
324                    if let Some(lit) = self.lit("|&;<>()$`\\\"' \t\n", "#|&;<>()$`\"' \t\r\n") {
325                        WordSgmt::Lit(lit)
326                    } else {
327                        break;
328                    }
329                }
330            };
331
332            word.push(Spanned::new(sgmt, self.capture_span(sgmt_start)));
333        }
334
335        if word.is_empty() {
336            None
337        } else {
338            Some(Token::Word(word))
339        }
340    }
341    // endregion
342
343    // region: "Cursor" utilities.
344    fn bump(&mut self) -> Option<char> {
345        self.chars.next()
346    }
347
348    fn eat_while(&mut self, condition: impl Fn(char) -> bool) -> String {
349        let mut eaten = String::new();
350
351        while let Some(c) = self.peek_bump(&condition) {
352            eaten.push(c);
353        }
354
355        eaten
356    }
357
358    fn peek(&self) -> Option<char> {
359        self.chars.clone().next()
360    }
361
362    fn peek2(&self) -> Option<char> {
363        let mut chars = self.chars.clone();
364        chars.next();
365        chars.next()
366    }
367
368    fn peek_bump(&mut self, condition: impl Fn(char) -> bool) -> Option<char> {
369        let c = self.peek()?;
370
371        if condition(c) {
372            self.bump()
373        } else {
374            None
375        }
376    }
377
378    fn position(&self) -> usize {
379        self.source_len - self.chars.as_str().len()
380    }
381
382    fn capture_span(&self, start: usize) -> Span {
383        assert!(start < self.position());
384
385        Span::new(start, self.position())
386    }
387    // endregion
388}