conch_parser/
lexer.rs

1//! This module defines a lexer to recognize tokens of the shell language.
2
3use self::TokenOrLiteral::*;
4use std::iter::{Fuse, Peekable};
5use super::token::{Positional, Token};
6use super::token::Token::*;
7
8#[derive(PartialEq, Eq, Debug, Clone)]
9enum TokenOrLiteral {
10    Tok(Token),
11    Escaped(Option<Token>),
12    Lit(char),
13}
14
15/// Converts raw characters into shell tokens.
16#[must_use = "`Lexer` is lazy and does nothing unless consumed"]
17#[derive(Clone, Debug)]
18pub struct Lexer<I: Iterator<Item = char>> {
19    inner: Peekable<Fuse<I>>,
20    peeked: Option<TokenOrLiteral>,
21}
22
23impl<I: Iterator<Item = char>> Lexer<I> {
24    /// Creates a new Lexer from any char iterator.
25    pub fn new(iter: I) -> Lexer<I> {
26        Lexer {
27            inner: iter.fuse().peekable(),
28            peeked: None,
29        }
30    }
31
32    #[inline]
33    fn next_is(&mut self, c: char) -> bool {
34        let is = self.inner.peek() == Some(&c);
35        if is { self.inner.next(); }
36        is
37    }
38
39    fn next_internal(&mut self) -> Option<TokenOrLiteral> {
40        if self.peeked.is_some() {
41            return self.peeked.take();
42        }
43
44        let cur = match self.inner.next() {
45            Some(c) => c,
46            None => return None,
47        };
48
49        let tok = match cur {
50            '\n' => Newline,
51            '!' => Bang,
52            '~' => Tilde,
53            '#' => Pound,
54            '*' => Star,
55            '?' => Question,
56            '%' => Percent,
57            '-' => Dash,
58            '=' => Equals,
59            '+' => Plus,
60            ':' => Colon,
61            '@' => At,
62            '^' => Caret,
63            '/' => Slash,
64            ',' => Comma,
65
66            // Make sure that we treat the next token as a single character,
67            // preventing multi-char tokens from being recognized. This is
68            // important because something like `\&&` would mean that the
69            // first & is a literal while the second retains its properties.
70            // We will let the parser deal with what actually becomes a literal.
71            '\\' => return Some(Escaped(self.inner.next().and_then(|c| {
72                Lexer::new(::std::iter::once(c)).next()
73            }))),
74
75            '\'' => SingleQuote,
76            '"' => DoubleQuote,
77            '`' => Backtick,
78
79            ';' => if self.next_is(';') { DSemi } else { Semi },
80            '&' => if self.next_is('&') { AndIf } else { Amp  },
81            '|' => if self.next_is('|') { OrIf  } else { Pipe },
82
83            '(' => ParenOpen,
84            ')' => ParenClose,
85            '{' => CurlyOpen,
86            '}' => CurlyClose,
87            '[' => SquareOpen,
88            ']' => SquareClose,
89
90            '$' => {
91                // Positional parameters are 0-9, so we only
92                // need to check a single digit ahead.
93                let positional = match self.inner.peek() {
94                    Some(&'0') => Some(Positional::Zero),
95                    Some(&'1') => Some(Positional::One),
96                    Some(&'2') => Some(Positional::Two),
97                    Some(&'3') => Some(Positional::Three),
98                    Some(&'4') => Some(Positional::Four),
99                    Some(&'5') => Some(Positional::Five),
100                    Some(&'6') => Some(Positional::Six),
101                    Some(&'7') => Some(Positional::Seven),
102                    Some(&'8') => Some(Positional::Eight),
103                    Some(&'9') => Some(Positional::Nine),
104                    _ => None,
105                };
106
107                match positional {
108                    Some(p) => {
109                        self.inner.next(); // Consume the character we just peeked
110                        ParamPositional(p)
111                    },
112                    None => Dollar,
113                }
114            },
115
116            '<' => if self.next_is('<') {
117                if self.next_is('-') { DLessDash } else { DLess }
118            } else if self.next_is('&') {
119                LessAnd
120            } else if self.next_is('>') {
121                LessGreat
122            } else {
123                Less
124            },
125
126            '>' => if self.next_is('&') {
127                GreatAnd
128            } else if self.next_is('>') {
129                DGreat
130            } else if self.next_is('|') {
131                Clobber
132            } else {
133                Great
134            },
135
136            // Newlines are valid whitespace, however, we want to tokenize them separately!
137            c if c.is_whitespace() => {
138                let mut buf = String::new();
139                buf.push(c);
140
141                // NB: Can't use filter here because it will advance the iterator too far.
142                while let Some(&c) = self.inner.peek() {
143                    if c.is_whitespace() && c != '\n' {
144                        self.inner.next();
145                        buf.push(c);
146                    } else {
147                        break
148                    }
149                }
150
151                Whitespace(buf)
152            },
153
154            c => return Some(Lit(c)),
155        };
156
157        Some(Tok(tok))
158    }
159}
160
161impl<I: Iterator<Item = char>> Iterator for Lexer<I> {
162    type Item = Token;
163
164    fn next(&mut self) -> Option<Token> {
165        fn name_start_char(c: char) -> bool {
166            c == '_' || c.is_alphabetic()
167        }
168
169        fn is_digit(c: char) -> bool {
170            c.is_digit(10)
171        }
172
173        fn name_char(c: char) -> bool {
174            is_digit(c) || name_start_char(c)
175        }
176
177        match self.next_internal() {
178            None => None,
179            Some(Tok(t)) => Some(t),
180            Some(Escaped(t)) => {
181                debug_assert_eq!(self.peeked, None);
182                self.peeked = t.map(Tok);
183                Some(Backslash)
184            },
185
186            Some(Lit(c)) => {
187                let is_name = name_start_char(c);
188                let mut word = String::new();
189                word.push(c);
190
191                loop {
192                    match self.next_internal() {
193                        // If we hit a token, delimit the current word w/o losing the token
194                        Some(tok@Tok(_)) |
195                        Some(tok@Escaped(_)) => {
196                            debug_assert_eq!(self.peeked, None);
197                            self.peeked = Some(tok);
198                            break;
199                        },
200
201                        // Make sure we delimit valid names whenever a non-name char comes along
202                        Some(Lit(c)) if is_name && !name_char(c) => {
203                            debug_assert_eq!(self.peeked, None);
204                            self.peeked = Some(Lit(c));
205                            return Some(Name(word));
206                        },
207
208                        // Otherwise, keep consuming characters for the literal
209                        Some(Lit(c)) => word.push(c),
210
211                        None => break,
212                    }
213                }
214
215                if is_name {
216                    Some(Name(word))
217                } else {
218                    Some(Literal(word))
219                }
220            },
221        }
222    }
223
224    fn size_hint(&self) -> (usize, Option<usize>) {
225        // The number of actual tokens we yield will never exceed
226        // the amount of characters we are processing. In practice
227        // the caller will probably see a lot fewer tokens than
228        // number of characters processed, however, they can prepare
229        // themselves for the worst possible case. A high estimate
230        // is better than no estimate.
231        let (_, hi) = self.inner.size_hint();
232        let low = if self.peeked.is_some() { 1 } else { 0 };
233        (low, hi)
234    }
235}