potenv/tokenizer/
mod.rs

1use std::{collections::VecDeque, path::PathBuf};
2
3use self::{
4    err::{ErrorKind, SyntaxError},
5    pos::Position,
6    token::*,
7};
8
9pub mod err;
10pub mod pos;
11#[cfg(test)]
12mod tests;
13pub mod token;
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16enum State {
17    AssignmentList,
18    Comment,
19    AssignmentName,
20    AssignmentValue,
21    AssignmentValueEscape,
22    SingleQuoted,
23    DoubleQuoted,
24    DoubleQuotedEscape,
25    Dollar,
26    SimpleExpansion,
27    ComplexExpansionStart,
28    ComplexExpansion,
29    ExpansionOperator,
30    ExpansionValue,
31    ExpansionValueEscape,
32}
33
34pub type TokenizerResult = Result<Token, SyntaxError>;
35
36#[inline(always)]
37fn is_wsnl(ch: char) -> bool {
38    matches!(ch, ' ' | '\t' | '\n')
39}
40
41#[inline(always)]
42fn is_identifier_start(ch: char) -> bool {
43    ch.is_ascii_alphabetic() || ch == '_'
44}
45
46#[inline(always)]
47fn is_identifier_char(ch: char) -> bool {
48    ch.is_ascii_alphanumeric() || ch == '_'
49}
50
51#[inline(always)]
52fn is_shell_special_char(ch: char) -> bool {
53    matches!(ch, '|' | '&' | ';' | '<' | '>' | '(' | ')')
54}
55
56#[inline(always)]
57fn is_shell_special_param(ch: char) -> bool {
58    ch.is_ascii_digit() || matches!(ch, '@' | '*' | '#' | '?' | '$' | '!' | '-')
59}
60
61#[inline(always)]
62fn is_dq_escape(ch: char) -> bool {
63    matches!(ch, '"' | '$' | '`' | '\\')
64}
65
66#[inline(always)]
67fn is_operator(ch: char) -> bool {
68    matches!(ch, '-' | '=' | '+' | '?')
69}
70
71#[derive(Debug)]
72pub struct Tokenizer<I>
73where
74    I: Iterator<Item = char>,
75{
76    input: I,
77    filename: Option<PathBuf>,
78    done: bool,
79    state: State,
80    return_states: VecDeque<State>,
81    queue: VecDeque<Token>,
82    buf: String,
83    buf_pos: Position,
84    cc: Option<char>,
85    reconsume: bool,
86    line: usize,
87    column: usize,
88    single_quote_pos: Position,
89    quoting_stack: VecDeque<Position>,
90    expansion_stack: VecDeque<Position>,
91}
92
93impl<I> Iterator for Tokenizer<I>
94where
95    I: Iterator<Item = char>,
96{
97    type Item = TokenizerResult;
98
99    fn next(&mut self) -> Option<Self::Item> {
100        if self.done {
101            if !self.queue.is_empty() {
102                Some(Ok(self.queue.pop_front().unwrap()))
103            } else {
104                None
105            }
106        } else {
107            while self.queue.is_empty() {
108                if let Err(e) = self.run() {
109                    return Some(Err(e));
110                }
111            }
112            Some(Ok(self.queue.pop_front().unwrap()))
113        }
114    }
115}
116
117impl<I> Tokenizer<I>
118where
119    I: Iterator<Item = char>,
120{
121    pub fn new(input: I, filename: Option<PathBuf>) -> Self {
122        Self {
123            input,
124            filename,
125            done: false,
126            state: State::AssignmentList,
127            return_states: VecDeque::with_capacity(16),
128            queue: VecDeque::with_capacity(4),
129            buf: String::with_capacity(64),
130            buf_pos: Position::new(0, 0),
131            reconsume: false,
132            cc: None,
133            line: 1,
134            column: 0,
135            single_quote_pos: Position::new(0, 0),
136            quoting_stack: VecDeque::with_capacity(8),
137            expansion_stack: VecDeque::with_capacity(8),
138        }
139    }
140
141    #[allow(clippy::unit_arg)]
142    fn run(&mut self) -> Result<(), SyntaxError> {
143        match self.state {
144            State::AssignmentList => match self.consume_the_next_character() {
145                None => Ok(self.emit_eof()),
146                Some('\0') => self.err(ErrorKind::NullCharacter),
147                Some(c) if is_wsnl(c) => Ok(()),
148                Some('#') => Ok(self.switch_to(State::Comment)),
149                Some(c) if is_identifier_start(c) => {
150                    self.buffer(c);
151                    Ok(self.switch_to(State::AssignmentName))
152                }
153                Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
154            },
155            State::Comment => loop {
156                match self.consume_the_next_character() {
157                    None => return Ok(self.emit_eof()),
158                    Some('\0') => return self.err(ErrorKind::NullCharacter),
159                    Some('\n') => return Ok(self.switch_to(State::AssignmentList)),
160                    Some(_) => (),
161                };
162            },
163            State::AssignmentName => match self.consume_the_next_character() {
164                None => self.err_eof(),
165                Some('\0') => self.err(ErrorKind::NullCharacter),
166                Some('=') => {
167                    self.flush_buffer(TokenKind::Assign);
168                    Ok(self.switch_to(State::AssignmentValue))
169                }
170                Some(c) if is_identifier_char(c) => {
171                    self.buffer(c);
172                    Ok(())
173                }
174                Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
175            },
176            State::AssignmentValue => match self.consume_the_next_character() {
177                None => {
178                    self.flush_buffer(TokenKind::Characters);
179                    Ok(self.emit_eof())
180                }
181                Some('\0') => self.err(ErrorKind::NullCharacter),
182                Some(c) if is_wsnl(c) => {
183                    self.flush_buffer(TokenKind::Characters);
184                    Ok(self.switch_to(State::AssignmentList))
185                }
186                Some('\\') => Ok(self.switch_to(State::AssignmentValueEscape)),
187                Some('\'') => {
188                    self.single_quote_pos = self.cur_pos();
189                    self.return_states.push_back(self.state);
190                    Ok(self.switch_to(State::SingleQuoted))
191                }
192                Some('"') => {
193                    self.quoting_stack.push_back(self.cur_pos());
194                    self.return_states.push_back(self.state);
195                    Ok(self.switch_to(State::DoubleQuoted))
196                }
197                Some('$') => {
198                    self.return_states.push_back(self.state);
199                    Ok(self.switch_to(State::Dollar))
200                }
201                Some('`') => self.err(ErrorKind::UnsupportedCommandExpansion),
202                Some(c) if is_shell_special_char(c) => {
203                    self.err(ErrorKind::UnescapedSpecialCharacter(c))
204                }
205                Some(c) => Ok(self.buffer(c)),
206            },
207            State::AssignmentValueEscape => match self.consume_the_next_character() {
208                None => {
209                    self.buffer('\\');
210                    self.flush_buffer(TokenKind::Characters);
211                    Ok(self.emit_eof())
212                }
213                Some('\0') => self.err(ErrorKind::NullCharacter),
214                Some('\n') => Ok(self.switch_to(State::AssignmentValue)),
215                Some(c) => {
216                    self.buffer(c);
217                    Ok(self.switch_to(State::AssignmentValue))
218                }
219            },
220            State::SingleQuoted => loop {
221                match self.consume_the_next_character() {
222                    None => return self.unterminated_single_quote(),
223                    Some('\0') => return self.err(ErrorKind::NullCharacter),
224                    Some('\'') => return Ok(self.switch_to_return_state()),
225                    Some(c) => self.buffer(c),
226                };
227            },
228            State::DoubleQuoted => loop {
229                match self.consume_the_next_character() {
230                    None => return self.unterminated_double_quote(),
231                    Some('\0') => return self.err(ErrorKind::NullCharacter),
232                    Some('`') => return self.err(ErrorKind::UnsupportedCommandExpansion),
233                    Some('"') => {
234                        self.quoting_stack.pop_back();
235                        return Ok(self.switch_to_return_state());
236                    }
237                    Some('\\') => return Ok(self.switch_to(State::DoubleQuotedEscape)),
238                    Some('$') => {
239                        self.return_states.push_back(self.state);
240                        return Ok(self.switch_to(State::Dollar));
241                    }
242                    Some(c) => self.buffer(c),
243                };
244            },
245            State::DoubleQuotedEscape => match self.consume_the_next_character() {
246                None => self.unterminated_double_quote(),
247                Some('\0') => self.err(ErrorKind::NullCharacter),
248                Some('\n') => Ok(self.switch_to(State::DoubleQuoted)),
249                Some(c) if is_dq_escape(c) => {
250                    self.buffer(c);
251                    Ok(self.switch_to(State::DoubleQuoted))
252                }
253                Some(c) => {
254                    self.buffer('\\');
255                    self.buffer(c);
256                    Ok(self.switch_to(State::DoubleQuoted))
257                }
258            },
259            State::Dollar => match self.consume_the_next_character() {
260                Some('\0') => self.err(ErrorKind::NullCharacter),
261                Some(c) if is_shell_special_param(c) => {
262                    self.err(ErrorKind::UnsupportedShellParameter(format!("${}", c)))
263                }
264                Some('(') => self.err(ErrorKind::UnsupportedCommandOrArithmeticExpansion),
265                Some('{') => {
266                    self.expansion_stack.push_back(self.cur_pos());
267                    self.flush_buffer(TokenKind::Characters);
268                    Ok(self.switch_to(State::ComplexExpansionStart))
269                }
270                Some(c) if is_identifier_char(c) => {
271                    self.flush_buffer(TokenKind::Characters);
272                    self.buffer(c);
273                    Ok(self.switch_to(State::SimpleExpansion))
274                }
275                Some(_) | None => {
276                    self.buffer('$');
277                    Ok(self.reconsume_in_return_state())
278                }
279            },
280            State::SimpleExpansion => match self.consume_the_next_character() {
281                Some('\0') => self.err(ErrorKind::NullCharacter),
282                Some(c) if is_identifier_char(c) => Ok(self.buffer(c)),
283                _ => {
284                    self.flush_buffer(TokenKind::SimpleExpansion);
285                    Ok(self.reconsume_in_return_state())
286                }
287            },
288            State::ComplexExpansionStart => match self.consume_the_next_character() {
289                Some('\0') => self.err(ErrorKind::NullCharacter),
290                Some(c) if is_shell_special_param(c) => {
291                    self.err(ErrorKind::UnsupportedShellParameter(format!("${{{}}}", c)))
292                }
293                Some(c) if is_identifier_start(c) => {
294                    self.buffer(c);
295                    Ok(self.switch_to(State::ComplexExpansion))
296                }
297                Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
298                None => self.err_eof(),
299            },
300            State::ComplexExpansion => match self.consume_the_next_character() {
301                None => self.unterminated_expansion(),
302                Some('\0') => self.err(ErrorKind::NullCharacter),
303                Some('}') => {
304                    self.expansion_stack.pop_back();
305                    self.flush_buffer(TokenKind::SimpleExpansion);
306                    Ok(self.switch_to_return_state())
307                }
308                Some(c) if is_identifier_char(c) => Ok(self.buffer(c)),
309                Some(':') => {
310                    self.flush_buffer(TokenKind::StartExpansion);
311                    self.buffer(':');
312                    Ok(self.switch_to(State::ExpansionOperator))
313                }
314                Some(c) if is_operator(c) => {
315                    self.flush_buffer(TokenKind::StartExpansion);
316                    self.emit(TokenKind::ExpansionOperator, c.to_string());
317                    Ok(self.switch_to(State::ExpansionValue))
318                }
319                Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
320            },
321            State::ExpansionOperator => match self.consume_the_next_character() {
322                None => self.err_eof(),
323                Some('\0') => self.err(ErrorKind::NullCharacter),
324                Some(c) if is_operator(c) => {
325                    self.buffer(c);
326                    self.flush_buffer(TokenKind::ExpansionOperator);
327                    Ok(self.switch_to(State::ExpansionValue))
328                }
329                Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
330            },
331            State::ExpansionValue => match self.consume_the_next_character() {
332                None => self.unterminated_expansion(),
333                Some('\0') => self.err(ErrorKind::NullCharacter),
334                Some('`') => self.err(ErrorKind::UnsupportedCommandExpansion),
335                Some('}') => {
336                    self.expansion_stack.pop_back();
337                    self.flush_buffer(TokenKind::Characters);
338                    self.emit(TokenKind::EndExpansion, "}".to_string());
339                    Ok(self.switch_to_return_state())
340                }
341                Some('\\') => Ok(self.switch_to(State::ExpansionValueEscape)),
342                Some('$') => {
343                    self.return_states.push_back(self.state);
344                    Ok(self.switch_to(State::Dollar))
345                }
346                Some('"') => {
347                    self.quoting_stack.push_back(self.cur_pos());
348                    self.return_states.push_back(self.state);
349                    Ok(self.switch_to(State::DoubleQuoted))
350                }
351                Some('\'') => {
352                    if !self.quoting_stack.is_empty() {
353                        self.buffer('\'');
354                        Ok(())
355                    } else {
356                        self.single_quote_pos = self.cur_pos();
357                        self.return_states.push_back(self.state);
358                        Ok(self.switch_to(State::SingleQuoted))
359                    }
360                }
361                Some(c) => Ok(self.buffer(c)),
362            },
363            State::ExpansionValueEscape => match self.consume_the_next_character() {
364                None => self.unterminated_expansion(),
365                Some('\0') => self.err(ErrorKind::NullCharacter),
366                Some('\n') => Ok(self.switch_to(State::ExpansionValue)),
367                Some(c) if is_dq_escape(c) => {
368                    self.buffer(c);
369                    Ok(self.switch_to(State::ExpansionValue))
370                }
371                Some(c) => {
372                    if !self.quoting_stack.is_empty() {
373                        self.buffer('\\');
374                    }
375                    self.buffer(c);
376                    Ok(self.switch_to(State::ExpansionValue))
377                }
378            },
379        }
380    }
381
382    fn switch_to(&mut self, state: State) {
383        self.state = state;
384    }
385
386    fn switch_to_return_state(&mut self) {
387        self.state = self.return_states.pop_back().unwrap();
388    }
389
390    fn reconsume_in(&mut self, state: State) {
391        self.reconsume = true;
392        self.state = state;
393    }
394
395    fn reconsume_in_return_state(&mut self) {
396        let state = self.return_states.pop_back().unwrap();
397        self.reconsume_in(state);
398    }
399
400    fn consume_the_next_character(&mut self) -> Option<char> {
401        if self.reconsume {
402            self.reconsume = false;
403        } else {
404            self.cc = self.input.next().map(|c| {
405                if c == '\n' {
406                    self.line += 1;
407                    self.column = 0;
408                } else {
409                    self.column += 1;
410                }
411                c
412            });
413        }
414        self.cc
415    }
416
417    fn emit(&mut self, kind: TokenKind, value: String) {
418        self.queue
419            .push_back(Token::new(kind, value, self.cur_pos()))
420    }
421
422    fn emit_eof(&mut self) {
423        let pos = Position::new(self.line, self.column + 1);
424        self.queue
425            .push_back(Token::new(TokenKind::Eof, "".to_string(), pos));
426        self.done = true;
427    }
428
429    fn flush_buffer(&mut self, kind: TokenKind) {
430        if !self.buf.is_empty() {
431            self.queue
432                .push_back(Token::new(kind, self.buf.clone(), self.buf_pos));
433            self.buf.clear();
434        }
435    }
436
437    fn buffer(&mut self, c: char) {
438        if self.buf.is_empty() {
439            self.buf_pos = self.cur_pos();
440        }
441        self.buf.push(c);
442    }
443
444    fn cur_pos(&self) -> Position {
445        Position::new(self.line, self.column)
446    }
447
448    fn err<T>(&self, kind: ErrorKind) -> Result<T, SyntaxError> {
449        Err(SyntaxError::new(
450            kind,
451            self.cur_pos(),
452            self.filename.clone(),
453        ))
454    }
455
456    fn err_at(&self, kind: ErrorKind, pos: Position) -> Result<(), SyntaxError> {
457        Err(SyntaxError::new(kind, pos, self.filename.clone()))
458    }
459
460    fn err_eof(&self) -> Result<(), SyntaxError> {
461        self.err_at(ErrorKind::Eof, Position::new(self.line, self.column + 1))
462    }
463
464    fn unterminated_single_quote(&mut self) -> Result<(), SyntaxError> {
465        self.err_at(
466            ErrorKind::UnterminatedSingleQuotedString,
467            self.single_quote_pos,
468        )
469    }
470
471    fn unterminated_double_quote(&mut self) -> Result<(), SyntaxError> {
472        let pos = self.quoting_stack.pop_back().unwrap();
473        self.err_at(ErrorKind::UnterminatedDoubleQuotedString, pos)
474    }
475
476    fn unterminated_expansion(&mut self) -> Result<(), SyntaxError> {
477        let pos = self.expansion_stack.pop_back().unwrap();
478        self.err_at(ErrorKind::UnterminatedExpansion, pos)
479    }
480}