1use std::{collections::VecDeque, path::PathBuf};
2
3use self::{
4 err::{ErrorKind, SyntaxError},
5 pos::Position,
6 token::*,
7};
8
9pub mod err;
10pub mod pos;
11#[cfg(test)]
12mod tests;
13pub mod token;
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16enum State {
17 AssignmentList,
18 Comment,
19 AssignmentName,
20 AssignmentValue,
21 AssignmentValueEscape,
22 SingleQuoted,
23 DoubleQuoted,
24 DoubleQuotedEscape,
25 Dollar,
26 SimpleExpansion,
27 ComplexExpansionStart,
28 ComplexExpansion,
29 ExpansionOperator,
30 ExpansionValue,
31 ExpansionValueEscape,
32}
33
34pub type TokenizerResult = Result<Token, SyntaxError>;
35
36#[inline(always)]
37fn is_wsnl(ch: char) -> bool {
38 matches!(ch, ' ' | '\t' | '\n')
39}
40
41#[inline(always)]
42fn is_identifier_start(ch: char) -> bool {
43 ch.is_ascii_alphabetic() || ch == '_'
44}
45
46#[inline(always)]
47fn is_identifier_char(ch: char) -> bool {
48 ch.is_ascii_alphanumeric() || ch == '_'
49}
50
51#[inline(always)]
52fn is_shell_special_char(ch: char) -> bool {
53 matches!(ch, '|' | '&' | ';' | '<' | '>' | '(' | ')')
54}
55
56#[inline(always)]
57fn is_shell_special_param(ch: char) -> bool {
58 ch.is_ascii_digit() || matches!(ch, '@' | '*' | '#' | '?' | '$' | '!' | '-')
59}
60
61#[inline(always)]
62fn is_dq_escape(ch: char) -> bool {
63 matches!(ch, '"' | '$' | '`' | '\\')
64}
65
66#[inline(always)]
67fn is_operator(ch: char) -> bool {
68 matches!(ch, '-' | '=' | '+' | '?')
69}
70
71#[derive(Debug)]
72pub struct Tokenizer<I>
73where
74 I: Iterator<Item = char>,
75{
76 input: I,
77 filename: Option<PathBuf>,
78 done: bool,
79 state: State,
80 return_states: VecDeque<State>,
81 queue: VecDeque<Token>,
82 buf: String,
83 buf_pos: Position,
84 cc: Option<char>,
85 reconsume: bool,
86 line: usize,
87 column: usize,
88 single_quote_pos: Position,
89 quoting_stack: VecDeque<Position>,
90 expansion_stack: VecDeque<Position>,
91}
92
93impl<I> Iterator for Tokenizer<I>
94where
95 I: Iterator<Item = char>,
96{
97 type Item = TokenizerResult;
98
99 fn next(&mut self) -> Option<Self::Item> {
100 if self.done {
101 if !self.queue.is_empty() {
102 Some(Ok(self.queue.pop_front().unwrap()))
103 } else {
104 None
105 }
106 } else {
107 while self.queue.is_empty() {
108 if let Err(e) = self.run() {
109 return Some(Err(e));
110 }
111 }
112 Some(Ok(self.queue.pop_front().unwrap()))
113 }
114 }
115}
116
117impl<I> Tokenizer<I>
118where
119 I: Iterator<Item = char>,
120{
121 pub fn new(input: I, filename: Option<PathBuf>) -> Self {
122 Self {
123 input,
124 filename,
125 done: false,
126 state: State::AssignmentList,
127 return_states: VecDeque::with_capacity(16),
128 queue: VecDeque::with_capacity(4),
129 buf: String::with_capacity(64),
130 buf_pos: Position::new(0, 0),
131 reconsume: false,
132 cc: None,
133 line: 1,
134 column: 0,
135 single_quote_pos: Position::new(0, 0),
136 quoting_stack: VecDeque::with_capacity(8),
137 expansion_stack: VecDeque::with_capacity(8),
138 }
139 }
140
141 #[allow(clippy::unit_arg)]
142 fn run(&mut self) -> Result<(), SyntaxError> {
143 match self.state {
144 State::AssignmentList => match self.consume_the_next_character() {
145 None => Ok(self.emit_eof()),
146 Some('\0') => self.err(ErrorKind::NullCharacter),
147 Some(c) if is_wsnl(c) => Ok(()),
148 Some('#') => Ok(self.switch_to(State::Comment)),
149 Some(c) if is_identifier_start(c) => {
150 self.buffer(c);
151 Ok(self.switch_to(State::AssignmentName))
152 }
153 Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
154 },
155 State::Comment => loop {
156 match self.consume_the_next_character() {
157 None => return Ok(self.emit_eof()),
158 Some('\0') => return self.err(ErrorKind::NullCharacter),
159 Some('\n') => return Ok(self.switch_to(State::AssignmentList)),
160 Some(_) => (),
161 };
162 },
163 State::AssignmentName => match self.consume_the_next_character() {
164 None => self.err_eof(),
165 Some('\0') => self.err(ErrorKind::NullCharacter),
166 Some('=') => {
167 self.flush_buffer(TokenKind::Assign);
168 Ok(self.switch_to(State::AssignmentValue))
169 }
170 Some(c) if is_identifier_char(c) => {
171 self.buffer(c);
172 Ok(())
173 }
174 Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
175 },
176 State::AssignmentValue => match self.consume_the_next_character() {
177 None => {
178 self.flush_buffer(TokenKind::Characters);
179 Ok(self.emit_eof())
180 }
181 Some('\0') => self.err(ErrorKind::NullCharacter),
182 Some(c) if is_wsnl(c) => {
183 self.flush_buffer(TokenKind::Characters);
184 Ok(self.switch_to(State::AssignmentList))
185 }
186 Some('\\') => Ok(self.switch_to(State::AssignmentValueEscape)),
187 Some('\'') => {
188 self.single_quote_pos = self.cur_pos();
189 self.return_states.push_back(self.state);
190 Ok(self.switch_to(State::SingleQuoted))
191 }
192 Some('"') => {
193 self.quoting_stack.push_back(self.cur_pos());
194 self.return_states.push_back(self.state);
195 Ok(self.switch_to(State::DoubleQuoted))
196 }
197 Some('$') => {
198 self.return_states.push_back(self.state);
199 Ok(self.switch_to(State::Dollar))
200 }
201 Some('`') => self.err(ErrorKind::UnsupportedCommandExpansion),
202 Some(c) if is_shell_special_char(c) => {
203 self.err(ErrorKind::UnescapedSpecialCharacter(c))
204 }
205 Some(c) => Ok(self.buffer(c)),
206 },
207 State::AssignmentValueEscape => match self.consume_the_next_character() {
208 None => {
209 self.buffer('\\');
210 self.flush_buffer(TokenKind::Characters);
211 Ok(self.emit_eof())
212 }
213 Some('\0') => self.err(ErrorKind::NullCharacter),
214 Some('\n') => Ok(self.switch_to(State::AssignmentValue)),
215 Some(c) => {
216 self.buffer(c);
217 Ok(self.switch_to(State::AssignmentValue))
218 }
219 },
220 State::SingleQuoted => loop {
221 match self.consume_the_next_character() {
222 None => return self.unterminated_single_quote(),
223 Some('\0') => return self.err(ErrorKind::NullCharacter),
224 Some('\'') => return Ok(self.switch_to_return_state()),
225 Some(c) => self.buffer(c),
226 };
227 },
228 State::DoubleQuoted => loop {
229 match self.consume_the_next_character() {
230 None => return self.unterminated_double_quote(),
231 Some('\0') => return self.err(ErrorKind::NullCharacter),
232 Some('`') => return self.err(ErrorKind::UnsupportedCommandExpansion),
233 Some('"') => {
234 self.quoting_stack.pop_back();
235 return Ok(self.switch_to_return_state());
236 }
237 Some('\\') => return Ok(self.switch_to(State::DoubleQuotedEscape)),
238 Some('$') => {
239 self.return_states.push_back(self.state);
240 return Ok(self.switch_to(State::Dollar));
241 }
242 Some(c) => self.buffer(c),
243 };
244 },
245 State::DoubleQuotedEscape => match self.consume_the_next_character() {
246 None => self.unterminated_double_quote(),
247 Some('\0') => self.err(ErrorKind::NullCharacter),
248 Some('\n') => Ok(self.switch_to(State::DoubleQuoted)),
249 Some(c) if is_dq_escape(c) => {
250 self.buffer(c);
251 Ok(self.switch_to(State::DoubleQuoted))
252 }
253 Some(c) => {
254 self.buffer('\\');
255 self.buffer(c);
256 Ok(self.switch_to(State::DoubleQuoted))
257 }
258 },
259 State::Dollar => match self.consume_the_next_character() {
260 Some('\0') => self.err(ErrorKind::NullCharacter),
261 Some(c) if is_shell_special_param(c) => {
262 self.err(ErrorKind::UnsupportedShellParameter(format!("${}", c)))
263 }
264 Some('(') => self.err(ErrorKind::UnsupportedCommandOrArithmeticExpansion),
265 Some('{') => {
266 self.expansion_stack.push_back(self.cur_pos());
267 self.flush_buffer(TokenKind::Characters);
268 Ok(self.switch_to(State::ComplexExpansionStart))
269 }
270 Some(c) if is_identifier_char(c) => {
271 self.flush_buffer(TokenKind::Characters);
272 self.buffer(c);
273 Ok(self.switch_to(State::SimpleExpansion))
274 }
275 Some(_) | None => {
276 self.buffer('$');
277 Ok(self.reconsume_in_return_state())
278 }
279 },
280 State::SimpleExpansion => match self.consume_the_next_character() {
281 Some('\0') => self.err(ErrorKind::NullCharacter),
282 Some(c) if is_identifier_char(c) => Ok(self.buffer(c)),
283 _ => {
284 self.flush_buffer(TokenKind::SimpleExpansion);
285 Ok(self.reconsume_in_return_state())
286 }
287 },
288 State::ComplexExpansionStart => match self.consume_the_next_character() {
289 Some('\0') => self.err(ErrorKind::NullCharacter),
290 Some(c) if is_shell_special_param(c) => {
291 self.err(ErrorKind::UnsupportedShellParameter(format!("${{{}}}", c)))
292 }
293 Some(c) if is_identifier_start(c) => {
294 self.buffer(c);
295 Ok(self.switch_to(State::ComplexExpansion))
296 }
297 Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
298 None => self.err_eof(),
299 },
300 State::ComplexExpansion => match self.consume_the_next_character() {
301 None => self.unterminated_expansion(),
302 Some('\0') => self.err(ErrorKind::NullCharacter),
303 Some('}') => {
304 self.expansion_stack.pop_back();
305 self.flush_buffer(TokenKind::SimpleExpansion);
306 Ok(self.switch_to_return_state())
307 }
308 Some(c) if is_identifier_char(c) => Ok(self.buffer(c)),
309 Some(':') => {
310 self.flush_buffer(TokenKind::StartExpansion);
311 self.buffer(':');
312 Ok(self.switch_to(State::ExpansionOperator))
313 }
314 Some(c) if is_operator(c) => {
315 self.flush_buffer(TokenKind::StartExpansion);
316 self.emit(TokenKind::ExpansionOperator, c.to_string());
317 Ok(self.switch_to(State::ExpansionValue))
318 }
319 Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
320 },
321 State::ExpansionOperator => match self.consume_the_next_character() {
322 None => self.err_eof(),
323 Some('\0') => self.err(ErrorKind::NullCharacter),
324 Some(c) if is_operator(c) => {
325 self.buffer(c);
326 self.flush_buffer(TokenKind::ExpansionOperator);
327 Ok(self.switch_to(State::ExpansionValue))
328 }
329 Some(c) => self.err(ErrorKind::InvalidCharacter(c)),
330 },
331 State::ExpansionValue => match self.consume_the_next_character() {
332 None => self.unterminated_expansion(),
333 Some('\0') => self.err(ErrorKind::NullCharacter),
334 Some('`') => self.err(ErrorKind::UnsupportedCommandExpansion),
335 Some('}') => {
336 self.expansion_stack.pop_back();
337 self.flush_buffer(TokenKind::Characters);
338 self.emit(TokenKind::EndExpansion, "}".to_string());
339 Ok(self.switch_to_return_state())
340 }
341 Some('\\') => Ok(self.switch_to(State::ExpansionValueEscape)),
342 Some('$') => {
343 self.return_states.push_back(self.state);
344 Ok(self.switch_to(State::Dollar))
345 }
346 Some('"') => {
347 self.quoting_stack.push_back(self.cur_pos());
348 self.return_states.push_back(self.state);
349 Ok(self.switch_to(State::DoubleQuoted))
350 }
351 Some('\'') => {
352 if !self.quoting_stack.is_empty() {
353 self.buffer('\'');
354 Ok(())
355 } else {
356 self.single_quote_pos = self.cur_pos();
357 self.return_states.push_back(self.state);
358 Ok(self.switch_to(State::SingleQuoted))
359 }
360 }
361 Some(c) => Ok(self.buffer(c)),
362 },
363 State::ExpansionValueEscape => match self.consume_the_next_character() {
364 None => self.unterminated_expansion(),
365 Some('\0') => self.err(ErrorKind::NullCharacter),
366 Some('\n') => Ok(self.switch_to(State::ExpansionValue)),
367 Some(c) if is_dq_escape(c) => {
368 self.buffer(c);
369 Ok(self.switch_to(State::ExpansionValue))
370 }
371 Some(c) => {
372 if !self.quoting_stack.is_empty() {
373 self.buffer('\\');
374 }
375 self.buffer(c);
376 Ok(self.switch_to(State::ExpansionValue))
377 }
378 },
379 }
380 }
381
382 fn switch_to(&mut self, state: State) {
383 self.state = state;
384 }
385
386 fn switch_to_return_state(&mut self) {
387 self.state = self.return_states.pop_back().unwrap();
388 }
389
390 fn reconsume_in(&mut self, state: State) {
391 self.reconsume = true;
392 self.state = state;
393 }
394
395 fn reconsume_in_return_state(&mut self) {
396 let state = self.return_states.pop_back().unwrap();
397 self.reconsume_in(state);
398 }
399
400 fn consume_the_next_character(&mut self) -> Option<char> {
401 if self.reconsume {
402 self.reconsume = false;
403 } else {
404 self.cc = self.input.next().map(|c| {
405 if c == '\n' {
406 self.line += 1;
407 self.column = 0;
408 } else {
409 self.column += 1;
410 }
411 c
412 });
413 }
414 self.cc
415 }
416
417 fn emit(&mut self, kind: TokenKind, value: String) {
418 self.queue
419 .push_back(Token::new(kind, value, self.cur_pos()))
420 }
421
422 fn emit_eof(&mut self) {
423 let pos = Position::new(self.line, self.column + 1);
424 self.queue
425 .push_back(Token::new(TokenKind::Eof, "".to_string(), pos));
426 self.done = true;
427 }
428
429 fn flush_buffer(&mut self, kind: TokenKind) {
430 if !self.buf.is_empty() {
431 self.queue
432 .push_back(Token::new(kind, self.buf.clone(), self.buf_pos));
433 self.buf.clear();
434 }
435 }
436
437 fn buffer(&mut self, c: char) {
438 if self.buf.is_empty() {
439 self.buf_pos = self.cur_pos();
440 }
441 self.buf.push(c);
442 }
443
444 fn cur_pos(&self) -> Position {
445 Position::new(self.line, self.column)
446 }
447
448 fn err<T>(&self, kind: ErrorKind) -> Result<T, SyntaxError> {
449 Err(SyntaxError::new(
450 kind,
451 self.cur_pos(),
452 self.filename.clone(),
453 ))
454 }
455
456 fn err_at(&self, kind: ErrorKind, pos: Position) -> Result<(), SyntaxError> {
457 Err(SyntaxError::new(kind, pos, self.filename.clone()))
458 }
459
460 fn err_eof(&self) -> Result<(), SyntaxError> {
461 self.err_at(ErrorKind::Eof, Position::new(self.line, self.column + 1))
462 }
463
464 fn unterminated_single_quote(&mut self) -> Result<(), SyntaxError> {
465 self.err_at(
466 ErrorKind::UnterminatedSingleQuotedString,
467 self.single_quote_pos,
468 )
469 }
470
471 fn unterminated_double_quote(&mut self) -> Result<(), SyntaxError> {
472 let pos = self.quoting_stack.pop_back().unwrap();
473 self.err_at(ErrorKind::UnterminatedDoubleQuotedString, pos)
474 }
475
476 fn unterminated_expansion(&mut self) -> Result<(), SyntaxError> {
477 let pos = self.expansion_stack.pop_back().unwrap();
478 self.err_at(ErrorKind::UnterminatedExpansion, pos)
479 }
480}