1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8 EndOfInput,
10 UnescapedNewLine,
12 SpecifiedTerminatingChar,
14 NonNewLineBlank,
16 HereDocumentBodyStart,
18 HereDocumentBodyEnd,
20 HereDocumentEndTag,
22 OperatorStart,
24 OperatorEnd,
26 Other,
28}
29
30#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34 pub index: i32,
36 pub line: i32,
38 pub column: i32,
40}
41
42impl Display for SourcePosition {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45 }
46}
47
48#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52 pub start: SourcePosition,
54 pub end: SourcePosition,
56}
57
58#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62 Operator(String, TokenLocation),
64 Word(String, TokenLocation),
66}
67
68impl Token {
69 pub fn to_str(&self) -> &str {
71 match self {
72 Token::Operator(s, _) => s,
73 Token::Word(s, _) => s,
74 }
75 }
76
77 pub fn location(&self) -> &TokenLocation {
79 match self {
80 Token::Operator(_, l) => l,
81 Token::Word(_, l) => l,
82 }
83 }
84}
85
86#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89 pub reason: TokenEndReason,
91 pub token: Option<Token>,
93}
94
95#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98 #[error("unterminated escape sequence")]
100 UnterminatedEscapeSequence,
101
102 #[error("unterminated single quote at {0}")]
104 UnterminatedSingleQuote(SourcePosition),
105
106 #[error("unterminated double quote at {0}")]
108 UnterminatedDoubleQuote(SourcePosition),
109
110 #[error("unterminated backquote near {0}")]
112 UnterminatedBackquote(SourcePosition),
113
114 #[error("unterminated extglob near {0}")]
117 UnterminatedExtendedGlob(SourcePosition),
118
119 #[error("unterminated variable expression")]
121 UnterminatedVariable,
122
123 #[error("unterminated command substitution")]
125 UnterminatedCommandSubstitution,
126
127 #[error("failed to decode UTF-8 characters")]
129 FailedDecoding,
130
131 #[error("missing here tag for here document body")]
133 MissingHereTagForDocumentBody,
134
135 #[error("missing here tag '{0}'")]
137 MissingHereTag(String),
138
139 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141 UnterminatedHereDocuments(String, String),
142
143 #[error("failed to read input")]
145 ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149 pub fn is_incomplete(&self) -> bool {
152 matches!(
153 self,
154 Self::UnterminatedEscapeSequence
155 | Self::UnterminatedSingleQuote(..)
156 | Self::UnterminatedDoubleQuote(..)
157 | Self::UnterminatedBackquote(..)
158 | Self::UnterminatedCommandSubstitution
159 | Self::UnterminatedVariable
160 | Self::UnterminatedExtendedGlob(..)
161 | Self::UnterminatedHereDocuments(..)
162 )
163 }
164}
165
166#[derive(Debug)]
168pub(crate) struct Tokens<'a> {
169 pub tokens: &'a [Token],
171}
172
173#[derive(Clone, Debug)]
174enum QuoteMode {
175 None,
176 Single(SourcePosition),
177 Double(SourcePosition),
178}
179
180#[derive(Clone, Debug, Default)]
181enum HereState {
182 #[default]
184 None,
185 NextTokenIsHereTag { remove_tabs: bool },
187 CurrentTokenIsHereTag {
189 remove_tabs: bool,
190 operator_token_result: TokenizeResult,
191 },
192 NextLineIsHereDoc,
195 InHereDocs,
198}
199
200#[derive(Clone, Debug)]
201struct HereTag {
202 tag: String,
203 tag_was_escaped_or_quoted: bool,
204 remove_tabs: bool,
205 position: SourcePosition,
206 tokens: Vec<TokenizeResult>,
207 pending_tokens_after: Vec<TokenizeResult>,
208}
209
210#[derive(Clone, Debug)]
211struct CrossTokenParseState {
212 cursor: SourcePosition,
214 here_state: HereState,
216 current_here_tags: Vec<HereTag>,
218 queued_tokens: Vec<TokenizeResult>,
220 arithmetic_expansion: bool,
222}
223
224#[derive(Clone, Debug, Hash, Eq, PartialEq)]
226pub struct TokenizerOptions {
227 pub enable_extended_globbing: bool,
229 #[allow(unused)]
231 pub posix_mode: bool,
232 pub sh_mode: bool,
234}
235
236impl Default for TokenizerOptions {
237 fn default() -> Self {
238 Self {
239 enable_extended_globbing: true,
240 posix_mode: false,
241 sh_mode: false,
242 }
243 }
244}
245
246pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
248 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
249 cross_state: CrossTokenParseState,
250 options: TokenizerOptions,
251}
252
253#[derive(Clone, Debug)]
255struct TokenParseState {
256 pub start_position: SourcePosition,
257 pub token_so_far: String,
258 pub token_is_operator: bool,
259 pub in_escape: bool,
260 pub quote_mode: QuoteMode,
261}
262
263impl TokenParseState {
264 pub fn new(start_position: &SourcePosition) -> Self {
265 TokenParseState {
266 start_position: start_position.clone(),
267 token_so_far: String::new(),
268 token_is_operator: false,
269 in_escape: false,
270 quote_mode: QuoteMode::None,
271 }
272 }
273
274 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
275 let token_location = TokenLocation {
276 start: std::mem::take(&mut self.start_position),
277 end: end_position.clone(),
278 };
279
280 let token = if std::mem::take(&mut self.token_is_operator) {
281 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
282 } else {
283 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
284 };
285
286 self.start_position = end_position.clone();
287 self.in_escape = false;
288 self.quote_mode = QuoteMode::None;
289
290 token
291 }
292
293 pub fn started_token(&self) -> bool {
294 !self.token_so_far.is_empty()
295 }
296
297 pub fn append_char(&mut self, c: char) {
298 self.token_so_far.push(c);
299 }
300
301 pub fn append_str(&mut self, s: &str) {
302 self.token_so_far.push_str(s);
303 }
304
305 pub fn unquoted(&self) -> bool {
306 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
307 }
308
309 pub fn current_token(&self) -> &str {
310 &self.token_so_far
311 }
312
313 pub fn is_specific_operator(&self, operator: &str) -> bool {
314 self.token_is_operator && self.current_token() == operator
315 }
316
317 pub fn in_operator(&self) -> bool {
318 self.token_is_operator
319 }
320
321 fn is_newline(&self) -> bool {
322 self.token_so_far == "\n"
323 }
324
325 fn replace_with_here_doc(&mut self, s: String) {
326 self.token_so_far = s;
327 }
328
329 pub fn delimit_current_token(
330 &mut self,
331 reason: TokenEndReason,
332 cross_token_state: &mut CrossTokenParseState,
333 ) -> Result<Option<TokenizeResult>, TokenizerError> {
334 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
337 return Ok(Some(TokenizeResult {
338 reason,
339 token: None,
340 }));
341 }
342
343 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
345 match current_here_state {
346 HereState::NextTokenIsHereTag { remove_tabs } => {
347 let operator_token_result = TokenizeResult {
350 reason,
351 token: Some(self.pop(&cross_token_state.cursor)),
352 };
353
354 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
355 remove_tabs,
356 operator_token_result,
357 };
358
359 return Ok(None);
360 }
361 HereState::CurrentTokenIsHereTag {
362 remove_tabs,
363 operator_token_result,
364 } => {
365 if self.is_newline() {
366 return Err(TokenizerError::MissingHereTag(
367 self.current_token().to_owned(),
368 ));
369 }
370
371 cross_token_state.here_state = HereState::NextLineIsHereDoc;
372
373 let tag = std::format!("{}\n", self.current_token());
375 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
376
377 let tag_token_result = TokenizeResult {
378 reason,
379 token: Some(self.pop(&cross_token_state.cursor)),
380 };
381
382 cross_token_state.current_here_tags.push(HereTag {
383 tag,
384 tag_was_escaped_or_quoted,
385 remove_tabs,
386 position: cross_token_state.cursor.clone(),
387 tokens: vec![operator_token_result, tag_token_result],
388 pending_tokens_after: vec![],
389 });
390
391 return Ok(None);
392 }
393 HereState::NextLineIsHereDoc => {
394 if self.is_newline() {
395 cross_token_state.here_state = HereState::InHereDocs;
396 } else {
397 cross_token_state.here_state = HereState::NextLineIsHereDoc;
398 }
399
400 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
401 let token = self.pop(&cross_token_state.cursor);
402 let result = TokenizeResult {
403 reason,
404 token: Some(token),
405 };
406
407 last_here_tag.pending_tokens_after.push(result);
408 } else {
409 return Err(TokenizerError::MissingHereTagForDocumentBody);
410 }
411
412 return Ok(None);
413 }
414 HereState::InHereDocs => {
415 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
417
418 for here_token in completed_here_tag.tokens {
420 cross_token_state.queued_tokens.push(here_token);
421 }
422
423 cross_token_state.queued_tokens.push(TokenizeResult {
425 reason: TokenEndReason::HereDocumentBodyStart,
426 token: None,
427 });
428
429 cross_token_state.queued_tokens.push(TokenizeResult {
431 reason,
432 token: Some(self.pop(&cross_token_state.cursor)),
433 });
434
435 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
437 cross_token_state.queued_tokens.push(TokenizeResult {
438 reason: TokenEndReason::HereDocumentEndTag,
439 token: Some(self.pop(&cross_token_state.cursor)),
440 });
441
442 for pending_token in completed_here_tag.pending_tokens_after {
445 cross_token_state.queued_tokens.push(pending_token);
446 }
447
448 if cross_token_state.current_here_tags.is_empty() {
449 cross_token_state.here_state = HereState::None;
450 } else {
451 cross_token_state.here_state = HereState::InHereDocs;
452 }
453
454 return Ok(None);
455 }
456 HereState::None => (),
457 }
458
459 let token = self.pop(&cross_token_state.cursor);
460 let result = TokenizeResult {
461 reason,
462 token: Some(token),
463 };
464
465 Ok(Some(result))
466 }
467}
468
469pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
475 tokenize_str_with_options(input, &TokenizerOptions::default())
476}
477
478pub fn tokenize_str_with_options(
485 input: &str,
486 options: &TokenizerOptions,
487) -> Result<Vec<Token>, TokenizerError> {
488 uncached_tokenize_string(input.to_owned(), options.to_owned())
489}
490
491#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
492fn uncached_tokenize_string(
493 input: String,
494 options: TokenizerOptions,
495) -> Result<Vec<Token>, TokenizerError> {
496 uncached_tokenize_str(input.as_str(), &options)
497}
498
499pub fn uncached_tokenize_str(
506 input: &str,
507 options: &TokenizerOptions,
508) -> Result<Vec<Token>, TokenizerError> {
509 let mut reader = std::io::BufReader::new(input.as_bytes());
510 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
511
512 let mut tokens = vec![];
513 loop {
514 match tokenizer.next_token()? {
515 TokenizeResult {
516 token: Some(token), ..
517 } => tokens.push(token),
518 TokenizeResult {
519 reason: TokenEndReason::EndOfInput,
520 ..
521 } => break,
522 _ => (),
523 }
524 }
525
526 Ok(tokens)
527}
528
529impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
530 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
531 Tokenizer {
532 options: options.clone(),
533 char_reader: reader.chars().peekable(),
534 cross_state: CrossTokenParseState {
535 cursor: SourcePosition {
536 index: 0,
537 line: 1,
538 column: 1,
539 },
540 here_state: HereState::None,
541 current_here_tags: vec![],
542 queued_tokens: vec![],
543 arithmetic_expansion: false,
544 },
545 }
546 }
547
548 #[allow(clippy::unnecessary_wraps)]
549 pub fn current_location(&self) -> Option<SourcePosition> {
550 Some(self.cross_state.cursor.clone())
551 }
552
553 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
554 let c = self
555 .char_reader
556 .next()
557 .transpose()
558 .map_err(TokenizerError::ReadError)?;
559
560 if let Some(ch) = c {
561 if ch == '\n' {
562 self.cross_state.cursor.line += 1;
563 self.cross_state.cursor.column = 1;
564 } else {
565 self.cross_state.cursor.column += 1;
566 }
567 self.cross_state.cursor.index += 1;
568 }
569
570 Ok(c)
571 }
572
573 fn consume_char(&mut self) -> Result<(), TokenizerError> {
574 let _ = self.next_char()?;
575 Ok(())
576 }
577
578 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
579 match self.char_reader.peek() {
580 Some(result) => match result {
581 Ok(c) => Ok(Some(*c)),
582 Err(_) => Err(TokenizerError::FailedDecoding),
583 },
584 None => Ok(None),
585 }
586 }
587
588 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
589 self.next_token_until(None)
590 }
591
592 #[allow(clippy::if_same_then_else)]
593 #[allow(clippy::too_many_lines)]
594 #[allow(clippy::unwrap_in_result)]
595 #[allow(clippy::panic_in_result_fn)]
596 fn next_token_until(
597 &mut self,
598 terminating_char: Option<char>,
599 ) -> Result<TokenizeResult, TokenizerError> {
600 let mut state = TokenParseState::new(&self.cross_state.cursor);
601 let mut result: Option<TokenizeResult> = None;
602
603 while result.is_none() {
604 if !self.cross_state.queued_tokens.is_empty() {
607 return Ok(self.cross_state.queued_tokens.remove(0));
608 }
609
610 let next = self.peek_char()?;
611 let c = next.unwrap_or('\0');
612
613 if next.is_none() {
616 if state.in_escape {
619 return Err(TokenizerError::UnterminatedEscapeSequence);
620 }
621 match state.quote_mode {
622 QuoteMode::None => (),
623 QuoteMode::Single(pos) => {
624 return Err(TokenizerError::UnterminatedSingleQuote(pos));
625 }
626 QuoteMode::Double(pos) => {
627 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
628 }
629 }
630
631 if !matches!(self.cross_state.here_state, HereState::None) {
633 let tag_names = self
634 .cross_state
635 .current_here_tags
636 .iter()
637 .map(|tag| tag.tag.trim())
638 .collect::<Vec<_>>()
639 .join(", ");
640 let tag_positions = self
641 .cross_state
642 .current_here_tags
643 .iter()
644 .map(|tag| std::format!("{}", tag.position))
645 .collect::<Vec<_>>()
646 .join(", ");
647 return Err(TokenizerError::UnterminatedHereDocuments(
648 tag_names,
649 tag_positions,
650 ));
651 }
652
653 result = state
654 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
655 } else if state.unquoted() && terminating_char == Some(c) {
659 result = state.delimit_current_token(
660 TokenEndReason::SpecifiedTerminatingChar,
661 &mut self.cross_state,
662 )?;
663 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
667 if !self.cross_state.current_here_tags.is_empty()
672 && self.cross_state.current_here_tags[0].remove_tabs
673 && (!state.started_token() || state.current_token().ends_with('\n'))
674 && c == '\t'
675 {
676 self.consume_char()?;
678 } else {
679 self.consume_char()?;
680 state.append_char(c);
681
682 if c == '\n' {
684 let next_here_tag = &self.cross_state.current_here_tags[0];
685 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
686 unquote_str(next_here_tag.tag.as_str()).into()
687 } else {
688 next_here_tag.tag.as_str().into()
689 };
690
691 if let Some(current_token_without_here_tag) =
692 state.current_token().strip_suffix(tag_str.as_ref())
693 {
694 if current_token_without_here_tag.is_empty()
698 || current_token_without_here_tag.ends_with('\n')
699 {
700 state.replace_with_here_doc(
701 current_token_without_here_tag.to_owned(),
702 );
703
704 result = state.delimit_current_token(
706 TokenEndReason::HereDocumentBodyEnd,
707 &mut self.cross_state,
708 )?;
709 }
710 }
711 }
712 }
713 } else if state.in_operator() {
714 let mut hypothetical_token = state.current_token().to_owned();
720 hypothetical_token.push(c);
721
722 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
723 self.consume_char()?;
724 state.append_char(c);
725 } else {
726 assert!(state.started_token());
727
728 if self.cross_state.arithmetic_expansion {
733 } else if state.is_specific_operator("<<") {
737 self.cross_state.here_state =
738 HereState::NextTokenIsHereTag { remove_tabs: false };
739 } else if state.is_specific_operator("<<-") {
740 self.cross_state.here_state =
741 HereState::NextTokenIsHereTag { remove_tabs: true };
742 }
743
744 let reason = if state.current_token() == "\n" {
745 TokenEndReason::UnescapedNewLine
746 } else {
747 TokenEndReason::OperatorEnd
748 };
749
750 result = state.delimit_current_token(reason, &mut self.cross_state)?;
751 }
752 } else if does_char_newly_affect_quoting(&state, c) {
756 if c == '\\' {
757 self.consume_char()?;
759
760 if matches!(self.peek_char()?, Some('\n')) {
761 self.consume_char()?;
763
764 } else {
766 state.in_escape = true;
767 state.append_char(c);
768 }
769 } else if c == '\'' {
770 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
771 self.consume_char()?;
772 state.append_char(c);
773 } else if c == '\"' {
774 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
775 self.consume_char()?;
776 state.append_char(c);
777 }
778 }
779 else if !state.in_escape
782 && matches!(state.quote_mode, QuoteMode::Single(_))
783 && c == '\''
784 {
785 state.quote_mode = QuoteMode::None;
786 self.consume_char()?;
787 state.append_char(c);
788 } else if !state.in_escape
789 && matches!(state.quote_mode, QuoteMode::Double(_))
790 && c == '\"'
791 {
792 state.quote_mode = QuoteMode::None;
793 self.consume_char()?;
794 state.append_char(c);
795 }
796 else if state.in_escape {
800 state.in_escape = false;
801 self.consume_char()?;
802 state.append_char(c);
803 } else if (state.unquoted()
804 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
805 && (c == '$' || c == '`')
806 {
807 if c == '$' {
809 self.consume_char()?;
811
812 let char_after_dollar_sign = self.peek_char()?;
814 match char_after_dollar_sign {
815 Some('(') => {
816 state.append_char('$');
818
819 state.append_char(self.next_char()?.unwrap());
821
822 let mut required_end_parens = 1;
825 if matches!(self.peek_char()?, Some('(')) {
826 state.append_char(self.next_char()?.unwrap());
828 required_end_parens = 2;
831 self.cross_state.arithmetic_expansion = true;
836 }
837
838 let mut pending_here_doc_tokens = vec![];
839 let mut drain_here_doc_tokens = false;
840
841 loop {
842 let cur_token = if drain_here_doc_tokens
843 && !pending_here_doc_tokens.is_empty()
844 {
845 if pending_here_doc_tokens.len() == 1 {
846 drain_here_doc_tokens = false;
847 }
848
849 pending_here_doc_tokens.remove(0)
850 } else {
851 let cur_token = self.next_token_until(Some(')'))?;
852
853 if matches!(
857 cur_token.reason,
858 TokenEndReason::HereDocumentBodyStart
859 | TokenEndReason::HereDocumentBodyEnd
860 | TokenEndReason::HereDocumentEndTag
861 ) {
862 pending_here_doc_tokens.push(cur_token);
863 continue;
864 }
865
866 cur_token
867 };
868
869 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
870 && !pending_here_doc_tokens.is_empty()
871 {
872 pending_here_doc_tokens.push(cur_token);
873 drain_here_doc_tokens = true;
874 continue;
875 }
876
877 if let Some(cur_token_value) = cur_token.token {
878 state.append_str(cur_token_value.to_str());
879
880 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
885 {
886 required_end_parens += 1;
887 }
888 }
889
890 match cur_token.reason {
891 TokenEndReason::HereDocumentBodyStart => {
892 state.append_char('\n');
893 }
894 TokenEndReason::NonNewLineBlank => state.append_char(' '),
895 TokenEndReason::SpecifiedTerminatingChar => {
896 required_end_parens -= 1;
901 if required_end_parens == 0 {
902 break;
903 }
904
905 state.append_char(self.next_char()?.unwrap());
908 }
909 TokenEndReason::EndOfInput => {
910 return Err(TokenizerError::UnterminatedCommandSubstitution)
911 }
912 _ => (),
913 }
914 }
915
916 self.cross_state.arithmetic_expansion = false;
917
918 state.append_char(self.next_char()?.unwrap());
919 }
920
921 Some('{') => {
922 state.append_char('$');
924
925 state.append_char(self.next_char()?.unwrap());
927
928 let mut pending_here_doc_tokens = vec![];
929 let mut drain_here_doc_tokens = false;
930
931 loop {
932 let cur_token = if drain_here_doc_tokens
933 && !pending_here_doc_tokens.is_empty()
934 {
935 if pending_here_doc_tokens.len() == 1 {
936 drain_here_doc_tokens = false;
937 }
938
939 pending_here_doc_tokens.remove(0)
940 } else {
941 let cur_token = self.next_token_until(Some('}'))?;
942
943 if matches!(
947 cur_token.reason,
948 TokenEndReason::HereDocumentBodyStart
949 | TokenEndReason::HereDocumentBodyEnd
950 | TokenEndReason::HereDocumentEndTag
951 ) {
952 pending_here_doc_tokens.push(cur_token);
953 continue;
954 }
955
956 cur_token
957 };
958
959 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
960 && !pending_here_doc_tokens.is_empty()
961 {
962 pending_here_doc_tokens.push(cur_token);
963 drain_here_doc_tokens = true;
964 continue;
965 }
966
967 if let Some(cur_token_value) = cur_token.token {
968 state.append_str(cur_token_value.to_str());
969 }
970
971 match cur_token.reason {
972 TokenEndReason::HereDocumentBodyStart => {
973 state.append_char('\n');
974 }
975 TokenEndReason::NonNewLineBlank => state.append_char(' '),
976 TokenEndReason::SpecifiedTerminatingChar => {
977 state.append_char(self.next_char()?.unwrap());
980 break;
981 }
982 TokenEndReason::EndOfInput => {
983 return Err(TokenizerError::UnterminatedVariable)
984 }
985 _ => (),
986 }
987 }
988 }
989 _ => {
990 state.append_char('$');
993 }
994 }
995 } else {
996 let backquote_pos = self.cross_state.cursor.clone();
999 self.consume_char()?;
1000
1001 state.append_char(c);
1003
1004 let mut escaping_enabled = false;
1006 let mut done = false;
1007 while !done {
1008 let next_char_in_backquote = self.next_char()?;
1010 if let Some(cib) = next_char_in_backquote {
1011 state.append_char(cib);
1013
1014 if !escaping_enabled && cib == '\\' {
1016 escaping_enabled = true;
1017 } else {
1018 if !escaping_enabled && cib == '`' {
1020 done = true;
1021 }
1022 escaping_enabled = false;
1023 }
1024 } else {
1025 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1026 }
1027 }
1028 }
1029 }
1030 else if c == '('
1036 && self.options.enable_extended_globbing
1037 && state.unquoted()
1038 && !state.in_operator()
1039 && state
1040 .current_token()
1041 .ends_with(|x| Self::can_start_extglob(x))
1042 {
1043 self.consume_char()?;
1045 state.append_char(c);
1046
1047 let mut paren_depth = 1;
1048
1049 while paren_depth > 0 {
1051 if let Some(extglob_char) = self.next_char()? {
1052 state.append_char(extglob_char);
1054
1055 if extglob_char == '(' {
1058 paren_depth += 1;
1059 } else if extglob_char == ')' {
1060 paren_depth -= 1;
1061 }
1062 } else {
1063 return Err(TokenizerError::UnterminatedExtendedGlob(
1064 self.cross_state.cursor.clone(),
1065 ));
1066 }
1067 }
1068 } else if state.unquoted() && Self::can_start_operator(c) {
1072 if state.started_token() {
1073 result = state.delimit_current_token(
1074 TokenEndReason::OperatorStart,
1075 &mut self.cross_state,
1076 )?;
1077 } else {
1078 state.token_is_operator = true;
1079 self.consume_char()?;
1080 state.append_char(c);
1081 }
1082 } else if state.unquoted() && is_blank(c) {
1086 if state.started_token() {
1087 result = state.delimit_current_token(
1088 TokenEndReason::NonNewLineBlank,
1089 &mut self.cross_state,
1090 )?;
1091 } else {
1092 state.start_position.column += 1;
1094 state.start_position.index += 1;
1095 }
1096
1097 self.consume_char()?;
1098 }
1099 else if !state.token_is_operator
1104 && (state.started_token() || matches!(terminating_char, Some('}')))
1105 {
1106 self.consume_char()?;
1107 state.append_char(c);
1108 } else if c == '#' {
1109 self.consume_char()?;
1111
1112 let mut done = false;
1113 while !done {
1114 done = match self.peek_char()? {
1115 Some('\n') => true,
1116 None => true,
1117 _ => {
1118 self.consume_char()?;
1120 false
1121 }
1122 };
1123 }
1124 } else if state.started_token() {
1126 result =
1128 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1129 } else {
1130 self.consume_char()?;
1133 state.append_char(c);
1134 }
1135 }
1136
1137 let result = result.unwrap();
1138
1139 Ok(result)
1140 }
1141
1142 fn can_start_extglob(c: char) -> bool {
1143 matches!(c, '@' | '!' | '?' | '+' | '*')
1144 }
1145
1146 fn can_start_operator(c: char) -> bool {
1147 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1148 }
1149
1150 fn is_operator(&self, s: &str) -> bool {
1151 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1153 return true;
1154 }
1155
1156 matches!(
1157 s,
1158 "&" | "&&"
1159 | "("
1160 | ")"
1161 | ";"
1162 | ";;"
1163 | "\n"
1164 | "|"
1165 | "||"
1166 | "<"
1167 | ">"
1168 | ">|"
1169 | "<<"
1170 | ">>"
1171 | "<&"
1172 | ">&"
1173 | "<<-"
1174 | "<>"
1175 )
1176 }
1177}
1178
1179impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1180 type Item = Result<TokenizeResult, TokenizerError>;
1181
1182 fn next(&mut self) -> Option<Self::Item> {
1183 match self.next_token() {
1184 #[allow(clippy::manual_map)]
1185 Ok(result) => match result.token {
1186 Some(_) => Some(Ok(result)),
1187 None => None,
1188 },
1189 Err(e) => Some(Err(e)),
1190 }
1191 }
1192}
1193
1194fn is_blank(c: char) -> bool {
1195 c == ' ' || c == '\t'
1196}
1197
1198fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1199 if state.in_escape {
1201 return false;
1202 }
1203
1204 match state.quote_mode {
1205 QuoteMode::Double(_) => {
1207 if c == '\\' {
1208 true
1210 } else {
1211 false
1212 }
1213 }
1214 QuoteMode::Single(_) => false,
1216 QuoteMode::None => is_quoting_char(c),
1219 }
1220}
1221
1222fn is_quoting_char(c: char) -> bool {
1223 matches!(c, '\\' | '\'' | '\"')
1224}
1225
1226pub fn unquote_str(s: &str) -> String {
1232 let mut result = String::new();
1233
1234 let mut in_escape = false;
1235 for c in s.chars() {
1236 match c {
1237 c if in_escape => {
1238 result.push(c);
1239 in_escape = false;
1240 }
1241 '\\' => in_escape = true,
1242 c if is_quoting_char(c) => (),
1243 c => result.push(c),
1244 }
1245 }
1246
1247 result
1248}
1249
1250#[cfg(test)]
1251#[allow(clippy::panic_in_result_fn)]
1252mod tests {
1253
1254 use super::*;
1255 use anyhow::Result;
1256 use pretty_assertions::{assert_eq, assert_matches};
1258
1259 #[test]
1260 fn tokenize_empty() -> Result<()> {
1261 let tokens = tokenize_str("")?;
1262 assert_eq!(tokens.len(), 0);
1263 Ok(())
1264 }
1265
1266 #[test]
1267 fn tokenize_line_continuation() -> Result<()> {
1268 let tokens = tokenize_str(
1269 r"a\
1270bc",
1271 )?;
1272 assert_matches!(
1273 &tokens[..],
1274 [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1275 );
1276 Ok(())
1277 }
1278
1279 #[test]
1280 fn tokenize_operators() -> Result<()> {
1281 assert_matches!(
1282 &tokenize_str("a>>b")?[..],
1283 [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1284 t1.to_str() == "a" &&
1285 t2.to_str() == ">>" &&
1286 t3.to_str() == "b"
1287 );
1288 Ok(())
1289 }
1290
1291 #[test]
1292 fn tokenize_comment() -> Result<()> {
1293 let tokens = tokenize_str(
1294 r"a #comment
1295",
1296 )?;
1297 assert_matches!(
1298 &tokens[..],
1299 [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1300 t1.to_str() == "a" &&
1301 t2.to_str() == "\n"
1302 );
1303 Ok(())
1304 }
1305
1306 #[test]
1307 fn tokenize_comment_at_eof() -> Result<()> {
1308 assert_matches!(
1309 &tokenize_str(r"a #comment")?[..],
1310 [t1 @ Token::Word(..)] if t1.to_str() == "a"
1311 );
1312 Ok(())
1313 }
1314
1315 #[test]
1316 fn tokenize_empty_here_doc() -> Result<()> {
1317 let tokens = tokenize_str(
1318 r"cat <<HERE
1319HERE
1320",
1321 )?;
1322 assert_matches!(
1323 &tokens[..],
1324 [t1 @ Token::Word(..),
1325 t2 @ Token::Operator(..),
1326 t3 @ Token::Word(..),
1327 t4 @ Token::Word(..),
1328 t5 @ Token::Word(..),
1329 t6 @ Token::Operator(..)] if
1330 t1.to_str() == "cat" &&
1331 t2.to_str() == "<<" &&
1332 t3.to_str() == "HERE" &&
1333 t4.to_str() == "" &&
1334 t5.to_str() == "HERE" &&
1335 t6.to_str() == "\n"
1336 );
1337 Ok(())
1338 }
1339
1340 #[test]
1341 fn tokenize_here_doc() -> Result<()> {
1342 let tokens = tokenize_str(
1343 r"cat <<HERE
1344SOMETHING
1345HERE
1346echo after
1347",
1348 )?;
1349 assert_matches!(
1350 &tokens[..],
1351 [t1 @ Token::Word(..),
1352 t2 @ Token::Operator(..),
1353 t3 @ Token::Word(..),
1354 t4 @ Token::Word(..),
1355 t5 @ Token::Word(..),
1356 t6 @ Token::Operator(..),
1357 t7 @ Token::Word(..),
1358 t8 @ Token::Word(..),
1359 t9 @ Token::Operator(..)] if
1360 t1.to_str() == "cat" &&
1361 t2.to_str() == "<<" &&
1362 t3.to_str() == "HERE" &&
1363 t4.to_str() == "SOMETHING\n" &&
1364 t5.to_str() == "HERE" &&
1365 t6.to_str() == "\n" &&
1366 t7.to_str() == "echo" &&
1367 t8.to_str() == "after" &&
1368 t9.to_str() == "\n"
1369 );
1370 Ok(())
1371 }
1372
1373 #[test]
1374 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1375 let tokens = tokenize_str(
1376 r"cat <<-HERE
1377 SOMETHING
1378 HERE
1379",
1380 )?;
1381 assert_matches!(
1382 &tokens[..],
1383 [t1 @ Token::Word(..),
1384 t2 @ Token::Operator(..),
1385 t3 @ Token::Word(..),
1386 t4 @ Token::Word(..),
1387 t5 @ Token::Word(..),
1388 t6 @ Token::Operator(..)] if
1389 t1.to_str() == "cat" &&
1390 t2.to_str() == "<<-" &&
1391 t3.to_str() == "HERE" &&
1392 t4.to_str() == "SOMETHING\n" &&
1393 t5.to_str() == "HERE" &&
1394 t6.to_str() == "\n"
1395 );
1396 Ok(())
1397 }
1398
1399 #[test]
1400 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1401 let tokens = tokenize_str(
1402 r"cat <<EOF | wc -l
1403A B C
14041 2 3
1405D E F
1406EOF
1407",
1408 )?;
1409 assert_matches!(
1410 &tokens[..],
1411 [t1 @ Token::Word(..),
1412 t2 @ Token::Operator(..),
1413 t3 @ Token::Word(..),
1414 t4 @ Token::Word(..),
1415 t5 @ Token::Word(..),
1416 t6 @ Token::Operator(..),
1417 t7 @ Token::Word(..),
1418 t8 @ Token::Word(..),
1419 t9 @ Token::Operator(..)] if
1420 t1.to_str() == "cat" &&
1421 t2.to_str() == "<<" &&
1422 t3.to_str() == "EOF" &&
1423 t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1424 t5.to_str() == "EOF" &&
1425 t6.to_str() == "|" &&
1426 t7.to_str() == "wc" &&
1427 t8.to_str() == "-l" &&
1428 t9.to_str() == "\n"
1429 );
1430
1431 Ok(())
1432 }
1433
1434 #[test]
1435 fn tokenize_multiple_here_docs() -> Result<()> {
1436 let tokens = tokenize_str(
1437 r"cat <<HERE1 <<HERE2
1438SOMETHING
1439HERE1
1440OTHER
1441HERE2
1442echo after
1443",
1444 )?;
1445 assert_matches!(
1446 &tokens[..],
1447 [t1 @ Token::Word(..),
1448 t2 @ Token::Operator(..),
1449 t3 @ Token::Word(..),
1450 t4 @ Token::Word(..),
1451 t5 @ Token::Word(..),
1452 t6 @ Token::Operator(..),
1453 t7 @ Token::Word(..),
1454 t8 @ Token::Word(..),
1455 t9 @ Token::Word(..),
1456 t10 @ Token::Operator(..),
1457 t11 @ Token::Word(..),
1458 t12 @ Token::Word(..),
1459 t13 @ Token::Operator(..)] if
1460 t1.to_str() == "cat" &&
1461 t2.to_str() == "<<" &&
1462 t3.to_str() == "HERE1" &&
1463 t4.to_str() == "SOMETHING\n" &&
1464 t5.to_str() == "HERE1" &&
1465 t6.to_str() == "<<" &&
1466 t7.to_str() == "HERE2" &&
1467 t8.to_str() == "OTHER\n" &&
1468 t9.to_str() == "HERE2" &&
1469 t10.to_str() == "\n" &&
1470 t11.to_str() == "echo" &&
1471 t12.to_str() == "after" &&
1472 t13.to_str() == "\n"
1473 );
1474 Ok(())
1475 }
1476
1477 #[test]
1478 fn tokenize_unterminated_here_doc() {
1479 let result = tokenize_str(
1480 r"cat <<HERE
1481SOMETHING
1482",
1483 );
1484 assert!(result.is_err());
1485 }
1486
1487 #[test]
1488 fn tokenize_missing_here_tag() {
1489 let result = tokenize_str(
1490 r"cat <<
1491",
1492 );
1493 assert!(result.is_err());
1494 }
1495
1496 #[test]
1497 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1498 let tokens = tokenize_str(
1499 r"echo $(cat <<HERE
1500TEXT
1501HERE
1502)",
1503 )?;
1504 assert_matches!(
1505 &tokens[..],
1506 [t1 @ Token::Word(..),
1507 t2 @ Token::Word(..)] if
1508 t1.to_str() == "echo" &&
1509 t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1510 );
1511 Ok(())
1512 }
1513
1514 #[test]
1515 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1516 let tokens = tokenize_str(
1517 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1518TEXT
1519HERE1
1520OTHER
1521HERE2
1522)",
1523 )?;
1524 assert_matches!(
1525 &tokens[..],
1526 [t1 @ Token::Word(..),
1527 t2 @ Token::Word(..)] if
1528 t1.to_str() == "echo" &&
1529 t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1530 );
1531 Ok(())
1532 }
1533
1534 #[test]
1535 fn tokenize_simple_backquote() -> Result<()> {
1536 assert_matches!(
1537 &tokenize_str(r"echo `echo hi`")?[..],
1538 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1539 t1.to_str() == "echo" &&
1540 t2.to_str() == "`echo hi`"
1541 );
1542 Ok(())
1543 }
1544
1545 #[test]
1546 fn tokenize_backquote_with_escape() -> Result<()> {
1547 assert_matches!(
1548 &tokenize_str(r"echo `echo\`hi`")?[..],
1549 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1550 t1.to_str() == "echo" &&
1551 t2.to_str() == r"`echo\`hi`"
1552 );
1553 Ok(())
1554 }
1555
1556 #[test]
1557 fn tokenize_unterminated_backquote() {
1558 assert_matches!(
1559 tokenize_str("`"),
1560 Err(TokenizerError::UnterminatedBackquote(_))
1561 );
1562 }
1563
1564 #[test]
1565 fn tokenize_unterminated_command_substitution() {
1566 assert_matches!(
1567 tokenize_str("$("),
1568 Err(TokenizerError::UnterminatedCommandSubstitution)
1569 );
1570 }
1571
1572 #[test]
1573 fn tokenize_command_substitution() -> Result<()> {
1574 assert_matches!(
1575 &tokenize_str("a$(echo hi)b c")?[..],
1576 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1577 t1.to_str() == "a$(echo hi)b" &&
1578 t2.to_str() == "c"
1579 );
1580 Ok(())
1581 }
1582
1583 #[test]
1584 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1585 assert_matches!(
1586 &tokenize_str("echo $(echo !(x))")?[..],
1587 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1588 t1.to_str() == "echo" &&
1589 t2.to_str() == "$(echo !(x))"
1590 );
1591 Ok(())
1592 }
1593
1594 #[test]
1595 fn tokenize_arithmetic_expression() -> Result<()> {
1596 assert_matches!(
1597 &tokenize_str("a$((1+2))b c")?[..],
1598 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1599 t1.to_str() == "a$((1+2))b" &&
1600 t2.to_str() == "c"
1601 );
1602 Ok(())
1603 }
1604
1605 #[test]
1606 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1607 assert_matches!(
1610 &tokenize_str("$(( 1 ))")?[..],
1611 [t1 @ Token::Word(..)] if
1612 t1.to_str() == "$((1 ))"
1613 );
1614 Ok(())
1615 }
1616 #[test]
1617 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1618 assert_matches!(
1619 &tokenize_str("$(( (0) ))")?[..],
1620 [t1 @ Token::Word(..)] if
1621 t1.to_str() == "$(((0)))"
1622 );
1623 Ok(())
1624 }
1625
1626 #[test]
1627 fn tokenize_special_parameters() -> Result<()> {
1628 assert_matches!(
1629 &tokenize_str("$$")?[..],
1630 [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1631 );
1632 assert_matches!(
1633 &tokenize_str("$@")?[..],
1634 [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1635 );
1636 assert_matches!(
1637 &tokenize_str("$!")?[..],
1638 [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1639 );
1640 assert_matches!(
1641 &tokenize_str("$?")?[..],
1642 [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1643 );
1644 assert_matches!(
1645 &tokenize_str("$*")?[..],
1646 [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1647 );
1648 Ok(())
1649 }
1650
1651 #[test]
1652 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1653 assert_matches!(
1654 &tokenize_str("$x")?[..],
1655 [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1656 );
1657 assert_matches!(
1658 &tokenize_str("a$x")?[..],
1659 [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1660 );
1661 Ok(())
1662 }
1663
1664 #[test]
1665 fn tokenize_unterminated_parameter_expansion() {
1666 assert_matches!(
1667 tokenize_str("${x"),
1668 Err(TokenizerError::UnterminatedVariable)
1669 );
1670 }
1671
1672 #[test]
1673 fn tokenize_braced_parameter_expansion() -> Result<()> {
1674 assert_matches!(
1675 &tokenize_str("${x}")?[..],
1676 [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1677 );
1678 assert_matches!(
1679 &tokenize_str("a${x}b")?[..],
1680 [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1681 );
1682 Ok(())
1683 }
1684
1685 #[test]
1686 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1687 assert_matches!(
1688 &tokenize_str(r"a${x\}}b")?[..],
1689 [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1690 );
1691 Ok(())
1692 }
1693
1694 #[test]
1695 fn tokenize_whitespace() -> Result<()> {
1696 assert_matches!(
1697 &tokenize_str("1 2 3")?[..],
1698 [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1699 t1.to_str() == "1" &&
1700 t2.to_str() == "2" &&
1701 t3.to_str() == "3"
1702 );
1703 Ok(())
1704 }
1705
1706 #[test]
1707 fn tokenize_escaped_whitespace() -> Result<()> {
1708 assert_matches!(
1709 &tokenize_str(r"1\ 2 3")?[..],
1710 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1711 t1.to_str() == r"1\ 2" &&
1712 t2.to_str() == "3"
1713 );
1714 Ok(())
1715 }
1716
1717 #[test]
1718 fn tokenize_single_quote() -> Result<()> {
1719 assert_matches!(
1720 &tokenize_str(r"x'a b'y")?[..],
1721 [t1 @ Token::Word(..)] if
1722 t1.to_str() == r"x'a b'y"
1723 );
1724 Ok(())
1725 }
1726
1727 #[test]
1728 fn tokenize_double_quote() -> Result<()> {
1729 assert_matches!(
1730 &tokenize_str(r#"x"a b"y"#)?[..],
1731 [t1 @ Token::Word(..)] if
1732 t1.to_str() == r#"x"a b"y"#
1733 );
1734 Ok(())
1735 }
1736
1737 #[test]
1738 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1739 assert_matches!(
1740 &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1741 [t1 @ Token::Word(..)] if
1742 t1.to_str() == r#"x"$(echo hi)"y"#
1743 );
1744 Ok(())
1745 }
1746
1747 #[test]
1748 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1749 assert_matches!(
1750 &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1751 [t1 @ Token::Word(..)] if
1752 t1.to_str() == r#"x"$((1+2))"y"#
1753 );
1754 Ok(())
1755 }
1756
1757 #[test]
1758 fn test_quote_removal() {
1759 assert_eq!(unquote_str(r#""hello""#), "hello");
1760 assert_eq!(unquote_str(r"'hello'"), "hello");
1761 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1762 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1763 }
1764}