1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8 EndOfInput,
10 UnescapedNewLine,
12 SpecifiedTerminatingChar,
14 NonNewLineBlank,
16 HereDocumentBodyStart,
18 HereDocumentBodyEnd,
20 HereDocumentEndTag,
22 OperatorStart,
24 OperatorEnd,
26 Other,
28}
29
30#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34 pub index: i32,
36 pub line: i32,
38 pub column: i32,
40}
41
42impl Display for SourcePosition {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45 }
46}
47
48#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52 pub start: SourcePosition,
54 pub end: SourcePosition,
56}
57
58#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62 Operator(String, TokenLocation),
64 Word(String, TokenLocation),
66}
67
68impl Token {
69 pub fn to_str(&self) -> &str {
71 match self {
72 Token::Operator(s, _) => s,
73 Token::Word(s, _) => s,
74 }
75 }
76
77 pub fn location(&self) -> &TokenLocation {
79 match self {
80 Token::Operator(_, l) => l,
81 Token::Word(_, l) => l,
82 }
83 }
84}
85
86#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89 pub reason: TokenEndReason,
91 pub token: Option<Token>,
93}
94
95#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98 #[error("unterminated escape sequence")]
100 UnterminatedEscapeSequence,
101
102 #[error("unterminated single quote at {0}")]
104 UnterminatedSingleQuote(SourcePosition),
105
106 #[error("unterminated double quote at {0}")]
108 UnterminatedDoubleQuote(SourcePosition),
109
110 #[error("unterminated backquote near {0}")]
112 UnterminatedBackquote(SourcePosition),
113
114 #[error("unterminated extglob near {0}")]
117 UnterminatedExtendedGlob(SourcePosition),
118
119 #[error("unterminated variable expression")]
121 UnterminatedVariable,
122
123 #[error("unterminated command substitution")]
125 UnterminatedCommandSubstitution,
126
127 #[error("failed to decode UTF-8 characters")]
129 FailedDecoding,
130
131 #[error("missing here tag for here document body")]
133 MissingHereTagForDocumentBody,
134
135 #[error("missing here tag '{0}'")]
137 MissingHereTag(String),
138
139 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141 UnterminatedHereDocuments(String, String),
142
143 #[error("failed to read input")]
145 ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149 pub fn is_incomplete(&self) -> bool {
150 matches!(
151 self,
152 Self::UnterminatedEscapeSequence
153 | Self::UnterminatedSingleQuote(..)
154 | Self::UnterminatedDoubleQuote(..)
155 | Self::UnterminatedBackquote(..)
156 | Self::UnterminatedCommandSubstitution
157 | Self::UnterminatedVariable
158 | Self::UnterminatedExtendedGlob(..)
159 | Self::UnterminatedHereDocuments(..)
160 )
161 }
162}
163
164#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167 pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173 None,
174 Single(SourcePosition),
175 Double(SourcePosition),
176}
177
178#[derive(Clone, Debug, Default)]
179enum HereState {
180 #[default]
182 None,
183 NextTokenIsHereTag { remove_tabs: bool },
185 CurrentTokenIsHereTag {
187 remove_tabs: bool,
188 operator_token_result: TokenizeResult,
189 },
190 NextLineIsHereDoc,
193 InHereDocs,
196}
197
198#[derive(Clone, Debug)]
199struct HereTag {
200 tag: String,
201 tag_was_escaped_or_quoted: bool,
202 remove_tabs: bool,
203 position: SourcePosition,
204 tokens: Vec<TokenizeResult>,
205 pending_tokens_after: Vec<TokenizeResult>,
206}
207
208#[derive(Clone, Debug)]
209struct CrossTokenParseState {
210 cursor: SourcePosition,
212 here_state: HereState,
214 current_here_tags: Vec<HereTag>,
216 queued_tokens: Vec<TokenizeResult>,
218 arithmetic_expansion: bool,
220}
221
222#[derive(Clone, Debug, Hash, Eq, PartialEq)]
224pub struct TokenizerOptions {
225 pub enable_extended_globbing: bool,
227 #[allow(unused)]
229 pub posix_mode: bool,
230 pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235 fn default() -> Self {
236 Self {
237 enable_extended_globbing: true,
238 posix_mode: false,
239 sh_mode: false,
240 }
241 }
242}
243
244pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247 cross_state: CrossTokenParseState,
248 options: TokenizerOptions,
249}
250
251#[derive(Clone, Debug)]
253struct TokenParseState {
254 pub start_position: SourcePosition,
255 pub token_so_far: String,
256 pub token_is_operator: bool,
257 pub in_escape: bool,
258 pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262 pub fn new(start_position: &SourcePosition) -> Self {
263 TokenParseState {
264 start_position: start_position.clone(),
265 token_so_far: String::new(),
266 token_is_operator: false,
267 in_escape: false,
268 quote_mode: QuoteMode::None,
269 }
270 }
271
272 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273 let token_location = TokenLocation {
274 start: std::mem::take(&mut self.start_position),
275 end: end_position.clone(),
276 };
277
278 let token = if std::mem::take(&mut self.token_is_operator) {
279 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
280 } else {
281 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
282 };
283
284 self.start_position = end_position.clone();
285 self.in_escape = false;
286 self.quote_mode = QuoteMode::None;
287
288 token
289 }
290
291 pub fn started_token(&self) -> bool {
292 !self.token_so_far.is_empty()
293 }
294
295 pub fn append_char(&mut self, c: char) {
296 self.token_so_far.push(c);
297 }
298
299 pub fn append_str(&mut self, s: &str) {
300 self.token_so_far.push_str(s);
301 }
302
303 pub fn unquoted(&self) -> bool {
304 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
305 }
306
307 pub fn current_token(&self) -> &str {
308 &self.token_so_far
309 }
310
311 pub fn is_specific_operator(&self, operator: &str) -> bool {
312 self.token_is_operator && self.current_token() == operator
313 }
314
315 pub fn in_operator(&self) -> bool {
316 self.token_is_operator
317 }
318
319 fn is_newline(&self) -> bool {
320 self.token_so_far == "\n"
321 }
322
323 fn replace_with_here_doc(&mut self, s: String) {
324 self.token_so_far = s;
325 }
326
327 pub fn delimit_current_token(
328 &mut self,
329 reason: TokenEndReason,
330 cross_token_state: &mut CrossTokenParseState,
331 ) -> Result<Option<TokenizeResult>, TokenizerError> {
332 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
335 return Ok(Some(TokenizeResult {
336 reason,
337 token: None,
338 }));
339 }
340
341 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
343 match current_here_state {
344 HereState::NextTokenIsHereTag { remove_tabs } => {
345 let operator_token_result = TokenizeResult {
348 reason,
349 token: Some(self.pop(&cross_token_state.cursor)),
350 };
351
352 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
353 remove_tabs,
354 operator_token_result,
355 };
356
357 return Ok(None);
358 }
359 HereState::CurrentTokenIsHereTag {
360 remove_tabs,
361 operator_token_result,
362 } => {
363 if self.is_newline() {
364 return Err(TokenizerError::MissingHereTag(
365 self.current_token().to_owned(),
366 ));
367 }
368
369 cross_token_state.here_state = HereState::NextLineIsHereDoc;
370
371 let tag = std::format!("{}\n", self.current_token());
373 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
374
375 let tag_token_result = TokenizeResult {
376 reason,
377 token: Some(self.pop(&cross_token_state.cursor)),
378 };
379
380 cross_token_state.current_here_tags.push(HereTag {
381 tag,
382 tag_was_escaped_or_quoted,
383 remove_tabs,
384 position: cross_token_state.cursor.clone(),
385 tokens: vec![operator_token_result, tag_token_result],
386 pending_tokens_after: vec![],
387 });
388
389 return Ok(None);
390 }
391 HereState::NextLineIsHereDoc => {
392 if self.is_newline() {
393 cross_token_state.here_state = HereState::InHereDocs;
394 } else {
395 cross_token_state.here_state = HereState::NextLineIsHereDoc;
396 }
397
398 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
399 let token = self.pop(&cross_token_state.cursor);
400 let result = TokenizeResult {
401 reason,
402 token: Some(token),
403 };
404
405 last_here_tag.pending_tokens_after.push(result);
406 } else {
407 return Err(TokenizerError::MissingHereTagForDocumentBody);
408 }
409
410 return Ok(None);
411 }
412 HereState::InHereDocs => {
413 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
415
416 for here_token in completed_here_tag.tokens {
418 cross_token_state.queued_tokens.push(here_token);
419 }
420
421 cross_token_state.queued_tokens.push(TokenizeResult {
423 reason: TokenEndReason::HereDocumentBodyStart,
424 token: None,
425 });
426
427 cross_token_state.queued_tokens.push(TokenizeResult {
429 reason,
430 token: Some(self.pop(&cross_token_state.cursor)),
431 });
432
433 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
435 cross_token_state.queued_tokens.push(TokenizeResult {
436 reason: TokenEndReason::HereDocumentEndTag,
437 token: Some(self.pop(&cross_token_state.cursor)),
438 });
439
440 for pending_token in completed_here_tag.pending_tokens_after {
443 cross_token_state.queued_tokens.push(pending_token);
444 }
445
446 if cross_token_state.current_here_tags.is_empty() {
447 cross_token_state.here_state = HereState::None;
448 } else {
449 cross_token_state.here_state = HereState::InHereDocs;
450 }
451
452 return Ok(None);
453 }
454 HereState::None => (),
455 }
456
457 let token = self.pop(&cross_token_state.cursor);
458 let result = TokenizeResult {
459 reason,
460 token: Some(token),
461 };
462
463 Ok(Some(result))
464 }
465}
466
467pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
473 tokenize_str_with_options(input, &TokenizerOptions::default())
474}
475
476pub fn tokenize_str_with_options(
483 input: &str,
484 options: &TokenizerOptions,
485) -> Result<Vec<Token>, TokenizerError> {
486 uncached_tokenize_string(input.to_owned(), options.to_owned())
487}
488
489#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
490fn uncached_tokenize_string(
491 input: String,
492 options: TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494 uncached_tokenize_str(input.as_str(), &options)
495}
496
497pub fn uncached_tokenize_str(
504 input: &str,
505 options: &TokenizerOptions,
506) -> Result<Vec<Token>, TokenizerError> {
507 let mut reader = std::io::BufReader::new(input.as_bytes());
508 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
509
510 let mut tokens = vec![];
511 loop {
512 match tokenizer.next_token()? {
513 TokenizeResult {
514 token: Some(token), ..
515 } => tokens.push(token),
516 TokenizeResult {
517 reason: TokenEndReason::EndOfInput,
518 ..
519 } => break,
520 _ => (),
521 }
522 }
523
524 Ok(tokens)
525}
526
527impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
528 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
529 Tokenizer {
530 options: options.clone(),
531 char_reader: reader.chars().peekable(),
532 cross_state: CrossTokenParseState {
533 cursor: SourcePosition {
534 index: 0,
535 line: 1,
536 column: 1,
537 },
538 here_state: HereState::None,
539 current_here_tags: vec![],
540 queued_tokens: vec![],
541 arithmetic_expansion: false,
542 },
543 }
544 }
545
546 pub fn current_location(&self) -> Option<SourcePosition> {
547 Some(self.cross_state.cursor.clone())
548 }
549
550 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
551 let c = self
552 .char_reader
553 .next()
554 .transpose()
555 .map_err(TokenizerError::ReadError)?;
556
557 if let Some(ch) = c {
558 if ch == '\n' {
559 self.cross_state.cursor.line += 1;
560 self.cross_state.cursor.column = 1;
561 } else {
562 self.cross_state.cursor.column += 1;
563 }
564 self.cross_state.cursor.index += 1;
565 }
566
567 Ok(c)
568 }
569
570 fn consume_char(&mut self) -> Result<(), TokenizerError> {
571 let _ = self.next_char()?;
572 Ok(())
573 }
574
575 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
576 match self.char_reader.peek() {
577 Some(result) => match result {
578 Ok(c) => Ok(Some(*c)),
579 Err(_) => Err(TokenizerError::FailedDecoding),
580 },
581 None => Ok(None),
582 }
583 }
584
585 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
586 self.next_token_until(None)
587 }
588
589 #[allow(clippy::if_same_then_else)]
590 fn next_token_until(
591 &mut self,
592 terminating_char: Option<char>,
593 ) -> Result<TokenizeResult, TokenizerError> {
594 let mut state = TokenParseState::new(&self.cross_state.cursor);
595 let mut result: Option<TokenizeResult> = None;
596
597 while result.is_none() {
598 if !self.cross_state.queued_tokens.is_empty() {
601 return Ok(self.cross_state.queued_tokens.remove(0));
602 }
603
604 let next = self.peek_char()?;
605 let c = next.unwrap_or('\0');
606
607 if next.is_none() {
610 if state.in_escape {
613 return Err(TokenizerError::UnterminatedEscapeSequence);
614 }
615 match state.quote_mode {
616 QuoteMode::None => (),
617 QuoteMode::Single(pos) => {
618 return Err(TokenizerError::UnterminatedSingleQuote(pos));
619 }
620 QuoteMode::Double(pos) => {
621 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
622 }
623 }
624
625 if !matches!(self.cross_state.here_state, HereState::None) {
627 let tag_names = self
628 .cross_state
629 .current_here_tags
630 .iter()
631 .map(|tag| tag.tag.trim())
632 .collect::<Vec<_>>()
633 .join(", ");
634 let tag_positions = self
635 .cross_state
636 .current_here_tags
637 .iter()
638 .map(|tag| std::format!("{}", tag.position))
639 .collect::<Vec<_>>()
640 .join(", ");
641 return Err(TokenizerError::UnterminatedHereDocuments(
642 tag_names,
643 tag_positions,
644 ));
645 }
646
647 result = state
648 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
649 } else if state.unquoted() && terminating_char == Some(c) {
653 result = state.delimit_current_token(
654 TokenEndReason::SpecifiedTerminatingChar,
655 &mut self.cross_state,
656 )?;
657 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
661 if !self.cross_state.current_here_tags.is_empty()
666 && self.cross_state.current_here_tags[0].remove_tabs
667 && (!state.started_token() || state.current_token().ends_with('\n'))
668 && c == '\t'
669 {
670 self.consume_char()?;
672 } else {
673 self.consume_char()?;
674 state.append_char(c);
675
676 if c == '\n' {
678 let next_here_tag = &self.cross_state.current_here_tags[0];
679 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
680 unquote_str(next_here_tag.tag.as_str()).into()
681 } else {
682 next_here_tag.tag.as_str().into()
683 };
684
685 if let Some(current_token_without_here_tag) =
686 state.current_token().strip_suffix(tag_str.as_ref())
687 {
688 if current_token_without_here_tag.is_empty()
692 || current_token_without_here_tag.ends_with('\n')
693 {
694 state.replace_with_here_doc(
695 current_token_without_here_tag.to_owned(),
696 );
697
698 result = state.delimit_current_token(
700 TokenEndReason::HereDocumentBodyEnd,
701 &mut self.cross_state,
702 )?;
703 }
704 }
705 }
706 }
707 } else if state.in_operator() {
708 let mut hypothetical_token = state.current_token().to_owned();
714 hypothetical_token.push(c);
715
716 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
717 self.consume_char()?;
718 state.append_char(c);
719 } else {
720 assert!(state.started_token());
721
722 if self.cross_state.arithmetic_expansion {
727 } else if state.is_specific_operator("<<") {
731 self.cross_state.here_state =
732 HereState::NextTokenIsHereTag { remove_tabs: false };
733 } else if state.is_specific_operator("<<-") {
734 self.cross_state.here_state =
735 HereState::NextTokenIsHereTag { remove_tabs: true };
736 }
737
738 let reason = if state.current_token() == "\n" {
739 TokenEndReason::UnescapedNewLine
740 } else {
741 TokenEndReason::OperatorEnd
742 };
743
744 result = state.delimit_current_token(reason, &mut self.cross_state)?;
745 }
746 } else if does_char_newly_affect_quoting(&state, c) {
750 if c == '\\' {
751 self.consume_char()?;
753
754 if matches!(self.peek_char()?, Some('\n')) {
755 self.consume_char()?;
757
758 } else {
760 state.in_escape = true;
761 state.append_char(c);
762 }
763 } else if c == '\'' {
764 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
765 self.consume_char()?;
766 state.append_char(c);
767 } else if c == '\"' {
768 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
769 self.consume_char()?;
770 state.append_char(c);
771 }
772 }
773 else if !state.in_escape
776 && matches!(state.quote_mode, QuoteMode::Single(_))
777 && c == '\''
778 {
779 state.quote_mode = QuoteMode::None;
780 self.consume_char()?;
781 state.append_char(c);
782 } else if !state.in_escape
783 && matches!(state.quote_mode, QuoteMode::Double(_))
784 && c == '\"'
785 {
786 state.quote_mode = QuoteMode::None;
787 self.consume_char()?;
788 state.append_char(c);
789 }
790 else if state.in_escape {
794 state.in_escape = false;
795 self.consume_char()?;
796 state.append_char(c);
797 } else if (state.unquoted()
798 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
799 && (c == '$' || c == '`')
800 {
801 if c == '$' {
803 self.consume_char()?;
805
806 let char_after_dollar_sign = self.peek_char()?;
808 match char_after_dollar_sign {
809 Some('(') => {
810 state.append_char('$');
812
813 state.append_char(self.next_char()?.unwrap());
815
816 let mut required_end_parens = 1;
819 if matches!(self.peek_char()?, Some('(')) {
820 state.append_char(self.next_char()?.unwrap());
822 required_end_parens = 2;
825 self.cross_state.arithmetic_expansion = true;
830 }
831
832 let mut pending_here_doc_tokens = vec![];
833 let mut drain_here_doc_tokens = false;
834
835 loop {
836 let cur_token = if drain_here_doc_tokens
837 && !pending_here_doc_tokens.is_empty()
838 {
839 if pending_here_doc_tokens.len() == 1 {
840 drain_here_doc_tokens = false;
841 }
842
843 pending_here_doc_tokens.remove(0)
844 } else {
845 let cur_token = self.next_token_until(Some(')'))?;
846
847 if matches!(
851 cur_token.reason,
852 TokenEndReason::HereDocumentBodyStart
853 | TokenEndReason::HereDocumentBodyEnd
854 | TokenEndReason::HereDocumentEndTag
855 ) {
856 pending_here_doc_tokens.push(cur_token);
857 continue;
858 }
859
860 cur_token
861 };
862
863 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
864 && !pending_here_doc_tokens.is_empty()
865 {
866 pending_here_doc_tokens.push(cur_token);
867 drain_here_doc_tokens = true;
868 continue;
869 }
870
871 if let Some(cur_token_value) = cur_token.token {
872 state.append_str(cur_token_value.to_str());
873
874 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
879 {
880 required_end_parens += 1;
881 }
882 }
883
884 match cur_token.reason {
885 TokenEndReason::HereDocumentBodyStart => {
886 state.append_char('\n')
887 }
888 TokenEndReason::NonNewLineBlank => state.append_char(' '),
889 TokenEndReason::SpecifiedTerminatingChar => {
890 required_end_parens -= 1;
895 if required_end_parens == 0 {
896 break;
897 }
898
899 state.append_char(self.next_char()?.unwrap());
902 }
903 TokenEndReason::EndOfInput => {
904 return Err(TokenizerError::UnterminatedCommandSubstitution)
905 }
906 _ => (),
907 }
908 }
909
910 self.cross_state.arithmetic_expansion = false;
911
912 state.append_char(self.next_char()?.unwrap());
913 }
914
915 Some('{') => {
916 state.append_char('$');
918
919 state.append_char(self.next_char()?.unwrap());
921
922 let mut pending_here_doc_tokens = vec![];
923 let mut drain_here_doc_tokens = false;
924
925 loop {
926 let cur_token = if drain_here_doc_tokens
927 && !pending_here_doc_tokens.is_empty()
928 {
929 if pending_here_doc_tokens.len() == 1 {
930 drain_here_doc_tokens = false;
931 }
932
933 pending_here_doc_tokens.remove(0)
934 } else {
935 let cur_token = self.next_token_until(Some('}'))?;
936
937 if matches!(
941 cur_token.reason,
942 TokenEndReason::HereDocumentBodyStart
943 | TokenEndReason::HereDocumentBodyEnd
944 | TokenEndReason::HereDocumentEndTag
945 ) {
946 pending_here_doc_tokens.push(cur_token);
947 continue;
948 }
949
950 cur_token
951 };
952
953 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
954 && !pending_here_doc_tokens.is_empty()
955 {
956 pending_here_doc_tokens.push(cur_token);
957 drain_here_doc_tokens = true;
958 continue;
959 }
960
961 if let Some(cur_token_value) = cur_token.token {
962 state.append_str(cur_token_value.to_str())
963 }
964
965 match cur_token.reason {
966 TokenEndReason::HereDocumentBodyStart => {
967 state.append_char('\n')
968 }
969 TokenEndReason::NonNewLineBlank => state.append_char(' '),
970 TokenEndReason::SpecifiedTerminatingChar => {
971 state.append_char(self.next_char()?.unwrap());
974 break;
975 }
976 TokenEndReason::EndOfInput => {
977 return Err(TokenizerError::UnterminatedVariable)
978 }
979 _ => (),
980 }
981 }
982 }
983 _ => {
984 state.append_char('$');
987 }
988 }
989 } else {
990 let backquote_pos = self.cross_state.cursor.clone();
993 self.consume_char()?;
994
995 state.append_char(c);
997
998 let mut escaping_enabled = false;
1000 let mut done = false;
1001 while !done {
1002 let next_char_in_backquote = self.next_char()?;
1004 if let Some(cib) = next_char_in_backquote {
1005 state.append_char(cib);
1007
1008 if !escaping_enabled && cib == '\\' {
1010 escaping_enabled = true;
1011 } else {
1012 if !escaping_enabled && cib == '`' {
1014 done = true;
1015 }
1016 escaping_enabled = false;
1017 }
1018 } else {
1019 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1020 }
1021 }
1022 }
1023 }
1024 else if c == '('
1030 && self.options.enable_extended_globbing
1031 && state.unquoted()
1032 && !state.in_operator()
1033 && state
1034 .current_token()
1035 .ends_with(|x| self.can_start_extglob(x))
1036 {
1037 self.consume_char()?;
1039 state.append_char(c);
1040
1041 let mut paren_depth = 1;
1042
1043 while paren_depth > 0 {
1045 if let Some(extglob_char) = self.next_char()? {
1046 state.append_char(extglob_char);
1048
1049 if extglob_char == '(' {
1052 paren_depth += 1;
1053 } else if extglob_char == ')' {
1054 paren_depth -= 1;
1055 }
1056 } else {
1057 return Err(TokenizerError::UnterminatedExtendedGlob(
1058 self.cross_state.cursor.clone(),
1059 ));
1060 }
1061 }
1062 } else if state.unquoted() && self.can_start_operator(c) {
1066 if state.started_token() {
1067 result = state.delimit_current_token(
1068 TokenEndReason::OperatorStart,
1069 &mut self.cross_state,
1070 )?;
1071 } else {
1072 state.token_is_operator = true;
1073 self.consume_char()?;
1074 state.append_char(c);
1075 }
1076 } else if state.unquoted() && is_blank(c) {
1080 if state.started_token() {
1081 result = state.delimit_current_token(
1082 TokenEndReason::NonNewLineBlank,
1083 &mut self.cross_state,
1084 )?;
1085 } else {
1086 state.start_position.column += 1;
1088 state.start_position.index += 1;
1089 }
1090
1091 self.consume_char()?;
1092 }
1093 else if !state.token_is_operator
1098 && (state.started_token() || terminating_char.is_some())
1099 {
1100 self.consume_char()?;
1101 state.append_char(c);
1102 } else if c == '#' {
1103 self.consume_char()?;
1105
1106 let mut done = false;
1107 while !done {
1108 done = match self.peek_char()? {
1109 Some('\n') => true,
1110 None => true,
1111 _ => {
1112 self.consume_char()?;
1114 false
1115 }
1116 };
1117 }
1118
1119 continue;
1121 } else if state.started_token() {
1125 result =
1126 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1127 } else {
1128 self.consume_char()?;
1131 state.append_char(c);
1132 }
1133 }
1134
1135 let result = result.unwrap();
1136
1137 Ok(result)
1138 }
1139
1140 fn can_start_extglob(&self, c: char) -> bool {
1141 matches!(c, '@' | '!' | '?' | '+' | '*')
1142 }
1143
1144 fn can_start_operator(&self, c: char) -> bool {
1145 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1146 }
1147
1148 fn is_operator(&self, s: &str) -> bool {
1149 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1151 return true;
1152 }
1153
1154 matches!(
1155 s,
1156 "&" | "&&"
1157 | "("
1158 | ")"
1159 | ";"
1160 | ";;"
1161 | "\n"
1162 | "|"
1163 | "||"
1164 | "<"
1165 | ">"
1166 | ">|"
1167 | "<<"
1168 | ">>"
1169 | "<&"
1170 | ">&"
1171 | "<<-"
1172 | "<>"
1173 )
1174 }
1175}
1176
1177impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1178 type Item = Result<TokenizeResult, TokenizerError>;
1179
1180 fn next(&mut self) -> Option<Self::Item> {
1181 match self.next_token() {
1182 #[allow(clippy::manual_map)]
1183 Ok(result) => match result.token {
1184 Some(_) => Some(Ok(result)),
1185 None => None,
1186 },
1187 Err(e) => Some(Err(e)),
1188 }
1189 }
1190}
1191
1192fn is_blank(c: char) -> bool {
1193 c == ' ' || c == '\t'
1194}
1195
1196fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1197 if state.in_escape {
1199 return false;
1200 }
1201
1202 match state.quote_mode {
1203 QuoteMode::Double(_) => {
1205 if c == '\\' {
1206 true
1208 } else {
1209 false
1210 }
1211 }
1212 QuoteMode::Single(_) => false,
1214 QuoteMode::None => is_quoting_char(c),
1217 }
1218}
1219
1220fn is_quoting_char(c: char) -> bool {
1221 matches!(c, '\\' | '\'' | '\"')
1222}
1223
1224pub fn unquote_str(s: &str) -> String {
1230 let mut result = String::new();
1231
1232 let mut in_escape = false;
1233 for c in s.chars() {
1234 match c {
1235 c if in_escape => {
1236 result.push(c);
1237 in_escape = false;
1238 }
1239 '\\' => in_escape = true,
1240 c if is_quoting_char(c) => (),
1241 c => result.push(c),
1242 }
1243 }
1244
1245 result
1246}
1247
1248#[cfg(test)]
1249mod tests {
1250 use super::*;
1251 use anyhow::Result;
1252 use pretty_assertions::{assert_eq, assert_matches};
1254
1255 #[test]
1256 fn tokenize_empty() -> Result<()> {
1257 let tokens = tokenize_str("")?;
1258 assert_eq!(tokens.len(), 0);
1259 Ok(())
1260 }
1261
1262 #[test]
1263 fn tokenize_line_continuation() -> Result<()> {
1264 let tokens = tokenize_str(
1265 r"a\
1266bc",
1267 )?;
1268 assert_matches!(
1269 &tokens[..],
1270 [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1271 );
1272 Ok(())
1273 }
1274
1275 #[test]
1276 fn tokenize_operators() -> Result<()> {
1277 assert_matches!(
1278 &tokenize_str("a>>b")?[..],
1279 [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1280 t1.to_str() == "a" &&
1281 t2.to_str() == ">>" &&
1282 t3.to_str() == "b"
1283 );
1284 Ok(())
1285 }
1286
1287 #[test]
1288 fn tokenize_comment() -> Result<()> {
1289 let tokens = tokenize_str(
1290 r#"a #comment
1291"#,
1292 )?;
1293 assert_matches!(
1294 &tokens[..],
1295 [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1296 t1.to_str() == "a" &&
1297 t2.to_str() == "\n"
1298 );
1299 Ok(())
1300 }
1301
1302 #[test]
1303 fn tokenize_comment_at_eof() -> Result<()> {
1304 assert_matches!(
1305 &tokenize_str(r#"a #comment"#)?[..],
1306 [t1 @ Token::Word(..)] if t1.to_str() == "a"
1307 );
1308 Ok(())
1309 }
1310
1311 #[test]
1312 fn tokenize_empty_here_doc() -> Result<()> {
1313 let tokens = tokenize_str(
1314 r#"cat <<HERE
1315HERE
1316"#,
1317 )?;
1318 assert_matches!(
1319 &tokens[..],
1320 [t1 @ Token::Word(..),
1321 t2 @ Token::Operator(..),
1322 t3 @ Token::Word(..),
1323 t4 @ Token::Word(..),
1324 t5 @ Token::Word(..),
1325 t6 @ Token::Operator(..)] if
1326 t1.to_str() == "cat" &&
1327 t2.to_str() == "<<" &&
1328 t3.to_str() == "HERE" &&
1329 t4.to_str() == "" &&
1330 t5.to_str() == "HERE" &&
1331 t6.to_str() == "\n"
1332 );
1333 Ok(())
1334 }
1335
1336 #[test]
1337 fn tokenize_here_doc() -> Result<()> {
1338 let tokens = tokenize_str(
1339 r#"cat <<HERE
1340SOMETHING
1341HERE
1342echo after
1343"#,
1344 )?;
1345 assert_matches!(
1346 &tokens[..],
1347 [t1 @ Token::Word(..),
1348 t2 @ Token::Operator(..),
1349 t3 @ Token::Word(..),
1350 t4 @ Token::Word(..),
1351 t5 @ Token::Word(..),
1352 t6 @ Token::Operator(..),
1353 t7 @ Token::Word(..),
1354 t8 @ Token::Word(..),
1355 t9 @ Token::Operator(..)] if
1356 t1.to_str() == "cat" &&
1357 t2.to_str() == "<<" &&
1358 t3.to_str() == "HERE" &&
1359 t4.to_str() == "SOMETHING\n" &&
1360 t5.to_str() == "HERE" &&
1361 t6.to_str() == "\n" &&
1362 t7.to_str() == "echo" &&
1363 t8.to_str() == "after" &&
1364 t9.to_str() == "\n"
1365 );
1366 Ok(())
1367 }
1368
1369 #[test]
1370 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1371 let tokens = tokenize_str(
1372 r#"cat <<-HERE
1373 SOMETHING
1374 HERE
1375"#,
1376 )?;
1377 assert_matches!(
1378 &tokens[..],
1379 [t1 @ Token::Word(..),
1380 t2 @ Token::Operator(..),
1381 t3 @ Token::Word(..),
1382 t4 @ Token::Word(..),
1383 t5 @ Token::Word(..),
1384 t6 @ Token::Operator(..)] if
1385 t1.to_str() == "cat" &&
1386 t2.to_str() == "<<-" &&
1387 t3.to_str() == "HERE" &&
1388 t4.to_str() == "SOMETHING\n" &&
1389 t5.to_str() == "HERE" &&
1390 t6.to_str() == "\n"
1391 );
1392 Ok(())
1393 }
1394
1395 #[test]
1396 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1397 let tokens = tokenize_str(
1398 r#"cat <<EOF | wc -l
1399A B C
14001 2 3
1401D E F
1402EOF
1403"#,
1404 )?;
1405 assert_matches!(
1406 &tokens[..],
1407 [t1 @ Token::Word(..),
1408 t2 @ Token::Operator(..),
1409 t3 @ Token::Word(..),
1410 t4 @ Token::Word(..),
1411 t5 @ Token::Word(..),
1412 t6 @ Token::Operator(..),
1413 t7 @ Token::Word(..),
1414 t8 @ Token::Word(..),
1415 t9 @ Token::Operator(..)] if
1416 t1.to_str() == "cat" &&
1417 t2.to_str() == "<<" &&
1418 t3.to_str() == "EOF" &&
1419 t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1420 t5.to_str() == "EOF" &&
1421 t6.to_str() == "|" &&
1422 t7.to_str() == "wc" &&
1423 t8.to_str() == "-l" &&
1424 t9.to_str() == "\n"
1425 );
1426
1427 Ok(())
1428 }
1429
1430 #[test]
1431 fn tokenize_multiple_here_docs() -> Result<()> {
1432 let tokens = tokenize_str(
1433 r#"cat <<HERE1 <<HERE2
1434SOMETHING
1435HERE1
1436OTHER
1437HERE2
1438echo after
1439"#,
1440 )?;
1441 assert_matches!(
1442 &tokens[..],
1443 [t1 @ Token::Word(..),
1444 t2 @ Token::Operator(..),
1445 t3 @ Token::Word(..),
1446 t4 @ Token::Word(..),
1447 t5 @ Token::Word(..),
1448 t6 @ Token::Operator(..),
1449 t7 @ Token::Word(..),
1450 t8 @ Token::Word(..),
1451 t9 @ Token::Word(..),
1452 t10 @ Token::Operator(..),
1453 t11 @ Token::Word(..),
1454 t12 @ Token::Word(..),
1455 t13 @ Token::Operator(..)] if
1456 t1.to_str() == "cat" &&
1457 t2.to_str() == "<<" &&
1458 t3.to_str() == "HERE1" &&
1459 t4.to_str() == "SOMETHING\n" &&
1460 t5.to_str() == "HERE1" &&
1461 t6.to_str() == "<<" &&
1462 t7.to_str() == "HERE2" &&
1463 t8.to_str() == "OTHER\n" &&
1464 t9.to_str() == "HERE2" &&
1465 t10.to_str() == "\n" &&
1466 t11.to_str() == "echo" &&
1467 t12.to_str() == "after" &&
1468 t13.to_str() == "\n"
1469 );
1470 Ok(())
1471 }
1472
1473 #[test]
1474 fn tokenize_unterminated_here_doc() -> Result<()> {
1475 let result = tokenize_str(
1476 r#"cat <<HERE
1477SOMETHING
1478"#,
1479 );
1480 assert!(result.is_err());
1481 Ok(())
1482 }
1483
1484 #[test]
1485 fn tokenize_missing_here_tag() -> Result<()> {
1486 let result = tokenize_str(
1487 r"cat <<
1488",
1489 );
1490 assert!(result.is_err());
1491 Ok(())
1492 }
1493
1494 #[test]
1495 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1496 let tokens = tokenize_str(
1497 r#"echo $(cat <<HERE
1498TEXT
1499HERE
1500)"#,
1501 )?;
1502 assert_matches!(
1503 &tokens[..],
1504 [t1 @ Token::Word(..),
1505 t2 @ Token::Word(..)] if
1506 t1.to_str() == "echo" &&
1507 t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1508 );
1509 Ok(())
1510 }
1511
1512 #[test]
1513 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1514 let tokens = tokenize_str(
1515 r#"echo $(cat <<HERE1 <<HERE2 | wc -l
1516TEXT
1517HERE1
1518OTHER
1519HERE2
1520)"#,
1521 )?;
1522 assert_matches!(
1523 &tokens[..],
1524 [t1 @ Token::Word(..),
1525 t2 @ Token::Word(..)] if
1526 t1.to_str() == "echo" &&
1527 t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1528 );
1529 Ok(())
1530 }
1531
1532 #[test]
1533 fn tokenize_simple_backquote() -> Result<()> {
1534 assert_matches!(
1535 &tokenize_str(r#"echo `echo hi`"#)?[..],
1536 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1537 t1.to_str() == "echo" &&
1538 t2.to_str() == "`echo hi`"
1539 );
1540 Ok(())
1541 }
1542
1543 #[test]
1544 fn tokenize_backquote_with_escape() -> Result<()> {
1545 assert_matches!(
1546 &tokenize_str(r"echo `echo\`hi`")?[..],
1547 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1548 t1.to_str() == "echo" &&
1549 t2.to_str() == r"`echo\`hi`"
1550 );
1551 Ok(())
1552 }
1553
1554 #[test]
1555 fn tokenize_unterminated_backquote() {
1556 assert_matches!(
1557 tokenize_str("`"),
1558 Err(TokenizerError::UnterminatedBackquote(_))
1559 );
1560 }
1561
1562 #[test]
1563 fn tokenize_unterminated_command_substitution() {
1564 assert_matches!(
1565 tokenize_str("$("),
1566 Err(TokenizerError::UnterminatedCommandSubstitution)
1567 );
1568 }
1569
1570 #[test]
1571 fn tokenize_command_substitution() -> Result<()> {
1572 assert_matches!(
1573 &tokenize_str("a$(echo hi)b c")?[..],
1574 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1575 t1.to_str() == "a$(echo hi)b" &&
1576 t2.to_str() == "c"
1577 );
1578 Ok(())
1579 }
1580
1581 #[test]
1582 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1583 assert_matches!(
1584 &tokenize_str("echo $(echo !(x))")?[..],
1585 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1586 t1.to_str() == "echo" &&
1587 t2.to_str() == "$(echo !(x))"
1588 );
1589 Ok(())
1590 }
1591
1592 #[test]
1593 fn tokenize_arithmetic_expression() -> Result<()> {
1594 assert_matches!(
1595 &tokenize_str("a$((1+2))b c")?[..],
1596 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1597 t1.to_str() == "a$((1+2))b" &&
1598 t2.to_str() == "c"
1599 );
1600 Ok(())
1601 }
1602
1603 #[test]
1604 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1605 assert_matches!(
1608 &tokenize_str("$(( 1 ))")?[..],
1609 [t1 @ Token::Word(..)] if
1610 t1.to_str() == "$((1 ))"
1611 );
1612 Ok(())
1613 }
1614 #[test]
1615 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1616 assert_matches!(
1617 &tokenize_str("$(( (0) ))")?[..],
1618 [t1 @ Token::Word(..)] if
1619 t1.to_str() == "$(((0)))"
1620 );
1621 Ok(())
1622 }
1623
1624 #[test]
1625 fn tokenize_special_parameters() -> Result<()> {
1626 assert_matches!(
1627 &tokenize_str("$$")?[..],
1628 [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1629 );
1630 assert_matches!(
1631 &tokenize_str("$@")?[..],
1632 [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1633 );
1634 assert_matches!(
1635 &tokenize_str("$!")?[..],
1636 [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1637 );
1638 assert_matches!(
1639 &tokenize_str("$?")?[..],
1640 [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1641 );
1642 assert_matches!(
1643 &tokenize_str("$*")?[..],
1644 [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1645 );
1646 Ok(())
1647 }
1648
1649 #[test]
1650 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1651 assert_matches!(
1652 &tokenize_str("$x")?[..],
1653 [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1654 );
1655 assert_matches!(
1656 &tokenize_str("a$x")?[..],
1657 [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1658 );
1659 Ok(())
1660 }
1661
1662 #[test]
1663 fn tokenize_unterminated_parameter_expansion() {
1664 assert_matches!(
1665 tokenize_str("${x"),
1666 Err(TokenizerError::UnterminatedVariable)
1667 );
1668 }
1669
1670 #[test]
1671 fn tokenize_braced_parameter_expansion() -> Result<()> {
1672 assert_matches!(
1673 &tokenize_str("${x}")?[..],
1674 [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1675 );
1676 assert_matches!(
1677 &tokenize_str("a${x}b")?[..],
1678 [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1679 );
1680 Ok(())
1681 }
1682
1683 #[test]
1684 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1685 assert_matches!(
1686 &tokenize_str(r"a${x\}}b")?[..],
1687 [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1688 );
1689 Ok(())
1690 }
1691
1692 #[test]
1693 fn tokenize_whitespace() -> Result<()> {
1694 assert_matches!(
1695 &tokenize_str("1 2 3")?[..],
1696 [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1697 t1.to_str() == "1" &&
1698 t2.to_str() == "2" &&
1699 t3.to_str() == "3"
1700 );
1701 Ok(())
1702 }
1703
1704 #[test]
1705 fn tokenize_escaped_whitespace() -> Result<()> {
1706 assert_matches!(
1707 &tokenize_str(r"1\ 2 3")?[..],
1708 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1709 t1.to_str() == r"1\ 2" &&
1710 t2.to_str() == "3"
1711 );
1712 Ok(())
1713 }
1714
1715 #[test]
1716 fn tokenize_single_quote() -> Result<()> {
1717 assert_matches!(
1718 &tokenize_str(r"x'a b'y")?[..],
1719 [t1 @ Token::Word(..)] if
1720 t1.to_str() == r"x'a b'y"
1721 );
1722 Ok(())
1723 }
1724
1725 #[test]
1726 fn tokenize_double_quote() -> Result<()> {
1727 assert_matches!(
1728 &tokenize_str(r#"x"a b"y"#)?[..],
1729 [t1 @ Token::Word(..)] if
1730 t1.to_str() == r#"x"a b"y"#
1731 );
1732 Ok(())
1733 }
1734
1735 #[test]
1736 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1737 assert_matches!(
1738 &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1739 [t1 @ Token::Word(..)] if
1740 t1.to_str() == r#"x"$(echo hi)"y"#
1741 );
1742 Ok(())
1743 }
1744
1745 #[test]
1746 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1747 assert_matches!(
1748 &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1749 [t1 @ Token::Word(..)] if
1750 t1.to_str() == r#"x"$((1+2))"y"#
1751 );
1752 Ok(())
1753 }
1754
1755 #[test]
1756 fn test_quote_removal() {
1757 assert_eq!(unquote_str(r#""hello""#), "hello");
1758 assert_eq!(unquote_str(r#"'hello'"#), "hello");
1759 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1760 assert_eq!(unquote_str(r#"'hel\'lo'"#), r#"hel'lo"#);
1761 }
1762}