1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8 EndOfInput,
10 UnescapedNewLine,
12 SpecifiedTerminatingChar,
14 NonNewLineBlank,
16 HereDocumentBodyStart,
18 HereDocumentBodyEnd,
20 HereDocumentEndTag,
22 OperatorStart,
24 OperatorEnd,
26 Other,
28}
29
30#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34 pub index: i32,
36 pub line: i32,
38 pub column: i32,
40}
41
42impl Display for SourcePosition {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45 }
46}
47
48#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52 pub start: SourcePosition,
54 pub end: SourcePosition,
56}
57
58#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62 Operator(String, TokenLocation),
64 Word(String, TokenLocation),
66}
67
68impl Token {
69 pub fn to_str(&self) -> &str {
71 match self {
72 Token::Operator(s, _) => s,
73 Token::Word(s, _) => s,
74 }
75 }
76
77 pub fn location(&self) -> &TokenLocation {
79 match self {
80 Token::Operator(_, l) => l,
81 Token::Word(_, l) => l,
82 }
83 }
84}
85
86#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89 pub reason: TokenEndReason,
91 pub token: Option<Token>,
93}
94
95#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98 #[error("unterminated escape sequence")]
100 UnterminatedEscapeSequence,
101
102 #[error("unterminated single quote at {0}")]
104 UnterminatedSingleQuote(SourcePosition),
105
106 #[error("unterminated double quote at {0}")]
108 UnterminatedDoubleQuote(SourcePosition),
109
110 #[error("unterminated backquote near {0}")]
112 UnterminatedBackquote(SourcePosition),
113
114 #[error("unterminated extglob near {0}")]
117 UnterminatedExtendedGlob(SourcePosition),
118
119 #[error("unterminated variable expression")]
121 UnterminatedVariable,
122
123 #[error("unterminated command substitution")]
125 UnterminatedCommandSubstitution,
126
127 #[error("failed to decode UTF-8 characters")]
129 FailedDecoding,
130
131 #[error("missing here tag for here document body")]
133 MissingHereTagForDocumentBody,
134
135 #[error("missing here tag '{0}'")]
137 MissingHereTag(String),
138
139 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141 UnterminatedHereDocuments(String, String),
142
143 #[error("failed to read input")]
145 ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149 pub fn is_incomplete(&self) -> bool {
150 matches!(
151 self,
152 Self::UnterminatedEscapeSequence
153 | Self::UnterminatedSingleQuote(..)
154 | Self::UnterminatedDoubleQuote(..)
155 | Self::UnterminatedBackquote(..)
156 | Self::UnterminatedCommandSubstitution
157 | Self::UnterminatedVariable
158 | Self::UnterminatedExtendedGlob(..)
159 | Self::UnterminatedHereDocuments(..)
160 )
161 }
162}
163
164#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167 pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173 None,
174 Single(SourcePosition),
175 Double(SourcePosition),
176}
177
178#[derive(Clone, Debug, Default)]
179enum HereState {
180 #[default]
182 None,
183 NextTokenIsHereTag { remove_tabs: bool },
185 CurrentTokenIsHereTag {
187 remove_tabs: bool,
188 operator_token_result: TokenizeResult,
189 },
190 NextLineIsHereDoc,
193 InHereDocs,
196}
197
198#[derive(Clone, Debug)]
199struct HereTag {
200 tag: String,
201 tag_was_escaped_or_quoted: bool,
202 remove_tabs: bool,
203 position: SourcePosition,
204 tokens: Vec<TokenizeResult>,
205 pending_tokens_after: Vec<TokenizeResult>,
206}
207
208#[derive(Clone, Debug)]
209struct CrossTokenParseState {
210 cursor: SourcePosition,
212 here_state: HereState,
214 current_here_tags: Vec<HereTag>,
216 queued_tokens: Vec<TokenizeResult>,
218 arithmetic_expansion: bool,
220}
221
222#[derive(Clone, Debug, Hash, Eq, PartialEq)]
224pub struct TokenizerOptions {
225 pub enable_extended_globbing: bool,
227 #[allow(unused)]
229 pub posix_mode: bool,
230 pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235 fn default() -> Self {
236 Self {
237 enable_extended_globbing: true,
238 posix_mode: false,
239 sh_mode: false,
240 }
241 }
242}
243
244pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247 cross_state: CrossTokenParseState,
248 options: TokenizerOptions,
249}
250
251#[derive(Clone, Debug)]
253struct TokenParseState {
254 pub start_position: SourcePosition,
255 pub token_so_far: String,
256 pub token_is_operator: bool,
257 pub in_escape: bool,
258 pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262 pub fn new(start_position: &SourcePosition) -> Self {
263 TokenParseState {
264 start_position: start_position.clone(),
265 token_so_far: String::new(),
266 token_is_operator: false,
267 in_escape: false,
268 quote_mode: QuoteMode::None,
269 }
270 }
271
272 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273 let token_location = TokenLocation {
274 start: std::mem::take(&mut self.start_position),
275 end: end_position.clone(),
276 };
277
278 let token = if std::mem::take(&mut self.token_is_operator) {
279 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
280 } else {
281 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
282 };
283
284 self.start_position = end_position.clone();
285 self.in_escape = false;
286 self.quote_mode = QuoteMode::None;
287
288 token
289 }
290
291 pub fn started_token(&self) -> bool {
292 !self.token_so_far.is_empty()
293 }
294
295 pub fn append_char(&mut self, c: char) {
296 self.token_so_far.push(c);
297 }
298
299 pub fn append_str(&mut self, s: &str) {
300 self.token_so_far.push_str(s);
301 }
302
303 pub fn unquoted(&self) -> bool {
304 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
305 }
306
307 pub fn current_token(&self) -> &str {
308 &self.token_so_far
309 }
310
311 pub fn is_specific_operator(&self, operator: &str) -> bool {
312 self.token_is_operator && self.current_token() == operator
313 }
314
315 pub fn in_operator(&self) -> bool {
316 self.token_is_operator
317 }
318
319 fn is_newline(&self) -> bool {
320 self.token_so_far == "\n"
321 }
322
323 fn replace_with_here_doc(&mut self, s: String) {
324 self.token_so_far = s;
325 }
326
327 pub fn delimit_current_token(
328 &mut self,
329 reason: TokenEndReason,
330 cross_token_state: &mut CrossTokenParseState,
331 ) -> Result<Option<TokenizeResult>, TokenizerError> {
332 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
335 return Ok(Some(TokenizeResult {
336 reason,
337 token: None,
338 }));
339 }
340
341 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
343 match current_here_state {
344 HereState::NextTokenIsHereTag { remove_tabs } => {
345 let operator_token_result = TokenizeResult {
348 reason,
349 token: Some(self.pop(&cross_token_state.cursor)),
350 };
351
352 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
353 remove_tabs,
354 operator_token_result,
355 };
356
357 return Ok(None);
358 }
359 HereState::CurrentTokenIsHereTag {
360 remove_tabs,
361 operator_token_result,
362 } => {
363 if self.is_newline() {
364 return Err(TokenizerError::MissingHereTag(
365 self.current_token().to_owned(),
366 ));
367 }
368
369 cross_token_state.here_state = HereState::NextLineIsHereDoc;
370
371 let tag = std::format!("{}\n", self.current_token());
373 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
374
375 let tag_token_result = TokenizeResult {
376 reason,
377 token: Some(self.pop(&cross_token_state.cursor)),
378 };
379
380 cross_token_state.current_here_tags.push(HereTag {
381 tag,
382 tag_was_escaped_or_quoted,
383 remove_tabs,
384 position: cross_token_state.cursor.clone(),
385 tokens: vec![operator_token_result, tag_token_result],
386 pending_tokens_after: vec![],
387 });
388
389 return Ok(None);
390 }
391 HereState::NextLineIsHereDoc => {
392 if self.is_newline() {
393 cross_token_state.here_state = HereState::InHereDocs;
394 } else {
395 cross_token_state.here_state = HereState::NextLineIsHereDoc;
396 }
397
398 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
399 let token = self.pop(&cross_token_state.cursor);
400 let result = TokenizeResult {
401 reason,
402 token: Some(token),
403 };
404
405 last_here_tag.pending_tokens_after.push(result);
406 } else {
407 return Err(TokenizerError::MissingHereTagForDocumentBody);
408 }
409
410 return Ok(None);
411 }
412 HereState::InHereDocs => {
413 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
415
416 for here_token in completed_here_tag.tokens {
418 cross_token_state.queued_tokens.push(here_token);
419 }
420
421 cross_token_state.queued_tokens.push(TokenizeResult {
423 reason: TokenEndReason::HereDocumentBodyStart,
424 token: None,
425 });
426
427 cross_token_state.queued_tokens.push(TokenizeResult {
429 reason,
430 token: Some(self.pop(&cross_token_state.cursor)),
431 });
432
433 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
435 cross_token_state.queued_tokens.push(TokenizeResult {
436 reason: TokenEndReason::HereDocumentEndTag,
437 token: Some(self.pop(&cross_token_state.cursor)),
438 });
439
440 for pending_token in completed_here_tag.pending_tokens_after {
443 cross_token_state.queued_tokens.push(pending_token);
444 }
445
446 if cross_token_state.current_here_tags.is_empty() {
447 cross_token_state.here_state = HereState::None;
448 } else {
449 cross_token_state.here_state = HereState::InHereDocs;
450 }
451
452 return Ok(None);
453 }
454 HereState::None => (),
455 }
456
457 let token = self.pop(&cross_token_state.cursor);
458 let result = TokenizeResult {
459 reason,
460 token: Some(token),
461 };
462
463 Ok(Some(result))
464 }
465}
466
467pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
473 tokenize_str_with_options(input, &TokenizerOptions::default())
474}
475
476pub fn tokenize_str_with_options(
483 input: &str,
484 options: &TokenizerOptions,
485) -> Result<Vec<Token>, TokenizerError> {
486 uncached_tokenize_string(input.to_owned(), options.to_owned())
487}
488
489#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
490fn uncached_tokenize_string(
491 input: String,
492 options: TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494 uncached_tokenize_str(input.as_str(), &options)
495}
496
497pub fn uncached_tokenize_str(
504 input: &str,
505 options: &TokenizerOptions,
506) -> Result<Vec<Token>, TokenizerError> {
507 let mut reader = std::io::BufReader::new(input.as_bytes());
508 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
509
510 let mut tokens = vec![];
511 loop {
512 match tokenizer.next_token()? {
513 TokenizeResult {
514 token: Some(token), ..
515 } => tokens.push(token),
516 TokenizeResult {
517 reason: TokenEndReason::EndOfInput,
518 ..
519 } => break,
520 _ => (),
521 }
522 }
523
524 Ok(tokens)
525}
526
527impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
528 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
529 Tokenizer {
530 options: options.clone(),
531 char_reader: reader.chars().peekable(),
532 cross_state: CrossTokenParseState {
533 cursor: SourcePosition {
534 index: 0,
535 line: 1,
536 column: 1,
537 },
538 here_state: HereState::None,
539 current_here_tags: vec![],
540 queued_tokens: vec![],
541 arithmetic_expansion: false,
542 },
543 }
544 }
545
546 pub fn current_location(&self) -> Option<SourcePosition> {
547 Some(self.cross_state.cursor.clone())
548 }
549
550 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
551 let c = self
552 .char_reader
553 .next()
554 .transpose()
555 .map_err(TokenizerError::ReadError)?;
556
557 if let Some(ch) = c {
558 if ch == '\n' {
559 self.cross_state.cursor.line += 1;
560 self.cross_state.cursor.column = 1;
561 } else {
562 self.cross_state.cursor.column += 1;
563 }
564 self.cross_state.cursor.index += 1;
565 }
566
567 Ok(c)
568 }
569
570 fn consume_char(&mut self) -> Result<(), TokenizerError> {
571 let _ = self.next_char()?;
572 Ok(())
573 }
574
575 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
576 match self.char_reader.peek() {
577 Some(result) => match result {
578 Ok(c) => Ok(Some(*c)),
579 Err(_) => Err(TokenizerError::FailedDecoding),
580 },
581 None => Ok(None),
582 }
583 }
584
585 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
586 self.next_token_until(None)
587 }
588
589 #[allow(clippy::if_same_then_else)]
590 fn next_token_until(
591 &mut self,
592 terminating_char: Option<char>,
593 ) -> Result<TokenizeResult, TokenizerError> {
594 let mut state = TokenParseState::new(&self.cross_state.cursor);
595 let mut result: Option<TokenizeResult> = None;
596
597 while result.is_none() {
598 if !self.cross_state.queued_tokens.is_empty() {
601 return Ok(self.cross_state.queued_tokens.remove(0));
602 }
603
604 let next = self.peek_char()?;
605 let c = next.unwrap_or('\0');
606
607 if next.is_none() {
610 if state.in_escape {
613 return Err(TokenizerError::UnterminatedEscapeSequence);
614 }
615 match state.quote_mode {
616 QuoteMode::None => (),
617 QuoteMode::Single(pos) => {
618 return Err(TokenizerError::UnterminatedSingleQuote(pos));
619 }
620 QuoteMode::Double(pos) => {
621 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
622 }
623 }
624
625 if !matches!(self.cross_state.here_state, HereState::None) {
627 let tag_names = self
628 .cross_state
629 .current_here_tags
630 .iter()
631 .map(|tag| tag.tag.trim())
632 .collect::<Vec<_>>()
633 .join(", ");
634 let tag_positions = self
635 .cross_state
636 .current_here_tags
637 .iter()
638 .map(|tag| std::format!("{}", tag.position))
639 .collect::<Vec<_>>()
640 .join(", ");
641 return Err(TokenizerError::UnterminatedHereDocuments(
642 tag_names,
643 tag_positions,
644 ));
645 }
646
647 result = state
648 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
649 } else if state.unquoted() && terminating_char == Some(c) {
653 result = state.delimit_current_token(
654 TokenEndReason::SpecifiedTerminatingChar,
655 &mut self.cross_state,
656 )?;
657 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
661 if !self.cross_state.current_here_tags.is_empty()
666 && self.cross_state.current_here_tags[0].remove_tabs
667 && (!state.started_token() || state.current_token().ends_with('\n'))
668 && c == '\t'
669 {
670 self.consume_char()?;
672 } else {
673 self.consume_char()?;
674 state.append_char(c);
675
676 if c == '\n' {
678 let next_here_tag = &self.cross_state.current_here_tags[0];
679 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
680 unquote_str(next_here_tag.tag.as_str()).into()
681 } else {
682 next_here_tag.tag.as_str().into()
683 };
684
685 if let Some(current_token_without_here_tag) =
686 state.current_token().strip_suffix(tag_str.as_ref())
687 {
688 if current_token_without_here_tag.is_empty()
692 || current_token_without_here_tag.ends_with('\n')
693 {
694 state.replace_with_here_doc(
695 current_token_without_here_tag.to_owned(),
696 );
697
698 result = state.delimit_current_token(
700 TokenEndReason::HereDocumentBodyEnd,
701 &mut self.cross_state,
702 )?;
703 }
704 }
705 }
706 }
707 } else if state.in_operator() {
708 let mut hypothetical_token = state.current_token().to_owned();
714 hypothetical_token.push(c);
715
716 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
717 self.consume_char()?;
718 state.append_char(c);
719 } else {
720 assert!(state.started_token());
721
722 if self.cross_state.arithmetic_expansion {
727 } else if state.is_specific_operator("<<") {
731 self.cross_state.here_state =
732 HereState::NextTokenIsHereTag { remove_tabs: false };
733 } else if state.is_specific_operator("<<-") {
734 self.cross_state.here_state =
735 HereState::NextTokenIsHereTag { remove_tabs: true };
736 }
737
738 let reason = if state.current_token() == "\n" {
739 TokenEndReason::UnescapedNewLine
740 } else {
741 TokenEndReason::OperatorEnd
742 };
743
744 result = state.delimit_current_token(reason, &mut self.cross_state)?;
745 }
746 } else if does_char_newly_affect_quoting(&state, c) {
750 if c == '\\' {
751 self.consume_char()?;
753
754 if matches!(self.peek_char()?, Some('\n')) {
755 self.consume_char()?;
757
758 } else {
760 state.in_escape = true;
761 state.append_char(c);
762 }
763 } else if c == '\'' {
764 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
765 self.consume_char()?;
766 state.append_char(c);
767 } else if c == '\"' {
768 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
769 self.consume_char()?;
770 state.append_char(c);
771 }
772 }
773 else if !state.in_escape
776 && matches!(state.quote_mode, QuoteMode::Single(_))
777 && c == '\''
778 {
779 state.quote_mode = QuoteMode::None;
780 self.consume_char()?;
781 state.append_char(c);
782 } else if !state.in_escape
783 && matches!(state.quote_mode, QuoteMode::Double(_))
784 && c == '\"'
785 {
786 state.quote_mode = QuoteMode::None;
787 self.consume_char()?;
788 state.append_char(c);
789 }
790 else if state.in_escape {
794 state.in_escape = false;
795 self.consume_char()?;
796 state.append_char(c);
797 } else if (state.unquoted()
798 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
799 && (c == '$' || c == '`')
800 {
801 if c == '$' {
803 self.consume_char()?;
805
806 let char_after_dollar_sign = self.peek_char()?;
808 match char_after_dollar_sign {
809 Some('(') => {
810 state.append_char('$');
812
813 state.append_char(self.next_char()?.unwrap());
815
816 let mut required_end_parens = 1;
819 if matches!(self.peek_char()?, Some('(')) {
820 state.append_char(self.next_char()?.unwrap());
822 required_end_parens = 2;
825 self.cross_state.arithmetic_expansion = true;
830 }
831
832 let mut pending_here_doc_tokens = vec![];
833 let mut drain_here_doc_tokens = false;
834
835 loop {
836 let cur_token = if drain_here_doc_tokens
837 && !pending_here_doc_tokens.is_empty()
838 {
839 if pending_here_doc_tokens.len() == 1 {
840 drain_here_doc_tokens = false;
841 }
842
843 pending_here_doc_tokens.remove(0)
844 } else {
845 let cur_token = self.next_token_until(Some(')'))?;
846
847 if matches!(
851 cur_token.reason,
852 TokenEndReason::HereDocumentBodyStart
853 | TokenEndReason::HereDocumentBodyEnd
854 | TokenEndReason::HereDocumentEndTag
855 ) {
856 pending_here_doc_tokens.push(cur_token);
857 continue;
858 }
859
860 cur_token
861 };
862
863 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
864 && !pending_here_doc_tokens.is_empty()
865 {
866 pending_here_doc_tokens.push(cur_token);
867 drain_here_doc_tokens = true;
868 continue;
869 }
870
871 if let Some(cur_token_value) = cur_token.token {
872 state.append_str(cur_token_value.to_str());
873
874 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
879 {
880 required_end_parens += 1;
881 }
882 }
883
884 match cur_token.reason {
885 TokenEndReason::HereDocumentBodyStart => {
886 state.append_char('\n')
887 }
888 TokenEndReason::NonNewLineBlank => state.append_char(' '),
889 TokenEndReason::SpecifiedTerminatingChar => {
890 required_end_parens -= 1;
895 if required_end_parens == 0 {
896 break;
897 }
898
899 state.append_char(self.next_char()?.unwrap());
902 }
903 TokenEndReason::EndOfInput => {
904 return Err(TokenizerError::UnterminatedCommandSubstitution)
905 }
906 _ => (),
907 }
908 }
909
910 self.cross_state.arithmetic_expansion = false;
911
912 state.append_char(self.next_char()?.unwrap());
913 }
914
915 Some('{') => {
916 state.append_char('$');
918
919 state.append_char(self.next_char()?.unwrap());
921
922 let mut pending_here_doc_tokens = vec![];
923 let mut drain_here_doc_tokens = false;
924
925 loop {
926 let cur_token = if drain_here_doc_tokens
927 && !pending_here_doc_tokens.is_empty()
928 {
929 if pending_here_doc_tokens.len() == 1 {
930 drain_here_doc_tokens = false;
931 }
932
933 pending_here_doc_tokens.remove(0)
934 } else {
935 let cur_token = self.next_token_until(Some('}'))?;
936
937 if matches!(
941 cur_token.reason,
942 TokenEndReason::HereDocumentBodyStart
943 | TokenEndReason::HereDocumentBodyEnd
944 | TokenEndReason::HereDocumentEndTag
945 ) {
946 pending_here_doc_tokens.push(cur_token);
947 continue;
948 }
949
950 cur_token
951 };
952
953 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
954 && !pending_here_doc_tokens.is_empty()
955 {
956 pending_here_doc_tokens.push(cur_token);
957 drain_here_doc_tokens = true;
958 continue;
959 }
960
961 if let Some(cur_token_value) = cur_token.token {
962 state.append_str(cur_token_value.to_str())
963 }
964
965 match cur_token.reason {
966 TokenEndReason::HereDocumentBodyStart => {
967 state.append_char('\n')
968 }
969 TokenEndReason::NonNewLineBlank => state.append_char(' '),
970 TokenEndReason::SpecifiedTerminatingChar => {
971 state.append_char(self.next_char()?.unwrap());
974 break;
975 }
976 TokenEndReason::EndOfInput => {
977 return Err(TokenizerError::UnterminatedVariable)
978 }
979 _ => (),
980 }
981 }
982 }
983 _ => {
984 state.append_char('$');
987 }
988 }
989 } else {
990 let backquote_pos = self.cross_state.cursor.clone();
993 self.consume_char()?;
994
995 state.append_char(c);
997
998 let mut escaping_enabled = false;
1000 let mut done = false;
1001 while !done {
1002 let next_char_in_backquote = self.next_char()?;
1004 if let Some(cib) = next_char_in_backquote {
1005 state.append_char(cib);
1007
1008 if !escaping_enabled && cib == '\\' {
1010 escaping_enabled = true;
1011 } else {
1012 if !escaping_enabled && cib == '`' {
1014 done = true;
1015 }
1016 escaping_enabled = false;
1017 }
1018 } else {
1019 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1020 }
1021 }
1022 }
1023 }
1024 else if c == '('
1030 && self.options.enable_extended_globbing
1031 && state.unquoted()
1032 && !state.in_operator()
1033 && state
1034 .current_token()
1035 .ends_with(|x| self.can_start_extglob(x))
1036 {
1037 self.consume_char()?;
1039 state.append_char(c);
1040
1041 let mut paren_depth = 1;
1042
1043 while paren_depth > 0 {
1045 if let Some(extglob_char) = self.next_char()? {
1046 state.append_char(extglob_char);
1048
1049 if extglob_char == '(' {
1052 paren_depth += 1;
1053 } else if extglob_char == ')' {
1054 paren_depth -= 1;
1055 }
1056 } else {
1057 return Err(TokenizerError::UnterminatedExtendedGlob(
1058 self.cross_state.cursor.clone(),
1059 ));
1060 }
1061 }
1062 } else if state.unquoted() && self.can_start_operator(c) {
1066 if state.started_token() {
1067 result = state.delimit_current_token(
1068 TokenEndReason::OperatorStart,
1069 &mut self.cross_state,
1070 )?;
1071 } else {
1072 state.token_is_operator = true;
1073 self.consume_char()?;
1074 state.append_char(c);
1075 }
1076 } else if state.unquoted() && is_blank(c) {
1080 if state.started_token() {
1081 result = state.delimit_current_token(
1082 TokenEndReason::NonNewLineBlank,
1083 &mut self.cross_state,
1084 )?;
1085 } else {
1086 state.start_position.column += 1;
1088 state.start_position.index += 1;
1089 }
1090
1091 self.consume_char()?;
1092 }
1093 else if !state.token_is_operator
1098 && (state.started_token() || matches!(terminating_char, Some('}')))
1099 {
1100 self.consume_char()?;
1101 state.append_char(c);
1102 } else if c == '#' {
1103 self.consume_char()?;
1105
1106 let mut done = false;
1107 while !done {
1108 done = match self.peek_char()? {
1109 Some('\n') => true,
1110 None => true,
1111 _ => {
1112 self.consume_char()?;
1114 false
1115 }
1116 };
1117 }
1118 continue;
1120 } else if state.started_token() {
1124 result =
1125 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1126 } else {
1127 self.consume_char()?;
1130 state.append_char(c);
1131 }
1132 }
1133
1134 let result = result.unwrap();
1135
1136 Ok(result)
1137 }
1138
1139 fn can_start_extglob(&self, c: char) -> bool {
1140 matches!(c, '@' | '!' | '?' | '+' | '*')
1141 }
1142
1143 fn can_start_operator(&self, c: char) -> bool {
1144 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1145 }
1146
1147 fn is_operator(&self, s: &str) -> bool {
1148 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1150 return true;
1151 }
1152
1153 matches!(
1154 s,
1155 "&" | "&&"
1156 | "("
1157 | ")"
1158 | ";"
1159 | ";;"
1160 | "\n"
1161 | "|"
1162 | "||"
1163 | "<"
1164 | ">"
1165 | ">|"
1166 | "<<"
1167 | ">>"
1168 | "<&"
1169 | ">&"
1170 | "<<-"
1171 | "<>"
1172 )
1173 }
1174}
1175
1176impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1177 type Item = Result<TokenizeResult, TokenizerError>;
1178
1179 fn next(&mut self) -> Option<Self::Item> {
1180 match self.next_token() {
1181 #[allow(clippy::manual_map)]
1182 Ok(result) => match result.token {
1183 Some(_) => Some(Ok(result)),
1184 None => None,
1185 },
1186 Err(e) => Some(Err(e)),
1187 }
1188 }
1189}
1190
1191fn is_blank(c: char) -> bool {
1192 c == ' ' || c == '\t'
1193}
1194
1195fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1196 if state.in_escape {
1198 return false;
1199 }
1200
1201 match state.quote_mode {
1202 QuoteMode::Double(_) => {
1204 if c == '\\' {
1205 true
1207 } else {
1208 false
1209 }
1210 }
1211 QuoteMode::Single(_) => false,
1213 QuoteMode::None => is_quoting_char(c),
1216 }
1217}
1218
1219fn is_quoting_char(c: char) -> bool {
1220 matches!(c, '\\' | '\'' | '\"')
1221}
1222
1223pub fn unquote_str(s: &str) -> String {
1229 let mut result = String::new();
1230
1231 let mut in_escape = false;
1232 for c in s.chars() {
1233 match c {
1234 c if in_escape => {
1235 result.push(c);
1236 in_escape = false;
1237 }
1238 '\\' => in_escape = true,
1239 c if is_quoting_char(c) => (),
1240 c => result.push(c),
1241 }
1242 }
1243
1244 result
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249 use super::*;
1250 use anyhow::Result;
1251 use pretty_assertions::{assert_eq, assert_matches};
1253
1254 #[test]
1255 fn tokenize_empty() -> Result<()> {
1256 let tokens = tokenize_str("")?;
1257 assert_eq!(tokens.len(), 0);
1258 Ok(())
1259 }
1260
1261 #[test]
1262 fn tokenize_line_continuation() -> Result<()> {
1263 let tokens = tokenize_str(
1264 r"a\
1265bc",
1266 )?;
1267 assert_matches!(
1268 &tokens[..],
1269 [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1270 );
1271 Ok(())
1272 }
1273
1274 #[test]
1275 fn tokenize_operators() -> Result<()> {
1276 assert_matches!(
1277 &tokenize_str("a>>b")?[..],
1278 [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1279 t1.to_str() == "a" &&
1280 t2.to_str() == ">>" &&
1281 t3.to_str() == "b"
1282 );
1283 Ok(())
1284 }
1285
1286 #[test]
1287 fn tokenize_comment() -> Result<()> {
1288 let tokens = tokenize_str(
1289 r#"a #comment
1290"#,
1291 )?;
1292 assert_matches!(
1293 &tokens[..],
1294 [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1295 t1.to_str() == "a" &&
1296 t2.to_str() == "\n"
1297 );
1298 Ok(())
1299 }
1300
1301 #[test]
1302 fn tokenize_comment_at_eof() -> Result<()> {
1303 assert_matches!(
1304 &tokenize_str(r#"a #comment"#)?[..],
1305 [t1 @ Token::Word(..)] if t1.to_str() == "a"
1306 );
1307 Ok(())
1308 }
1309
1310 #[test]
1311 fn tokenize_empty_here_doc() -> Result<()> {
1312 let tokens = tokenize_str(
1313 r#"cat <<HERE
1314HERE
1315"#,
1316 )?;
1317 assert_matches!(
1318 &tokens[..],
1319 [t1 @ Token::Word(..),
1320 t2 @ Token::Operator(..),
1321 t3 @ Token::Word(..),
1322 t4 @ Token::Word(..),
1323 t5 @ Token::Word(..),
1324 t6 @ Token::Operator(..)] if
1325 t1.to_str() == "cat" &&
1326 t2.to_str() == "<<" &&
1327 t3.to_str() == "HERE" &&
1328 t4.to_str() == "" &&
1329 t5.to_str() == "HERE" &&
1330 t6.to_str() == "\n"
1331 );
1332 Ok(())
1333 }
1334
1335 #[test]
1336 fn tokenize_here_doc() -> Result<()> {
1337 let tokens = tokenize_str(
1338 r#"cat <<HERE
1339SOMETHING
1340HERE
1341echo after
1342"#,
1343 )?;
1344 assert_matches!(
1345 &tokens[..],
1346 [t1 @ Token::Word(..),
1347 t2 @ Token::Operator(..),
1348 t3 @ Token::Word(..),
1349 t4 @ Token::Word(..),
1350 t5 @ Token::Word(..),
1351 t6 @ Token::Operator(..),
1352 t7 @ Token::Word(..),
1353 t8 @ Token::Word(..),
1354 t9 @ Token::Operator(..)] if
1355 t1.to_str() == "cat" &&
1356 t2.to_str() == "<<" &&
1357 t3.to_str() == "HERE" &&
1358 t4.to_str() == "SOMETHING\n" &&
1359 t5.to_str() == "HERE" &&
1360 t6.to_str() == "\n" &&
1361 t7.to_str() == "echo" &&
1362 t8.to_str() == "after" &&
1363 t9.to_str() == "\n"
1364 );
1365 Ok(())
1366 }
1367
1368 #[test]
1369 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1370 let tokens = tokenize_str(
1371 r#"cat <<-HERE
1372 SOMETHING
1373 HERE
1374"#,
1375 )?;
1376 assert_matches!(
1377 &tokens[..],
1378 [t1 @ Token::Word(..),
1379 t2 @ Token::Operator(..),
1380 t3 @ Token::Word(..),
1381 t4 @ Token::Word(..),
1382 t5 @ Token::Word(..),
1383 t6 @ Token::Operator(..)] if
1384 t1.to_str() == "cat" &&
1385 t2.to_str() == "<<-" &&
1386 t3.to_str() == "HERE" &&
1387 t4.to_str() == "SOMETHING\n" &&
1388 t5.to_str() == "HERE" &&
1389 t6.to_str() == "\n"
1390 );
1391 Ok(())
1392 }
1393
1394 #[test]
1395 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1396 let tokens = tokenize_str(
1397 r#"cat <<EOF | wc -l
1398A B C
13991 2 3
1400D E F
1401EOF
1402"#,
1403 )?;
1404 assert_matches!(
1405 &tokens[..],
1406 [t1 @ Token::Word(..),
1407 t2 @ Token::Operator(..),
1408 t3 @ Token::Word(..),
1409 t4 @ Token::Word(..),
1410 t5 @ Token::Word(..),
1411 t6 @ Token::Operator(..),
1412 t7 @ Token::Word(..),
1413 t8 @ Token::Word(..),
1414 t9 @ Token::Operator(..)] if
1415 t1.to_str() == "cat" &&
1416 t2.to_str() == "<<" &&
1417 t3.to_str() == "EOF" &&
1418 t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1419 t5.to_str() == "EOF" &&
1420 t6.to_str() == "|" &&
1421 t7.to_str() == "wc" &&
1422 t8.to_str() == "-l" &&
1423 t9.to_str() == "\n"
1424 );
1425
1426 Ok(())
1427 }
1428
1429 #[test]
1430 fn tokenize_multiple_here_docs() -> Result<()> {
1431 let tokens = tokenize_str(
1432 r#"cat <<HERE1 <<HERE2
1433SOMETHING
1434HERE1
1435OTHER
1436HERE2
1437echo after
1438"#,
1439 )?;
1440 assert_matches!(
1441 &tokens[..],
1442 [t1 @ Token::Word(..),
1443 t2 @ Token::Operator(..),
1444 t3 @ Token::Word(..),
1445 t4 @ Token::Word(..),
1446 t5 @ Token::Word(..),
1447 t6 @ Token::Operator(..),
1448 t7 @ Token::Word(..),
1449 t8 @ Token::Word(..),
1450 t9 @ Token::Word(..),
1451 t10 @ Token::Operator(..),
1452 t11 @ Token::Word(..),
1453 t12 @ Token::Word(..),
1454 t13 @ Token::Operator(..)] if
1455 t1.to_str() == "cat" &&
1456 t2.to_str() == "<<" &&
1457 t3.to_str() == "HERE1" &&
1458 t4.to_str() == "SOMETHING\n" &&
1459 t5.to_str() == "HERE1" &&
1460 t6.to_str() == "<<" &&
1461 t7.to_str() == "HERE2" &&
1462 t8.to_str() == "OTHER\n" &&
1463 t9.to_str() == "HERE2" &&
1464 t10.to_str() == "\n" &&
1465 t11.to_str() == "echo" &&
1466 t12.to_str() == "after" &&
1467 t13.to_str() == "\n"
1468 );
1469 Ok(())
1470 }
1471
1472 #[test]
1473 fn tokenize_unterminated_here_doc() -> Result<()> {
1474 let result = tokenize_str(
1475 r#"cat <<HERE
1476SOMETHING
1477"#,
1478 );
1479 assert!(result.is_err());
1480 Ok(())
1481 }
1482
1483 #[test]
1484 fn tokenize_missing_here_tag() -> Result<()> {
1485 let result = tokenize_str(
1486 r"cat <<
1487",
1488 );
1489 assert!(result.is_err());
1490 Ok(())
1491 }
1492
1493 #[test]
1494 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1495 let tokens = tokenize_str(
1496 r#"echo $(cat <<HERE
1497TEXT
1498HERE
1499)"#,
1500 )?;
1501 assert_matches!(
1502 &tokens[..],
1503 [t1 @ Token::Word(..),
1504 t2 @ Token::Word(..)] if
1505 t1.to_str() == "echo" &&
1506 t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1507 );
1508 Ok(())
1509 }
1510
1511 #[test]
1512 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1513 let tokens = tokenize_str(
1514 r#"echo $(cat <<HERE1 <<HERE2 | wc -l
1515TEXT
1516HERE1
1517OTHER
1518HERE2
1519)"#,
1520 )?;
1521 assert_matches!(
1522 &tokens[..],
1523 [t1 @ Token::Word(..),
1524 t2 @ Token::Word(..)] if
1525 t1.to_str() == "echo" &&
1526 t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1527 );
1528 Ok(())
1529 }
1530
1531 #[test]
1532 fn tokenize_simple_backquote() -> Result<()> {
1533 assert_matches!(
1534 &tokenize_str(r#"echo `echo hi`"#)?[..],
1535 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1536 t1.to_str() == "echo" &&
1537 t2.to_str() == "`echo hi`"
1538 );
1539 Ok(())
1540 }
1541
1542 #[test]
1543 fn tokenize_backquote_with_escape() -> Result<()> {
1544 assert_matches!(
1545 &tokenize_str(r"echo `echo\`hi`")?[..],
1546 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1547 t1.to_str() == "echo" &&
1548 t2.to_str() == r"`echo\`hi`"
1549 );
1550 Ok(())
1551 }
1552
1553 #[test]
1554 fn tokenize_unterminated_backquote() {
1555 assert_matches!(
1556 tokenize_str("`"),
1557 Err(TokenizerError::UnterminatedBackquote(_))
1558 );
1559 }
1560
1561 #[test]
1562 fn tokenize_unterminated_command_substitution() {
1563 assert_matches!(
1564 tokenize_str("$("),
1565 Err(TokenizerError::UnterminatedCommandSubstitution)
1566 );
1567 }
1568
1569 #[test]
1570 fn tokenize_command_substitution() -> Result<()> {
1571 assert_matches!(
1572 &tokenize_str("a$(echo hi)b c")?[..],
1573 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1574 t1.to_str() == "a$(echo hi)b" &&
1575 t2.to_str() == "c"
1576 );
1577 Ok(())
1578 }
1579
1580 #[test]
1581 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1582 assert_matches!(
1583 &tokenize_str("echo $(echo !(x))")?[..],
1584 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1585 t1.to_str() == "echo" &&
1586 t2.to_str() == "$(echo !(x))"
1587 );
1588 Ok(())
1589 }
1590
1591 #[test]
1592 fn tokenize_arithmetic_expression() -> Result<()> {
1593 assert_matches!(
1594 &tokenize_str("a$((1+2))b c")?[..],
1595 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1596 t1.to_str() == "a$((1+2))b" &&
1597 t2.to_str() == "c"
1598 );
1599 Ok(())
1600 }
1601
1602 #[test]
1603 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1604 assert_matches!(
1607 &tokenize_str("$(( 1 ))")?[..],
1608 [t1 @ Token::Word(..)] if
1609 t1.to_str() == "$((1 ))"
1610 );
1611 Ok(())
1612 }
1613 #[test]
1614 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1615 assert_matches!(
1616 &tokenize_str("$(( (0) ))")?[..],
1617 [t1 @ Token::Word(..)] if
1618 t1.to_str() == "$(((0)))"
1619 );
1620 Ok(())
1621 }
1622
1623 #[test]
1624 fn tokenize_special_parameters() -> Result<()> {
1625 assert_matches!(
1626 &tokenize_str("$$")?[..],
1627 [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1628 );
1629 assert_matches!(
1630 &tokenize_str("$@")?[..],
1631 [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1632 );
1633 assert_matches!(
1634 &tokenize_str("$!")?[..],
1635 [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1636 );
1637 assert_matches!(
1638 &tokenize_str("$?")?[..],
1639 [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1640 );
1641 assert_matches!(
1642 &tokenize_str("$*")?[..],
1643 [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1644 );
1645 Ok(())
1646 }
1647
1648 #[test]
1649 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1650 assert_matches!(
1651 &tokenize_str("$x")?[..],
1652 [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1653 );
1654 assert_matches!(
1655 &tokenize_str("a$x")?[..],
1656 [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1657 );
1658 Ok(())
1659 }
1660
1661 #[test]
1662 fn tokenize_unterminated_parameter_expansion() {
1663 assert_matches!(
1664 tokenize_str("${x"),
1665 Err(TokenizerError::UnterminatedVariable)
1666 );
1667 }
1668
1669 #[test]
1670 fn tokenize_braced_parameter_expansion() -> Result<()> {
1671 assert_matches!(
1672 &tokenize_str("${x}")?[..],
1673 [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1674 );
1675 assert_matches!(
1676 &tokenize_str("a${x}b")?[..],
1677 [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1678 );
1679 Ok(())
1680 }
1681
1682 #[test]
1683 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1684 assert_matches!(
1685 &tokenize_str(r"a${x\}}b")?[..],
1686 [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1687 );
1688 Ok(())
1689 }
1690
1691 #[test]
1692 fn tokenize_whitespace() -> Result<()> {
1693 assert_matches!(
1694 &tokenize_str("1 2 3")?[..],
1695 [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1696 t1.to_str() == "1" &&
1697 t2.to_str() == "2" &&
1698 t3.to_str() == "3"
1699 );
1700 Ok(())
1701 }
1702
1703 #[test]
1704 fn tokenize_escaped_whitespace() -> Result<()> {
1705 assert_matches!(
1706 &tokenize_str(r"1\ 2 3")?[..],
1707 [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1708 t1.to_str() == r"1\ 2" &&
1709 t2.to_str() == "3"
1710 );
1711 Ok(())
1712 }
1713
1714 #[test]
1715 fn tokenize_single_quote() -> Result<()> {
1716 assert_matches!(
1717 &tokenize_str(r"x'a b'y")?[..],
1718 [t1 @ Token::Word(..)] if
1719 t1.to_str() == r"x'a b'y"
1720 );
1721 Ok(())
1722 }
1723
1724 #[test]
1725 fn tokenize_double_quote() -> Result<()> {
1726 assert_matches!(
1727 &tokenize_str(r#"x"a b"y"#)?[..],
1728 [t1 @ Token::Word(..)] if
1729 t1.to_str() == r#"x"a b"y"#
1730 );
1731 Ok(())
1732 }
1733
1734 #[test]
1735 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1736 assert_matches!(
1737 &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1738 [t1 @ Token::Word(..)] if
1739 t1.to_str() == r#"x"$(echo hi)"y"#
1740 );
1741 Ok(())
1742 }
1743
1744 #[test]
1745 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1746 assert_matches!(
1747 &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1748 [t1 @ Token::Word(..)] if
1749 t1.to_str() == r#"x"$((1+2))"y"#
1750 );
1751 Ok(())
1752 }
1753
1754 #[test]
1755 fn test_quote_removal() {
1756 assert_eq!(unquote_str(r#""hello""#), "hello");
1757 assert_eq!(unquote_str(r#"'hello'"#), "hello");
1758 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1759 assert_eq!(unquote_str(r#"'hel\'lo'"#), r#"hel'lo"#);
1760 }
1761}