1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8 EndOfInput,
10 UnescapedNewLine,
12 SpecifiedTerminatingChar,
14 NonNewLineBlank,
16 HereDocumentBodyStart,
18 HereDocumentBodyEnd,
20 HereDocumentEndTag,
22 OperatorStart,
24 OperatorEnd,
26 Other,
28}
29
30#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
34#[cfg_attr(test, serde(rename = "Pos"))]
35pub struct SourcePosition {
36 #[cfg_attr(test, serde(rename = "idx"))]
38 pub index: i32,
39 pub line: i32,
41 #[cfg_attr(test, serde(rename = "col"))]
43 pub column: i32,
44}
45
46impl Display for SourcePosition {
47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
49 }
50}
51
52#[derive(Clone, Default, Debug)]
54#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
55#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
56#[cfg_attr(test, serde(rename = "Loc"))]
57pub struct TokenLocation {
58 pub start: SourcePosition,
60 pub end: SourcePosition,
62}
63
64#[derive(Clone, Debug)]
66#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
67#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
68pub enum Token {
69 #[cfg_attr(test, serde(rename = "Op"))]
71 Operator(String, TokenLocation),
72 #[cfg_attr(test, serde(rename = "W"))]
74 Word(String, TokenLocation),
75}
76
77impl Token {
78 pub fn to_str(&self) -> &str {
80 match self {
81 Self::Operator(s, _) => s,
82 Self::Word(s, _) => s,
83 }
84 }
85
86 pub const fn location(&self) -> &TokenLocation {
88 match self {
89 Self::Operator(_, l) => l,
90 Self::Word(_, l) => l,
91 }
92 }
93}
94
95#[derive(Clone, Debug)]
97pub(crate) struct TokenizeResult {
98 pub reason: TokenEndReason,
100 pub token: Option<Token>,
102}
103
104#[derive(thiserror::Error, Debug)]
106pub enum TokenizerError {
107 #[error("unterminated escape sequence")]
109 UnterminatedEscapeSequence,
110
111 #[error("unterminated single quote at {0}")]
113 UnterminatedSingleQuote(SourcePosition),
114
115 #[error("unterminated ANSI C quote at {0}")]
117 UnterminatedAnsiCQuote(SourcePosition),
118
119 #[error("unterminated double quote at {0}")]
121 UnterminatedDoubleQuote(SourcePosition),
122
123 #[error("unterminated backquote near {0}")]
125 UnterminatedBackquote(SourcePosition),
126
127 #[error("unterminated extglob near {0}")]
130 UnterminatedExtendedGlob(SourcePosition),
131
132 #[error("unterminated variable expression")]
134 UnterminatedVariable,
135
136 #[error("unterminated command substitution")]
138 UnterminatedCommandSubstitution,
139
140 #[error("failed to decode UTF-8 characters")]
142 FailedDecoding,
143
144 #[error("missing here tag for here document body")]
146 MissingHereTagForDocumentBody,
147
148 #[error("missing here tag '{0}'")]
150 MissingHereTag(String),
151
152 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
154 UnterminatedHereDocuments(String, String),
155
156 #[error("failed to read input")]
158 ReadError(#[from] std::io::Error),
159}
160
161impl TokenizerError {
162 pub const fn is_incomplete(&self) -> bool {
165 matches!(
166 self,
167 Self::UnterminatedEscapeSequence
168 | Self::UnterminatedAnsiCQuote(..)
169 | Self::UnterminatedSingleQuote(..)
170 | Self::UnterminatedDoubleQuote(..)
171 | Self::UnterminatedBackquote(..)
172 | Self::UnterminatedCommandSubstitution
173 | Self::UnterminatedVariable
174 | Self::UnterminatedExtendedGlob(..)
175 | Self::UnterminatedHereDocuments(..)
176 )
177 }
178}
179
180#[derive(Debug)]
182pub(crate) struct Tokens<'a> {
183 pub tokens: &'a [Token],
185}
186
187#[derive(Clone, Debug)]
188enum QuoteMode {
189 None,
190 AnsiC(SourcePosition),
191 Single(SourcePosition),
192 Double(SourcePosition),
193}
194
195#[derive(Clone, Debug, Default)]
196enum HereState {
197 #[default]
199 None,
200 NextTokenIsHereTag { remove_tabs: bool },
202 CurrentTokenIsHereTag {
204 remove_tabs: bool,
205 operator_token_result: TokenizeResult,
206 },
207 NextLineIsHereDoc,
210 InHereDocs,
213}
214
215#[derive(Clone, Debug)]
216struct HereTag {
217 tag: String,
218 tag_was_escaped_or_quoted: bool,
219 remove_tabs: bool,
220 position: SourcePosition,
221 tokens: Vec<TokenizeResult>,
222 pending_tokens_after: Vec<TokenizeResult>,
223}
224
225#[derive(Clone, Debug)]
226struct CrossTokenParseState {
227 cursor: SourcePosition,
229 here_state: HereState,
231 current_here_tags: Vec<HereTag>,
233 queued_tokens: Vec<TokenizeResult>,
235 arithmetic_expansion: bool,
237}
238
239#[derive(Clone, Debug, Hash, Eq, PartialEq)]
241pub struct TokenizerOptions {
242 pub enable_extended_globbing: bool,
244 #[allow(unused)]
246 pub posix_mode: bool,
247 pub sh_mode: bool,
249}
250
251impl Default for TokenizerOptions {
252 fn default() -> Self {
253 Self {
254 enable_extended_globbing: true,
255 posix_mode: false,
256 sh_mode: false,
257 }
258 }
259}
260
261pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
263 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
264 cross_state: CrossTokenParseState,
265 options: TokenizerOptions,
266}
267
268#[derive(Clone, Debug)]
270struct TokenParseState {
271 pub start_position: SourcePosition,
272 pub token_so_far: String,
273 pub token_is_operator: bool,
274 pub in_escape: bool,
275 pub quote_mode: QuoteMode,
276}
277
278impl TokenParseState {
279 pub fn new(start_position: &SourcePosition) -> Self {
280 Self {
281 start_position: start_position.clone(),
282 token_so_far: String::new(),
283 token_is_operator: false,
284 in_escape: false,
285 quote_mode: QuoteMode::None,
286 }
287 }
288
289 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
290 let token_location = TokenLocation {
291 start: std::mem::take(&mut self.start_position),
292 end: end_position.clone(),
293 };
294
295 let token = if std::mem::take(&mut self.token_is_operator) {
296 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
297 } else {
298 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
299 };
300
301 self.start_position = end_position.clone();
302 self.in_escape = false;
303 self.quote_mode = QuoteMode::None;
304
305 token
306 }
307
308 pub fn started_token(&self) -> bool {
309 !self.token_so_far.is_empty()
310 }
311
312 pub fn append_char(&mut self, c: char) {
313 self.token_so_far.push(c);
314 }
315
316 pub fn append_str(&mut self, s: &str) {
317 self.token_so_far.push_str(s);
318 }
319
320 pub const fn unquoted(&self) -> bool {
321 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
322 }
323
324 pub fn current_token(&self) -> &str {
325 &self.token_so_far
326 }
327
328 pub fn is_specific_operator(&self, operator: &str) -> bool {
329 self.token_is_operator && self.current_token() == operator
330 }
331
332 pub const fn in_operator(&self) -> bool {
333 self.token_is_operator
334 }
335
336 fn is_newline(&self) -> bool {
337 self.token_so_far == "\n"
338 }
339
340 fn replace_with_here_doc(&mut self, s: String) {
341 self.token_so_far = s;
342 }
343
344 pub fn delimit_current_token(
345 &mut self,
346 reason: TokenEndReason,
347 cross_token_state: &mut CrossTokenParseState,
348 ) -> Result<Option<TokenizeResult>, TokenizerError> {
349 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
352 return Ok(Some(TokenizeResult {
353 reason,
354 token: None,
355 }));
356 }
357
358 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
360 match current_here_state {
361 HereState::NextTokenIsHereTag { remove_tabs } => {
362 let operator_token_result = TokenizeResult {
365 reason,
366 token: Some(self.pop(&cross_token_state.cursor)),
367 };
368
369 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
370 remove_tabs,
371 operator_token_result,
372 };
373
374 return Ok(None);
375 }
376 HereState::CurrentTokenIsHereTag {
377 remove_tabs,
378 operator_token_result,
379 } => {
380 if self.is_newline() {
381 return Err(TokenizerError::MissingHereTag(
382 self.current_token().to_owned(),
383 ));
384 }
385
386 cross_token_state.here_state = HereState::NextLineIsHereDoc;
387
388 let tag = std::format!("{}\n", self.current_token());
390 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
391
392 let tag_token_result = TokenizeResult {
393 reason,
394 token: Some(self.pop(&cross_token_state.cursor)),
395 };
396
397 cross_token_state.current_here_tags.push(HereTag {
398 tag,
399 tag_was_escaped_or_quoted,
400 remove_tabs,
401 position: cross_token_state.cursor.clone(),
402 tokens: vec![operator_token_result, tag_token_result],
403 pending_tokens_after: vec![],
404 });
405
406 return Ok(None);
407 }
408 HereState::NextLineIsHereDoc => {
409 if self.is_newline() {
410 cross_token_state.here_state = HereState::InHereDocs;
411 } else {
412 cross_token_state.here_state = HereState::NextLineIsHereDoc;
413 }
414
415 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
416 let token = self.pop(&cross_token_state.cursor);
417 let result = TokenizeResult {
418 reason,
419 token: Some(token),
420 };
421
422 last_here_tag.pending_tokens_after.push(result);
423 } else {
424 return Err(TokenizerError::MissingHereTagForDocumentBody);
425 }
426
427 return Ok(None);
428 }
429 HereState::InHereDocs => {
430 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
432
433 for here_token in completed_here_tag.tokens {
435 cross_token_state.queued_tokens.push(here_token);
436 }
437
438 cross_token_state.queued_tokens.push(TokenizeResult {
440 reason: TokenEndReason::HereDocumentBodyStart,
441 token: None,
442 });
443
444 cross_token_state.queued_tokens.push(TokenizeResult {
446 reason,
447 token: Some(self.pop(&cross_token_state.cursor)),
448 });
449
450 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
452 cross_token_state.queued_tokens.push(TokenizeResult {
453 reason: TokenEndReason::HereDocumentEndTag,
454 token: Some(self.pop(&cross_token_state.cursor)),
455 });
456
457 for pending_token in completed_here_tag.pending_tokens_after {
460 cross_token_state.queued_tokens.push(pending_token);
461 }
462
463 if cross_token_state.current_here_tags.is_empty() {
464 cross_token_state.here_state = HereState::None;
465 } else {
466 cross_token_state.here_state = HereState::InHereDocs;
467 }
468
469 return Ok(None);
470 }
471 HereState::None => (),
472 }
473
474 let token = self.pop(&cross_token_state.cursor);
475 let result = TokenizeResult {
476 reason,
477 token: Some(token),
478 };
479
480 Ok(Some(result))
481 }
482}
483
484pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
490 tokenize_str_with_options(input, &TokenizerOptions::default())
491}
492
493pub fn tokenize_str_with_options(
500 input: &str,
501 options: &TokenizerOptions,
502) -> Result<Vec<Token>, TokenizerError> {
503 uncached_tokenize_string(input.to_owned(), options.to_owned())
504}
505
506#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
507fn uncached_tokenize_string(
508 input: String,
509 options: TokenizerOptions,
510) -> Result<Vec<Token>, TokenizerError> {
511 uncached_tokenize_str(input.as_str(), &options)
512}
513
514pub fn uncached_tokenize_str(
521 input: &str,
522 options: &TokenizerOptions,
523) -> Result<Vec<Token>, TokenizerError> {
524 let mut reader = std::io::BufReader::new(input.as_bytes());
525 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
526
527 let mut tokens = vec![];
528 loop {
529 match tokenizer.next_token()? {
530 TokenizeResult {
531 token: Some(token), ..
532 } => tokens.push(token),
533 TokenizeResult {
534 reason: TokenEndReason::EndOfInput,
535 ..
536 } => break,
537 _ => (),
538 }
539 }
540
541 Ok(tokens)
542}
543
544impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
545 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
546 Tokenizer {
547 options: options.clone(),
548 char_reader: reader.chars().peekable(),
549 cross_state: CrossTokenParseState {
550 cursor: SourcePosition {
551 index: 0,
552 line: 1,
553 column: 1,
554 },
555 here_state: HereState::None,
556 current_here_tags: vec![],
557 queued_tokens: vec![],
558 arithmetic_expansion: false,
559 },
560 }
561 }
562
563 #[allow(clippy::unnecessary_wraps)]
564 pub fn current_location(&self) -> Option<SourcePosition> {
565 Some(self.cross_state.cursor.clone())
566 }
567
568 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
569 let c = self
570 .char_reader
571 .next()
572 .transpose()
573 .map_err(TokenizerError::ReadError)?;
574
575 if let Some(ch) = c {
576 if ch == '\n' {
577 self.cross_state.cursor.line += 1;
578 self.cross_state.cursor.column = 1;
579 } else {
580 self.cross_state.cursor.column += 1;
581 }
582 self.cross_state.cursor.index += 1;
583 }
584
585 Ok(c)
586 }
587
588 fn consume_char(&mut self) -> Result<(), TokenizerError> {
589 let _ = self.next_char()?;
590 Ok(())
591 }
592
593 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
594 match self.char_reader.peek() {
595 Some(result) => match result {
596 Ok(c) => Ok(Some(*c)),
597 Err(_) => Err(TokenizerError::FailedDecoding),
598 },
599 None => Ok(None),
600 }
601 }
602
603 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
604 self.next_token_until(None, false )
605 }
606
607 #[allow(clippy::cognitive_complexity)]
618 #[allow(clippy::if_same_then_else)]
619 #[allow(clippy::panic_in_result_fn)]
620 #[allow(clippy::too_many_lines)]
621 #[allow(clippy::unwrap_in_result)]
622 fn next_token_until(
623 &mut self,
624 terminating_char: Option<char>,
625 include_space: bool,
626 ) -> Result<TokenizeResult, TokenizerError> {
627 let mut state = TokenParseState::new(&self.cross_state.cursor);
628 let mut result: Option<TokenizeResult> = None;
629
630 while result.is_none() {
631 if !self.cross_state.queued_tokens.is_empty() {
634 return Ok(self.cross_state.queued_tokens.remove(0));
635 }
636
637 let next = self.peek_char()?;
638 let c = next.unwrap_or('\0');
639
640 if next.is_none() {
643 if state.in_escape {
646 return Err(TokenizerError::UnterminatedEscapeSequence);
647 }
648 match state.quote_mode {
649 QuoteMode::None => (),
650 QuoteMode::AnsiC(pos) => {
651 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
652 }
653 QuoteMode::Single(pos) => {
654 return Err(TokenizerError::UnterminatedSingleQuote(pos));
655 }
656 QuoteMode::Double(pos) => {
657 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
658 }
659 }
660
661 if !matches!(self.cross_state.here_state, HereState::None) {
663 if self.remove_here_end_tag(&mut state, &mut result, false)? {
664 continue;
666 }
667
668 let tag_names = self
669 .cross_state
670 .current_here_tags
671 .iter()
672 .map(|tag| tag.tag.trim())
673 .collect::<Vec<_>>()
674 .join(", ");
675 let tag_positions = self
676 .cross_state
677 .current_here_tags
678 .iter()
679 .map(|tag| std::format!("{}", tag.position))
680 .collect::<Vec<_>>()
681 .join(", ");
682 return Err(TokenizerError::UnterminatedHereDocuments(
683 tag_names,
684 tag_positions,
685 ));
686 }
687
688 result = state
689 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
690 } else if state.unquoted() && terminating_char == Some(c) {
694 result = state.delimit_current_token(
695 TokenEndReason::SpecifiedTerminatingChar,
696 &mut self.cross_state,
697 )?;
698 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
702 if !self.cross_state.current_here_tags.is_empty()
707 && self.cross_state.current_here_tags[0].remove_tabs
708 && (!state.started_token() || state.current_token().ends_with('\n'))
709 && c == '\t'
710 {
711 self.consume_char()?;
713 } else {
714 self.consume_char()?;
715 state.append_char(c);
716
717 if c == '\n' {
719 self.remove_here_end_tag(&mut state, &mut result, true)?;
720 }
721 }
722 } else if state.in_operator() {
723 let mut hypothetical_token = state.current_token().to_owned();
729 hypothetical_token.push(c);
730
731 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
732 self.consume_char()?;
733 state.append_char(c);
734 } else {
735 assert!(state.started_token());
736
737 if self.cross_state.arithmetic_expansion {
742 if state.is_specific_operator(")") && c == ')' {
750 self.cross_state.arithmetic_expansion = false;
751 }
752 } else if state.is_specific_operator("<<") {
753 self.cross_state.here_state =
754 HereState::NextTokenIsHereTag { remove_tabs: false };
755 } else if state.is_specific_operator("<<-") {
756 self.cross_state.here_state =
757 HereState::NextTokenIsHereTag { remove_tabs: true };
758 } else if state.is_specific_operator("(") && c == '(' {
759 self.cross_state.arithmetic_expansion = true;
760 }
761
762 let reason = if state.current_token() == "\n" {
763 TokenEndReason::UnescapedNewLine
764 } else {
765 TokenEndReason::OperatorEnd
766 };
767
768 result = state.delimit_current_token(reason, &mut self.cross_state)?;
769 }
770 } else if does_char_newly_affect_quoting(&state, c) {
774 if c == '\\' {
775 self.consume_char()?;
777
778 if matches!(self.peek_char()?, Some('\n')) {
779 self.consume_char()?;
781
782 } else {
784 state.in_escape = true;
785 state.append_char(c);
786 }
787 } else if c == '\'' {
788 if state.token_so_far.ends_with('$') {
789 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
790 } else {
791 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
792 }
793
794 self.consume_char()?;
795 state.append_char(c);
796 } else if c == '\"' {
797 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
798 self.consume_char()?;
799 state.append_char(c);
800 }
801 }
802 else if !state.in_escape
805 && matches!(
806 state.quote_mode,
807 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
808 )
809 && c == '\''
810 {
811 state.quote_mode = QuoteMode::None;
812 self.consume_char()?;
813 state.append_char(c);
814 } else if !state.in_escape
815 && matches!(state.quote_mode, QuoteMode::Double(..))
816 && c == '\"'
817 {
818 state.quote_mode = QuoteMode::None;
819 self.consume_char()?;
820 state.append_char(c);
821 }
822 else if state.in_escape {
826 state.in_escape = false;
827 self.consume_char()?;
828 state.append_char(c);
829 } else if (state.unquoted()
830 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
831 && (c == '$' || c == '`')
832 {
833 if c == '$' {
835 self.consume_char()?;
837
838 let char_after_dollar_sign = self.peek_char()?;
840 match char_after_dollar_sign {
841 Some('(') => {
842 state.append_char('$');
844
845 state.append_char(self.next_char()?.unwrap());
847
848 let mut required_end_parens = 1;
851 if matches!(self.peek_char()?, Some('(')) {
852 state.append_char(self.next_char()?.unwrap());
854 required_end_parens = 2;
857 self.cross_state.arithmetic_expansion = true;
862 }
863
864 let mut pending_here_doc_tokens = vec![];
865 let mut drain_here_doc_tokens = false;
866
867 loop {
868 let cur_token = if drain_here_doc_tokens
869 && !pending_here_doc_tokens.is_empty()
870 {
871 if pending_here_doc_tokens.len() == 1 {
872 drain_here_doc_tokens = false;
873 }
874
875 pending_here_doc_tokens.remove(0)
876 } else {
877 let cur_token = self.next_token_until(
878 Some(')'),
879 true, )?;
881
882 if matches!(
886 cur_token.reason,
887 TokenEndReason::HereDocumentBodyStart
888 | TokenEndReason::HereDocumentBodyEnd
889 | TokenEndReason::HereDocumentEndTag
890 ) {
891 pending_here_doc_tokens.push(cur_token);
892 continue;
893 }
894
895 cur_token
896 };
897
898 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
899 && !pending_here_doc_tokens.is_empty()
900 {
901 pending_here_doc_tokens.push(cur_token);
902 drain_here_doc_tokens = true;
903 continue;
904 }
905
906 if let Some(cur_token_value) = cur_token.token {
907 state.append_str(cur_token_value.to_str());
908
909 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
914 {
915 required_end_parens += 1;
916 }
917 }
918
919 match cur_token.reason {
920 TokenEndReason::HereDocumentBodyStart => {
921 state.append_char('\n');
922 }
923 TokenEndReason::NonNewLineBlank => state.append_char(' '),
924 TokenEndReason::SpecifiedTerminatingChar => {
925 required_end_parens -= 1;
930 if required_end_parens == 0 {
931 break;
932 }
933
934 state.append_char(self.next_char()?.unwrap());
937 }
938 TokenEndReason::EndOfInput => {
939 return Err(
940 TokenizerError::UnterminatedCommandSubstitution,
941 );
942 }
943 _ => (),
944 }
945 }
946
947 self.cross_state.arithmetic_expansion = false;
948
949 state.append_char(self.next_char()?.unwrap());
950 }
951
952 Some('{') => {
953 state.append_char('$');
955
956 state.append_char(self.next_char()?.unwrap());
958
959 let mut pending_here_doc_tokens = vec![];
960 let mut drain_here_doc_tokens = false;
961
962 loop {
963 let cur_token = if drain_here_doc_tokens
964 && !pending_here_doc_tokens.is_empty()
965 {
966 if pending_here_doc_tokens.len() == 1 {
967 drain_here_doc_tokens = false;
968 }
969
970 pending_here_doc_tokens.remove(0)
971 } else {
972 let cur_token = self.next_token_until(
973 Some('}'),
974 false, )?;
976
977 if matches!(
981 cur_token.reason,
982 TokenEndReason::HereDocumentBodyStart
983 | TokenEndReason::HereDocumentBodyEnd
984 | TokenEndReason::HereDocumentEndTag
985 ) {
986 pending_here_doc_tokens.push(cur_token);
987 continue;
988 }
989
990 cur_token
991 };
992
993 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
994 && !pending_here_doc_tokens.is_empty()
995 {
996 pending_here_doc_tokens.push(cur_token);
997 drain_here_doc_tokens = true;
998 continue;
999 }
1000
1001 if let Some(cur_token_value) = cur_token.token {
1002 state.append_str(cur_token_value.to_str());
1003 }
1004
1005 match cur_token.reason {
1006 TokenEndReason::HereDocumentBodyStart => {
1007 state.append_char('\n');
1008 }
1009 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1010 TokenEndReason::SpecifiedTerminatingChar => {
1011 state.append_char(self.next_char()?.unwrap());
1014 break;
1015 }
1016 TokenEndReason::EndOfInput => {
1017 return Err(TokenizerError::UnterminatedVariable);
1018 }
1019 _ => (),
1020 }
1021 }
1022 }
1023 _ => {
1024 state.append_char('$');
1027 }
1028 }
1029 } else {
1030 let backquote_pos = self.cross_state.cursor.clone();
1033 self.consume_char()?;
1034
1035 state.append_char(c);
1037
1038 let mut escaping_enabled = false;
1040 let mut done = false;
1041 while !done {
1042 let next_char_in_backquote = self.next_char()?;
1044 if let Some(cib) = next_char_in_backquote {
1045 state.append_char(cib);
1047
1048 if !escaping_enabled && cib == '\\' {
1050 escaping_enabled = true;
1051 } else {
1052 if !escaping_enabled && cib == '`' {
1054 done = true;
1055 }
1056 escaping_enabled = false;
1057 }
1058 } else {
1059 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1060 }
1061 }
1062 }
1063 }
1064 else if c == '('
1070 && self.options.enable_extended_globbing
1071 && state.unquoted()
1072 && !state.in_operator()
1073 && state
1074 .current_token()
1075 .ends_with(|x| Self::can_start_extglob(x))
1076 {
1077 self.consume_char()?;
1079 state.append_char(c);
1080
1081 let mut paren_depth = 1;
1082
1083 while paren_depth > 0 {
1085 if let Some(extglob_char) = self.next_char()? {
1086 state.append_char(extglob_char);
1088
1089 if extglob_char == '(' {
1092 paren_depth += 1;
1093 } else if extglob_char == ')' {
1094 paren_depth -= 1;
1095 }
1096 } else {
1097 return Err(TokenizerError::UnterminatedExtendedGlob(
1098 self.cross_state.cursor.clone(),
1099 ));
1100 }
1101 }
1102 } else if state.unquoted() && Self::can_start_operator(c) {
1106 if state.started_token() {
1107 result = state.delimit_current_token(
1108 TokenEndReason::OperatorStart,
1109 &mut self.cross_state,
1110 )?;
1111 } else {
1112 state.token_is_operator = true;
1113 self.consume_char()?;
1114 state.append_char(c);
1115 }
1116 } else if state.unquoted() && is_blank(c) {
1120 if state.started_token() {
1121 result = state.delimit_current_token(
1122 TokenEndReason::NonNewLineBlank,
1123 &mut self.cross_state,
1124 )?;
1125 } else if include_space {
1126 state.append_char(c);
1127 } else {
1128 state.start_position.column += 1;
1130 state.start_position.index += 1;
1131 }
1132
1133 self.consume_char()?;
1134 }
1135 else if !state.token_is_operator
1140 && (state.started_token() || matches!(terminating_char, Some('}')))
1141 {
1142 self.consume_char()?;
1143 state.append_char(c);
1144 } else if c == '#' {
1145 self.consume_char()?;
1147
1148 let mut done = false;
1149 while !done {
1150 done = match self.peek_char()? {
1151 Some('\n') => true,
1152 None => true,
1153 _ => {
1154 self.consume_char()?;
1156 false
1157 }
1158 };
1159 }
1160 } else if state.started_token() {
1162 result =
1164 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1165 } else {
1166 self.consume_char()?;
1169 state.append_char(c);
1170 }
1171 }
1172
1173 let result = result.unwrap();
1174
1175 Ok(result)
1176 }
1177
1178 fn remove_here_end_tag(
1179 &mut self,
1180 state: &mut TokenParseState,
1181 result: &mut Option<TokenizeResult>,
1182 ends_with_newline: bool,
1183 ) -> Result<bool, TokenizerError> {
1184 if self.cross_state.current_here_tags.is_empty() {
1186 return Ok(false);
1187 }
1188
1189 let next_here_tag = &self.cross_state.current_here_tags[0];
1190
1191 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1192 unquote_str(next_here_tag.tag.as_str()).into()
1193 } else {
1194 next_here_tag.tag.as_str().into()
1195 };
1196
1197 let tag_str = if !ends_with_newline {
1198 tag_str
1199 .strip_suffix('\n')
1200 .unwrap_or_else(|| tag_str.as_ref())
1201 } else {
1202 tag_str.as_ref()
1203 };
1204
1205 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1206 if current_token_without_here_tag.is_empty()
1210 || current_token_without_here_tag.ends_with('\n')
1211 {
1212 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1213
1214 *result = state.delimit_current_token(
1216 TokenEndReason::HereDocumentBodyEnd,
1217 &mut self.cross_state,
1218 )?;
1219
1220 return Ok(true);
1221 }
1222 }
1223 Ok(false)
1224 }
1225
1226 const fn can_start_extglob(c: char) -> bool {
1227 matches!(c, '@' | '!' | '?' | '+' | '*')
1228 }
1229
1230 const fn can_start_operator(c: char) -> bool {
1231 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1232 }
1233
1234 fn is_operator(&self, s: &str) -> bool {
1235 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1237 return true;
1238 }
1239
1240 matches!(
1241 s,
1242 "&" | "&&"
1243 | "("
1244 | ")"
1245 | ";"
1246 | ";;"
1247 | "\n"
1248 | "|"
1249 | "||"
1250 | "<"
1251 | ">"
1252 | ">|"
1253 | "<<"
1254 | ">>"
1255 | "<&"
1256 | ">&"
1257 | "<<-"
1258 | "<>"
1259 )
1260 }
1261}
1262
1263impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1264 type Item = Result<TokenizeResult, TokenizerError>;
1265
1266 fn next(&mut self) -> Option<Self::Item> {
1267 match self.next_token() {
1268 #[allow(clippy::manual_map)]
1269 Ok(result) => match result.token {
1270 Some(_) => Some(Ok(result)),
1271 None => None,
1272 },
1273 Err(e) => Some(Err(e)),
1274 }
1275 }
1276}
1277
1278const fn is_blank(c: char) -> bool {
1279 c == ' ' || c == '\t'
1280}
1281
1282const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1283 if state.in_escape {
1285 return false;
1286 }
1287
1288 match state.quote_mode {
1289 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1292 if c == '\\' {
1293 true
1295 } else {
1296 false
1297 }
1298 }
1299 QuoteMode::Single(_) => false,
1301 QuoteMode::None => is_quoting_char(c),
1304 }
1305}
1306
1307const fn is_quoting_char(c: char) -> bool {
1308 matches!(c, '\\' | '\'' | '\"')
1309}
1310
1311pub fn unquote_str(s: &str) -> String {
1317 let mut result = String::new();
1318
1319 let mut in_escape = false;
1320 for c in s.chars() {
1321 match c {
1322 c if in_escape => {
1323 result.push(c);
1324 in_escape = false;
1325 }
1326 '\\' => in_escape = true,
1327 c if is_quoting_char(c) => (),
1328 c => result.push(c),
1329 }
1330 }
1331
1332 result
1333}
1334
1335#[cfg(test)]
1336#[allow(clippy::panic_in_result_fn)]
1337mod tests {
1338
1339 use super::*;
1340 use anyhow::Result;
1341 use insta::assert_ron_snapshot;
1342 use pretty_assertions::{assert_eq, assert_matches};
1343
1344 #[derive(serde::Serialize)]
1345 struct TokenizerResult<'a> {
1346 input: &'a str,
1347 result: Vec<Token>,
1348 }
1349
1350 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1351 Ok(TokenizerResult {
1352 input,
1353 result: tokenize_str(input)?,
1354 })
1355 }
1356
1357 #[test]
1358 fn tokenize_empty() -> Result<()> {
1359 let tokens = tokenize_str("")?;
1360 assert_eq!(tokens.len(), 0);
1361 Ok(())
1362 }
1363
1364 #[test]
1365 fn tokenize_line_continuation() -> Result<()> {
1366 assert_ron_snapshot!(test_tokenizer(
1367 r"a\
1368bc"
1369 )?);
1370 Ok(())
1371 }
1372
1373 #[test]
1374 fn tokenize_operators() -> Result<()> {
1375 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1376 Ok(())
1377 }
1378
1379 #[test]
1380 fn tokenize_comment() -> Result<()> {
1381 assert_ron_snapshot!(test_tokenizer(
1382 r"a #comment
1383"
1384 )?);
1385 Ok(())
1386 }
1387
1388 #[test]
1389 fn tokenize_comment_at_eof() -> Result<()> {
1390 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1391 Ok(())
1392 }
1393
1394 #[test]
1395 fn tokenize_empty_here_doc() -> Result<()> {
1396 assert_ron_snapshot!(test_tokenizer(
1397 r"cat <<HERE
1398HERE
1399"
1400 )?);
1401 Ok(())
1402 }
1403
1404 #[test]
1405 fn tokenize_here_doc() -> Result<()> {
1406 assert_ron_snapshot!(test_tokenizer(
1407 r"cat <<HERE
1408SOMETHING
1409HERE
1410echo after
1411"
1412 )?);
1413 assert_ron_snapshot!(test_tokenizer(
1414 r"cat <<HERE
1415SOMETHING
1416HERE
1417"
1418 )?);
1419 assert_ron_snapshot!(test_tokenizer(
1420 r"cat <<HERE
1421SOMETHING
1422HERE
1423
1424"
1425 )?);
1426 assert_ron_snapshot!(test_tokenizer(
1427 r"cat <<HERE
1428SOMETHING
1429HERE"
1430 )?);
1431 Ok(())
1432 }
1433
1434 #[test]
1435 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1436 assert_ron_snapshot!(test_tokenizer(
1437 r"cat <<-HERE
1438 SOMETHING
1439 HERE
1440"
1441 )?);
1442 Ok(())
1443 }
1444
1445 #[test]
1446 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1447 assert_ron_snapshot!(test_tokenizer(
1448 r"cat <<EOF | wc -l
1449A B C
14501 2 3
1451D E F
1452EOF
1453"
1454 )?);
1455 Ok(())
1456 }
1457
1458 #[test]
1459 fn tokenize_multiple_here_docs() -> Result<()> {
1460 assert_ron_snapshot!(test_tokenizer(
1461 r"cat <<HERE1 <<HERE2
1462SOMETHING
1463HERE1
1464OTHER
1465HERE2
1466echo after
1467"
1468 )?);
1469 Ok(())
1470 }
1471
1472 #[test]
1473 fn tokenize_unterminated_here_doc() {
1474 let result = tokenize_str(
1475 r"cat <<HERE
1476SOMETHING
1477",
1478 );
1479 assert!(result.is_err());
1480 }
1481
1482 #[test]
1483 fn tokenize_missing_here_tag() {
1484 let result = tokenize_str(
1485 r"cat <<
1486",
1487 );
1488 assert!(result.is_err());
1489 }
1490
1491 #[test]
1492 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1493 assert_ron_snapshot!(test_tokenizer(
1494 r"echo $(cat <<HERE
1495TEXT
1496HERE
1497)"
1498 )?);
1499 Ok(())
1500 }
1501
1502 #[test]
1503 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1504 assert_ron_snapshot!(test_tokenizer(
1505 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1506TEXT
1507HERE1
1508OTHER
1509HERE2
1510)"
1511 )?);
1512 Ok(())
1513 }
1514
1515 #[test]
1516 fn tokenize_simple_backquote() -> Result<()> {
1517 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1518 Ok(())
1519 }
1520
1521 #[test]
1522 fn tokenize_backquote_with_escape() -> Result<()> {
1523 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1524 Ok(())
1525 }
1526
1527 #[test]
1528 fn tokenize_unterminated_backquote() {
1529 assert_matches!(
1530 tokenize_str("`"),
1531 Err(TokenizerError::UnterminatedBackquote(_))
1532 );
1533 }
1534
1535 #[test]
1536 fn tokenize_unterminated_command_substitution() {
1537 assert_matches!(
1538 tokenize_str("$("),
1539 Err(TokenizerError::UnterminatedCommandSubstitution)
1540 );
1541 }
1542
1543 #[test]
1544 fn tokenize_command_substitution() -> Result<()> {
1545 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1546 Ok(())
1547 }
1548
1549 #[test]
1550 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1551 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1552 Ok(())
1553 }
1554
1555 #[test]
1556 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1557 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1558 Ok(())
1559 }
1560
1561 #[test]
1562 fn tokenize_arithmetic_expression() -> Result<()> {
1563 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1564 Ok(())
1565 }
1566
1567 #[test]
1568 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1569 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1572 Ok(())
1573 }
1574 #[test]
1575 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1576 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1577 Ok(())
1578 }
1579
1580 #[test]
1581 fn tokenize_special_parameters() -> Result<()> {
1582 assert_ron_snapshot!(test_tokenizer("$$")?);
1583 assert_ron_snapshot!(test_tokenizer("$@")?);
1584 assert_ron_snapshot!(test_tokenizer("$!")?);
1585 assert_ron_snapshot!(test_tokenizer("$?")?);
1586 assert_ron_snapshot!(test_tokenizer("$*")?);
1587 Ok(())
1588 }
1589
1590 #[test]
1591 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1592 assert_ron_snapshot!(test_tokenizer("$x")?);
1593 assert_ron_snapshot!(test_tokenizer("a$x")?);
1594 Ok(())
1595 }
1596
1597 #[test]
1598 fn tokenize_unterminated_parameter_expansion() {
1599 assert_matches!(
1600 tokenize_str("${x"),
1601 Err(TokenizerError::UnterminatedVariable)
1602 );
1603 }
1604
1605 #[test]
1606 fn tokenize_braced_parameter_expansion() -> Result<()> {
1607 assert_ron_snapshot!(test_tokenizer("${x}")?);
1608 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1609 Ok(())
1610 }
1611
1612 #[test]
1613 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1614 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1615 Ok(())
1616 }
1617
1618 #[test]
1619 fn tokenize_whitespace() -> Result<()> {
1620 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1621 Ok(())
1622 }
1623
1624 #[test]
1625 fn tokenize_escaped_whitespace() -> Result<()> {
1626 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1627 Ok(())
1628 }
1629
1630 #[test]
1631 fn tokenize_single_quote() -> Result<()> {
1632 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1633 Ok(())
1634 }
1635
1636 #[test]
1637 fn tokenize_double_quote() -> Result<()> {
1638 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1639 Ok(())
1640 }
1641
1642 #[test]
1643 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1644 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1645 Ok(())
1646 }
1647
1648 #[test]
1649 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1650 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1651 Ok(())
1652 }
1653
1654 #[test]
1655 fn test_quote_removal() {
1656 assert_eq!(unquote_str(r#""hello""#), "hello");
1657 assert_eq!(unquote_str(r"'hello'"), "hello");
1658 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1659 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1660 }
1661}