1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[derive(Clone, Debug)]
6pub(crate) enum TokenEndReason {
7 EndOfInput,
9 UnescapedNewLine,
11 SpecifiedTerminatingChar,
13 NonNewLineBlank,
15 HereDocumentBodyStart,
17 HereDocumentBodyEnd,
19 HereDocumentEndTag,
21 OperatorStart,
23 OperatorEnd,
25 Other,
27}
28
29#[derive(Clone, Default, Debug)]
31#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
32#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
33#[cfg_attr(test, serde(rename = "Pos"))]
34pub struct SourcePosition {
35 #[cfg_attr(test, serde(rename = "idx"))]
37 pub index: i32,
38 pub line: i32,
40 #[cfg_attr(test, serde(rename = "col"))]
42 pub column: i32,
43}
44
45impl Display for SourcePosition {
46 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
48 }
49}
50
51#[derive(Clone, Default, Debug)]
53#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
54#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
55#[cfg_attr(test, serde(rename = "Loc"))]
56pub struct TokenLocation {
57 pub start: SourcePosition,
59 pub end: SourcePosition,
61}
62
63#[derive(Clone, Debug)]
65#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
66#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
67pub enum Token {
68 #[cfg_attr(test, serde(rename = "Op"))]
70 Operator(String, TokenLocation),
71 #[cfg_attr(test, serde(rename = "W"))]
73 Word(String, TokenLocation),
74}
75
76impl Token {
77 pub fn to_str(&self) -> &str {
79 match self {
80 Self::Operator(s, _) => s,
81 Self::Word(s, _) => s,
82 }
83 }
84
85 pub const fn location(&self) -> &TokenLocation {
87 match self {
88 Self::Operator(_, l) => l,
89 Self::Word(_, l) => l,
90 }
91 }
92}
93
94#[derive(Clone, Debug)]
96pub(crate) struct TokenizeResult {
97 pub reason: TokenEndReason,
99 pub token: Option<Token>,
101}
102
103#[derive(thiserror::Error, Debug)]
105pub enum TokenizerError {
106 #[error("unterminated escape sequence")]
108 UnterminatedEscapeSequence,
109
110 #[error("unterminated single quote at {0}")]
112 UnterminatedSingleQuote(SourcePosition),
113
114 #[error("unterminated ANSI C quote at {0}")]
116 UnterminatedAnsiCQuote(SourcePosition),
117
118 #[error("unterminated double quote at {0}")]
120 UnterminatedDoubleQuote(SourcePosition),
121
122 #[error("unterminated backquote near {0}")]
124 UnterminatedBackquote(SourcePosition),
125
126 #[error("unterminated extglob near {0}")]
129 UnterminatedExtendedGlob(SourcePosition),
130
131 #[error("unterminated variable expression")]
133 UnterminatedVariable,
134
135 #[error("unterminated command substitution")]
137 UnterminatedCommandSubstitution,
138
139 #[error("failed to decode UTF-8 characters")]
141 FailedDecoding,
142
143 #[error("missing here tag for here document body")]
145 MissingHereTagForDocumentBody,
146
147 #[error("missing here tag '{0}'")]
149 MissingHereTag(String),
150
151 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
153 UnterminatedHereDocuments(String, String),
154
155 #[error("failed to read input")]
157 ReadError(#[from] std::io::Error),
158}
159
160impl TokenizerError {
161 pub const fn is_incomplete(&self) -> bool {
164 matches!(
165 self,
166 Self::UnterminatedEscapeSequence
167 | Self::UnterminatedAnsiCQuote(..)
168 | Self::UnterminatedSingleQuote(..)
169 | Self::UnterminatedDoubleQuote(..)
170 | Self::UnterminatedBackquote(..)
171 | Self::UnterminatedCommandSubstitution
172 | Self::UnterminatedVariable
173 | Self::UnterminatedExtendedGlob(..)
174 | Self::UnterminatedHereDocuments(..)
175 )
176 }
177}
178
179#[derive(Debug)]
181pub(crate) struct Tokens<'a> {
182 pub tokens: &'a [Token],
184}
185
186#[derive(Clone, Debug)]
187enum QuoteMode {
188 None,
189 AnsiC(SourcePosition),
190 Single(SourcePosition),
191 Double(SourcePosition),
192}
193
194#[derive(Clone, Debug, Default)]
195enum HereState {
196 #[default]
198 None,
199 NextTokenIsHereTag { remove_tabs: bool },
201 CurrentTokenIsHereTag {
203 remove_tabs: bool,
204 operator_token_result: TokenizeResult,
205 },
206 NextLineIsHereDoc,
209 InHereDocs,
212}
213
214#[derive(Clone, Debug)]
215struct HereTag {
216 tag: String,
217 tag_was_escaped_or_quoted: bool,
218 remove_tabs: bool,
219 position: SourcePosition,
220 tokens: Vec<TokenizeResult>,
221 pending_tokens_after: Vec<TokenizeResult>,
222}
223
224#[derive(Clone, Debug)]
225struct CrossTokenParseState {
226 cursor: SourcePosition,
228 here_state: HereState,
230 current_here_tags: Vec<HereTag>,
232 queued_tokens: Vec<TokenizeResult>,
234 arithmetic_expansion: bool,
236}
237
238#[derive(Clone, Debug, Hash, Eq, PartialEq)]
240pub struct TokenizerOptions {
241 pub enable_extended_globbing: bool,
243 pub posix_mode: bool,
245 pub sh_mode: bool,
247}
248
249impl Default for TokenizerOptions {
250 fn default() -> Self {
251 Self {
252 enable_extended_globbing: true,
253 posix_mode: false,
254 sh_mode: false,
255 }
256 }
257}
258
259pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
261 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
262 cross_state: CrossTokenParseState,
263 options: TokenizerOptions,
264}
265
266#[derive(Clone, Debug)]
268struct TokenParseState {
269 pub start_position: SourcePosition,
270 pub token_so_far: String,
271 pub token_is_operator: bool,
272 pub in_escape: bool,
273 pub quote_mode: QuoteMode,
274}
275
276impl TokenParseState {
277 pub fn new(start_position: &SourcePosition) -> Self {
278 Self {
279 start_position: start_position.clone(),
280 token_so_far: String::new(),
281 token_is_operator: false,
282 in_escape: false,
283 quote_mode: QuoteMode::None,
284 }
285 }
286
287 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
288 let token_location = TokenLocation {
289 start: std::mem::take(&mut self.start_position),
290 end: end_position.clone(),
291 };
292
293 let token = if std::mem::take(&mut self.token_is_operator) {
294 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
295 } else {
296 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
297 };
298
299 self.start_position = end_position.clone();
300 self.in_escape = false;
301 self.quote_mode = QuoteMode::None;
302
303 token
304 }
305
306 pub fn started_token(&self) -> bool {
307 !self.token_so_far.is_empty()
308 }
309
310 pub fn append_char(&mut self, c: char) {
311 self.token_so_far.push(c);
312 }
313
314 pub fn append_str(&mut self, s: &str) {
315 self.token_so_far.push_str(s);
316 }
317
318 pub const fn unquoted(&self) -> bool {
319 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
320 }
321
322 pub fn current_token(&self) -> &str {
323 &self.token_so_far
324 }
325
326 pub fn is_specific_operator(&self, operator: &str) -> bool {
327 self.token_is_operator && self.current_token() == operator
328 }
329
330 pub const fn in_operator(&self) -> bool {
331 self.token_is_operator
332 }
333
334 fn is_newline(&self) -> bool {
335 self.token_so_far == "\n"
336 }
337
338 fn replace_with_here_doc(&mut self, s: String) {
339 self.token_so_far = s;
340 }
341
342 pub fn delimit_current_token(
343 &mut self,
344 reason: TokenEndReason,
345 cross_token_state: &mut CrossTokenParseState,
346 ) -> Result<Option<TokenizeResult>, TokenizerError> {
347 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
350 return Ok(Some(TokenizeResult {
351 reason,
352 token: None,
353 }));
354 }
355
356 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
358 match current_here_state {
359 HereState::NextTokenIsHereTag { remove_tabs } => {
360 let operator_token_result = TokenizeResult {
363 reason,
364 token: Some(self.pop(&cross_token_state.cursor)),
365 };
366
367 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
368 remove_tabs,
369 operator_token_result,
370 };
371
372 return Ok(None);
373 }
374 HereState::CurrentTokenIsHereTag {
375 remove_tabs,
376 operator_token_result,
377 } => {
378 if self.is_newline() {
379 return Err(TokenizerError::MissingHereTag(
380 self.current_token().to_owned(),
381 ));
382 }
383
384 cross_token_state.here_state = HereState::NextLineIsHereDoc;
385
386 let tag = std::format!("{}\n", self.current_token());
388 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
389
390 let tag_token_result = TokenizeResult {
391 reason,
392 token: Some(self.pop(&cross_token_state.cursor)),
393 };
394
395 cross_token_state.current_here_tags.push(HereTag {
396 tag,
397 tag_was_escaped_or_quoted,
398 remove_tabs,
399 position: cross_token_state.cursor.clone(),
400 tokens: vec![operator_token_result, tag_token_result],
401 pending_tokens_after: vec![],
402 });
403
404 return Ok(None);
405 }
406 HereState::NextLineIsHereDoc => {
407 if self.is_newline() {
408 cross_token_state.here_state = HereState::InHereDocs;
409 } else {
410 cross_token_state.here_state = HereState::NextLineIsHereDoc;
411 }
412
413 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
414 let token = self.pop(&cross_token_state.cursor);
415 let result = TokenizeResult {
416 reason,
417 token: Some(token),
418 };
419
420 last_here_tag.pending_tokens_after.push(result);
421 } else {
422 return Err(TokenizerError::MissingHereTagForDocumentBody);
423 }
424
425 return Ok(None);
426 }
427 HereState::InHereDocs => {
428 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
430
431 for here_token in completed_here_tag.tokens {
433 cross_token_state.queued_tokens.push(here_token);
434 }
435
436 cross_token_state.queued_tokens.push(TokenizeResult {
438 reason: TokenEndReason::HereDocumentBodyStart,
439 token: None,
440 });
441
442 cross_token_state.queued_tokens.push(TokenizeResult {
444 reason,
445 token: Some(self.pop(&cross_token_state.cursor)),
446 });
447
448 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
450 cross_token_state.queued_tokens.push(TokenizeResult {
451 reason: TokenEndReason::HereDocumentEndTag,
452 token: Some(self.pop(&cross_token_state.cursor)),
453 });
454
455 for pending_token in completed_here_tag.pending_tokens_after {
458 cross_token_state.queued_tokens.push(pending_token);
459 }
460
461 if cross_token_state.current_here_tags.is_empty() {
462 cross_token_state.here_state = HereState::None;
463 } else {
464 cross_token_state.here_state = HereState::InHereDocs;
465 }
466
467 return Ok(None);
468 }
469 HereState::None => (),
470 }
471
472 let token = self.pop(&cross_token_state.cursor);
473 let result = TokenizeResult {
474 reason,
475 token: Some(token),
476 };
477
478 Ok(Some(result))
479 }
480}
481
482pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
488 tokenize_str_with_options(input, &TokenizerOptions::default())
489}
490
491pub fn tokenize_str_with_options(
498 input: &str,
499 options: &TokenizerOptions,
500) -> Result<Vec<Token>, TokenizerError> {
501 uncached_tokenize_string(input.to_owned(), options.to_owned())
502}
503
504#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
505fn uncached_tokenize_string(
506 input: String,
507 options: TokenizerOptions,
508) -> Result<Vec<Token>, TokenizerError> {
509 uncached_tokenize_str(input.as_str(), &options)
510}
511
512pub fn uncached_tokenize_str(
519 input: &str,
520 options: &TokenizerOptions,
521) -> Result<Vec<Token>, TokenizerError> {
522 let mut reader = std::io::BufReader::new(input.as_bytes());
523 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
524
525 let mut tokens = vec![];
526 loop {
527 match tokenizer.next_token()? {
528 TokenizeResult {
529 token: Some(token), ..
530 } => tokens.push(token),
531 TokenizeResult {
532 reason: TokenEndReason::EndOfInput,
533 ..
534 } => break,
535 _ => (),
536 }
537 }
538
539 Ok(tokens)
540}
541
542impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
543 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
544 Tokenizer {
545 options: options.clone(),
546 char_reader: reader.chars().peekable(),
547 cross_state: CrossTokenParseState {
548 cursor: SourcePosition {
549 index: 0,
550 line: 1,
551 column: 1,
552 },
553 here_state: HereState::None,
554 current_here_tags: vec![],
555 queued_tokens: vec![],
556 arithmetic_expansion: false,
557 },
558 }
559 }
560
561 #[expect(clippy::unnecessary_wraps)]
562 pub fn current_location(&self) -> Option<SourcePosition> {
563 Some(self.cross_state.cursor.clone())
564 }
565
566 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
567 let c = self
568 .char_reader
569 .next()
570 .transpose()
571 .map_err(TokenizerError::ReadError)?;
572
573 if let Some(ch) = c {
574 if ch == '\n' {
575 self.cross_state.cursor.line += 1;
576 self.cross_state.cursor.column = 1;
577 } else {
578 self.cross_state.cursor.column += 1;
579 }
580 self.cross_state.cursor.index += 1;
581 }
582
583 Ok(c)
584 }
585
586 fn consume_char(&mut self) -> Result<(), TokenizerError> {
587 let _ = self.next_char()?;
588 Ok(())
589 }
590
591 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
592 match self.char_reader.peek() {
593 Some(result) => match result {
594 Ok(c) => Ok(Some(*c)),
595 Err(_) => Err(TokenizerError::FailedDecoding),
596 },
597 None => Ok(None),
598 }
599 }
600
601 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
602 self.next_token_until(None, false )
603 }
604
605 #[expect(clippy::cognitive_complexity)]
616 #[expect(clippy::if_same_then_else)]
617 #[expect(clippy::panic_in_result_fn)]
618 #[expect(clippy::too_many_lines)]
619 #[expect(clippy::unwrap_in_result)]
620 fn next_token_until(
621 &mut self,
622 terminating_char: Option<char>,
623 include_space: bool,
624 ) -> Result<TokenizeResult, TokenizerError> {
625 let mut state = TokenParseState::new(&self.cross_state.cursor);
626 let mut result: Option<TokenizeResult> = None;
627
628 while result.is_none() {
629 if !self.cross_state.queued_tokens.is_empty() {
632 return Ok(self.cross_state.queued_tokens.remove(0));
633 }
634
635 let next = self.peek_char()?;
636 let c = next.unwrap_or('\0');
637
638 if next.is_none() {
641 if state.in_escape {
644 return Err(TokenizerError::UnterminatedEscapeSequence);
645 }
646 match state.quote_mode {
647 QuoteMode::None => (),
648 QuoteMode::AnsiC(pos) => {
649 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
650 }
651 QuoteMode::Single(pos) => {
652 return Err(TokenizerError::UnterminatedSingleQuote(pos));
653 }
654 QuoteMode::Double(pos) => {
655 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
656 }
657 }
658
659 if !matches!(self.cross_state.here_state, HereState::None) {
661 if self.remove_here_end_tag(&mut state, &mut result, false)? {
662 continue;
664 }
665
666 let tag_names = self
667 .cross_state
668 .current_here_tags
669 .iter()
670 .map(|tag| tag.tag.trim())
671 .collect::<Vec<_>>()
672 .join(", ");
673 let tag_positions = self
674 .cross_state
675 .current_here_tags
676 .iter()
677 .map(|tag| std::format!("{}", tag.position))
678 .collect::<Vec<_>>()
679 .join(", ");
680 return Err(TokenizerError::UnterminatedHereDocuments(
681 tag_names,
682 tag_positions,
683 ));
684 }
685
686 result = state
687 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
688 } else if state.unquoted() && terminating_char == Some(c) {
692 result = state.delimit_current_token(
693 TokenEndReason::SpecifiedTerminatingChar,
694 &mut self.cross_state,
695 )?;
696 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
700 if !self.cross_state.current_here_tags.is_empty()
705 && self.cross_state.current_here_tags[0].remove_tabs
706 && (!state.started_token() || state.current_token().ends_with('\n'))
707 && c == '\t'
708 {
709 self.consume_char()?;
711 } else {
712 self.consume_char()?;
713 state.append_char(c);
714
715 if c == '\n' {
717 self.remove_here_end_tag(&mut state, &mut result, true)?;
718 }
719 }
720 } else if state.in_operator() {
721 let mut hypothetical_token = state.current_token().to_owned();
727 hypothetical_token.push(c);
728
729 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
730 self.consume_char()?;
731 state.append_char(c);
732 } else {
733 assert!(state.started_token());
734
735 if self.cross_state.arithmetic_expansion {
740 if state.is_specific_operator(")") && c == ')' {
748 self.cross_state.arithmetic_expansion = false;
749 }
750 } else if state.is_specific_operator("<<") {
751 self.cross_state.here_state =
752 HereState::NextTokenIsHereTag { remove_tabs: false };
753 } else if state.is_specific_operator("<<-") {
754 self.cross_state.here_state =
755 HereState::NextTokenIsHereTag { remove_tabs: true };
756 } else if state.is_specific_operator("(") && c == '(' {
757 self.cross_state.arithmetic_expansion = true;
758 }
759
760 let reason = if state.current_token() == "\n" {
761 TokenEndReason::UnescapedNewLine
762 } else {
763 TokenEndReason::OperatorEnd
764 };
765
766 result = state.delimit_current_token(reason, &mut self.cross_state)?;
767 }
768 } else if does_char_newly_affect_quoting(&state, c) {
772 if c == '\\' {
773 self.consume_char()?;
775
776 if matches!(self.peek_char()?, Some('\n')) {
777 self.consume_char()?;
779
780 } else {
782 state.in_escape = true;
783 state.append_char(c);
784 }
785 } else if c == '\'' {
786 if state.token_so_far.ends_with('$') {
787 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
788 } else {
789 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
790 }
791
792 self.consume_char()?;
793 state.append_char(c);
794 } else if c == '\"' {
795 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
796 self.consume_char()?;
797 state.append_char(c);
798 }
799 }
800 else if !state.in_escape
803 && matches!(
804 state.quote_mode,
805 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
806 )
807 && c == '\''
808 {
809 state.quote_mode = QuoteMode::None;
810 self.consume_char()?;
811 state.append_char(c);
812 } else if !state.in_escape
813 && matches!(state.quote_mode, QuoteMode::Double(..))
814 && c == '\"'
815 {
816 state.quote_mode = QuoteMode::None;
817 self.consume_char()?;
818 state.append_char(c);
819 }
820 else if state.in_escape {
824 state.in_escape = false;
825 self.consume_char()?;
826 state.append_char(c);
827 } else if (state.unquoted()
828 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
829 && (c == '$' || c == '`')
830 {
831 if c == '$' {
833 self.consume_char()?;
835
836 let char_after_dollar_sign = self.peek_char()?;
838 match char_after_dollar_sign {
839 Some('(') => {
840 state.append_char('$');
842
843 state.append_char(self.next_char()?.unwrap());
845
846 let mut required_end_parens = 1;
849 if matches!(self.peek_char()?, Some('(')) {
850 state.append_char(self.next_char()?.unwrap());
852 required_end_parens = 2;
855 self.cross_state.arithmetic_expansion = true;
860 }
861
862 let mut pending_here_doc_tokens = vec![];
863 let mut drain_here_doc_tokens = false;
864
865 loop {
866 let cur_token = if drain_here_doc_tokens
867 && !pending_here_doc_tokens.is_empty()
868 {
869 if pending_here_doc_tokens.len() == 1 {
870 drain_here_doc_tokens = false;
871 }
872
873 pending_here_doc_tokens.remove(0)
874 } else {
875 let cur_token = self.next_token_until(
876 Some(')'),
877 true, )?;
879
880 if matches!(
884 cur_token.reason,
885 TokenEndReason::HereDocumentBodyStart
886 | TokenEndReason::HereDocumentBodyEnd
887 | TokenEndReason::HereDocumentEndTag
888 ) {
889 pending_here_doc_tokens.push(cur_token);
890 continue;
891 }
892
893 cur_token
894 };
895
896 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
897 && !pending_here_doc_tokens.is_empty()
898 {
899 pending_here_doc_tokens.push(cur_token);
900 drain_here_doc_tokens = true;
901 continue;
902 }
903
904 if let Some(cur_token_value) = cur_token.token {
905 state.append_str(cur_token_value.to_str());
906
907 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
912 {
913 required_end_parens += 1;
914 }
915 }
916
917 match cur_token.reason {
918 TokenEndReason::HereDocumentBodyStart => {
919 state.append_char('\n');
920 }
921 TokenEndReason::NonNewLineBlank => state.append_char(' '),
922 TokenEndReason::SpecifiedTerminatingChar => {
923 required_end_parens -= 1;
928 if required_end_parens == 0 {
929 break;
930 }
931
932 state.append_char(self.next_char()?.unwrap());
935 }
936 TokenEndReason::EndOfInput => {
937 return Err(
938 TokenizerError::UnterminatedCommandSubstitution,
939 );
940 }
941 _ => (),
942 }
943 }
944
945 self.cross_state.arithmetic_expansion = false;
946
947 state.append_char(self.next_char()?.unwrap());
948 }
949
950 Some('{') => {
951 state.append_char('$');
953
954 state.append_char(self.next_char()?.unwrap());
956
957 let mut pending_here_doc_tokens = vec![];
958 let mut drain_here_doc_tokens = false;
959
960 loop {
961 let cur_token = if drain_here_doc_tokens
962 && !pending_here_doc_tokens.is_empty()
963 {
964 if pending_here_doc_tokens.len() == 1 {
965 drain_here_doc_tokens = false;
966 }
967
968 pending_here_doc_tokens.remove(0)
969 } else {
970 let cur_token = self.next_token_until(
971 Some('}'),
972 false, )?;
974
975 if matches!(
979 cur_token.reason,
980 TokenEndReason::HereDocumentBodyStart
981 | TokenEndReason::HereDocumentBodyEnd
982 | TokenEndReason::HereDocumentEndTag
983 ) {
984 pending_here_doc_tokens.push(cur_token);
985 continue;
986 }
987
988 cur_token
989 };
990
991 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
992 && !pending_here_doc_tokens.is_empty()
993 {
994 pending_here_doc_tokens.push(cur_token);
995 drain_here_doc_tokens = true;
996 continue;
997 }
998
999 if let Some(cur_token_value) = cur_token.token {
1000 state.append_str(cur_token_value.to_str());
1001 }
1002
1003 match cur_token.reason {
1004 TokenEndReason::HereDocumentBodyStart => {
1005 state.append_char('\n');
1006 }
1007 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1008 TokenEndReason::SpecifiedTerminatingChar => {
1009 state.append_char(self.next_char()?.unwrap());
1012 break;
1013 }
1014 TokenEndReason::EndOfInput => {
1015 return Err(TokenizerError::UnterminatedVariable);
1016 }
1017 _ => (),
1018 }
1019 }
1020 }
1021 _ => {
1022 state.append_char('$');
1025 }
1026 }
1027 } else {
1028 let backquote_pos = self.cross_state.cursor.clone();
1031 self.consume_char()?;
1032
1033 state.append_char(c);
1035
1036 let mut escaping_enabled = false;
1038 let mut done = false;
1039 while !done {
1040 let next_char_in_backquote = self.next_char()?;
1042 if let Some(cib) = next_char_in_backquote {
1043 state.append_char(cib);
1045
1046 if !escaping_enabled && cib == '\\' {
1048 escaping_enabled = true;
1049 } else {
1050 if !escaping_enabled && cib == '`' {
1052 done = true;
1053 }
1054 escaping_enabled = false;
1055 }
1056 } else {
1057 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1058 }
1059 }
1060 }
1061 }
1062 else if c == '('
1068 && self.options.enable_extended_globbing
1069 && state.unquoted()
1070 && !state.in_operator()
1071 && state
1072 .current_token()
1073 .ends_with(|x| Self::can_start_extglob(x))
1074 {
1075 self.consume_char()?;
1077 state.append_char(c);
1078
1079 let mut paren_depth = 1;
1080
1081 while paren_depth > 0 {
1083 if let Some(extglob_char) = self.next_char()? {
1084 state.append_char(extglob_char);
1086
1087 if extglob_char == '(' {
1090 paren_depth += 1;
1091 } else if extglob_char == ')' {
1092 paren_depth -= 1;
1093 }
1094 } else {
1095 return Err(TokenizerError::UnterminatedExtendedGlob(
1096 self.cross_state.cursor.clone(),
1097 ));
1098 }
1099 }
1100 } else if state.unquoted() && Self::can_start_operator(c) {
1104 if state.started_token() {
1105 result = state.delimit_current_token(
1106 TokenEndReason::OperatorStart,
1107 &mut self.cross_state,
1108 )?;
1109 } else {
1110 state.token_is_operator = true;
1111 self.consume_char()?;
1112 state.append_char(c);
1113 }
1114 } else if state.unquoted() && is_blank(c) {
1118 if state.started_token() {
1119 result = state.delimit_current_token(
1120 TokenEndReason::NonNewLineBlank,
1121 &mut self.cross_state,
1122 )?;
1123 } else if include_space {
1124 state.append_char(c);
1125 } else {
1126 state.start_position.column += 1;
1128 state.start_position.index += 1;
1129 }
1130
1131 self.consume_char()?;
1132 }
1133 else if !state.token_is_operator
1138 && (state.started_token() || matches!(terminating_char, Some('}')))
1139 {
1140 self.consume_char()?;
1141 state.append_char(c);
1142 } else if c == '#' {
1143 self.consume_char()?;
1145
1146 let mut done = false;
1147 while !done {
1148 done = match self.peek_char()? {
1149 Some('\n') => true,
1150 None => true,
1151 _ => {
1152 self.consume_char()?;
1154 false
1155 }
1156 };
1157 }
1158 } else if state.started_token() {
1160 result =
1162 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1163 } else {
1164 self.consume_char()?;
1167 state.append_char(c);
1168 }
1169 }
1170
1171 let result = result.unwrap();
1172
1173 Ok(result)
1174 }
1175
1176 fn remove_here_end_tag(
1177 &mut self,
1178 state: &mut TokenParseState,
1179 result: &mut Option<TokenizeResult>,
1180 ends_with_newline: bool,
1181 ) -> Result<bool, TokenizerError> {
1182 if self.cross_state.current_here_tags.is_empty() {
1184 return Ok(false);
1185 }
1186
1187 let next_here_tag = &self.cross_state.current_here_tags[0];
1188
1189 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1190 unquote_str(next_here_tag.tag.as_str()).into()
1191 } else {
1192 next_here_tag.tag.as_str().into()
1193 };
1194
1195 let tag_str = if !ends_with_newline {
1196 tag_str
1197 .strip_suffix('\n')
1198 .unwrap_or_else(|| tag_str.as_ref())
1199 } else {
1200 tag_str.as_ref()
1201 };
1202
1203 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1204 if current_token_without_here_tag.is_empty()
1208 || current_token_without_here_tag.ends_with('\n')
1209 {
1210 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1211
1212 *result = state.delimit_current_token(
1214 TokenEndReason::HereDocumentBodyEnd,
1215 &mut self.cross_state,
1216 )?;
1217
1218 return Ok(true);
1219 }
1220 }
1221 Ok(false)
1222 }
1223
1224 const fn can_start_extglob(c: char) -> bool {
1225 matches!(c, '@' | '!' | '?' | '+' | '*')
1226 }
1227
1228 const fn can_start_operator(c: char) -> bool {
1229 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1230 }
1231
1232 fn is_operator(&self, s: &str) -> bool {
1233 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1235 return true;
1236 }
1237
1238 matches!(
1239 s,
1240 "&" | "&&"
1241 | "("
1242 | ")"
1243 | ";"
1244 | ";;"
1245 | "\n"
1246 | "|"
1247 | "||"
1248 | "<"
1249 | ">"
1250 | ">|"
1251 | "<<"
1252 | ">>"
1253 | "<&"
1254 | ">&"
1255 | "<<-"
1256 | "<>"
1257 )
1258 }
1259}
1260
1261impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1262 type Item = Result<TokenizeResult, TokenizerError>;
1263
1264 fn next(&mut self) -> Option<Self::Item> {
1265 match self.next_token() {
1266 #[expect(clippy::manual_map)]
1267 Ok(result) => match result.token {
1268 Some(_) => Some(Ok(result)),
1269 None => None,
1270 },
1271 Err(e) => Some(Err(e)),
1272 }
1273 }
1274}
1275
1276const fn is_blank(c: char) -> bool {
1277 c == ' ' || c == '\t'
1278}
1279
1280const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1281 if state.in_escape {
1283 return false;
1284 }
1285
1286 match state.quote_mode {
1287 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1290 if c == '\\' {
1291 true
1293 } else {
1294 false
1295 }
1296 }
1297 QuoteMode::Single(_) => false,
1299 QuoteMode::None => is_quoting_char(c),
1302 }
1303}
1304
1305const fn is_quoting_char(c: char) -> bool {
1306 matches!(c, '\\' | '\'' | '\"')
1307}
1308
1309pub fn unquote_str(s: &str) -> String {
1315 let mut result = String::new();
1316
1317 let mut in_escape = false;
1318 for c in s.chars() {
1319 match c {
1320 c if in_escape => {
1321 result.push(c);
1322 in_escape = false;
1323 }
1324 '\\' => in_escape = true,
1325 c if is_quoting_char(c) => (),
1326 c => result.push(c),
1327 }
1328 }
1329
1330 result
1331}
1332
1333#[cfg(test)]
1334mod tests {
1335
1336 use super::*;
1337 use anyhow::Result;
1338 use insta::assert_ron_snapshot;
1339 use pretty_assertions::{assert_eq, assert_matches};
1340
1341 #[derive(serde::Serialize)]
1342 struct TokenizerResult<'a> {
1343 input: &'a str,
1344 result: Vec<Token>,
1345 }
1346
1347 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1348 Ok(TokenizerResult {
1349 input,
1350 result: tokenize_str(input)?,
1351 })
1352 }
1353
1354 #[test]
1355 fn tokenize_empty() -> Result<()> {
1356 let tokens = tokenize_str("")?;
1357 assert_eq!(tokens.len(), 0);
1358 Ok(())
1359 }
1360
1361 #[test]
1362 fn tokenize_line_continuation() -> Result<()> {
1363 assert_ron_snapshot!(test_tokenizer(
1364 r"a\
1365bc"
1366 )?);
1367 Ok(())
1368 }
1369
1370 #[test]
1371 fn tokenize_operators() -> Result<()> {
1372 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1373 Ok(())
1374 }
1375
1376 #[test]
1377 fn tokenize_comment() -> Result<()> {
1378 assert_ron_snapshot!(test_tokenizer(
1379 r"a #comment
1380"
1381 )?);
1382 Ok(())
1383 }
1384
1385 #[test]
1386 fn tokenize_comment_at_eof() -> Result<()> {
1387 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1388 Ok(())
1389 }
1390
1391 #[test]
1392 fn tokenize_empty_here_doc() -> Result<()> {
1393 assert_ron_snapshot!(test_tokenizer(
1394 r"cat <<HERE
1395HERE
1396"
1397 )?);
1398 Ok(())
1399 }
1400
1401 #[test]
1402 fn tokenize_here_doc() -> Result<()> {
1403 assert_ron_snapshot!(test_tokenizer(
1404 r"cat <<HERE
1405SOMETHING
1406HERE
1407echo after
1408"
1409 )?);
1410 assert_ron_snapshot!(test_tokenizer(
1411 r"cat <<HERE
1412SOMETHING
1413HERE
1414"
1415 )?);
1416 assert_ron_snapshot!(test_tokenizer(
1417 r"cat <<HERE
1418SOMETHING
1419HERE
1420
1421"
1422 )?);
1423 assert_ron_snapshot!(test_tokenizer(
1424 r"cat <<HERE
1425SOMETHING
1426HERE"
1427 )?);
1428 Ok(())
1429 }
1430
1431 #[test]
1432 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1433 assert_ron_snapshot!(test_tokenizer(
1434 r"cat <<-HERE
1435 SOMETHING
1436 HERE
1437"
1438 )?);
1439 Ok(())
1440 }
1441
1442 #[test]
1443 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1444 assert_ron_snapshot!(test_tokenizer(
1445 r"cat <<EOF | wc -l
1446A B C
14471 2 3
1448D E F
1449EOF
1450"
1451 )?);
1452 Ok(())
1453 }
1454
1455 #[test]
1456 fn tokenize_multiple_here_docs() -> Result<()> {
1457 assert_ron_snapshot!(test_tokenizer(
1458 r"cat <<HERE1 <<HERE2
1459SOMETHING
1460HERE1
1461OTHER
1462HERE2
1463echo after
1464"
1465 )?);
1466 Ok(())
1467 }
1468
1469 #[test]
1470 fn tokenize_unterminated_here_doc() {
1471 let result = tokenize_str(
1472 r"cat <<HERE
1473SOMETHING
1474",
1475 );
1476 assert!(result.is_err());
1477 }
1478
1479 #[test]
1480 fn tokenize_missing_here_tag() {
1481 let result = tokenize_str(
1482 r"cat <<
1483",
1484 );
1485 assert!(result.is_err());
1486 }
1487
1488 #[test]
1489 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1490 assert_ron_snapshot!(test_tokenizer(
1491 r"echo $(cat <<HERE
1492TEXT
1493HERE
1494)"
1495 )?);
1496 Ok(())
1497 }
1498
1499 #[test]
1500 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1501 assert_ron_snapshot!(test_tokenizer(
1502 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1503TEXT
1504HERE1
1505OTHER
1506HERE2
1507)"
1508 )?);
1509 Ok(())
1510 }
1511
1512 #[test]
1513 fn tokenize_simple_backquote() -> Result<()> {
1514 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1515 Ok(())
1516 }
1517
1518 #[test]
1519 fn tokenize_backquote_with_escape() -> Result<()> {
1520 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1521 Ok(())
1522 }
1523
1524 #[test]
1525 fn tokenize_unterminated_backquote() {
1526 assert_matches!(
1527 tokenize_str("`"),
1528 Err(TokenizerError::UnterminatedBackquote(_))
1529 );
1530 }
1531
1532 #[test]
1533 fn tokenize_unterminated_command_substitution() {
1534 assert_matches!(
1535 tokenize_str("$("),
1536 Err(TokenizerError::UnterminatedCommandSubstitution)
1537 );
1538 }
1539
1540 #[test]
1541 fn tokenize_command_substitution() -> Result<()> {
1542 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1543 Ok(())
1544 }
1545
1546 #[test]
1547 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1548 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1549 Ok(())
1550 }
1551
1552 #[test]
1553 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1554 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1555 Ok(())
1556 }
1557
1558 #[test]
1559 fn tokenize_arithmetic_expression() -> Result<()> {
1560 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1561 Ok(())
1562 }
1563
1564 #[test]
1565 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1566 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1569 Ok(())
1570 }
1571 #[test]
1572 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1573 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1574 Ok(())
1575 }
1576
1577 #[test]
1578 fn tokenize_special_parameters() -> Result<()> {
1579 assert_ron_snapshot!(test_tokenizer("$$")?);
1580 assert_ron_snapshot!(test_tokenizer("$@")?);
1581 assert_ron_snapshot!(test_tokenizer("$!")?);
1582 assert_ron_snapshot!(test_tokenizer("$?")?);
1583 assert_ron_snapshot!(test_tokenizer("$*")?);
1584 Ok(())
1585 }
1586
1587 #[test]
1588 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1589 assert_ron_snapshot!(test_tokenizer("$x")?);
1590 assert_ron_snapshot!(test_tokenizer("a$x")?);
1591 Ok(())
1592 }
1593
1594 #[test]
1595 fn tokenize_unterminated_parameter_expansion() {
1596 assert_matches!(
1597 tokenize_str("${x"),
1598 Err(TokenizerError::UnterminatedVariable)
1599 );
1600 }
1601
1602 #[test]
1603 fn tokenize_braced_parameter_expansion() -> Result<()> {
1604 assert_ron_snapshot!(test_tokenizer("${x}")?);
1605 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1606 Ok(())
1607 }
1608
1609 #[test]
1610 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1611 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1612 Ok(())
1613 }
1614
1615 #[test]
1616 fn tokenize_whitespace() -> Result<()> {
1617 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1618 Ok(())
1619 }
1620
1621 #[test]
1622 fn tokenize_escaped_whitespace() -> Result<()> {
1623 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1624 Ok(())
1625 }
1626
1627 #[test]
1628 fn tokenize_single_quote() -> Result<()> {
1629 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1630 Ok(())
1631 }
1632
1633 #[test]
1634 fn tokenize_double_quote() -> Result<()> {
1635 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1636 Ok(())
1637 }
1638
1639 #[test]
1640 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1641 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1642 Ok(())
1643 }
1644
1645 #[test]
1646 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1647 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1648 Ok(())
1649 }
1650
1651 #[test]
1652 fn test_quote_removal() {
1653 assert_eq!(unquote_str(r#""hello""#), "hello");
1654 assert_eq!(unquote_str(r"'hello'"), "hello");
1655 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1656 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1657 }
1658}