1use std::borrow::Cow;
2use std::fmt::Display;
3use std::sync::Arc;
4use utf8_chars::BufReadCharsExt;
5
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8 EndOfInput,
10 UnescapedNewLine,
12 SpecifiedTerminatingChar,
14 NonNewLineBlank,
16 HereDocumentBodyStart,
18 HereDocumentBodyEnd,
20 HereDocumentEndTag,
22 OperatorStart,
24 OperatorEnd,
26 Other,
28}
29
30#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
34#[cfg_attr(test, serde(rename = "Pos"))]
35pub struct SourcePosition {
36 #[cfg_attr(test, serde(rename = "idx"))]
38 pub index: usize,
39 pub line: usize,
41 #[cfg_attr(test, serde(rename = "col"))]
43 pub column: usize,
44}
45
46impl Display for SourcePosition {
47 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48 f.write_fmt(format_args!("line {} col {}", self.line, self.column))
49 }
50}
51
52#[cfg(feature = "diagnostics")]
53impl From<&SourcePosition> for miette::SourceOffset {
54 #[allow(clippy::cast_sign_loss)]
55 fn from(position: &SourcePosition) -> Self {
56 position.index.into()
57 }
58}
59
60#[derive(Clone, Default, Debug)]
62#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
63#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
64#[cfg_attr(test, serde(rename = "Loc"))]
65pub struct TokenLocation {
66 pub start: Arc<SourcePosition>,
68 pub end: Arc<SourcePosition>,
70}
71
72impl TokenLocation {
73 pub fn length(&self) -> usize {
75 self.end.index - self.start.index
76 }
77 pub(crate) fn within(start: &Self, end: &Self) -> Self {
78 Self {
79 start: start.start.clone(),
80 end: end.end.clone(),
81 }
82 }
83}
84
85#[derive(Clone, Debug)]
87#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
88#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
89pub enum Token {
90 #[cfg_attr(test, serde(rename = "Op"))]
92 Operator(String, TokenLocation),
93 #[cfg_attr(test, serde(rename = "W"))]
95 Word(String, TokenLocation),
96}
97
98impl Token {
99 pub fn to_str(&self) -> &str {
101 match self {
102 Self::Operator(s, _) => s,
103 Self::Word(s, _) => s,
104 }
105 }
106
107 pub const fn location(&self) -> &TokenLocation {
109 match self {
110 Self::Operator(_, l) => l,
111 Self::Word(_, l) => l,
112 }
113 }
114}
115
116#[cfg(feature = "diagnostics")]
117impl From<&Token> for miette::SourceSpan {
118 fn from(token: &Token) -> Self {
119 let start = token.location().start.as_ref();
120 Self::new(start.into(), token.location().length())
121 }
122}
123
124#[derive(Clone, Debug)]
126pub(crate) struct TokenizeResult {
127 pub reason: TokenEndReason,
129 pub token: Option<Token>,
131}
132
133#[derive(thiserror::Error, Debug)]
135pub enum TokenizerError {
136 #[error("unterminated escape sequence")]
138 UnterminatedEscapeSequence,
139
140 #[error("unterminated single quote at {0}")]
142 UnterminatedSingleQuote(SourcePosition),
143
144 #[error("unterminated ANSI C quote at {0}")]
146 UnterminatedAnsiCQuote(SourcePosition),
147
148 #[error("unterminated double quote at {0}")]
150 UnterminatedDoubleQuote(SourcePosition),
151
152 #[error("unterminated backquote near {0}")]
154 UnterminatedBackquote(SourcePosition),
155
156 #[error("unterminated extglob near {0}")]
159 UnterminatedExtendedGlob(SourcePosition),
160
161 #[error("unterminated variable expression")]
163 UnterminatedVariable,
164
165 #[error("unterminated command substitution")]
167 UnterminatedCommandSubstitution,
168
169 #[error("failed to decode UTF-8 characters")]
171 FailedDecoding,
172
173 #[error("missing here tag for here document body")]
175 MissingHereTagForDocumentBody,
176
177 #[error("missing here tag '{0}'")]
179 MissingHereTag(String),
180
181 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
183 UnterminatedHereDocuments(String, String),
184
185 #[error("failed to read input")]
187 ReadError(#[from] std::io::Error),
188}
189
190impl TokenizerError {
191 pub const fn is_incomplete(&self) -> bool {
194 matches!(
195 self,
196 Self::UnterminatedEscapeSequence
197 | Self::UnterminatedAnsiCQuote(..)
198 | Self::UnterminatedSingleQuote(..)
199 | Self::UnterminatedDoubleQuote(..)
200 | Self::UnterminatedBackquote(..)
201 | Self::UnterminatedCommandSubstitution
202 | Self::UnterminatedVariable
203 | Self::UnterminatedExtendedGlob(..)
204 | Self::UnterminatedHereDocuments(..)
205 )
206 }
207}
208
209#[derive(Debug)]
211pub(crate) struct Tokens<'a> {
212 pub tokens: &'a [Token],
214}
215
216#[derive(Clone, Debug)]
217enum QuoteMode {
218 None,
219 AnsiC(SourcePosition),
220 Single(SourcePosition),
221 Double(SourcePosition),
222}
223
224#[derive(Clone, Debug, Default)]
225enum HereState {
226 #[default]
228 None,
229 NextTokenIsHereTag { remove_tabs: bool },
231 CurrentTokenIsHereTag {
233 remove_tabs: bool,
234 operator_token_result: TokenizeResult,
235 },
236 NextLineIsHereDoc,
239 InHereDocs,
242}
243
244#[derive(Clone, Debug)]
245struct HereTag {
246 tag: String,
247 tag_was_escaped_or_quoted: bool,
248 remove_tabs: bool,
249 position: SourcePosition,
250 tokens: Vec<TokenizeResult>,
251 pending_tokens_after: Vec<TokenizeResult>,
252}
253
254#[derive(Clone, Debug)]
255struct CrossTokenParseState {
256 cursor: SourcePosition,
258 here_state: HereState,
260 current_here_tags: Vec<HereTag>,
262 queued_tokens: Vec<TokenizeResult>,
264 arithmetic_expansion: bool,
266}
267
268#[derive(Clone, Debug, Hash, Eq, PartialEq)]
270pub struct TokenizerOptions {
271 pub enable_extended_globbing: bool,
273 pub posix_mode: bool,
275 pub sh_mode: bool,
277}
278
279impl Default for TokenizerOptions {
280 fn default() -> Self {
281 Self {
282 enable_extended_globbing: true,
283 posix_mode: false,
284 sh_mode: false,
285 }
286 }
287}
288
289pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
291 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
292 cross_state: CrossTokenParseState,
293 options: TokenizerOptions,
294}
295
296#[derive(Clone, Debug)]
298struct TokenParseState {
299 pub start_position: SourcePosition,
300 pub token_so_far: String,
301 pub token_is_operator: bool,
302 pub in_escape: bool,
303 pub quote_mode: QuoteMode,
304}
305
306impl TokenParseState {
307 pub fn new(start_position: &SourcePosition) -> Self {
308 Self {
309 start_position: start_position.to_owned(),
310 token_so_far: String::new(),
311 token_is_operator: false,
312 in_escape: false,
313 quote_mode: QuoteMode::None,
314 }
315 }
316
317 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
318 let end = Arc::new(end_position.to_owned());
319 let token_location = TokenLocation {
320 start: Arc::new(std::mem::take(&mut self.start_position)),
321 end,
322 };
323
324 let token = if std::mem::take(&mut self.token_is_operator) {
325 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
326 } else {
327 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
328 };
329
330 end_position.clone_into(&mut self.start_position);
331 self.in_escape = false;
332 self.quote_mode = QuoteMode::None;
333
334 token
335 }
336
337 pub const fn started_token(&self) -> bool {
338 !self.token_so_far.is_empty()
339 }
340
341 pub fn append_char(&mut self, c: char) {
342 self.token_so_far.push(c);
343 }
344
345 pub fn append_str(&mut self, s: &str) {
346 self.token_so_far.push_str(s);
347 }
348
349 pub const fn unquoted(&self) -> bool {
350 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
351 }
352
353 pub fn current_token(&self) -> &str {
354 &self.token_so_far
355 }
356
357 pub fn is_specific_operator(&self, operator: &str) -> bool {
358 self.token_is_operator && self.current_token() == operator
359 }
360
361 pub const fn in_operator(&self) -> bool {
362 self.token_is_operator
363 }
364
365 fn is_newline(&self) -> bool {
366 self.token_so_far == "\n"
367 }
368
369 fn replace_with_here_doc(&mut self, s: String) {
370 self.token_so_far = s;
371 }
372
373 pub fn delimit_current_token(
374 &mut self,
375 reason: TokenEndReason,
376 cross_token_state: &mut CrossTokenParseState,
377 ) -> Result<Option<TokenizeResult>, TokenizerError> {
378 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
381 return Ok(Some(TokenizeResult {
382 reason,
383 token: None,
384 }));
385 }
386
387 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
389 match current_here_state {
390 HereState::NextTokenIsHereTag { remove_tabs } => {
391 let operator_token_result = TokenizeResult {
394 reason,
395 token: Some(self.pop(&cross_token_state.cursor)),
396 };
397
398 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
399 remove_tabs,
400 operator_token_result,
401 };
402
403 return Ok(None);
404 }
405 HereState::CurrentTokenIsHereTag {
406 remove_tabs,
407 operator_token_result,
408 } => {
409 if self.is_newline() {
410 return Err(TokenizerError::MissingHereTag(
411 self.current_token().to_owned(),
412 ));
413 }
414
415 cross_token_state.here_state = HereState::NextLineIsHereDoc;
416
417 let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
419 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
420
421 let tag_token_result = TokenizeResult {
422 reason,
423 token: Some(self.pop(&cross_token_state.cursor)),
424 };
425
426 cross_token_state.current_here_tags.push(HereTag {
427 tag,
428 tag_was_escaped_or_quoted,
429 remove_tabs,
430 position: cross_token_state.cursor.clone(),
431 tokens: vec![operator_token_result, tag_token_result],
432 pending_tokens_after: vec![],
433 });
434
435 return Ok(None);
436 }
437 HereState::NextLineIsHereDoc => {
438 if self.is_newline() {
439 cross_token_state.here_state = HereState::InHereDocs;
440 } else {
441 cross_token_state.here_state = HereState::NextLineIsHereDoc;
442 }
443
444 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
445 let token = self.pop(&cross_token_state.cursor);
446 let result = TokenizeResult {
447 reason,
448 token: Some(token),
449 };
450
451 last_here_tag.pending_tokens_after.push(result);
452 } else {
453 return Err(TokenizerError::MissingHereTagForDocumentBody);
454 }
455
456 return Ok(None);
457 }
458 HereState::InHereDocs => {
459 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
461
462 for here_token in completed_here_tag.tokens {
464 cross_token_state.queued_tokens.push(here_token);
465 }
466
467 cross_token_state.queued_tokens.push(TokenizeResult {
469 reason: TokenEndReason::HereDocumentBodyStart,
470 token: None,
471 });
472
473 cross_token_state.queued_tokens.push(TokenizeResult {
475 reason,
476 token: Some(self.pop(&cross_token_state.cursor)),
477 });
478
479 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
481 cross_token_state.queued_tokens.push(TokenizeResult {
482 reason: TokenEndReason::HereDocumentEndTag,
483 token: Some(self.pop(&cross_token_state.cursor)),
484 });
485
486 for pending_token in completed_here_tag.pending_tokens_after {
489 cross_token_state.queued_tokens.push(pending_token);
490 }
491
492 if cross_token_state.current_here_tags.is_empty() {
493 cross_token_state.here_state = HereState::None;
494 } else {
495 cross_token_state.here_state = HereState::InHereDocs;
496 }
497
498 return Ok(None);
499 }
500 HereState::None => (),
501 }
502
503 let token = self.pop(&cross_token_state.cursor);
504 let result = TokenizeResult {
505 reason,
506 token: Some(token),
507 };
508
509 Ok(Some(result))
510 }
511}
512
513pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
519 tokenize_str_with_options(input, &TokenizerOptions::default())
520}
521
522pub fn tokenize_str_with_options(
529 input: &str,
530 options: &TokenizerOptions,
531) -> Result<Vec<Token>, TokenizerError> {
532 uncached_tokenize_string(input.to_owned(), options.to_owned())
533}
534
535#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
536fn uncached_tokenize_string(
537 input: String,
538 options: TokenizerOptions,
539) -> Result<Vec<Token>, TokenizerError> {
540 uncached_tokenize_str(input.as_str(), &options)
541}
542
543pub fn uncached_tokenize_str(
550 input: &str,
551 options: &TokenizerOptions,
552) -> Result<Vec<Token>, TokenizerError> {
553 let mut reader = std::io::BufReader::new(input.as_bytes());
554 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
555
556 let mut tokens = vec![];
557 loop {
558 match tokenizer.next_token()? {
559 TokenizeResult {
560 token: Some(token), ..
561 } => tokens.push(token),
562 TokenizeResult {
563 reason: TokenEndReason::EndOfInput,
564 ..
565 } => break,
566 _ => (),
567 }
568 }
569
570 Ok(tokens)
571}
572
573impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
574 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
575 Tokenizer {
576 options: options.clone(),
577 char_reader: reader.chars().peekable(),
578 cross_state: CrossTokenParseState {
579 cursor: SourcePosition {
580 index: 0,
581 line: 1,
582 column: 1,
583 },
584 here_state: HereState::None,
585 current_here_tags: vec![],
586 queued_tokens: vec![],
587 arithmetic_expansion: false,
588 },
589 }
590 }
591
592 #[expect(clippy::unnecessary_wraps)]
593 pub fn current_location(&self) -> Option<SourcePosition> {
594 Some(self.cross_state.cursor.clone())
595 }
596
597 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
598 let c = self
599 .char_reader
600 .next()
601 .transpose()
602 .map_err(TokenizerError::ReadError)?;
603
604 if let Some(ch) = c {
605 if ch == '\n' {
606 self.cross_state.cursor.line += 1;
607 self.cross_state.cursor.column = 1;
608 } else {
609 self.cross_state.cursor.column += 1;
610 }
611 self.cross_state.cursor.index += 1;
612 }
613
614 Ok(c)
615 }
616
617 fn consume_char(&mut self) -> Result<(), TokenizerError> {
618 let _ = self.next_char()?;
619 Ok(())
620 }
621
622 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
623 match self.char_reader.peek() {
624 Some(result) => match result {
625 Ok(c) => Ok(Some(*c)),
626 Err(_) => Err(TokenizerError::FailedDecoding),
627 },
628 None => Ok(None),
629 }
630 }
631
632 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
633 self.next_token_until(None, false )
634 }
635
636 #[expect(clippy::cognitive_complexity)]
647 #[expect(clippy::if_same_then_else)]
648 #[expect(clippy::panic_in_result_fn)]
649 #[expect(clippy::too_many_lines)]
650 #[allow(clippy::unwrap_in_result)]
651 fn next_token_until(
652 &mut self,
653 terminating_char: Option<char>,
654 include_space: bool,
655 ) -> Result<TokenizeResult, TokenizerError> {
656 let mut state = TokenParseState::new(&self.cross_state.cursor);
657 let mut result: Option<TokenizeResult> = None;
658
659 while result.is_none() {
660 if !self.cross_state.queued_tokens.is_empty() {
663 return Ok(self.cross_state.queued_tokens.remove(0));
664 }
665
666 let next = self.peek_char()?;
667 let c = next.unwrap_or('\0');
668
669 if next.is_none() {
672 if state.in_escape {
675 return Err(TokenizerError::UnterminatedEscapeSequence);
676 }
677 match state.quote_mode {
678 QuoteMode::None => (),
679 QuoteMode::AnsiC(pos) => {
680 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
681 }
682 QuoteMode::Single(pos) => {
683 return Err(TokenizerError::UnterminatedSingleQuote(pos));
684 }
685 QuoteMode::Double(pos) => {
686 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
687 }
688 }
689
690 if !matches!(self.cross_state.here_state, HereState::None) {
692 if self.remove_here_end_tag(&mut state, &mut result, false)? {
693 continue;
695 }
696
697 let tag_names = self
698 .cross_state
699 .current_here_tags
700 .iter()
701 .map(|tag| tag.tag.trim())
702 .collect::<Vec<_>>()
703 .join(", ");
704 let tag_positions = self
705 .cross_state
706 .current_here_tags
707 .iter()
708 .map(|tag| std::format!("{}", tag.position))
709 .collect::<Vec<_>>()
710 .join(", ");
711 return Err(TokenizerError::UnterminatedHereDocuments(
712 tag_names,
713 tag_positions,
714 ));
715 }
716
717 result = state
718 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
719 } else if state.unquoted() && terminating_char == Some(c) {
723 result = state.delimit_current_token(
724 TokenEndReason::SpecifiedTerminatingChar,
725 &mut self.cross_state,
726 )?;
727 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
731 if !self.cross_state.current_here_tags.is_empty()
736 && self.cross_state.current_here_tags[0].remove_tabs
737 && (!state.started_token() || state.current_token().ends_with('\n'))
738 && c == '\t'
739 {
740 self.consume_char()?;
742 } else {
743 self.consume_char()?;
744 state.append_char(c);
745
746 if c == '\n' {
748 self.remove_here_end_tag(&mut state, &mut result, true)?;
749 }
750 }
751 } else if state.in_operator() {
752 let mut hypothetical_token = state.current_token().to_owned();
758 hypothetical_token.push(c);
759
760 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
761 self.consume_char()?;
762 state.append_char(c);
763 } else {
764 assert!(state.started_token());
765
766 if self.cross_state.arithmetic_expansion {
771 if state.is_specific_operator(")") && c == ')' {
779 self.cross_state.arithmetic_expansion = false;
780 }
781 } else if state.is_specific_operator("<<") {
782 self.cross_state.here_state =
783 HereState::NextTokenIsHereTag { remove_tabs: false };
784 } else if state.is_specific_operator("<<-") {
785 self.cross_state.here_state =
786 HereState::NextTokenIsHereTag { remove_tabs: true };
787 } else if state.is_specific_operator("(") && c == '(' {
788 self.cross_state.arithmetic_expansion = true;
789 }
790
791 let reason = if state.current_token() == "\n" {
792 TokenEndReason::UnescapedNewLine
793 } else {
794 TokenEndReason::OperatorEnd
795 };
796
797 result = state.delimit_current_token(reason, &mut self.cross_state)?;
798 }
799 } else if does_char_newly_affect_quoting(&state, c) {
803 if c == '\\' {
804 self.consume_char()?;
806
807 if matches!(self.peek_char()?, Some('\n')) {
808 self.consume_char()?;
810
811 } else {
813 state.in_escape = true;
814 state.append_char(c);
815 }
816 } else if c == '\'' {
817 if state.token_so_far.ends_with('$') {
818 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
819 } else {
820 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
821 }
822
823 self.consume_char()?;
824 state.append_char(c);
825 } else if c == '\"' {
826 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
827 self.consume_char()?;
828 state.append_char(c);
829 }
830 }
831 else if !state.in_escape
834 && matches!(
835 state.quote_mode,
836 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
837 )
838 && c == '\''
839 {
840 state.quote_mode = QuoteMode::None;
841 self.consume_char()?;
842 state.append_char(c);
843 } else if !state.in_escape
844 && matches!(state.quote_mode, QuoteMode::Double(..))
845 && c == '\"'
846 {
847 state.quote_mode = QuoteMode::None;
848 self.consume_char()?;
849 state.append_char(c);
850 }
851 else if state.in_escape {
855 state.in_escape = false;
856 self.consume_char()?;
857 state.append_char(c);
858 } else if (state.unquoted()
859 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
860 && (c == '$' || c == '`')
861 {
862 if c == '$' {
864 self.consume_char()?;
866
867 let char_after_dollar_sign = self.peek_char()?;
869 match char_after_dollar_sign {
870 Some('(') => {
871 state.append_char('$');
873
874 state.append_char(self.next_char()?.unwrap());
876
877 let mut required_end_parens = 1;
880 if matches!(self.peek_char()?, Some('(')) {
881 state.append_char(self.next_char()?.unwrap());
883 required_end_parens = 2;
886 self.cross_state.arithmetic_expansion = true;
891 }
892
893 let mut pending_here_doc_tokens = vec![];
894 let mut drain_here_doc_tokens = false;
895
896 loop {
897 let cur_token = if drain_here_doc_tokens
898 && !pending_here_doc_tokens.is_empty()
899 {
900 if pending_here_doc_tokens.len() == 1 {
901 drain_here_doc_tokens = false;
902 }
903
904 pending_here_doc_tokens.remove(0)
905 } else {
906 let cur_token = self.next_token_until(
907 Some(')'),
908 true, )?;
910
911 if matches!(
915 cur_token.reason,
916 TokenEndReason::HereDocumentBodyStart
917 | TokenEndReason::HereDocumentBodyEnd
918 | TokenEndReason::HereDocumentEndTag
919 ) {
920 pending_here_doc_tokens.push(cur_token);
921 continue;
922 }
923
924 cur_token
925 };
926
927 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
928 && !pending_here_doc_tokens.is_empty()
929 {
930 pending_here_doc_tokens.push(cur_token);
931 drain_here_doc_tokens = true;
932 continue;
933 }
934
935 if let Some(cur_token_value) = cur_token.token {
936 state.append_str(cur_token_value.to_str());
937
938 if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
943 {
944 required_end_parens += 1;
945 }
946 }
947
948 match cur_token.reason {
949 TokenEndReason::HereDocumentBodyStart => {
950 state.append_char('\n');
951 }
952 TokenEndReason::NonNewLineBlank => state.append_char(' '),
953 TokenEndReason::SpecifiedTerminatingChar => {
954 required_end_parens -= 1;
959 if required_end_parens == 0 {
960 break;
961 }
962
963 state.append_char(self.next_char()?.unwrap());
966 }
967 TokenEndReason::EndOfInput => {
968 return Err(
969 TokenizerError::UnterminatedCommandSubstitution,
970 );
971 }
972 _ => (),
973 }
974 }
975
976 self.cross_state.arithmetic_expansion = false;
977
978 state.append_char(self.next_char()?.unwrap());
979 }
980
981 Some('{') => {
982 state.append_char('$');
984
985 state.append_char(self.next_char()?.unwrap());
987
988 let mut pending_here_doc_tokens = vec![];
989 let mut drain_here_doc_tokens = false;
990
991 loop {
992 let cur_token = if drain_here_doc_tokens
993 && !pending_here_doc_tokens.is_empty()
994 {
995 if pending_here_doc_tokens.len() == 1 {
996 drain_here_doc_tokens = false;
997 }
998
999 pending_here_doc_tokens.remove(0)
1000 } else {
1001 let cur_token = self.next_token_until(
1002 Some('}'),
1003 false, )?;
1005
1006 if matches!(
1010 cur_token.reason,
1011 TokenEndReason::HereDocumentBodyStart
1012 | TokenEndReason::HereDocumentBodyEnd
1013 | TokenEndReason::HereDocumentEndTag
1014 ) {
1015 pending_here_doc_tokens.push(cur_token);
1016 continue;
1017 }
1018
1019 cur_token
1020 };
1021
1022 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
1023 && !pending_here_doc_tokens.is_empty()
1024 {
1025 pending_here_doc_tokens.push(cur_token);
1026 drain_here_doc_tokens = true;
1027 continue;
1028 }
1029
1030 if let Some(cur_token_value) = cur_token.token {
1031 state.append_str(cur_token_value.to_str());
1032 }
1033
1034 match cur_token.reason {
1035 TokenEndReason::HereDocumentBodyStart => {
1036 state.append_char('\n');
1037 }
1038 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1039 TokenEndReason::SpecifiedTerminatingChar => {
1040 state.append_char(self.next_char()?.unwrap());
1043 break;
1044 }
1045 TokenEndReason::EndOfInput => {
1046 return Err(TokenizerError::UnterminatedVariable);
1047 }
1048 _ => (),
1049 }
1050 }
1051 }
1052 _ => {
1053 state.append_char('$');
1056 }
1057 }
1058 } else {
1059 let backquote_pos = self.cross_state.cursor.clone();
1062 self.consume_char()?;
1063
1064 state.append_char(c);
1066
1067 let mut escaping_enabled = false;
1069 let mut done = false;
1070 while !done {
1071 let next_char_in_backquote = self.next_char()?;
1073 if let Some(cib) = next_char_in_backquote {
1074 state.append_char(cib);
1076
1077 if !escaping_enabled && cib == '\\' {
1079 escaping_enabled = true;
1080 } else {
1081 if !escaping_enabled && cib == '`' {
1083 done = true;
1084 }
1085 escaping_enabled = false;
1086 }
1087 } else {
1088 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1089 }
1090 }
1091 }
1092 }
1093 else if c == '('
1099 && self.options.enable_extended_globbing
1100 && state.unquoted()
1101 && !state.in_operator()
1102 && state
1103 .current_token()
1104 .ends_with(|x| Self::can_start_extglob(x))
1105 {
1106 self.consume_char()?;
1108 state.append_char(c);
1109
1110 let mut paren_depth = 1;
1111
1112 while paren_depth > 0 {
1114 if let Some(extglob_char) = self.next_char()? {
1115 state.append_char(extglob_char);
1117
1118 if extglob_char == '(' {
1121 paren_depth += 1;
1122 } else if extglob_char == ')' {
1123 paren_depth -= 1;
1124 }
1125 } else {
1126 return Err(TokenizerError::UnterminatedExtendedGlob(
1127 self.cross_state.cursor.clone(),
1128 ));
1129 }
1130 }
1131 } else if state.unquoted() && Self::can_start_operator(c) {
1135 if state.started_token() {
1136 result = state.delimit_current_token(
1137 TokenEndReason::OperatorStart,
1138 &mut self.cross_state,
1139 )?;
1140 } else {
1141 state.token_is_operator = true;
1142 self.consume_char()?;
1143 state.append_char(c);
1144 }
1145 } else if state.unquoted() && is_blank(c) {
1149 if state.started_token() {
1150 result = state.delimit_current_token(
1151 TokenEndReason::NonNewLineBlank,
1152 &mut self.cross_state,
1153 )?;
1154 } else if include_space {
1155 state.append_char(c);
1156 } else {
1157 state.start_position.column += 1;
1159 state.start_position.index += 1;
1160 }
1161
1162 self.consume_char()?;
1163 }
1164 else if !state.token_is_operator
1169 && (state.started_token() || matches!(terminating_char, Some('}')))
1170 {
1171 self.consume_char()?;
1172 state.append_char(c);
1173 } else if c == '#' {
1174 self.consume_char()?;
1176
1177 let mut done = false;
1178 while !done {
1179 done = match self.peek_char()? {
1180 Some('\n') => true,
1181 None => true,
1182 _ => {
1183 self.consume_char()?;
1185 false
1186 }
1187 };
1188 }
1189 } else if state.started_token() {
1191 result =
1193 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1194 } else {
1195 self.consume_char()?;
1198 state.append_char(c);
1199 }
1200 }
1201
1202 let result = result.unwrap();
1203
1204 Ok(result)
1205 }
1206
1207 fn remove_here_end_tag(
1208 &mut self,
1209 state: &mut TokenParseState,
1210 result: &mut Option<TokenizeResult>,
1211 ends_with_newline: bool,
1212 ) -> Result<bool, TokenizerError> {
1213 if self.cross_state.current_here_tags.is_empty() {
1215 return Ok(false);
1216 }
1217
1218 let next_here_tag = &self.cross_state.current_here_tags[0];
1219
1220 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1221 unquote_str(next_here_tag.tag.as_str()).into()
1222 } else {
1223 next_here_tag.tag.as_str().into()
1224 };
1225
1226 let tag_str = if !ends_with_newline {
1227 tag_str
1228 .strip_suffix('\n')
1229 .unwrap_or_else(|| tag_str.as_ref())
1230 } else {
1231 tag_str.as_ref()
1232 };
1233
1234 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1235 if current_token_without_here_tag.is_empty()
1239 || current_token_without_here_tag.ends_with('\n')
1240 {
1241 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1242
1243 *result = state.delimit_current_token(
1245 TokenEndReason::HereDocumentBodyEnd,
1246 &mut self.cross_state,
1247 )?;
1248
1249 return Ok(true);
1250 }
1251 }
1252 Ok(false)
1253 }
1254
1255 const fn can_start_extglob(c: char) -> bool {
1256 matches!(c, '@' | '!' | '?' | '+' | '*')
1257 }
1258
1259 const fn can_start_operator(c: char) -> bool {
1260 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1261 }
1262
1263 fn is_operator(&self, s: &str) -> bool {
1264 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1266 return true;
1267 }
1268
1269 matches!(
1270 s,
1271 "&" | "&&"
1272 | "("
1273 | ")"
1274 | ";"
1275 | ";;"
1276 | "\n"
1277 | "|"
1278 | "||"
1279 | "<"
1280 | ">"
1281 | ">|"
1282 | "<<"
1283 | ">>"
1284 | "<&"
1285 | ">&"
1286 | "<<-"
1287 | "<>"
1288 )
1289 }
1290}
1291
1292impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1293 type Item = Result<TokenizeResult, TokenizerError>;
1294
1295 fn next(&mut self) -> Option<Self::Item> {
1296 match self.next_token() {
1297 #[expect(clippy::manual_map)]
1298 Ok(result) => match result.token {
1299 Some(_) => Some(Ok(result)),
1300 None => None,
1301 },
1302 Err(e) => Some(Err(e)),
1303 }
1304 }
1305}
1306
1307const fn is_blank(c: char) -> bool {
1308 c == ' ' || c == '\t'
1309}
1310
1311const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1312 if state.in_escape {
1314 return false;
1315 }
1316
1317 match state.quote_mode {
1318 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1321 if c == '\\' {
1322 true
1324 } else {
1325 false
1326 }
1327 }
1328 QuoteMode::Single(_) => false,
1330 QuoteMode::None => is_quoting_char(c),
1333 }
1334}
1335
1336const fn is_quoting_char(c: char) -> bool {
1337 matches!(c, '\\' | '\'' | '\"')
1338}
1339
1340pub fn unquote_str(s: &str) -> String {
1346 let mut result = String::new();
1347
1348 let mut in_escape = false;
1349 for c in s.chars() {
1350 match c {
1351 c if in_escape => {
1352 result.push(c);
1353 in_escape = false;
1354 }
1355 '\\' => in_escape = true,
1356 c if is_quoting_char(c) => (),
1357 c => result.push(c),
1358 }
1359 }
1360
1361 result
1362}
1363
1364#[cfg(test)]
1365mod tests {
1366
1367 use super::*;
1368 use anyhow::Result;
1369 use insta::assert_ron_snapshot;
1370 use pretty_assertions::{assert_eq, assert_matches};
1371
1372 #[derive(serde::Serialize)]
1373 struct TokenizerResult<'a> {
1374 input: &'a str,
1375 result: Vec<Token>,
1376 }
1377
1378 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1379 Ok(TokenizerResult {
1380 input,
1381 result: tokenize_str(input)?,
1382 })
1383 }
1384
1385 #[test]
1386 fn tokenize_empty() -> Result<()> {
1387 let tokens = tokenize_str("")?;
1388 assert_eq!(tokens.len(), 0);
1389 Ok(())
1390 }
1391
1392 #[test]
1393 fn tokenize_line_continuation() -> Result<()> {
1394 assert_ron_snapshot!(test_tokenizer(
1395 r"a\
1396bc"
1397 )?);
1398 Ok(())
1399 }
1400
1401 #[test]
1402 fn tokenize_operators() -> Result<()> {
1403 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1404 Ok(())
1405 }
1406
1407 #[test]
1408 fn tokenize_comment() -> Result<()> {
1409 assert_ron_snapshot!(test_tokenizer(
1410 r"a #comment
1411"
1412 )?);
1413 Ok(())
1414 }
1415
1416 #[test]
1417 fn tokenize_comment_at_eof() -> Result<()> {
1418 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1419 Ok(())
1420 }
1421
1422 #[test]
1423 fn tokenize_empty_here_doc() -> Result<()> {
1424 assert_ron_snapshot!(test_tokenizer(
1425 r"cat <<HERE
1426HERE
1427"
1428 )?);
1429 Ok(())
1430 }
1431
1432 #[test]
1433 fn tokenize_here_doc() -> Result<()> {
1434 assert_ron_snapshot!(test_tokenizer(
1435 r"cat <<HERE
1436SOMETHING
1437HERE
1438echo after
1439"
1440 )?);
1441 assert_ron_snapshot!(test_tokenizer(
1442 r"cat <<HERE
1443SOMETHING
1444HERE
1445"
1446 )?);
1447 assert_ron_snapshot!(test_tokenizer(
1448 r"cat <<HERE
1449SOMETHING
1450HERE
1451
1452"
1453 )?);
1454 assert_ron_snapshot!(test_tokenizer(
1455 r"cat <<HERE
1456SOMETHING
1457HERE"
1458 )?);
1459 Ok(())
1460 }
1461
1462 #[test]
1463 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1464 assert_ron_snapshot!(test_tokenizer(
1465 r"cat <<-HERE
1466 SOMETHING
1467 HERE
1468"
1469 )?);
1470 Ok(())
1471 }
1472
1473 #[test]
1474 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1475 assert_ron_snapshot!(test_tokenizer(
1476 r"cat <<EOF | wc -l
1477A B C
14781 2 3
1479D E F
1480EOF
1481"
1482 )?);
1483 Ok(())
1484 }
1485
1486 #[test]
1487 fn tokenize_multiple_here_docs() -> Result<()> {
1488 assert_ron_snapshot!(test_tokenizer(
1489 r"cat <<HERE1 <<HERE2
1490SOMETHING
1491HERE1
1492OTHER
1493HERE2
1494echo after
1495"
1496 )?);
1497 Ok(())
1498 }
1499
1500 #[test]
1501 fn tokenize_unterminated_here_doc() {
1502 let result = tokenize_str(
1503 r"cat <<HERE
1504SOMETHING
1505",
1506 );
1507 assert!(result.is_err());
1508 }
1509
1510 #[test]
1511 fn tokenize_missing_here_tag() {
1512 let result = tokenize_str(
1513 r"cat <<
1514",
1515 );
1516 assert!(result.is_err());
1517 }
1518
1519 #[test]
1520 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1521 assert_ron_snapshot!(test_tokenizer(
1522 r"echo $(cat <<HERE
1523TEXT
1524HERE
1525)"
1526 )?);
1527 Ok(())
1528 }
1529
1530 #[test]
1531 fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1532 assert_ron_snapshot!(test_tokenizer(
1533 r#"echo "$(cat <<HERE
1534TEXT
1535HERE
1536)""#
1537 )?);
1538 Ok(())
1539 }
1540
1541 #[test]
1542 fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1543 assert_ron_snapshot!(test_tokenizer(
1544 r#"echo "$(cat << HERE
1545TEXT
1546HERE
1547)""#
1548 )?);
1549 Ok(())
1550 }
1551
1552 #[test]
1553 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1554 assert_ron_snapshot!(test_tokenizer(
1555 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1556TEXT
1557HERE1
1558OTHER
1559HERE2
1560)"
1561 )?);
1562 Ok(())
1563 }
1564
1565 #[test]
1566 fn tokenize_simple_backquote() -> Result<()> {
1567 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1568 Ok(())
1569 }
1570
1571 #[test]
1572 fn tokenize_backquote_with_escape() -> Result<()> {
1573 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1574 Ok(())
1575 }
1576
1577 #[test]
1578 fn tokenize_unterminated_backquote() {
1579 assert_matches!(
1580 tokenize_str("`"),
1581 Err(TokenizerError::UnterminatedBackquote(_))
1582 );
1583 }
1584
1585 #[test]
1586 fn tokenize_unterminated_command_substitution() {
1587 assert_matches!(
1588 tokenize_str("$("),
1589 Err(TokenizerError::UnterminatedCommandSubstitution)
1590 );
1591 }
1592
1593 #[test]
1594 fn tokenize_command_substitution() -> Result<()> {
1595 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1596 Ok(())
1597 }
1598
1599 #[test]
1600 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1601 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1602 Ok(())
1603 }
1604
1605 #[test]
1606 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1607 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1608 Ok(())
1609 }
1610
1611 #[test]
1612 fn tokenize_arithmetic_expression() -> Result<()> {
1613 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1614 Ok(())
1615 }
1616
1617 #[test]
1618 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1619 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1622 Ok(())
1623 }
1624 #[test]
1625 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1626 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1627 Ok(())
1628 }
1629
1630 #[test]
1631 fn tokenize_special_parameters() -> Result<()> {
1632 assert_ron_snapshot!(test_tokenizer("$$")?);
1633 assert_ron_snapshot!(test_tokenizer("$@")?);
1634 assert_ron_snapshot!(test_tokenizer("$!")?);
1635 assert_ron_snapshot!(test_tokenizer("$?")?);
1636 assert_ron_snapshot!(test_tokenizer("$*")?);
1637 Ok(())
1638 }
1639
1640 #[test]
1641 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1642 assert_ron_snapshot!(test_tokenizer("$x")?);
1643 assert_ron_snapshot!(test_tokenizer("a$x")?);
1644 Ok(())
1645 }
1646
1647 #[test]
1648 fn tokenize_unterminated_parameter_expansion() {
1649 assert_matches!(
1650 tokenize_str("${x"),
1651 Err(TokenizerError::UnterminatedVariable)
1652 );
1653 }
1654
1655 #[test]
1656 fn tokenize_braced_parameter_expansion() -> Result<()> {
1657 assert_ron_snapshot!(test_tokenizer("${x}")?);
1658 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1659 Ok(())
1660 }
1661
1662 #[test]
1663 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1664 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1665 Ok(())
1666 }
1667
1668 #[test]
1669 fn tokenize_whitespace() -> Result<()> {
1670 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1671 Ok(())
1672 }
1673
1674 #[test]
1675 fn tokenize_escaped_whitespace() -> Result<()> {
1676 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1677 Ok(())
1678 }
1679
1680 #[test]
1681 fn tokenize_single_quote() -> Result<()> {
1682 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1683 Ok(())
1684 }
1685
1686 #[test]
1687 fn tokenize_double_quote() -> Result<()> {
1688 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1689 Ok(())
1690 }
1691
1692 #[test]
1693 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1694 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1695 Ok(())
1696 }
1697
1698 #[test]
1699 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1700 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1701 Ok(())
1702 }
1703
1704 #[test]
1705 fn test_quote_removal() {
1706 assert_eq!(unquote_str(r#""hello""#), "hello");
1707 assert_eq!(unquote_str(r"'hello'"), "hello");
1708 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1709 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1710 }
1711}