1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9 EndOfInput,
11 UnescapedNewLine,
13 SpecifiedTerminatingChar,
15 NonNewLineBlank,
17 HereDocumentBodyStart,
19 HereDocumentBodyEnd,
21 HereDocumentEndTag,
23 OperatorStart,
25 OperatorEnd,
27 Other,
29}
30
31pub type TokenLocation = SourceSpan;
33
34#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38 any(test, feature = "serde"),
39 derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42 Operator(String, SourceSpan),
44 Word(String, SourceSpan),
46}
47
48impl Token {
49 pub fn to_str(&self) -> &str {
51 match self {
52 Self::Operator(s, _) => s,
53 Self::Word(s, _) => s,
54 }
55 }
56
57 pub const fn location(&self) -> &SourceSpan {
59 match self {
60 Self::Operator(_, l) => l,
61 Self::Word(_, l) => l,
62 }
63 }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68 fn from(token: &Token) -> Self {
69 let start = token.location().start.as_ref();
70 Self::new(start.into(), token.location().length())
71 }
72}
73
74#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77 pub reason: TokenEndReason,
79 pub token: Option<Token>,
81}
82
83#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86 #[error("unterminated escape sequence")]
88 UnterminatedEscapeSequence,
89
90 #[error("unterminated single quote at {0}")]
92 UnterminatedSingleQuote(SourcePosition),
93
94 #[error("unterminated ANSI C quote at {0}")]
96 UnterminatedAnsiCQuote(SourcePosition),
97
98 #[error("unterminated double quote at {0}")]
100 UnterminatedDoubleQuote(SourcePosition),
101
102 #[error("unterminated backquote near {0}")]
104 UnterminatedBackquote(SourcePosition),
105
106 #[error("unterminated extglob near {0}")]
109 UnterminatedExtendedGlob(SourcePosition),
110
111 #[error("unterminated variable expression")]
113 UnterminatedVariable,
114
115 #[error("unterminated command substitution")]
117 UnterminatedCommandSubstitution,
118
119 #[error("unterminated expansion")]
122 UnterminatedExpansion,
123
124 #[error("failed to decode UTF-8 characters")]
126 FailedDecoding,
127
128 #[error("missing here tag for here document body")]
130 MissingHereTagForDocumentBody,
131
132 #[error("missing here tag '{0}'")]
134 MissingHereTag(String),
135
136 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
138 UnterminatedHereDocuments(String, String),
139
140 #[error("failed to read input")]
142 ReadError(#[from] std::io::Error),
143}
144
145impl TokenizerError {
146 pub const fn is_incomplete(&self) -> bool {
149 matches!(
150 self,
151 Self::UnterminatedEscapeSequence
152 | Self::UnterminatedAnsiCQuote(..)
153 | Self::UnterminatedSingleQuote(..)
154 | Self::UnterminatedDoubleQuote(..)
155 | Self::UnterminatedBackquote(..)
156 | Self::UnterminatedCommandSubstitution
157 | Self::UnterminatedExpansion
158 | Self::UnterminatedVariable
159 | Self::UnterminatedExtendedGlob(..)
160 | Self::UnterminatedHereDocuments(..)
161 )
162 }
163}
164
165#[derive(Debug)]
167pub(crate) struct Tokens<'a> {
168 pub tokens: &'a [Token],
170}
171
172#[derive(Clone, Debug)]
173enum QuoteMode {
174 None,
175 AnsiC(SourcePosition),
176 Single(SourcePosition),
177 Double(SourcePosition),
178}
179
180#[derive(Clone, Debug, Default)]
181enum HereState {
182 #[default]
184 None,
185 NextTokenIsHereTag { remove_tabs: bool },
187 CurrentTokenIsHereTag {
189 remove_tabs: bool,
190 operator_token_result: TokenizeResult,
191 },
192 NextLineIsHereDoc,
195 InHereDocs,
198}
199
200#[derive(Clone, Debug)]
201struct HereTag {
202 tag: String,
203 tag_was_escaped_or_quoted: bool,
204 remove_tabs: bool,
205 position: SourcePosition,
206 tokens: Vec<TokenizeResult>,
207 pending_tokens_after: Vec<TokenizeResult>,
208}
209
210#[derive(Clone, Debug)]
211struct CrossTokenParseState {
212 cursor: SourcePosition,
214 here_state: HereState,
216 current_here_tags: Vec<HereTag>,
218 queued_tokens: Vec<TokenizeResult>,
220 arithmetic_expansion: bool,
222}
223
224#[derive(Clone, Debug, Hash, Eq, PartialEq)]
226pub struct TokenizerOptions {
227 pub enable_extended_globbing: bool,
229 pub posix_mode: bool,
231 pub sh_mode: bool,
233}
234
235impl Default for TokenizerOptions {
236 fn default() -> Self {
237 Self {
238 enable_extended_globbing: true,
239 posix_mode: false,
240 sh_mode: false,
241 }
242 }
243}
244
245pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
247 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
248 cross_state: CrossTokenParseState,
249 options: TokenizerOptions,
250}
251
252#[derive(Clone, Debug)]
254struct TokenParseState {
255 pub start_position: SourcePosition,
256 pub token_so_far: String,
257 pub token_is_operator: bool,
258 pub in_escape: bool,
259 pub quote_mode: QuoteMode,
260}
261
262impl TokenParseState {
263 pub fn new(start_position: &SourcePosition) -> Self {
264 Self {
265 start_position: start_position.to_owned(),
266 token_so_far: String::new(),
267 token_is_operator: false,
268 in_escape: false,
269 quote_mode: QuoteMode::None,
270 }
271 }
272
273 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
274 let end = Arc::new(end_position.to_owned());
275 let token_location = SourceSpan {
276 start: Arc::new(std::mem::take(&mut self.start_position)),
277 end,
278 };
279
280 let token = if std::mem::take(&mut self.token_is_operator) {
281 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
282 } else {
283 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
284 };
285
286 end_position.clone_into(&mut self.start_position);
287 self.in_escape = false;
288 self.quote_mode = QuoteMode::None;
289
290 token
291 }
292
293 pub const fn started_token(&self) -> bool {
294 !self.token_so_far.is_empty()
295 }
296
297 pub fn append_char(&mut self, c: char) {
298 self.token_so_far.push(c);
299 }
300
301 pub fn append_str(&mut self, s: &str) {
302 self.token_so_far.push_str(s);
303 }
304
305 pub const fn unquoted(&self) -> bool {
306 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
307 }
308
309 pub fn current_token(&self) -> &str {
310 &self.token_so_far
311 }
312
313 pub fn is_specific_operator(&self, operator: &str) -> bool {
314 self.token_is_operator && self.current_token() == operator
315 }
316
317 pub const fn in_operator(&self) -> bool {
318 self.token_is_operator
319 }
320
321 fn is_newline(&self) -> bool {
322 self.token_so_far == "\n"
323 }
324
325 fn replace_with_here_doc(&mut self, s: String) {
326 self.token_so_far = s;
327 }
328
329 #[allow(clippy::too_many_lines)]
330 pub fn delimit_current_token(
331 &mut self,
332 reason: TokenEndReason,
333 cross_token_state: &mut CrossTokenParseState,
334 ) -> Result<Option<TokenizeResult>, TokenizerError> {
335 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
338 return Ok(Some(TokenizeResult {
339 reason,
340 token: None,
341 }));
342 }
343
344 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
346 match current_here_state {
347 HereState::NextTokenIsHereTag { remove_tabs } => {
348 let operator_token_result = TokenizeResult {
351 reason,
352 token: Some(self.pop(&cross_token_state.cursor)),
353 };
354
355 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
356 remove_tabs,
357 operator_token_result,
358 };
359
360 return Ok(None);
361 }
362 HereState::CurrentTokenIsHereTag {
363 remove_tabs,
364 operator_token_result,
365 } => {
366 if self.is_newline() {
367 return Err(TokenizerError::MissingHereTag(
368 self.current_token().to_owned(),
369 ));
370 }
371
372 cross_token_state.here_state = HereState::NextLineIsHereDoc;
373
374 let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
376 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
377
378 let tag_token_result = TokenizeResult {
379 reason,
380 token: Some(self.pop(&cross_token_state.cursor)),
381 };
382
383 cross_token_state.current_here_tags.push(HereTag {
384 tag,
385 tag_was_escaped_or_quoted,
386 remove_tabs,
387 position: cross_token_state.cursor.clone(),
388 tokens: vec![operator_token_result, tag_token_result],
389 pending_tokens_after: vec![],
390 });
391
392 return Ok(None);
393 }
394 HereState::NextLineIsHereDoc => {
395 if self.is_newline() {
396 cross_token_state.here_state = HereState::InHereDocs;
397 } else {
398 cross_token_state.here_state = HereState::NextLineIsHereDoc;
399 }
400
401 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
402 let token = self.pop(&cross_token_state.cursor);
403 let result = TokenizeResult {
404 reason,
405 token: Some(token),
406 };
407
408 last_here_tag.pending_tokens_after.push(result);
409 } else {
410 return Err(TokenizerError::MissingHereTagForDocumentBody);
411 }
412
413 return Ok(None);
414 }
415 HereState::InHereDocs => {
416 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
418
419 for here_token in completed_here_tag.tokens {
421 cross_token_state.queued_tokens.push(here_token);
422 }
423
424 cross_token_state.queued_tokens.push(TokenizeResult {
426 reason: TokenEndReason::HereDocumentBodyStart,
427 token: None,
428 });
429
430 cross_token_state.queued_tokens.push(TokenizeResult {
432 reason,
433 token: Some(self.pop(&cross_token_state.cursor)),
434 });
435
436 let end_tag = if completed_here_tag.tag_was_escaped_or_quoted {
438 unquote_str(&completed_here_tag.tag)
439 } else {
440 completed_here_tag.tag
441 };
442 self.append_str(end_tag.trim_end_matches('\n'));
443 cross_token_state.queued_tokens.push(TokenizeResult {
444 reason: TokenEndReason::HereDocumentEndTag,
445 token: Some(self.pop(&cross_token_state.cursor)),
446 });
447
448 for pending_token in completed_here_tag.pending_tokens_after {
451 cross_token_state.queued_tokens.push(pending_token);
452 }
453
454 if cross_token_state.current_here_tags.is_empty() {
455 cross_token_state.here_state = HereState::None;
456 } else {
457 cross_token_state.here_state = HereState::InHereDocs;
458 }
459
460 return Ok(None);
461 }
462 HereState::None => (),
463 }
464
465 let token = self.pop(&cross_token_state.cursor);
466 let result = TokenizeResult {
467 reason,
468 token: Some(token),
469 };
470
471 Ok(Some(result))
472 }
473}
474
475pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
481 tokenize_str_with_options(input, &TokenizerOptions::default())
482}
483
484pub fn tokenize_str_with_options(
491 input: &str,
492 options: &TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494 uncached_tokenize_string(input.to_owned(), options.to_owned())
495}
496
497#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
498fn uncached_tokenize_string(
499 input: String,
500 options: TokenizerOptions,
501) -> Result<Vec<Token>, TokenizerError> {
502 uncached_tokenize_str(input.as_str(), &options)
503}
504
505pub fn uncached_tokenize_str(
512 input: &str,
513 options: &TokenizerOptions,
514) -> Result<Vec<Token>, TokenizerError> {
515 let mut reader = std::io::BufReader::new(input.as_bytes());
516 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
517
518 let mut tokens = vec![];
519 loop {
520 match tokenizer.next_token()? {
521 TokenizeResult {
522 token: Some(token), ..
523 } => tokens.push(token),
524 TokenizeResult {
525 reason: TokenEndReason::EndOfInput,
526 ..
527 } => break,
528 _ => (),
529 }
530 }
531
532 Ok(tokens)
533}
534
535impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
536 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
537 Tokenizer {
538 options: options.clone(),
539 char_reader: reader.chars().peekable(),
540 cross_state: CrossTokenParseState {
541 cursor: SourcePosition {
542 index: 0,
543 line: 1,
544 column: 1,
545 },
546 here_state: HereState::None,
547 current_here_tags: vec![],
548 queued_tokens: vec![],
549 arithmetic_expansion: false,
550 },
551 }
552 }
553
554 #[expect(clippy::unnecessary_wraps)]
555 pub fn current_location(&self) -> Option<SourcePosition> {
556 Some(self.cross_state.cursor.clone())
557 }
558
559 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
560 let c = self
561 .char_reader
562 .next()
563 .transpose()
564 .map_err(TokenizerError::ReadError)?;
565
566 if let Some(ch) = c {
567 if ch == '\n' {
568 self.cross_state.cursor.line += 1;
569 self.cross_state.cursor.column = 1;
570 } else {
571 self.cross_state.cursor.column += 1;
572 }
573 self.cross_state.cursor.index += 1;
574 }
575
576 Ok(c)
577 }
578
579 fn consume_char(&mut self) -> Result<(), TokenizerError> {
580 let _ = self.next_char()?;
581 Ok(())
582 }
583
584 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
585 match self.char_reader.peek() {
586 Some(result) => match result {
587 Ok(c) => Ok(Some(*c)),
588 Err(_) => Err(TokenizerError::FailedDecoding),
589 },
590 None => Ok(None),
591 }
592 }
593
594 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
595 self.next_token_until(None, false )
596 }
597
598 fn consume_nested_construct(
609 &mut self,
610 state: &mut TokenParseState,
611 terminating_char: char,
612 nesting_open: &str,
613 mut nesting_count: u32,
614 ) -> Result<(), TokenizerError> {
615 let mut pending_here_doc_tokens = vec![];
616 let mut drain_here_doc_tokens = false;
617
618 loop {
619 let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
620 if pending_here_doc_tokens.len() == 1 {
621 drain_here_doc_tokens = false;
622 }
623 pending_here_doc_tokens.remove(0)
624 } else {
625 let cur_token = self.next_token_until(Some(terminating_char), true)?;
626
627 if matches!(
628 cur_token.reason,
629 TokenEndReason::HereDocumentBodyStart
630 | TokenEndReason::HereDocumentBodyEnd
631 | TokenEndReason::HereDocumentEndTag
632 ) {
633 pending_here_doc_tokens.push(cur_token);
634 continue;
635 }
636 cur_token
637 };
638
639 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
640 && !pending_here_doc_tokens.is_empty()
641 {
642 pending_here_doc_tokens.push(cur_token);
643 drain_here_doc_tokens = true;
644 continue;
645 }
646
647 if let Some(cur_token_value) = cur_token.token {
648 state.append_str(cur_token_value.to_str());
649
650 if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
651 nesting_count += 1;
652 }
653 }
654
655 match cur_token.reason {
656 TokenEndReason::HereDocumentBodyStart => {
657 state.append_char('\n');
658 }
659 TokenEndReason::NonNewLineBlank => state.append_char(' '),
660 TokenEndReason::SpecifiedTerminatingChar => {
661 nesting_count -= 1;
662 if nesting_count == 0 {
663 break;
664 }
665 state.append_char(self.next_char()?.unwrap());
666 }
667 TokenEndReason::EndOfInput => {
668 return Err(TokenizerError::UnterminatedExpansion);
669 }
670 _ => (),
671 }
672 }
673
674 state.append_char(self.next_char()?.unwrap());
675 Ok(())
676 }
677
678 #[expect(clippy::cognitive_complexity)]
689 #[expect(clippy::if_same_then_else)]
690 #[expect(clippy::panic_in_result_fn)]
691 #[expect(clippy::too_many_lines)]
692 #[allow(clippy::unwrap_in_result)]
693 fn next_token_until(
694 &mut self,
695 terminating_char: Option<char>,
696 include_space: bool,
697 ) -> Result<TokenizeResult, TokenizerError> {
698 let mut state = TokenParseState::new(&self.cross_state.cursor);
699 let mut result: Option<TokenizeResult> = None;
700
701 while result.is_none() {
702 if !self.cross_state.queued_tokens.is_empty() {
705 return Ok(self.cross_state.queued_tokens.remove(0));
706 }
707
708 let next = self.peek_char()?;
709 let c = next.unwrap_or('\0');
710
711 if next.is_none() {
714 if state.in_escape {
717 return Err(TokenizerError::UnterminatedEscapeSequence);
718 }
719 match state.quote_mode {
720 QuoteMode::None => (),
721 QuoteMode::AnsiC(pos) => {
722 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
723 }
724 QuoteMode::Single(pos) => {
725 return Err(TokenizerError::UnterminatedSingleQuote(pos));
726 }
727 QuoteMode::Double(pos) => {
728 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
729 }
730 }
731
732 if !matches!(self.cross_state.here_state, HereState::None) {
734 if self.remove_here_end_tag(&mut state, &mut result, false)? {
735 continue;
737 }
738
739 let tag_names = self
740 .cross_state
741 .current_here_tags
742 .iter()
743 .map(|tag| tag.tag.trim())
744 .collect::<Vec<_>>()
745 .join(", ");
746 let tag_positions = self
747 .cross_state
748 .current_here_tags
749 .iter()
750 .map(|tag| std::format!("{}", tag.position))
751 .collect::<Vec<_>>()
752 .join(", ");
753 return Err(TokenizerError::UnterminatedHereDocuments(
754 tag_names,
755 tag_positions,
756 ));
757 }
758
759 result = state
760 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
761 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
765 if !self.cross_state.current_here_tags.is_empty()
770 && self.cross_state.current_here_tags[0].remove_tabs
771 && (!state.started_token() || state.current_token().ends_with('\n'))
772 && c == '\t'
773 {
774 self.consume_char()?;
776 } else {
777 self.consume_char()?;
778 state.append_char(c);
779
780 if c == '\n' {
782 self.remove_here_end_tag(&mut state, &mut result, true)?;
783 }
784 }
785 } else if state.unquoted() && terminating_char == Some(c) {
789 result = state.delimit_current_token(
790 TokenEndReason::SpecifiedTerminatingChar,
791 &mut self.cross_state,
792 )?;
793 } else if state.in_operator() {
794 let mut hypothetical_token = state.current_token().to_owned();
800 hypothetical_token.push(c);
801
802 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
803 self.consume_char()?;
804 state.append_char(c);
805 } else {
806 assert!(state.started_token());
807
808 if self.cross_state.arithmetic_expansion {
813 if state.is_specific_operator(")") && c == ')' {
821 self.cross_state.arithmetic_expansion = false;
822 }
823 } else if state.is_specific_operator("<<") {
824 self.cross_state.here_state =
825 HereState::NextTokenIsHereTag { remove_tabs: false };
826 } else if state.is_specific_operator("<<-") {
827 self.cross_state.here_state =
828 HereState::NextTokenIsHereTag { remove_tabs: true };
829 } else if state.is_specific_operator("(") && c == '(' {
830 self.cross_state.arithmetic_expansion = true;
831 }
832
833 let reason = if state.current_token() == "\n" {
834 TokenEndReason::UnescapedNewLine
835 } else {
836 TokenEndReason::OperatorEnd
837 };
838
839 result = state.delimit_current_token(reason, &mut self.cross_state)?;
840 }
841 } else if does_char_newly_affect_quoting(&state, c) {
845 if c == '\\' {
846 self.consume_char()?;
848
849 if matches!(self.peek_char()?, Some('\n')) {
850 self.consume_char()?;
852
853 } else {
855 state.in_escape = true;
856 state.append_char(c);
857 }
858 } else if c == '\'' {
859 if state.token_so_far.ends_with('$') {
860 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
861 } else {
862 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
863 }
864
865 self.consume_char()?;
866 state.append_char(c);
867 } else if c == '\"' {
868 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
869 self.consume_char()?;
870 state.append_char(c);
871 }
872 }
873 else if !state.in_escape
876 && matches!(
877 state.quote_mode,
878 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
879 )
880 && c == '\''
881 {
882 state.quote_mode = QuoteMode::None;
883 self.consume_char()?;
884 state.append_char(c);
885 } else if !state.in_escape
886 && matches!(state.quote_mode, QuoteMode::Double(..))
887 && c == '\"'
888 {
889 state.quote_mode = QuoteMode::None;
890 self.consume_char()?;
891 state.append_char(c);
892 }
893 else if state.in_escape {
897 state.in_escape = false;
898 self.consume_char()?;
899 state.append_char(c);
900 } else if (state.unquoted()
901 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
902 && (c == '$' || c == '`')
903 {
904 if c == '$' {
906 self.consume_char()?;
908
909 let char_after_dollar_sign = self.peek_char()?;
911 match char_after_dollar_sign {
912 Some('(') => {
913 state.append_char('$');
915
916 state.append_char(self.next_char()?.unwrap());
918
919 let (initial_nesting, is_arithmetic) =
922 if matches!(self.peek_char()?, Some('(')) {
923 state.append_char(self.next_char()?.unwrap());
925 (2, true)
926 } else {
927 (1, false)
928 };
929
930 if is_arithmetic {
931 self.cross_state.arithmetic_expansion = true;
932 }
933
934 self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
935
936 if is_arithmetic {
937 self.cross_state.arithmetic_expansion = false;
938 }
939 }
940
941 Some('[') => {
942 state.append_char('$');
944
945 state.append_char(self.next_char()?.unwrap());
947
948 self.cross_state.arithmetic_expansion = true;
951
952 self.consume_nested_construct(&mut state, ']', "[", 1)?;
953
954 self.cross_state.arithmetic_expansion = false;
955 }
956
957 Some('{') => {
958 state.append_char('$');
960
961 state.append_char(self.next_char()?.unwrap());
963
964 let mut pending_here_doc_tokens = vec![];
965 let mut drain_here_doc_tokens = false;
966
967 loop {
968 let cur_token = if drain_here_doc_tokens
969 && !pending_here_doc_tokens.is_empty()
970 {
971 if pending_here_doc_tokens.len() == 1 {
972 drain_here_doc_tokens = false;
973 }
974
975 pending_here_doc_tokens.remove(0)
976 } else {
977 let cur_token = self.next_token_until(
978 Some('}'),
979 false, )?;
981
982 if matches!(
986 cur_token.reason,
987 TokenEndReason::HereDocumentBodyStart
988 | TokenEndReason::HereDocumentBodyEnd
989 | TokenEndReason::HereDocumentEndTag
990 ) {
991 pending_here_doc_tokens.push(cur_token);
992 continue;
993 }
994
995 cur_token
996 };
997
998 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
999 && !pending_here_doc_tokens.is_empty()
1000 {
1001 pending_here_doc_tokens.push(cur_token);
1002 drain_here_doc_tokens = true;
1003 continue;
1004 }
1005
1006 if let Some(cur_token_value) = cur_token.token {
1007 state.append_str(cur_token_value.to_str());
1008 }
1009
1010 match cur_token.reason {
1011 TokenEndReason::HereDocumentBodyStart => {
1012 state.append_char('\n');
1013 }
1014 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1015 TokenEndReason::SpecifiedTerminatingChar => {
1016 state.append_char(self.next_char()?.unwrap());
1019 break;
1020 }
1021 TokenEndReason::EndOfInput => {
1022 return Err(TokenizerError::UnterminatedVariable);
1023 }
1024 _ => (),
1025 }
1026 }
1027 }
1028 _ => {
1029 state.append_char('$');
1032 }
1033 }
1034 } else {
1035 let backquote_pos = self.cross_state.cursor.clone();
1038 self.consume_char()?;
1039
1040 state.append_char(c);
1042
1043 let mut escaping_enabled = false;
1045 let mut done = false;
1046 while !done {
1047 let next_char_in_backquote = self.next_char()?;
1049 if let Some(cib) = next_char_in_backquote {
1050 state.append_char(cib);
1052
1053 if !escaping_enabled && cib == '\\' {
1055 escaping_enabled = true;
1056 } else {
1057 if !escaping_enabled && cib == '`' {
1059 done = true;
1060 }
1061 escaping_enabled = false;
1062 }
1063 } else {
1064 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1065 }
1066 }
1067 }
1068 }
1069 else if c == '('
1075 && self.options.enable_extended_globbing
1076 && state.unquoted()
1077 && !state.in_operator()
1078 && state
1079 .current_token()
1080 .ends_with(|x| Self::can_start_extglob(x))
1081 {
1082 self.consume_char()?;
1084 state.append_char(c);
1085
1086 let mut paren_depth = 1;
1087 let mut in_escape = false;
1088
1089 while paren_depth > 0 {
1091 if let Some(extglob_char) = self.next_char()? {
1092 state.append_char(extglob_char);
1094
1095 match extglob_char {
1096 _ if in_escape => in_escape = false,
1097 '\\' => in_escape = true,
1098 '(' => paren_depth += 1,
1099 ')' => paren_depth -= 1,
1100 _ => (),
1101 }
1102 } else {
1103 return Err(TokenizerError::UnterminatedExtendedGlob(
1104 self.cross_state.cursor.clone(),
1105 ));
1106 }
1107 }
1108 } else if state.unquoted() && Self::can_start_operator(c) {
1112 if state.started_token() {
1113 result = state.delimit_current_token(
1114 TokenEndReason::OperatorStart,
1115 &mut self.cross_state,
1116 )?;
1117 } else {
1118 state.token_is_operator = true;
1119 self.consume_char()?;
1120 state.append_char(c);
1121 }
1122 } else if state.unquoted() && is_blank(c) {
1126 if state.started_token() {
1127 result = state.delimit_current_token(
1128 TokenEndReason::NonNewLineBlank,
1129 &mut self.cross_state,
1130 )?;
1131 } else if include_space {
1132 state.append_char(c);
1133 } else {
1134 state.start_position.column += 1;
1136 state.start_position.index += 1;
1137 }
1138
1139 self.consume_char()?;
1140 }
1141 else if !state.token_is_operator
1146 && (state.started_token() || matches!(terminating_char, Some('}')))
1147 {
1148 self.consume_char()?;
1149 state.append_char(c);
1150 } else if c == '#' {
1151 self.consume_char()?;
1153
1154 let mut done = false;
1155 while !done {
1156 done = match self.peek_char()? {
1157 Some('\n') => true,
1158 None => true,
1159 _ => {
1160 self.consume_char()?;
1162 false
1163 }
1164 };
1165 }
1166 } else if state.started_token() {
1168 result =
1170 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1171 } else {
1172 self.consume_char()?;
1175 state.append_char(c);
1176 }
1177 }
1178
1179 let result = result.unwrap();
1180
1181 Ok(result)
1182 }
1183
1184 fn remove_here_end_tag(
1185 &mut self,
1186 state: &mut TokenParseState,
1187 result: &mut Option<TokenizeResult>,
1188 ends_with_newline: bool,
1189 ) -> Result<bool, TokenizerError> {
1190 if self.cross_state.current_here_tags.is_empty() {
1192 return Ok(false);
1193 }
1194
1195 let next_here_tag = &self.cross_state.current_here_tags[0];
1196
1197 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1198 unquote_str(next_here_tag.tag.as_str()).into()
1199 } else {
1200 next_here_tag.tag.as_str().into()
1201 };
1202
1203 let tag_str = if !ends_with_newline {
1204 tag_str
1205 .strip_suffix('\n')
1206 .unwrap_or_else(|| tag_str.as_ref())
1207 } else {
1208 tag_str.as_ref()
1209 };
1210
1211 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1212 if current_token_without_here_tag.is_empty()
1216 || current_token_without_here_tag.ends_with('\n')
1217 {
1218 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1219
1220 *result = state.delimit_current_token(
1222 TokenEndReason::HereDocumentBodyEnd,
1223 &mut self.cross_state,
1224 )?;
1225
1226 return Ok(true);
1227 }
1228 }
1229 Ok(false)
1230 }
1231
1232 const fn can_start_extglob(c: char) -> bool {
1233 matches!(c, '@' | '!' | '?' | '+' | '*')
1234 }
1235
1236 const fn can_start_operator(c: char) -> bool {
1237 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1238 }
1239
1240 fn is_operator(&self, s: &str) -> bool {
1241 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1243 return true;
1244 }
1245
1246 matches!(
1247 s,
1248 "&" | "&&"
1249 | "("
1250 | ")"
1251 | ";"
1252 | ";;"
1253 | "\n"
1254 | "|"
1255 | "||"
1256 | "<"
1257 | ">"
1258 | ">|"
1259 | "<<"
1260 | ">>"
1261 | "<&"
1262 | ">&"
1263 | "<<-"
1264 | "<>"
1265 )
1266 }
1267}
1268
1269impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1270 type Item = Result<TokenizeResult, TokenizerError>;
1271
1272 fn next(&mut self) -> Option<Self::Item> {
1273 match self.next_token() {
1274 #[expect(clippy::manual_map)]
1275 Ok(result) => match result.token {
1276 Some(_) => Some(Ok(result)),
1277 None => None,
1278 },
1279 Err(e) => Some(Err(e)),
1280 }
1281 }
1282}
1283
1284const fn is_blank(c: char) -> bool {
1285 c == ' ' || c == '\t'
1286}
1287
1288const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1289 if state.in_escape {
1291 return false;
1292 }
1293
1294 match state.quote_mode {
1295 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1298 if c == '\\' {
1299 true
1301 } else {
1302 false
1303 }
1304 }
1305 QuoteMode::Single(_) => false,
1307 QuoteMode::None => is_quoting_char(c),
1310 }
1311}
1312
1313const fn is_quoting_char(c: char) -> bool {
1314 matches!(c, '\\' | '\'' | '\"')
1315}
1316
1317pub fn unquote_str(s: &str) -> String {
1323 let mut result = String::new();
1324
1325 let mut in_escape = false;
1326 for c in s.chars() {
1327 match c {
1328 c if in_escape => {
1329 result.push(c);
1330 in_escape = false;
1331 }
1332 '\\' => in_escape = true,
1333 c if is_quoting_char(c) => (),
1334 c => result.push(c),
1335 }
1336 }
1337
1338 result
1339}
1340
1341#[cfg(test)]
1342mod tests {
1343
1344 use super::*;
1345 use anyhow::Result;
1346 use insta::assert_ron_snapshot;
1347 use pretty_assertions::{assert_eq, assert_matches};
1348
1349 #[derive(serde::Serialize, serde::Deserialize)]
1350 struct TokenizerResult<'a> {
1351 input: &'a str,
1352 result: Vec<Token>,
1353 }
1354
1355 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1356 Ok(TokenizerResult {
1357 input,
1358 result: tokenize_str(input)?,
1359 })
1360 }
1361
1362 #[test]
1363 fn tokenize_empty() -> Result<()> {
1364 let tokens = tokenize_str("")?;
1365 assert_eq!(tokens.len(), 0);
1366 Ok(())
1367 }
1368
1369 #[test]
1370 fn tokenize_line_continuation() -> Result<()> {
1371 assert_ron_snapshot!(test_tokenizer(
1372 r"a\
1373bc"
1374 )?);
1375 Ok(())
1376 }
1377
1378 #[test]
1379 fn tokenize_operators() -> Result<()> {
1380 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1381 Ok(())
1382 }
1383
1384 #[test]
1385 fn tokenize_comment() -> Result<()> {
1386 assert_ron_snapshot!(test_tokenizer(
1387 r"a #comment
1388"
1389 )?);
1390 Ok(())
1391 }
1392
1393 #[test]
1394 fn tokenize_comment_at_eof() -> Result<()> {
1395 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1396 Ok(())
1397 }
1398
1399 #[test]
1400 fn tokenize_empty_here_doc() -> Result<()> {
1401 assert_ron_snapshot!(test_tokenizer(
1402 r"cat <<HERE
1403HERE
1404"
1405 )?);
1406 Ok(())
1407 }
1408
1409 #[test]
1410 fn tokenize_here_doc() -> Result<()> {
1411 assert_ron_snapshot!(test_tokenizer(
1412 r"cat <<HERE
1413SOMETHING
1414HERE
1415echo after
1416"
1417 )?);
1418 assert_ron_snapshot!(test_tokenizer(
1419 r"cat <<HERE
1420SOMETHING
1421HERE
1422"
1423 )?);
1424 assert_ron_snapshot!(test_tokenizer(
1425 r"cat <<HERE
1426SOMETHING
1427HERE
1428
1429"
1430 )?);
1431 assert_ron_snapshot!(test_tokenizer(
1432 r"cat <<HERE
1433SOMETHING
1434HERE"
1435 )?);
1436 Ok(())
1437 }
1438
1439 #[test]
1440 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1441 assert_ron_snapshot!(test_tokenizer(
1442 r"cat <<-HERE
1443 SOMETHING
1444 HERE
1445"
1446 )?);
1447 Ok(())
1448 }
1449
1450 #[test]
1451 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1452 assert_ron_snapshot!(test_tokenizer(
1453 r"cat <<EOF | wc -l
1454A B C
14551 2 3
1456D E F
1457EOF
1458"
1459 )?);
1460 Ok(())
1461 }
1462
1463 #[test]
1464 fn tokenize_multiple_here_docs() -> Result<()> {
1465 assert_ron_snapshot!(test_tokenizer(
1466 r"cat <<HERE1 <<HERE2
1467SOMETHING
1468HERE1
1469OTHER
1470HERE2
1471echo after
1472"
1473 )?);
1474 Ok(())
1475 }
1476
1477 #[test]
1478 fn tokenize_unterminated_here_doc() {
1479 let result = tokenize_str(
1480 r"cat <<HERE
1481SOMETHING
1482",
1483 );
1484 assert!(result.is_err());
1485 }
1486
1487 #[test]
1488 fn tokenize_missing_here_tag() {
1489 let result = tokenize_str(
1490 r"cat <<
1491",
1492 );
1493 assert!(result.is_err());
1494 }
1495
1496 #[test]
1497 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1498 assert_ron_snapshot!(test_tokenizer(
1499 r"echo $(cat <<HERE
1500TEXT
1501HERE
1502)"
1503 )?);
1504 Ok(())
1505 }
1506
1507 #[test]
1508 fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1509 assert_ron_snapshot!(test_tokenizer(
1510 r#"echo "$(cat <<HERE
1511TEXT
1512HERE
1513)""#
1514 )?);
1515 Ok(())
1516 }
1517
1518 #[test]
1519 fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1520 assert_ron_snapshot!(test_tokenizer(
1521 r#"echo "$(cat << HERE
1522TEXT
1523HERE
1524)""#
1525 )?);
1526 Ok(())
1527 }
1528
1529 #[test]
1530 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1531 assert_ron_snapshot!(test_tokenizer(
1532 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1533TEXT
1534HERE1
1535OTHER
1536HERE2
1537)"
1538 )?);
1539 Ok(())
1540 }
1541
1542 #[test]
1543 fn tokenize_simple_backquote() -> Result<()> {
1544 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1545 Ok(())
1546 }
1547
1548 #[test]
1549 fn tokenize_backquote_with_escape() -> Result<()> {
1550 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1551 Ok(())
1552 }
1553
1554 #[test]
1555 fn tokenize_unterminated_backquote() {
1556 assert_matches!(
1557 tokenize_str("`"),
1558 Err(TokenizerError::UnterminatedBackquote(_))
1559 );
1560 }
1561
1562 #[test]
1563 fn tokenize_unterminated_command_substitution() {
1564 assert_matches!(
1567 tokenize_str("$("),
1568 Err(TokenizerError::UnterminatedExpansion)
1569 );
1570 }
1571
1572 #[test]
1573 fn tokenize_unterminated_arithmetic_expansion() {
1574 assert_matches!(
1575 tokenize_str("$(("),
1576 Err(TokenizerError::UnterminatedExpansion)
1577 );
1578 }
1579
1580 #[test]
1581 fn tokenize_unterminated_legacy_arithmetic_expansion() {
1582 assert_matches!(
1583 tokenize_str("$["),
1584 Err(TokenizerError::UnterminatedExpansion)
1585 );
1586 }
1587
1588 #[test]
1589 fn tokenize_command_substitution() -> Result<()> {
1590 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1591 Ok(())
1592 }
1593
1594 #[test]
1595 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1596 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1597 Ok(())
1598 }
1599
1600 #[test]
1601 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1602 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1603 Ok(())
1604 }
1605
1606 #[test]
1607 fn tokenize_arithmetic_expression() -> Result<()> {
1608 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1609 Ok(())
1610 }
1611
1612 #[test]
1613 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1614 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1617 Ok(())
1618 }
1619 #[test]
1620 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1621 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1622 Ok(())
1623 }
1624
1625 #[test]
1626 fn tokenize_special_parameters() -> Result<()> {
1627 assert_ron_snapshot!(test_tokenizer("$$")?);
1628 assert_ron_snapshot!(test_tokenizer("$@")?);
1629 assert_ron_snapshot!(test_tokenizer("$!")?);
1630 assert_ron_snapshot!(test_tokenizer("$?")?);
1631 assert_ron_snapshot!(test_tokenizer("$*")?);
1632 Ok(())
1633 }
1634
1635 #[test]
1636 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1637 assert_ron_snapshot!(test_tokenizer("$x")?);
1638 assert_ron_snapshot!(test_tokenizer("a$x")?);
1639 Ok(())
1640 }
1641
1642 #[test]
1643 fn tokenize_unterminated_parameter_expansion() {
1644 assert_matches!(
1645 tokenize_str("${x"),
1646 Err(TokenizerError::UnterminatedVariable)
1647 );
1648 }
1649
1650 #[test]
1651 fn tokenize_braced_parameter_expansion() -> Result<()> {
1652 assert_ron_snapshot!(test_tokenizer("${x}")?);
1653 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1654 Ok(())
1655 }
1656
1657 #[test]
1658 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1659 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1660 Ok(())
1661 }
1662
1663 #[test]
1664 fn tokenize_whitespace() -> Result<()> {
1665 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1666 Ok(())
1667 }
1668
1669 #[test]
1670 fn tokenize_escaped_whitespace() -> Result<()> {
1671 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1672 Ok(())
1673 }
1674
1675 #[test]
1676 fn tokenize_single_quote() -> Result<()> {
1677 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1678 Ok(())
1679 }
1680
1681 #[test]
1682 fn tokenize_double_quote() -> Result<()> {
1683 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1684 Ok(())
1685 }
1686
1687 #[test]
1688 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1689 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1690 Ok(())
1691 }
1692
1693 #[test]
1694 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1695 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1696 Ok(())
1697 }
1698
1699 #[test]
1700 fn test_quote_removal() {
1701 assert_eq!(unquote_str(r#""hello""#), "hello");
1702 assert_eq!(unquote_str(r"'hello'"), "hello");
1703 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1704 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1705 }
1706}