1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9 EndOfInput,
11 UnescapedNewLine,
13 SpecifiedTerminatingChar,
15 NonNewLineBlank,
17 HereDocumentBodyStart,
19 HereDocumentBodyEnd,
21 HereDocumentEndTag,
23 OperatorStart,
25 OperatorEnd,
27 Other,
29}
30
31pub type TokenLocation = SourceSpan;
33
34#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38 any(test, feature = "serde"),
39 derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42 Operator(String, SourceSpan),
44 Word(String, SourceSpan),
46}
47
48impl Token {
49 pub fn to_str(&self) -> &str {
51 match self {
52 Self::Operator(s, _) => s,
53 Self::Word(s, _) => s,
54 }
55 }
56
57 pub const fn location(&self) -> &SourceSpan {
59 match self {
60 Self::Operator(_, l) => l,
61 Self::Word(_, l) => l,
62 }
63 }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68 fn from(token: &Token) -> Self {
69 let start = token.location().start.as_ref();
70 Self::new(start.into(), token.location().length())
71 }
72}
73
74#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77 pub reason: TokenEndReason,
79 pub token: Option<Token>,
81}
82
83#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86 #[error("unterminated escape sequence")]
88 UnterminatedEscapeSequence,
89
90 #[error("unterminated single quote at {0}")]
92 UnterminatedSingleQuote(SourcePosition),
93
94 #[error("unterminated ANSI C quote at {0}")]
96 UnterminatedAnsiCQuote(SourcePosition),
97
98 #[error("unterminated double quote at {0}")]
100 UnterminatedDoubleQuote(SourcePosition),
101
102 #[error("unterminated backquote near {0}")]
104 UnterminatedBackquote(SourcePosition),
105
106 #[error("unterminated extglob near {0}")]
109 UnterminatedExtendedGlob(SourcePosition),
110
111 #[error("unterminated variable expression")]
113 UnterminatedVariable,
114
115 #[error("unterminated command substitution")]
117 UnterminatedCommandSubstitution,
118
119 #[error("unterminated expansion")]
121 UnterminatedExpansion,
122
123 #[error("failed to decode UTF-8 characters")]
125 FailedDecoding,
126
127 #[error("missing here tag for here document body")]
129 MissingHereTagForDocumentBody,
130
131 #[error("missing here tag '{0}'")]
133 MissingHereTag(String),
134
135 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
137 UnterminatedHereDocuments(String, String),
138
139 #[error("failed to read input")]
141 ReadError(#[from] std::io::Error),
142}
143
144impl TokenizerError {
145 pub const fn is_incomplete(&self) -> bool {
148 matches!(
149 self,
150 Self::UnterminatedEscapeSequence
151 | Self::UnterminatedAnsiCQuote(..)
152 | Self::UnterminatedSingleQuote(..)
153 | Self::UnterminatedDoubleQuote(..)
154 | Self::UnterminatedBackquote(..)
155 | Self::UnterminatedCommandSubstitution
156 | Self::UnterminatedExpansion
157 | Self::UnterminatedVariable
158 | Self::UnterminatedExtendedGlob(..)
159 | Self::UnterminatedHereDocuments(..)
160 )
161 }
162}
163
164#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167 pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173 None,
174 AnsiC(SourcePosition),
175 Single(SourcePosition),
176 Double(SourcePosition),
177}
178
179#[derive(Clone, Debug, Default)]
180enum HereState {
181 #[default]
183 None,
184 NextTokenIsHereTag { remove_tabs: bool },
186 CurrentTokenIsHereTag {
188 remove_tabs: bool,
189 operator_token_result: TokenizeResult,
190 },
191 NextLineIsHereDoc,
194 InHereDocs,
197}
198
199#[derive(Clone, Debug)]
200struct HereTag {
201 tag: String,
202 tag_was_escaped_or_quoted: bool,
203 remove_tabs: bool,
204 position: SourcePosition,
205 tokens: Vec<TokenizeResult>,
206 pending_tokens_after: Vec<TokenizeResult>,
207}
208
209#[derive(Clone, Debug)]
210struct CrossTokenParseState {
211 cursor: SourcePosition,
213 here_state: HereState,
215 current_here_tags: Vec<HereTag>,
217 queued_tokens: Vec<TokenizeResult>,
219 arithmetic_expansion: bool,
221}
222
223#[derive(Clone, Debug, Hash, Eq, PartialEq)]
225pub struct TokenizerOptions {
226 pub enable_extended_globbing: bool,
228 pub posix_mode: bool,
230 pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235 fn default() -> Self {
236 Self {
237 enable_extended_globbing: true,
238 posix_mode: false,
239 sh_mode: false,
240 }
241 }
242}
243
244pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247 cross_state: CrossTokenParseState,
248 options: TokenizerOptions,
249}
250
251#[derive(Clone, Debug)]
253struct TokenParseState {
254 pub start_position: SourcePosition,
255 pub token_so_far: String,
256 pub token_is_operator: bool,
257 pub in_escape: bool,
258 pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262 pub fn new(start_position: &SourcePosition) -> Self {
263 Self {
264 start_position: start_position.to_owned(),
265 token_so_far: String::new(),
266 token_is_operator: false,
267 in_escape: false,
268 quote_mode: QuoteMode::None,
269 }
270 }
271
272 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273 let end = Arc::new(end_position.to_owned());
274 let token_location = SourceSpan {
275 start: Arc::new(std::mem::take(&mut self.start_position)),
276 end,
277 };
278
279 let token = if std::mem::take(&mut self.token_is_operator) {
280 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
281 } else {
282 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
283 };
284
285 end_position.clone_into(&mut self.start_position);
286 self.in_escape = false;
287 self.quote_mode = QuoteMode::None;
288
289 token
290 }
291
292 pub const fn started_token(&self) -> bool {
293 !self.token_so_far.is_empty()
294 }
295
296 pub fn append_char(&mut self, c: char) {
297 self.token_so_far.push(c);
298 }
299
300 pub fn append_str(&mut self, s: &str) {
301 self.token_so_far.push_str(s);
302 }
303
304 pub const fn unquoted(&self) -> bool {
305 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
306 }
307
308 pub fn current_token(&self) -> &str {
309 &self.token_so_far
310 }
311
312 pub fn is_specific_operator(&self, operator: &str) -> bool {
313 self.token_is_operator && self.current_token() == operator
314 }
315
316 pub const fn in_operator(&self) -> bool {
317 self.token_is_operator
318 }
319
320 fn is_newline(&self) -> bool {
321 self.token_so_far == "\n"
322 }
323
324 fn replace_with_here_doc(&mut self, s: String) {
325 self.token_so_far = s;
326 }
327
328 #[allow(clippy::too_many_lines)]
329 pub fn delimit_current_token(
330 &mut self,
331 reason: TokenEndReason,
332 cross_token_state: &mut CrossTokenParseState,
333 ) -> Result<Option<TokenizeResult>, TokenizerError> {
334 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
337 return Ok(Some(TokenizeResult {
338 reason,
339 token: None,
340 }));
341 }
342
343 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
345 match current_here_state {
346 HereState::NextTokenIsHereTag { remove_tabs } => {
347 let operator_token_result = TokenizeResult {
350 reason,
351 token: Some(self.pop(&cross_token_state.cursor)),
352 };
353
354 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
355 remove_tabs,
356 operator_token_result,
357 };
358
359 return Ok(None);
360 }
361 HereState::CurrentTokenIsHereTag {
362 remove_tabs,
363 operator_token_result,
364 } => {
365 if self.is_newline() {
366 return Err(TokenizerError::MissingHereTag(
367 self.current_token().to_owned(),
368 ));
369 }
370
371 cross_token_state.here_state = HereState::NextLineIsHereDoc;
372
373 let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
375 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
376
377 let tag_token_result = TokenizeResult {
378 reason,
379 token: Some(self.pop(&cross_token_state.cursor)),
380 };
381
382 cross_token_state.current_here_tags.push(HereTag {
383 tag,
384 tag_was_escaped_or_quoted,
385 remove_tabs,
386 position: cross_token_state.cursor.clone(),
387 tokens: vec![operator_token_result, tag_token_result],
388 pending_tokens_after: vec![],
389 });
390
391 return Ok(None);
392 }
393 HereState::NextLineIsHereDoc => {
394 if self.is_newline() {
395 cross_token_state.here_state = HereState::InHereDocs;
396 } else {
397 cross_token_state.here_state = HereState::NextLineIsHereDoc;
398 }
399
400 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
401 let token = self.pop(&cross_token_state.cursor);
402 let result = TokenizeResult {
403 reason,
404 token: Some(token),
405 };
406
407 last_here_tag.pending_tokens_after.push(result);
408 } else {
409 return Err(TokenizerError::MissingHereTagForDocumentBody);
410 }
411
412 return Ok(None);
413 }
414 HereState::InHereDocs => {
415 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
417
418 for here_token in completed_here_tag.tokens {
420 cross_token_state.queued_tokens.push(here_token);
421 }
422
423 cross_token_state.queued_tokens.push(TokenizeResult {
425 reason: TokenEndReason::HereDocumentBodyStart,
426 token: None,
427 });
428
429 cross_token_state.queued_tokens.push(TokenizeResult {
431 reason,
432 token: Some(self.pop(&cross_token_state.cursor)),
433 });
434
435 let end_tag = if completed_here_tag.tag_was_escaped_or_quoted {
437 unquote_str(&completed_here_tag.tag)
438 } else {
439 completed_here_tag.tag
440 };
441 self.append_str(end_tag.trim_end_matches('\n'));
442 cross_token_state.queued_tokens.push(TokenizeResult {
443 reason: TokenEndReason::HereDocumentEndTag,
444 token: Some(self.pop(&cross_token_state.cursor)),
445 });
446
447 for pending_token in completed_here_tag.pending_tokens_after {
450 cross_token_state.queued_tokens.push(pending_token);
451 }
452
453 if cross_token_state.current_here_tags.is_empty() {
454 cross_token_state.here_state = HereState::None;
455 } else {
456 cross_token_state.here_state = HereState::InHereDocs;
457 }
458
459 return Ok(None);
460 }
461 HereState::None => (),
462 }
463
464 let token = self.pop(&cross_token_state.cursor);
465 let result = TokenizeResult {
466 reason,
467 token: Some(token),
468 };
469
470 Ok(Some(result))
471 }
472}
473
474pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
480 tokenize_str_with_options(input, &TokenizerOptions::default())
481}
482
483pub fn tokenize_str_with_options(
490 input: &str,
491 options: &TokenizerOptions,
492) -> Result<Vec<Token>, TokenizerError> {
493 uncached_tokenize_string(input.to_owned(), options.to_owned())
494}
495
496#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
497fn uncached_tokenize_string(
498 input: String,
499 options: TokenizerOptions,
500) -> Result<Vec<Token>, TokenizerError> {
501 uncached_tokenize_str(input.as_str(), &options)
502}
503
504pub fn uncached_tokenize_str(
511 input: &str,
512 options: &TokenizerOptions,
513) -> Result<Vec<Token>, TokenizerError> {
514 let mut reader = std::io::BufReader::new(input.as_bytes());
515 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
516
517 let mut tokens = vec![];
518 loop {
519 match tokenizer.next_token()? {
520 TokenizeResult {
521 token: Some(token), ..
522 } => tokens.push(token),
523 TokenizeResult {
524 reason: TokenEndReason::EndOfInput,
525 ..
526 } => break,
527 _ => (),
528 }
529 }
530
531 Ok(tokens)
532}
533
534impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
535 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
536 Tokenizer {
537 options: options.clone(),
538 char_reader: reader.chars().peekable(),
539 cross_state: CrossTokenParseState {
540 cursor: SourcePosition {
541 index: 0,
542 line: 1,
543 column: 1,
544 },
545 here_state: HereState::None,
546 current_here_tags: vec![],
547 queued_tokens: vec![],
548 arithmetic_expansion: false,
549 },
550 }
551 }
552
553 #[expect(clippy::unnecessary_wraps)]
554 pub fn current_location(&self) -> Option<SourcePosition> {
555 Some(self.cross_state.cursor.clone())
556 }
557
558 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
559 let c = self
560 .char_reader
561 .next()
562 .transpose()
563 .map_err(TokenizerError::ReadError)?;
564
565 if let Some(ch) = c {
566 if ch == '\n' {
567 self.cross_state.cursor.line += 1;
568 self.cross_state.cursor.column = 1;
569 } else {
570 self.cross_state.cursor.column += 1;
571 }
572 self.cross_state.cursor.index += 1;
573 }
574
575 Ok(c)
576 }
577
578 fn consume_char(&mut self) -> Result<(), TokenizerError> {
579 let _ = self.next_char()?;
580 Ok(())
581 }
582
583 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
584 match self.char_reader.peek() {
585 Some(result) => match result {
586 Ok(c) => Ok(Some(*c)),
587 Err(_) => Err(TokenizerError::FailedDecoding),
588 },
589 None => Ok(None),
590 }
591 }
592
593 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
594 self.next_token_until(None, false )
595 }
596
597 fn consume_nested_construct(
607 &mut self,
608 state: &mut TokenParseState,
609 terminating_char: char,
610 nesting_open: &str,
611 mut nesting_count: u32,
612 ) -> Result<(), TokenizerError> {
613 let mut pending_here_doc_tokens = vec![];
614 let mut drain_here_doc_tokens = false;
615
616 loop {
617 let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
618 if pending_here_doc_tokens.len() == 1 {
619 drain_here_doc_tokens = false;
620 }
621 pending_here_doc_tokens.remove(0)
622 } else {
623 let cur_token = self.next_token_until(Some(terminating_char), true)?;
624
625 if matches!(
626 cur_token.reason,
627 TokenEndReason::HereDocumentBodyStart
628 | TokenEndReason::HereDocumentBodyEnd
629 | TokenEndReason::HereDocumentEndTag
630 ) {
631 pending_here_doc_tokens.push(cur_token);
632 continue;
633 }
634 cur_token
635 };
636
637 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
638 && !pending_here_doc_tokens.is_empty()
639 {
640 pending_here_doc_tokens.push(cur_token);
641 drain_here_doc_tokens = true;
642 continue;
643 }
644
645 if let Some(cur_token_value) = cur_token.token {
646 state.append_str(cur_token_value.to_str());
647
648 if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
649 nesting_count += 1;
650 }
651 }
652
653 match cur_token.reason {
654 TokenEndReason::HereDocumentBodyStart => {
655 state.append_char('\n');
656 }
657 TokenEndReason::NonNewLineBlank => state.append_char(' '),
658 TokenEndReason::SpecifiedTerminatingChar => {
659 nesting_count -= 1;
660 if nesting_count == 0 {
661 break;
662 }
663 state.append_char(self.next_char()?.unwrap());
664 }
665 TokenEndReason::EndOfInput => {
666 return Err(TokenizerError::UnterminatedExpansion);
667 }
668 _ => (),
669 }
670 }
671
672 state.append_char(self.next_char()?.unwrap());
673 Ok(())
674 }
675
676 #[expect(clippy::cognitive_complexity)]
687 #[expect(clippy::if_same_then_else)]
688 #[expect(clippy::panic_in_result_fn)]
689 #[expect(clippy::too_many_lines)]
690 #[allow(clippy::unwrap_in_result)]
691 fn next_token_until(
692 &mut self,
693 terminating_char: Option<char>,
694 include_space: bool,
695 ) -> Result<TokenizeResult, TokenizerError> {
696 let mut state = TokenParseState::new(&self.cross_state.cursor);
697 let mut result: Option<TokenizeResult> = None;
698
699 while result.is_none() {
700 if !self.cross_state.queued_tokens.is_empty() {
703 return Ok(self.cross_state.queued_tokens.remove(0));
704 }
705
706 let next = self.peek_char()?;
707 let c = next.unwrap_or('\0');
708
709 if next.is_none() {
712 if state.in_escape {
715 return Err(TokenizerError::UnterminatedEscapeSequence);
716 }
717 match state.quote_mode {
718 QuoteMode::None => (),
719 QuoteMode::AnsiC(pos) => {
720 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
721 }
722 QuoteMode::Single(pos) => {
723 return Err(TokenizerError::UnterminatedSingleQuote(pos));
724 }
725 QuoteMode::Double(pos) => {
726 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
727 }
728 }
729
730 if !matches!(self.cross_state.here_state, HereState::None) {
732 if self.remove_here_end_tag(&mut state, &mut result, false)? {
733 continue;
735 }
736
737 let tag_names = self
738 .cross_state
739 .current_here_tags
740 .iter()
741 .map(|tag| tag.tag.trim())
742 .collect::<Vec<_>>()
743 .join(", ");
744 let tag_positions = self
745 .cross_state
746 .current_here_tags
747 .iter()
748 .map(|tag| std::format!("{}", tag.position))
749 .collect::<Vec<_>>()
750 .join(", ");
751 return Err(TokenizerError::UnterminatedHereDocuments(
752 tag_names,
753 tag_positions,
754 ));
755 }
756
757 result = state
758 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
759 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
763 if !self.cross_state.current_here_tags.is_empty()
768 && self.cross_state.current_here_tags[0].remove_tabs
769 && (!state.started_token() || state.current_token().ends_with('\n'))
770 && c == '\t'
771 {
772 self.consume_char()?;
774 } else {
775 self.consume_char()?;
776 state.append_char(c);
777
778 if c == '\n' {
780 self.remove_here_end_tag(&mut state, &mut result, true)?;
781 }
782 }
783 } else if state.unquoted() && terminating_char == Some(c) {
787 result = state.delimit_current_token(
788 TokenEndReason::SpecifiedTerminatingChar,
789 &mut self.cross_state,
790 )?;
791 } else if state.in_operator() {
792 let mut hypothetical_token = state.current_token().to_owned();
798 hypothetical_token.push(c);
799
800 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
801 self.consume_char()?;
802 state.append_char(c);
803 } else {
804 assert!(state.started_token());
805
806 if self.cross_state.arithmetic_expansion {
811 if state.is_specific_operator(")") && c == ')' {
819 self.cross_state.arithmetic_expansion = false;
820 }
821 } else if state.is_specific_operator("<<") {
822 self.cross_state.here_state =
823 HereState::NextTokenIsHereTag { remove_tabs: false };
824 } else if state.is_specific_operator("<<-") {
825 self.cross_state.here_state =
826 HereState::NextTokenIsHereTag { remove_tabs: true };
827 } else if state.is_specific_operator("(") && c == '(' {
828 self.cross_state.arithmetic_expansion = true;
829 }
830
831 let reason = if state.current_token() == "\n" {
832 TokenEndReason::UnescapedNewLine
833 } else {
834 TokenEndReason::OperatorEnd
835 };
836
837 result = state.delimit_current_token(reason, &mut self.cross_state)?;
838 }
839 } else if does_char_newly_affect_quoting(&state, c) {
843 if c == '\\' {
844 self.consume_char()?;
846
847 if matches!(self.peek_char()?, Some('\n')) {
848 self.consume_char()?;
850
851 } else {
853 state.in_escape = true;
854 state.append_char(c);
855 }
856 } else if c == '\'' {
857 if state.token_so_far.ends_with('$') {
858 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
859 } else {
860 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
861 }
862
863 self.consume_char()?;
864 state.append_char(c);
865 } else if c == '\"' {
866 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
867 self.consume_char()?;
868 state.append_char(c);
869 }
870 }
871 else if !state.in_escape
874 && matches!(
875 state.quote_mode,
876 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
877 )
878 && c == '\''
879 {
880 state.quote_mode = QuoteMode::None;
881 self.consume_char()?;
882 state.append_char(c);
883 } else if !state.in_escape
884 && matches!(state.quote_mode, QuoteMode::Double(..))
885 && c == '\"'
886 {
887 state.quote_mode = QuoteMode::None;
888 self.consume_char()?;
889 state.append_char(c);
890 }
891 else if state.in_escape {
895 state.in_escape = false;
896 self.consume_char()?;
897 state.append_char(c);
898 } else if (state.unquoted()
899 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
900 && (c == '$' || c == '`')
901 {
902 if c == '$' {
904 self.consume_char()?;
906
907 let char_after_dollar_sign = self.peek_char()?;
909 match char_after_dollar_sign {
910 Some('(') => {
911 state.append_char('$');
913
914 state.append_char(self.next_char()?.unwrap());
916
917 let (initial_nesting, is_arithmetic) =
920 if matches!(self.peek_char()?, Some('(')) {
921 state.append_char(self.next_char()?.unwrap());
923 (2, true)
924 } else {
925 (1, false)
926 };
927
928 if is_arithmetic {
929 self.cross_state.arithmetic_expansion = true;
930 }
931
932 self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
933
934 if is_arithmetic {
935 self.cross_state.arithmetic_expansion = false;
936 }
937 }
938
939 Some('[') => {
940 state.append_char('$');
942
943 state.append_char(self.next_char()?.unwrap());
945
946 self.cross_state.arithmetic_expansion = true;
949
950 self.consume_nested_construct(&mut state, ']', "[", 1)?;
951
952 self.cross_state.arithmetic_expansion = false;
953 }
954
955 Some('{') => {
956 state.append_char('$');
958
959 state.append_char(self.next_char()?.unwrap());
961
962 let mut pending_here_doc_tokens = vec![];
963 let mut drain_here_doc_tokens = false;
964
965 loop {
966 let cur_token = if drain_here_doc_tokens
967 && !pending_here_doc_tokens.is_empty()
968 {
969 if pending_here_doc_tokens.len() == 1 {
970 drain_here_doc_tokens = false;
971 }
972
973 pending_here_doc_tokens.remove(0)
974 } else {
975 let cur_token = self.next_token_until(
976 Some('}'),
977 false, )?;
979
980 if matches!(
984 cur_token.reason,
985 TokenEndReason::HereDocumentBodyStart
986 | TokenEndReason::HereDocumentBodyEnd
987 | TokenEndReason::HereDocumentEndTag
988 ) {
989 pending_here_doc_tokens.push(cur_token);
990 continue;
991 }
992
993 cur_token
994 };
995
996 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
997 && !pending_here_doc_tokens.is_empty()
998 {
999 pending_here_doc_tokens.push(cur_token);
1000 drain_here_doc_tokens = true;
1001 continue;
1002 }
1003
1004 if let Some(cur_token_value) = cur_token.token {
1005 state.append_str(cur_token_value.to_str());
1006 }
1007
1008 match cur_token.reason {
1009 TokenEndReason::HereDocumentBodyStart => {
1010 state.append_char('\n');
1011 }
1012 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1013 TokenEndReason::SpecifiedTerminatingChar => {
1014 state.append_char(self.next_char()?.unwrap());
1017 break;
1018 }
1019 TokenEndReason::EndOfInput => {
1020 return Err(TokenizerError::UnterminatedVariable);
1021 }
1022 _ => (),
1023 }
1024 }
1025 }
1026 _ => {
1027 state.append_char('$');
1030 }
1031 }
1032 } else {
1033 let backquote_pos = self.cross_state.cursor.clone();
1036 self.consume_char()?;
1037
1038 state.append_char(c);
1040
1041 let mut escaping_enabled = false;
1043 let mut done = false;
1044 while !done {
1045 let next_char_in_backquote = self.next_char()?;
1047 if let Some(cib) = next_char_in_backquote {
1048 state.append_char(cib);
1050
1051 if !escaping_enabled && cib == '\\' {
1053 escaping_enabled = true;
1054 } else {
1055 if !escaping_enabled && cib == '`' {
1057 done = true;
1058 }
1059 escaping_enabled = false;
1060 }
1061 } else {
1062 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1063 }
1064 }
1065 }
1066 }
1067 else if c == '('
1073 && self.options.enable_extended_globbing
1074 && state.unquoted()
1075 && !state.in_operator()
1076 && state
1077 .current_token()
1078 .ends_with(|x| Self::can_start_extglob(x))
1079 {
1080 self.consume_char()?;
1082 state.append_char(c);
1083
1084 let mut paren_depth = 1;
1085 let mut in_escape = false;
1086
1087 while paren_depth > 0 {
1089 if let Some(extglob_char) = self.next_char()? {
1090 state.append_char(extglob_char);
1092
1093 match extglob_char {
1094 _ if in_escape => in_escape = false,
1095 '\\' => in_escape = true,
1096 '(' => paren_depth += 1,
1097 ')' => paren_depth -= 1,
1098 _ => (),
1099 }
1100 } else {
1101 return Err(TokenizerError::UnterminatedExtendedGlob(
1102 self.cross_state.cursor.clone(),
1103 ));
1104 }
1105 }
1106 } else if state.unquoted() && Self::can_start_operator(c) {
1110 if state.started_token() {
1111 result = state.delimit_current_token(
1112 TokenEndReason::OperatorStart,
1113 &mut self.cross_state,
1114 )?;
1115 } else {
1116 state.token_is_operator = true;
1117 self.consume_char()?;
1118 state.append_char(c);
1119 }
1120 } else if state.unquoted() && is_blank(c) {
1124 if state.started_token() {
1125 result = state.delimit_current_token(
1126 TokenEndReason::NonNewLineBlank,
1127 &mut self.cross_state,
1128 )?;
1129 } else if include_space {
1130 state.append_char(c);
1131 } else {
1132 state.start_position.column += 1;
1134 state.start_position.index += 1;
1135 }
1136
1137 self.consume_char()?;
1138 }
1139 else if !state.token_is_operator
1144 && (state.started_token() || matches!(terminating_char, Some('}')))
1145 {
1146 self.consume_char()?;
1147 state.append_char(c);
1148 } else if c == '#' {
1149 self.consume_char()?;
1151
1152 let mut done = false;
1153 while !done {
1154 done = match self.peek_char()? {
1155 Some('\n') => true,
1156 None => true,
1157 _ => {
1158 self.consume_char()?;
1160 false
1161 }
1162 };
1163 }
1164 } else if state.started_token() {
1166 result =
1168 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1169 } else {
1170 self.consume_char()?;
1173 state.append_char(c);
1174 }
1175 }
1176
1177 let result = result.unwrap();
1178
1179 Ok(result)
1180 }
1181
1182 fn remove_here_end_tag(
1183 &mut self,
1184 state: &mut TokenParseState,
1185 result: &mut Option<TokenizeResult>,
1186 ends_with_newline: bool,
1187 ) -> Result<bool, TokenizerError> {
1188 if self.cross_state.current_here_tags.is_empty() {
1190 return Ok(false);
1191 }
1192
1193 let next_here_tag = &self.cross_state.current_here_tags[0];
1194
1195 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1196 unquote_str(next_here_tag.tag.as_str()).into()
1197 } else {
1198 next_here_tag.tag.as_str().into()
1199 };
1200
1201 let tag_str = if !ends_with_newline {
1202 tag_str
1203 .strip_suffix('\n')
1204 .unwrap_or_else(|| tag_str.as_ref())
1205 } else {
1206 tag_str.as_ref()
1207 };
1208
1209 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1210 if current_token_without_here_tag.is_empty()
1214 || current_token_without_here_tag.ends_with('\n')
1215 {
1216 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1217
1218 *result = state.delimit_current_token(
1220 TokenEndReason::HereDocumentBodyEnd,
1221 &mut self.cross_state,
1222 )?;
1223
1224 return Ok(true);
1225 }
1226 }
1227 Ok(false)
1228 }
1229
1230 const fn can_start_extglob(c: char) -> bool {
1231 matches!(c, '@' | '!' | '?' | '+' | '*')
1232 }
1233
1234 const fn can_start_operator(c: char) -> bool {
1235 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1236 }
1237
1238 fn is_operator(&self, s: &str) -> bool {
1239 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1241 return true;
1242 }
1243
1244 matches!(
1245 s,
1246 "&" | "&&"
1247 | "("
1248 | ")"
1249 | ";"
1250 | ";;"
1251 | "\n"
1252 | "|"
1253 | "||"
1254 | "<"
1255 | ">"
1256 | ">|"
1257 | "<<"
1258 | ">>"
1259 | "<&"
1260 | ">&"
1261 | "<<-"
1262 | "<>"
1263 )
1264 }
1265}
1266
1267impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1268 type Item = Result<TokenizeResult, TokenizerError>;
1269
1270 fn next(&mut self) -> Option<Self::Item> {
1271 match self.next_token() {
1272 #[expect(clippy::manual_map)]
1273 Ok(result) => match result.token {
1274 Some(_) => Some(Ok(result)),
1275 None => None,
1276 },
1277 Err(e) => Some(Err(e)),
1278 }
1279 }
1280}
1281
1282const fn is_blank(c: char) -> bool {
1283 c == ' ' || c == '\t'
1284}
1285
1286const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1287 if state.in_escape {
1289 return false;
1290 }
1291
1292 match state.quote_mode {
1293 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1296 if c == '\\' {
1297 true
1299 } else {
1300 false
1301 }
1302 }
1303 QuoteMode::Single(_) => false,
1305 QuoteMode::None => is_quoting_char(c),
1308 }
1309}
1310
1311const fn is_quoting_char(c: char) -> bool {
1312 matches!(c, '\\' | '\'' | '\"')
1313}
1314
1315pub fn unquote_str(s: &str) -> String {
1321 let mut result = String::new();
1322
1323 let mut in_escape = false;
1324 for c in s.chars() {
1325 match c {
1326 c if in_escape => {
1327 result.push(c);
1328 in_escape = false;
1329 }
1330 '\\' => in_escape = true,
1331 c if is_quoting_char(c) => (),
1332 c => result.push(c),
1333 }
1334 }
1335
1336 result
1337}
1338
1339#[cfg(test)]
1340mod tests {
1341
1342 use super::*;
1343 use anyhow::Result;
1344 use insta::assert_ron_snapshot;
1345 use pretty_assertions::{assert_eq, assert_matches};
1346
1347 #[derive(serde::Serialize, serde::Deserialize)]
1348 struct TokenizerResult<'a> {
1349 input: &'a str,
1350 result: Vec<Token>,
1351 }
1352
1353 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1354 Ok(TokenizerResult {
1355 input,
1356 result: tokenize_str(input)?,
1357 })
1358 }
1359
1360 #[test]
1361 fn tokenize_empty() -> Result<()> {
1362 let tokens = tokenize_str("")?;
1363 assert_eq!(tokens.len(), 0);
1364 Ok(())
1365 }
1366
1367 #[test]
1368 fn tokenize_line_continuation() -> Result<()> {
1369 assert_ron_snapshot!(test_tokenizer(
1370 r"a\
1371bc"
1372 )?);
1373 Ok(())
1374 }
1375
1376 #[test]
1377 fn tokenize_operators() -> Result<()> {
1378 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1379 Ok(())
1380 }
1381
1382 #[test]
1383 fn tokenize_comment() -> Result<()> {
1384 assert_ron_snapshot!(test_tokenizer(
1385 r"a #comment
1386"
1387 )?);
1388 Ok(())
1389 }
1390
1391 #[test]
1392 fn tokenize_comment_at_eof() -> Result<()> {
1393 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1394 Ok(())
1395 }
1396
1397 #[test]
1398 fn tokenize_empty_here_doc() -> Result<()> {
1399 assert_ron_snapshot!(test_tokenizer(
1400 r"cat <<HERE
1401HERE
1402"
1403 )?);
1404 Ok(())
1405 }
1406
1407 #[test]
1408 fn tokenize_here_doc() -> Result<()> {
1409 assert_ron_snapshot!(test_tokenizer(
1410 r"cat <<HERE
1411SOMETHING
1412HERE
1413echo after
1414"
1415 )?);
1416 assert_ron_snapshot!(test_tokenizer(
1417 r"cat <<HERE
1418SOMETHING
1419HERE
1420"
1421 )?);
1422 assert_ron_snapshot!(test_tokenizer(
1423 r"cat <<HERE
1424SOMETHING
1425HERE
1426
1427"
1428 )?);
1429 assert_ron_snapshot!(test_tokenizer(
1430 r"cat <<HERE
1431SOMETHING
1432HERE"
1433 )?);
1434 Ok(())
1435 }
1436
1437 #[test]
1438 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1439 assert_ron_snapshot!(test_tokenizer(
1440 r"cat <<-HERE
1441 SOMETHING
1442 HERE
1443"
1444 )?);
1445 Ok(())
1446 }
1447
1448 #[test]
1449 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1450 assert_ron_snapshot!(test_tokenizer(
1451 r"cat <<EOF | wc -l
1452A B C
14531 2 3
1454D E F
1455EOF
1456"
1457 )?);
1458 Ok(())
1459 }
1460
1461 #[test]
1462 fn tokenize_multiple_here_docs() -> Result<()> {
1463 assert_ron_snapshot!(test_tokenizer(
1464 r"cat <<HERE1 <<HERE2
1465SOMETHING
1466HERE1
1467OTHER
1468HERE2
1469echo after
1470"
1471 )?);
1472 Ok(())
1473 }
1474
1475 #[test]
1476 fn tokenize_unterminated_here_doc() {
1477 let result = tokenize_str(
1478 r"cat <<HERE
1479SOMETHING
1480",
1481 );
1482 assert!(result.is_err());
1483 }
1484
1485 #[test]
1486 fn tokenize_missing_here_tag() {
1487 let result = tokenize_str(
1488 r"cat <<
1489",
1490 );
1491 assert!(result.is_err());
1492 }
1493
1494 #[test]
1495 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1496 assert_ron_snapshot!(test_tokenizer(
1497 r"echo $(cat <<HERE
1498TEXT
1499HERE
1500)"
1501 )?);
1502 Ok(())
1503 }
1504
1505 #[test]
1506 fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1507 assert_ron_snapshot!(test_tokenizer(
1508 r#"echo "$(cat <<HERE
1509TEXT
1510HERE
1511)""#
1512 )?);
1513 Ok(())
1514 }
1515
1516 #[test]
1517 fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1518 assert_ron_snapshot!(test_tokenizer(
1519 r#"echo "$(cat << HERE
1520TEXT
1521HERE
1522)""#
1523 )?);
1524 Ok(())
1525 }
1526
1527 #[test]
1528 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1529 assert_ron_snapshot!(test_tokenizer(
1530 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1531TEXT
1532HERE1
1533OTHER
1534HERE2
1535)"
1536 )?);
1537 Ok(())
1538 }
1539
1540 #[test]
1541 fn tokenize_simple_backquote() -> Result<()> {
1542 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1543 Ok(())
1544 }
1545
1546 #[test]
1547 fn tokenize_backquote_with_escape() -> Result<()> {
1548 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1549 Ok(())
1550 }
1551
1552 #[test]
1553 fn tokenize_unterminated_backquote() {
1554 assert_matches!(
1555 tokenize_str("`"),
1556 Err(TokenizerError::UnterminatedBackquote(_))
1557 );
1558 }
1559
1560 #[test]
1561 fn tokenize_unterminated_command_substitution() {
1562 assert_matches!(
1565 tokenize_str("$("),
1566 Err(TokenizerError::UnterminatedExpansion)
1567 );
1568 }
1569
1570 #[test]
1571 fn tokenize_unterminated_arithmetic_expansion() {
1572 assert_matches!(
1573 tokenize_str("$(("),
1574 Err(TokenizerError::UnterminatedExpansion)
1575 );
1576 }
1577
1578 #[test]
1579 fn tokenize_unterminated_legacy_arithmetic_expansion() {
1580 assert_matches!(
1581 tokenize_str("$["),
1582 Err(TokenizerError::UnterminatedExpansion)
1583 );
1584 }
1585
1586 #[test]
1587 fn tokenize_command_substitution() -> Result<()> {
1588 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1589 Ok(())
1590 }
1591
1592 #[test]
1593 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1594 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1595 Ok(())
1596 }
1597
1598 #[test]
1599 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1600 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1601 Ok(())
1602 }
1603
1604 #[test]
1605 fn tokenize_arithmetic_expression() -> Result<()> {
1606 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1607 Ok(())
1608 }
1609
1610 #[test]
1611 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1612 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1615 Ok(())
1616 }
1617 #[test]
1618 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1619 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1620 Ok(())
1621 }
1622
1623 #[test]
1624 fn tokenize_special_parameters() -> Result<()> {
1625 assert_ron_snapshot!(test_tokenizer("$$")?);
1626 assert_ron_snapshot!(test_tokenizer("$@")?);
1627 assert_ron_snapshot!(test_tokenizer("$!")?);
1628 assert_ron_snapshot!(test_tokenizer("$?")?);
1629 assert_ron_snapshot!(test_tokenizer("$*")?);
1630 Ok(())
1631 }
1632
1633 #[test]
1634 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1635 assert_ron_snapshot!(test_tokenizer("$x")?);
1636 assert_ron_snapshot!(test_tokenizer("a$x")?);
1637 Ok(())
1638 }
1639
1640 #[test]
1641 fn tokenize_unterminated_parameter_expansion() {
1642 assert_matches!(
1643 tokenize_str("${x"),
1644 Err(TokenizerError::UnterminatedVariable)
1645 );
1646 }
1647
1648 #[test]
1649 fn tokenize_braced_parameter_expansion() -> Result<()> {
1650 assert_ron_snapshot!(test_tokenizer("${x}")?);
1651 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1652 Ok(())
1653 }
1654
1655 #[test]
1656 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1657 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1658 Ok(())
1659 }
1660
1661 #[test]
1662 fn tokenize_whitespace() -> Result<()> {
1663 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1664 Ok(())
1665 }
1666
1667 #[test]
1668 fn tokenize_escaped_whitespace() -> Result<()> {
1669 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1670 Ok(())
1671 }
1672
1673 #[test]
1674 fn tokenize_single_quote() -> Result<()> {
1675 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1676 Ok(())
1677 }
1678
1679 #[test]
1680 fn tokenize_double_quote() -> Result<()> {
1681 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1682 Ok(())
1683 }
1684
1685 #[test]
1686 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1687 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1688 Ok(())
1689 }
1690
1691 #[test]
1692 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1693 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1694 Ok(())
1695 }
1696
1697 #[test]
1698 fn test_quote_removal() {
1699 assert_eq!(unquote_str(r#""hello""#), "hello");
1700 assert_eq!(unquote_str(r"'hello'"), "hello");
1701 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1702 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1703 }
1704}