1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9 EndOfInput,
11 UnescapedNewLine,
13 SpecifiedTerminatingChar,
15 NonNewLineBlank,
17 HereDocumentBodyStart,
19 HereDocumentBodyEnd,
21 HereDocumentEndTag,
23 OperatorStart,
25 OperatorEnd,
27 Other,
29}
30
31pub type TokenLocation = SourceSpan;
33
34#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38 any(test, feature = "serde"),
39 derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42 Operator(String, SourceSpan),
44 Word(String, SourceSpan),
46}
47
48impl Token {
49 pub fn to_str(&self) -> &str {
51 match self {
52 Self::Operator(s, _) => s,
53 Self::Word(s, _) => s,
54 }
55 }
56
57 pub const fn location(&self) -> &SourceSpan {
59 match self {
60 Self::Operator(_, l) => l,
61 Self::Word(_, l) => l,
62 }
63 }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68 fn from(token: &Token) -> Self {
69 let start = token.location().start.as_ref();
70 Self::new(start.into(), token.location().length())
71 }
72}
73
74#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77 pub reason: TokenEndReason,
79 pub token: Option<Token>,
81}
82
83#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86 #[error("unterminated escape sequence")]
88 UnterminatedEscapeSequence,
89
90 #[error("unterminated single quote at {0}")]
92 UnterminatedSingleQuote(SourcePosition),
93
94 #[error("unterminated ANSI C quote at {0}")]
96 UnterminatedAnsiCQuote(SourcePosition),
97
98 #[error("unterminated double quote at {0}")]
100 UnterminatedDoubleQuote(SourcePosition),
101
102 #[error("unterminated backquote near {0}")]
104 UnterminatedBackquote(SourcePosition),
105
106 #[error("unterminated extglob near {0}")]
109 UnterminatedExtendedGlob(SourcePosition),
110
111 #[error("unterminated variable expression")]
113 UnterminatedVariable,
114
115 #[error("unterminated command substitution")]
117 UnterminatedCommandSubstitution,
118
119 #[error("unterminated expansion")]
121 UnterminatedExpansion,
122
123 #[error("failed to decode UTF-8 characters")]
125 FailedDecoding,
126
127 #[error("missing here tag for here document body")]
129 MissingHereTagForDocumentBody,
130
131 #[error("missing here tag '{0}'")]
133 MissingHereTag(String),
134
135 #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
137 UnterminatedHereDocuments(String, String),
138
139 #[error("failed to read input")]
141 ReadError(#[from] std::io::Error),
142}
143
144impl TokenizerError {
145 pub const fn is_incomplete(&self) -> bool {
148 matches!(
149 self,
150 Self::UnterminatedEscapeSequence
151 | Self::UnterminatedAnsiCQuote(..)
152 | Self::UnterminatedSingleQuote(..)
153 | Self::UnterminatedDoubleQuote(..)
154 | Self::UnterminatedBackquote(..)
155 | Self::UnterminatedCommandSubstitution
156 | Self::UnterminatedExpansion
157 | Self::UnterminatedVariable
158 | Self::UnterminatedExtendedGlob(..)
159 | Self::UnterminatedHereDocuments(..)
160 )
161 }
162}
163
164#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167 pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173 None,
174 AnsiC(SourcePosition),
175 Single(SourcePosition),
176 Double(SourcePosition),
177}
178
179#[derive(Clone, Debug, Default)]
180enum HereState {
181 #[default]
183 None,
184 NextTokenIsHereTag { remove_tabs: bool },
186 CurrentTokenIsHereTag {
188 remove_tabs: bool,
189 operator_token_result: TokenizeResult,
190 },
191 NextLineIsHereDoc,
194 InHereDocs,
197}
198
199#[derive(Clone, Debug)]
200struct HereTag {
201 tag: String,
202 tag_was_escaped_or_quoted: bool,
203 remove_tabs: bool,
204 position: SourcePosition,
205 tokens: Vec<TokenizeResult>,
206 pending_tokens_after: Vec<TokenizeResult>,
207}
208
209#[derive(Clone, Debug)]
210struct CrossTokenParseState {
211 cursor: SourcePosition,
213 here_state: HereState,
215 current_here_tags: Vec<HereTag>,
217 queued_tokens: Vec<TokenizeResult>,
219 arithmetic_expansion: bool,
221}
222
223#[derive(Clone, Debug, Hash, Eq, PartialEq)]
225pub struct TokenizerOptions {
226 pub enable_extended_globbing: bool,
228 pub posix_mode: bool,
230 pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235 fn default() -> Self {
236 Self {
237 enable_extended_globbing: true,
238 posix_mode: false,
239 sh_mode: false,
240 }
241 }
242}
243
244pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246 char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247 cross_state: CrossTokenParseState,
248 options: TokenizerOptions,
249}
250
251#[derive(Clone, Debug)]
253struct TokenParseState {
254 pub start_position: SourcePosition,
255 pub token_so_far: String,
256 pub token_is_operator: bool,
257 pub in_escape: bool,
258 pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262 pub fn new(start_position: &SourcePosition) -> Self {
263 Self {
264 start_position: start_position.to_owned(),
265 token_so_far: String::new(),
266 token_is_operator: false,
267 in_escape: false,
268 quote_mode: QuoteMode::None,
269 }
270 }
271
272 pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273 let end = Arc::new(end_position.to_owned());
274 let token_location = SourceSpan {
275 start: Arc::new(std::mem::take(&mut self.start_position)),
276 end,
277 };
278
279 let token = if std::mem::take(&mut self.token_is_operator) {
280 Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
281 } else {
282 Token::Word(std::mem::take(&mut self.token_so_far), token_location)
283 };
284
285 end_position.clone_into(&mut self.start_position);
286 self.in_escape = false;
287 self.quote_mode = QuoteMode::None;
288
289 token
290 }
291
292 pub const fn started_token(&self) -> bool {
293 !self.token_so_far.is_empty()
294 }
295
296 pub fn append_char(&mut self, c: char) {
297 self.token_so_far.push(c);
298 }
299
300 pub fn append_str(&mut self, s: &str) {
301 self.token_so_far.push_str(s);
302 }
303
304 pub const fn unquoted(&self) -> bool {
305 !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
306 }
307
308 pub fn current_token(&self) -> &str {
309 &self.token_so_far
310 }
311
312 pub fn is_specific_operator(&self, operator: &str) -> bool {
313 self.token_is_operator && self.current_token() == operator
314 }
315
316 pub const fn in_operator(&self) -> bool {
317 self.token_is_operator
318 }
319
320 fn is_newline(&self) -> bool {
321 self.token_so_far == "\n"
322 }
323
324 fn replace_with_here_doc(&mut self, s: String) {
325 self.token_so_far = s;
326 }
327
328 pub fn delimit_current_token(
329 &mut self,
330 reason: TokenEndReason,
331 cross_token_state: &mut CrossTokenParseState,
332 ) -> Result<Option<TokenizeResult>, TokenizerError> {
333 if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
336 return Ok(Some(TokenizeResult {
337 reason,
338 token: None,
339 }));
340 }
341
342 let current_here_state = std::mem::take(&mut cross_token_state.here_state);
344 match current_here_state {
345 HereState::NextTokenIsHereTag { remove_tabs } => {
346 let operator_token_result = TokenizeResult {
349 reason,
350 token: Some(self.pop(&cross_token_state.cursor)),
351 };
352
353 cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
354 remove_tabs,
355 operator_token_result,
356 };
357
358 return Ok(None);
359 }
360 HereState::CurrentTokenIsHereTag {
361 remove_tabs,
362 operator_token_result,
363 } => {
364 if self.is_newline() {
365 return Err(TokenizerError::MissingHereTag(
366 self.current_token().to_owned(),
367 ));
368 }
369
370 cross_token_state.here_state = HereState::NextLineIsHereDoc;
371
372 let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
374 let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
375
376 let tag_token_result = TokenizeResult {
377 reason,
378 token: Some(self.pop(&cross_token_state.cursor)),
379 };
380
381 cross_token_state.current_here_tags.push(HereTag {
382 tag,
383 tag_was_escaped_or_quoted,
384 remove_tabs,
385 position: cross_token_state.cursor.clone(),
386 tokens: vec![operator_token_result, tag_token_result],
387 pending_tokens_after: vec![],
388 });
389
390 return Ok(None);
391 }
392 HereState::NextLineIsHereDoc => {
393 if self.is_newline() {
394 cross_token_state.here_state = HereState::InHereDocs;
395 } else {
396 cross_token_state.here_state = HereState::NextLineIsHereDoc;
397 }
398
399 if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
400 let token = self.pop(&cross_token_state.cursor);
401 let result = TokenizeResult {
402 reason,
403 token: Some(token),
404 };
405
406 last_here_tag.pending_tokens_after.push(result);
407 } else {
408 return Err(TokenizerError::MissingHereTagForDocumentBody);
409 }
410
411 return Ok(None);
412 }
413 HereState::InHereDocs => {
414 let completed_here_tag = cross_token_state.current_here_tags.remove(0);
416
417 for here_token in completed_here_tag.tokens {
419 cross_token_state.queued_tokens.push(here_token);
420 }
421
422 cross_token_state.queued_tokens.push(TokenizeResult {
424 reason: TokenEndReason::HereDocumentBodyStart,
425 token: None,
426 });
427
428 cross_token_state.queued_tokens.push(TokenizeResult {
430 reason,
431 token: Some(self.pop(&cross_token_state.cursor)),
432 });
433
434 self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
436 cross_token_state.queued_tokens.push(TokenizeResult {
437 reason: TokenEndReason::HereDocumentEndTag,
438 token: Some(self.pop(&cross_token_state.cursor)),
439 });
440
441 for pending_token in completed_here_tag.pending_tokens_after {
444 cross_token_state.queued_tokens.push(pending_token);
445 }
446
447 if cross_token_state.current_here_tags.is_empty() {
448 cross_token_state.here_state = HereState::None;
449 } else {
450 cross_token_state.here_state = HereState::InHereDocs;
451 }
452
453 return Ok(None);
454 }
455 HereState::None => (),
456 }
457
458 let token = self.pop(&cross_token_state.cursor);
459 let result = TokenizeResult {
460 reason,
461 token: Some(token),
462 };
463
464 Ok(Some(result))
465 }
466}
467
468pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
474 tokenize_str_with_options(input, &TokenizerOptions::default())
475}
476
477pub fn tokenize_str_with_options(
484 input: &str,
485 options: &TokenizerOptions,
486) -> Result<Vec<Token>, TokenizerError> {
487 uncached_tokenize_string(input.to_owned(), options.to_owned())
488}
489
490#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
491fn uncached_tokenize_string(
492 input: String,
493 options: TokenizerOptions,
494) -> Result<Vec<Token>, TokenizerError> {
495 uncached_tokenize_str(input.as_str(), &options)
496}
497
498pub fn uncached_tokenize_str(
505 input: &str,
506 options: &TokenizerOptions,
507) -> Result<Vec<Token>, TokenizerError> {
508 let mut reader = std::io::BufReader::new(input.as_bytes());
509 let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
510
511 let mut tokens = vec![];
512 loop {
513 match tokenizer.next_token()? {
514 TokenizeResult {
515 token: Some(token), ..
516 } => tokens.push(token),
517 TokenizeResult {
518 reason: TokenEndReason::EndOfInput,
519 ..
520 } => break,
521 _ => (),
522 }
523 }
524
525 Ok(tokens)
526}
527
528impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
529 pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
530 Tokenizer {
531 options: options.clone(),
532 char_reader: reader.chars().peekable(),
533 cross_state: CrossTokenParseState {
534 cursor: SourcePosition {
535 index: 0,
536 line: 1,
537 column: 1,
538 },
539 here_state: HereState::None,
540 current_here_tags: vec![],
541 queued_tokens: vec![],
542 arithmetic_expansion: false,
543 },
544 }
545 }
546
547 #[expect(clippy::unnecessary_wraps)]
548 pub fn current_location(&self) -> Option<SourcePosition> {
549 Some(self.cross_state.cursor.clone())
550 }
551
552 fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
553 let c = self
554 .char_reader
555 .next()
556 .transpose()
557 .map_err(TokenizerError::ReadError)?;
558
559 if let Some(ch) = c {
560 if ch == '\n' {
561 self.cross_state.cursor.line += 1;
562 self.cross_state.cursor.column = 1;
563 } else {
564 self.cross_state.cursor.column += 1;
565 }
566 self.cross_state.cursor.index += 1;
567 }
568
569 Ok(c)
570 }
571
572 fn consume_char(&mut self) -> Result<(), TokenizerError> {
573 let _ = self.next_char()?;
574 Ok(())
575 }
576
577 fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
578 match self.char_reader.peek() {
579 Some(result) => match result {
580 Ok(c) => Ok(Some(*c)),
581 Err(_) => Err(TokenizerError::FailedDecoding),
582 },
583 None => Ok(None),
584 }
585 }
586
587 pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
588 self.next_token_until(None, false )
589 }
590
591 fn consume_nested_construct(
601 &mut self,
602 state: &mut TokenParseState,
603 terminating_char: char,
604 nesting_open: &str,
605 mut nesting_count: u32,
606 ) -> Result<(), TokenizerError> {
607 let mut pending_here_doc_tokens = vec![];
608 let mut drain_here_doc_tokens = false;
609
610 loop {
611 let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
612 if pending_here_doc_tokens.len() == 1 {
613 drain_here_doc_tokens = false;
614 }
615 pending_here_doc_tokens.remove(0)
616 } else {
617 let cur_token = self.next_token_until(Some(terminating_char), true)?;
618
619 if matches!(
620 cur_token.reason,
621 TokenEndReason::HereDocumentBodyStart
622 | TokenEndReason::HereDocumentBodyEnd
623 | TokenEndReason::HereDocumentEndTag
624 ) {
625 pending_here_doc_tokens.push(cur_token);
626 continue;
627 }
628 cur_token
629 };
630
631 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
632 && !pending_here_doc_tokens.is_empty()
633 {
634 pending_here_doc_tokens.push(cur_token);
635 drain_here_doc_tokens = true;
636 continue;
637 }
638
639 if let Some(cur_token_value) = cur_token.token {
640 state.append_str(cur_token_value.to_str());
641
642 if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
643 nesting_count += 1;
644 }
645 }
646
647 match cur_token.reason {
648 TokenEndReason::HereDocumentBodyStart => {
649 state.append_char('\n');
650 }
651 TokenEndReason::NonNewLineBlank => state.append_char(' '),
652 TokenEndReason::SpecifiedTerminatingChar => {
653 nesting_count -= 1;
654 if nesting_count == 0 {
655 break;
656 }
657 state.append_char(self.next_char()?.unwrap());
658 }
659 TokenEndReason::EndOfInput => {
660 return Err(TokenizerError::UnterminatedExpansion);
661 }
662 _ => (),
663 }
664 }
665
666 state.append_char(self.next_char()?.unwrap());
667 Ok(())
668 }
669
670 #[expect(clippy::cognitive_complexity)]
681 #[expect(clippy::if_same_then_else)]
682 #[expect(clippy::panic_in_result_fn)]
683 #[expect(clippy::too_many_lines)]
684 #[allow(clippy::unwrap_in_result)]
685 fn next_token_until(
686 &mut self,
687 terminating_char: Option<char>,
688 include_space: bool,
689 ) -> Result<TokenizeResult, TokenizerError> {
690 let mut state = TokenParseState::new(&self.cross_state.cursor);
691 let mut result: Option<TokenizeResult> = None;
692
693 while result.is_none() {
694 if !self.cross_state.queued_tokens.is_empty() {
697 return Ok(self.cross_state.queued_tokens.remove(0));
698 }
699
700 let next = self.peek_char()?;
701 let c = next.unwrap_or('\0');
702
703 if next.is_none() {
706 if state.in_escape {
709 return Err(TokenizerError::UnterminatedEscapeSequence);
710 }
711 match state.quote_mode {
712 QuoteMode::None => (),
713 QuoteMode::AnsiC(pos) => {
714 return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
715 }
716 QuoteMode::Single(pos) => {
717 return Err(TokenizerError::UnterminatedSingleQuote(pos));
718 }
719 QuoteMode::Double(pos) => {
720 return Err(TokenizerError::UnterminatedDoubleQuote(pos));
721 }
722 }
723
724 if !matches!(self.cross_state.here_state, HereState::None) {
726 if self.remove_here_end_tag(&mut state, &mut result, false)? {
727 continue;
729 }
730
731 let tag_names = self
732 .cross_state
733 .current_here_tags
734 .iter()
735 .map(|tag| tag.tag.trim())
736 .collect::<Vec<_>>()
737 .join(", ");
738 let tag_positions = self
739 .cross_state
740 .current_here_tags
741 .iter()
742 .map(|tag| std::format!("{}", tag.position))
743 .collect::<Vec<_>>()
744 .join(", ");
745 return Err(TokenizerError::UnterminatedHereDocuments(
746 tag_names,
747 tag_positions,
748 ));
749 }
750
751 result = state
752 .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
753 } else if state.unquoted() && terminating_char == Some(c) {
757 result = state.delimit_current_token(
758 TokenEndReason::SpecifiedTerminatingChar,
759 &mut self.cross_state,
760 )?;
761 } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
765 if !self.cross_state.current_here_tags.is_empty()
770 && self.cross_state.current_here_tags[0].remove_tabs
771 && (!state.started_token() || state.current_token().ends_with('\n'))
772 && c == '\t'
773 {
774 self.consume_char()?;
776 } else {
777 self.consume_char()?;
778 state.append_char(c);
779
780 if c == '\n' {
782 self.remove_here_end_tag(&mut state, &mut result, true)?;
783 }
784 }
785 } else if state.in_operator() {
786 let mut hypothetical_token = state.current_token().to_owned();
792 hypothetical_token.push(c);
793
794 if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
795 self.consume_char()?;
796 state.append_char(c);
797 } else {
798 assert!(state.started_token());
799
800 if self.cross_state.arithmetic_expansion {
805 if state.is_specific_operator(")") && c == ')' {
813 self.cross_state.arithmetic_expansion = false;
814 }
815 } else if state.is_specific_operator("<<") {
816 self.cross_state.here_state =
817 HereState::NextTokenIsHereTag { remove_tabs: false };
818 } else if state.is_specific_operator("<<-") {
819 self.cross_state.here_state =
820 HereState::NextTokenIsHereTag { remove_tabs: true };
821 } else if state.is_specific_operator("(") && c == '(' {
822 self.cross_state.arithmetic_expansion = true;
823 }
824
825 let reason = if state.current_token() == "\n" {
826 TokenEndReason::UnescapedNewLine
827 } else {
828 TokenEndReason::OperatorEnd
829 };
830
831 result = state.delimit_current_token(reason, &mut self.cross_state)?;
832 }
833 } else if does_char_newly_affect_quoting(&state, c) {
837 if c == '\\' {
838 self.consume_char()?;
840
841 if matches!(self.peek_char()?, Some('\n')) {
842 self.consume_char()?;
844
845 } else {
847 state.in_escape = true;
848 state.append_char(c);
849 }
850 } else if c == '\'' {
851 if state.token_so_far.ends_with('$') {
852 state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
853 } else {
854 state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
855 }
856
857 self.consume_char()?;
858 state.append_char(c);
859 } else if c == '\"' {
860 state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
861 self.consume_char()?;
862 state.append_char(c);
863 }
864 }
865 else if !state.in_escape
868 && matches!(
869 state.quote_mode,
870 QuoteMode::Single(..) | QuoteMode::AnsiC(..)
871 )
872 && c == '\''
873 {
874 state.quote_mode = QuoteMode::None;
875 self.consume_char()?;
876 state.append_char(c);
877 } else if !state.in_escape
878 && matches!(state.quote_mode, QuoteMode::Double(..))
879 && c == '\"'
880 {
881 state.quote_mode = QuoteMode::None;
882 self.consume_char()?;
883 state.append_char(c);
884 }
885 else if state.in_escape {
889 state.in_escape = false;
890 self.consume_char()?;
891 state.append_char(c);
892 } else if (state.unquoted()
893 || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
894 && (c == '$' || c == '`')
895 {
896 if c == '$' {
898 self.consume_char()?;
900
901 let char_after_dollar_sign = self.peek_char()?;
903 match char_after_dollar_sign {
904 Some('(') => {
905 state.append_char('$');
907
908 state.append_char(self.next_char()?.unwrap());
910
911 let (initial_nesting, is_arithmetic) =
914 if matches!(self.peek_char()?, Some('(')) {
915 state.append_char(self.next_char()?.unwrap());
917 (2, true)
918 } else {
919 (1, false)
920 };
921
922 if is_arithmetic {
923 self.cross_state.arithmetic_expansion = true;
924 }
925
926 self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
927
928 if is_arithmetic {
929 self.cross_state.arithmetic_expansion = false;
930 }
931 }
932
933 Some('[') => {
934 state.append_char('$');
936
937 state.append_char(self.next_char()?.unwrap());
939
940 self.cross_state.arithmetic_expansion = true;
943
944 self.consume_nested_construct(&mut state, ']', "[", 1)?;
945
946 self.cross_state.arithmetic_expansion = false;
947 }
948
949 Some('{') => {
950 state.append_char('$');
952
953 state.append_char(self.next_char()?.unwrap());
955
956 let mut pending_here_doc_tokens = vec![];
957 let mut drain_here_doc_tokens = false;
958
959 loop {
960 let cur_token = if drain_here_doc_tokens
961 && !pending_here_doc_tokens.is_empty()
962 {
963 if pending_here_doc_tokens.len() == 1 {
964 drain_here_doc_tokens = false;
965 }
966
967 pending_here_doc_tokens.remove(0)
968 } else {
969 let cur_token = self.next_token_until(
970 Some('}'),
971 false, )?;
973
974 if matches!(
978 cur_token.reason,
979 TokenEndReason::HereDocumentBodyStart
980 | TokenEndReason::HereDocumentBodyEnd
981 | TokenEndReason::HereDocumentEndTag
982 ) {
983 pending_here_doc_tokens.push(cur_token);
984 continue;
985 }
986
987 cur_token
988 };
989
990 if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
991 && !pending_here_doc_tokens.is_empty()
992 {
993 pending_here_doc_tokens.push(cur_token);
994 drain_here_doc_tokens = true;
995 continue;
996 }
997
998 if let Some(cur_token_value) = cur_token.token {
999 state.append_str(cur_token_value.to_str());
1000 }
1001
1002 match cur_token.reason {
1003 TokenEndReason::HereDocumentBodyStart => {
1004 state.append_char('\n');
1005 }
1006 TokenEndReason::NonNewLineBlank => state.append_char(' '),
1007 TokenEndReason::SpecifiedTerminatingChar => {
1008 state.append_char(self.next_char()?.unwrap());
1011 break;
1012 }
1013 TokenEndReason::EndOfInput => {
1014 return Err(TokenizerError::UnterminatedVariable);
1015 }
1016 _ => (),
1017 }
1018 }
1019 }
1020 _ => {
1021 state.append_char('$');
1024 }
1025 }
1026 } else {
1027 let backquote_pos = self.cross_state.cursor.clone();
1030 self.consume_char()?;
1031
1032 state.append_char(c);
1034
1035 let mut escaping_enabled = false;
1037 let mut done = false;
1038 while !done {
1039 let next_char_in_backquote = self.next_char()?;
1041 if let Some(cib) = next_char_in_backquote {
1042 state.append_char(cib);
1044
1045 if !escaping_enabled && cib == '\\' {
1047 escaping_enabled = true;
1048 } else {
1049 if !escaping_enabled && cib == '`' {
1051 done = true;
1052 }
1053 escaping_enabled = false;
1054 }
1055 } else {
1056 return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1057 }
1058 }
1059 }
1060 }
1061 else if c == '('
1067 && self.options.enable_extended_globbing
1068 && state.unquoted()
1069 && !state.in_operator()
1070 && state
1071 .current_token()
1072 .ends_with(|x| Self::can_start_extglob(x))
1073 {
1074 self.consume_char()?;
1076 state.append_char(c);
1077
1078 let mut paren_depth = 1;
1079 let mut in_escape = false;
1080
1081 while paren_depth > 0 {
1083 if let Some(extglob_char) = self.next_char()? {
1084 state.append_char(extglob_char);
1086
1087 match extglob_char {
1088 _ if in_escape => in_escape = false,
1089 '\\' => in_escape = true,
1090 '(' => paren_depth += 1,
1091 ')' => paren_depth -= 1,
1092 _ => (),
1093 }
1094 } else {
1095 return Err(TokenizerError::UnterminatedExtendedGlob(
1096 self.cross_state.cursor.clone(),
1097 ));
1098 }
1099 }
1100 } else if state.unquoted() && Self::can_start_operator(c) {
1104 if state.started_token() {
1105 result = state.delimit_current_token(
1106 TokenEndReason::OperatorStart,
1107 &mut self.cross_state,
1108 )?;
1109 } else {
1110 state.token_is_operator = true;
1111 self.consume_char()?;
1112 state.append_char(c);
1113 }
1114 } else if state.unquoted() && is_blank(c) {
1118 if state.started_token() {
1119 result = state.delimit_current_token(
1120 TokenEndReason::NonNewLineBlank,
1121 &mut self.cross_state,
1122 )?;
1123 } else if include_space {
1124 state.append_char(c);
1125 } else {
1126 state.start_position.column += 1;
1128 state.start_position.index += 1;
1129 }
1130
1131 self.consume_char()?;
1132 }
1133 else if !state.token_is_operator
1138 && (state.started_token() || matches!(terminating_char, Some('}')))
1139 {
1140 self.consume_char()?;
1141 state.append_char(c);
1142 } else if c == '#' {
1143 self.consume_char()?;
1145
1146 let mut done = false;
1147 while !done {
1148 done = match self.peek_char()? {
1149 Some('\n') => true,
1150 None => true,
1151 _ => {
1152 self.consume_char()?;
1154 false
1155 }
1156 };
1157 }
1158 } else if state.started_token() {
1160 result =
1162 state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1163 } else {
1164 self.consume_char()?;
1167 state.append_char(c);
1168 }
1169 }
1170
1171 let result = result.unwrap();
1172
1173 Ok(result)
1174 }
1175
1176 fn remove_here_end_tag(
1177 &mut self,
1178 state: &mut TokenParseState,
1179 result: &mut Option<TokenizeResult>,
1180 ends_with_newline: bool,
1181 ) -> Result<bool, TokenizerError> {
1182 if self.cross_state.current_here_tags.is_empty() {
1184 return Ok(false);
1185 }
1186
1187 let next_here_tag = &self.cross_state.current_here_tags[0];
1188
1189 let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1190 unquote_str(next_here_tag.tag.as_str()).into()
1191 } else {
1192 next_here_tag.tag.as_str().into()
1193 };
1194
1195 let tag_str = if !ends_with_newline {
1196 tag_str
1197 .strip_suffix('\n')
1198 .unwrap_or_else(|| tag_str.as_ref())
1199 } else {
1200 tag_str.as_ref()
1201 };
1202
1203 if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1204 if current_token_without_here_tag.is_empty()
1208 || current_token_without_here_tag.ends_with('\n')
1209 {
1210 state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1211
1212 *result = state.delimit_current_token(
1214 TokenEndReason::HereDocumentBodyEnd,
1215 &mut self.cross_state,
1216 )?;
1217
1218 return Ok(true);
1219 }
1220 }
1221 Ok(false)
1222 }
1223
1224 const fn can_start_extglob(c: char) -> bool {
1225 matches!(c, '@' | '!' | '?' | '+' | '*')
1226 }
1227
1228 const fn can_start_operator(c: char) -> bool {
1229 matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1230 }
1231
1232 fn is_operator(&self, s: &str) -> bool {
1233 if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1235 return true;
1236 }
1237
1238 matches!(
1239 s,
1240 "&" | "&&"
1241 | "("
1242 | ")"
1243 | ";"
1244 | ";;"
1245 | "\n"
1246 | "|"
1247 | "||"
1248 | "<"
1249 | ">"
1250 | ">|"
1251 | "<<"
1252 | ">>"
1253 | "<&"
1254 | ">&"
1255 | "<<-"
1256 | "<>"
1257 )
1258 }
1259}
1260
1261impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1262 type Item = Result<TokenizeResult, TokenizerError>;
1263
1264 fn next(&mut self) -> Option<Self::Item> {
1265 match self.next_token() {
1266 #[expect(clippy::manual_map)]
1267 Ok(result) => match result.token {
1268 Some(_) => Some(Ok(result)),
1269 None => None,
1270 },
1271 Err(e) => Some(Err(e)),
1272 }
1273 }
1274}
1275
1276const fn is_blank(c: char) -> bool {
1277 c == ' ' || c == '\t'
1278}
1279
1280const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1281 if state.in_escape {
1283 return false;
1284 }
1285
1286 match state.quote_mode {
1287 QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1290 if c == '\\' {
1291 true
1293 } else {
1294 false
1295 }
1296 }
1297 QuoteMode::Single(_) => false,
1299 QuoteMode::None => is_quoting_char(c),
1302 }
1303}
1304
1305const fn is_quoting_char(c: char) -> bool {
1306 matches!(c, '\\' | '\'' | '\"')
1307}
1308
1309pub fn unquote_str(s: &str) -> String {
1315 let mut result = String::new();
1316
1317 let mut in_escape = false;
1318 for c in s.chars() {
1319 match c {
1320 c if in_escape => {
1321 result.push(c);
1322 in_escape = false;
1323 }
1324 '\\' => in_escape = true,
1325 c if is_quoting_char(c) => (),
1326 c => result.push(c),
1327 }
1328 }
1329
1330 result
1331}
1332
1333#[cfg(test)]
1334mod tests {
1335
1336 use super::*;
1337 use anyhow::Result;
1338 use insta::assert_ron_snapshot;
1339 use pretty_assertions::{assert_eq, assert_matches};
1340
1341 #[derive(serde::Serialize, serde::Deserialize)]
1342 struct TokenizerResult<'a> {
1343 input: &'a str,
1344 result: Vec<Token>,
1345 }
1346
1347 fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1348 Ok(TokenizerResult {
1349 input,
1350 result: tokenize_str(input)?,
1351 })
1352 }
1353
1354 #[test]
1355 fn tokenize_empty() -> Result<()> {
1356 let tokens = tokenize_str("")?;
1357 assert_eq!(tokens.len(), 0);
1358 Ok(())
1359 }
1360
1361 #[test]
1362 fn tokenize_line_continuation() -> Result<()> {
1363 assert_ron_snapshot!(test_tokenizer(
1364 r"a\
1365bc"
1366 )?);
1367 Ok(())
1368 }
1369
1370 #[test]
1371 fn tokenize_operators() -> Result<()> {
1372 assert_ron_snapshot!(test_tokenizer("a>>b")?);
1373 Ok(())
1374 }
1375
1376 #[test]
1377 fn tokenize_comment() -> Result<()> {
1378 assert_ron_snapshot!(test_tokenizer(
1379 r"a #comment
1380"
1381 )?);
1382 Ok(())
1383 }
1384
1385 #[test]
1386 fn tokenize_comment_at_eof() -> Result<()> {
1387 assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1388 Ok(())
1389 }
1390
1391 #[test]
1392 fn tokenize_empty_here_doc() -> Result<()> {
1393 assert_ron_snapshot!(test_tokenizer(
1394 r"cat <<HERE
1395HERE
1396"
1397 )?);
1398 Ok(())
1399 }
1400
1401 #[test]
1402 fn tokenize_here_doc() -> Result<()> {
1403 assert_ron_snapshot!(test_tokenizer(
1404 r"cat <<HERE
1405SOMETHING
1406HERE
1407echo after
1408"
1409 )?);
1410 assert_ron_snapshot!(test_tokenizer(
1411 r"cat <<HERE
1412SOMETHING
1413HERE
1414"
1415 )?);
1416 assert_ron_snapshot!(test_tokenizer(
1417 r"cat <<HERE
1418SOMETHING
1419HERE
1420
1421"
1422 )?);
1423 assert_ron_snapshot!(test_tokenizer(
1424 r"cat <<HERE
1425SOMETHING
1426HERE"
1427 )?);
1428 Ok(())
1429 }
1430
1431 #[test]
1432 fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1433 assert_ron_snapshot!(test_tokenizer(
1434 r"cat <<-HERE
1435 SOMETHING
1436 HERE
1437"
1438 )?);
1439 Ok(())
1440 }
1441
1442 #[test]
1443 fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1444 assert_ron_snapshot!(test_tokenizer(
1445 r"cat <<EOF | wc -l
1446A B C
14471 2 3
1448D E F
1449EOF
1450"
1451 )?);
1452 Ok(())
1453 }
1454
1455 #[test]
1456 fn tokenize_multiple_here_docs() -> Result<()> {
1457 assert_ron_snapshot!(test_tokenizer(
1458 r"cat <<HERE1 <<HERE2
1459SOMETHING
1460HERE1
1461OTHER
1462HERE2
1463echo after
1464"
1465 )?);
1466 Ok(())
1467 }
1468
1469 #[test]
1470 fn tokenize_unterminated_here_doc() {
1471 let result = tokenize_str(
1472 r"cat <<HERE
1473SOMETHING
1474",
1475 );
1476 assert!(result.is_err());
1477 }
1478
1479 #[test]
1480 fn tokenize_missing_here_tag() {
1481 let result = tokenize_str(
1482 r"cat <<
1483",
1484 );
1485 assert!(result.is_err());
1486 }
1487
1488 #[test]
1489 fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1490 assert_ron_snapshot!(test_tokenizer(
1491 r"echo $(cat <<HERE
1492TEXT
1493HERE
1494)"
1495 )?);
1496 Ok(())
1497 }
1498
1499 #[test]
1500 fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1501 assert_ron_snapshot!(test_tokenizer(
1502 r#"echo "$(cat <<HERE
1503TEXT
1504HERE
1505)""#
1506 )?);
1507 Ok(())
1508 }
1509
1510 #[test]
1511 fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1512 assert_ron_snapshot!(test_tokenizer(
1513 r#"echo "$(cat << HERE
1514TEXT
1515HERE
1516)""#
1517 )?);
1518 Ok(())
1519 }
1520
1521 #[test]
1522 fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1523 assert_ron_snapshot!(test_tokenizer(
1524 r"echo $(cat <<HERE1 <<HERE2 | wc -l
1525TEXT
1526HERE1
1527OTHER
1528HERE2
1529)"
1530 )?);
1531 Ok(())
1532 }
1533
1534 #[test]
1535 fn tokenize_simple_backquote() -> Result<()> {
1536 assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1537 Ok(())
1538 }
1539
1540 #[test]
1541 fn tokenize_backquote_with_escape() -> Result<()> {
1542 assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1543 Ok(())
1544 }
1545
1546 #[test]
1547 fn tokenize_unterminated_backquote() {
1548 assert_matches!(
1549 tokenize_str("`"),
1550 Err(TokenizerError::UnterminatedBackquote(_))
1551 );
1552 }
1553
1554 #[test]
1555 fn tokenize_unterminated_command_substitution() {
1556 assert_matches!(
1559 tokenize_str("$("),
1560 Err(TokenizerError::UnterminatedExpansion)
1561 );
1562 }
1563
1564 #[test]
1565 fn tokenize_unterminated_arithmetic_expansion() {
1566 assert_matches!(
1567 tokenize_str("$(("),
1568 Err(TokenizerError::UnterminatedExpansion)
1569 );
1570 }
1571
1572 #[test]
1573 fn tokenize_unterminated_legacy_arithmetic_expansion() {
1574 assert_matches!(
1575 tokenize_str("$["),
1576 Err(TokenizerError::UnterminatedExpansion)
1577 );
1578 }
1579
1580 #[test]
1581 fn tokenize_command_substitution() -> Result<()> {
1582 assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1583 Ok(())
1584 }
1585
1586 #[test]
1587 fn tokenize_command_substitution_with_subshell() -> Result<()> {
1588 assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1589 Ok(())
1590 }
1591
1592 #[test]
1593 fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1594 assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1595 Ok(())
1596 }
1597
1598 #[test]
1599 fn tokenize_arithmetic_expression() -> Result<()> {
1600 assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1601 Ok(())
1602 }
1603
1604 #[test]
1605 fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1606 assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1609 Ok(())
1610 }
1611 #[test]
1612 fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1613 assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1614 Ok(())
1615 }
1616
1617 #[test]
1618 fn tokenize_special_parameters() -> Result<()> {
1619 assert_ron_snapshot!(test_tokenizer("$$")?);
1620 assert_ron_snapshot!(test_tokenizer("$@")?);
1621 assert_ron_snapshot!(test_tokenizer("$!")?);
1622 assert_ron_snapshot!(test_tokenizer("$?")?);
1623 assert_ron_snapshot!(test_tokenizer("$*")?);
1624 Ok(())
1625 }
1626
1627 #[test]
1628 fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1629 assert_ron_snapshot!(test_tokenizer("$x")?);
1630 assert_ron_snapshot!(test_tokenizer("a$x")?);
1631 Ok(())
1632 }
1633
1634 #[test]
1635 fn tokenize_unterminated_parameter_expansion() {
1636 assert_matches!(
1637 tokenize_str("${x"),
1638 Err(TokenizerError::UnterminatedVariable)
1639 );
1640 }
1641
1642 #[test]
1643 fn tokenize_braced_parameter_expansion() -> Result<()> {
1644 assert_ron_snapshot!(test_tokenizer("${x}")?);
1645 assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1646 Ok(())
1647 }
1648
1649 #[test]
1650 fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1651 assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1652 Ok(())
1653 }
1654
1655 #[test]
1656 fn tokenize_whitespace() -> Result<()> {
1657 assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1658 Ok(())
1659 }
1660
1661 #[test]
1662 fn tokenize_escaped_whitespace() -> Result<()> {
1663 assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1664 Ok(())
1665 }
1666
1667 #[test]
1668 fn tokenize_single_quote() -> Result<()> {
1669 assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1670 Ok(())
1671 }
1672
1673 #[test]
1674 fn tokenize_double_quote() -> Result<()> {
1675 assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1676 Ok(())
1677 }
1678
1679 #[test]
1680 fn tokenize_double_quoted_command_substitution() -> Result<()> {
1681 assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1682 Ok(())
1683 }
1684
1685 #[test]
1686 fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1687 assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1688 Ok(())
1689 }
1690
1691 #[test]
1692 fn test_quote_removal() {
1693 assert_eq!(unquote_str(r#""hello""#), "hello");
1694 assert_eq!(unquote_str(r"'hello'"), "hello");
1695 assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1696 assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1697 }
1698}