just/
lexer.rs

1use {super::*, CompileErrorKind::*, TokenKind::*};
2
3/// Just language lexer
4///
5/// The lexer proceeds character-by-character, as opposed to using regular
6/// expressions to lex tokens or semi-tokens at a time. As a result, it is
7/// verbose and straightforward. Just used to have a regex-based lexer, which
8/// was slower and generally godawful.  However, this should not be taken as a
9/// slight against regular expressions, the lexer was just idiosyncratically
10/// bad.
11pub(crate) struct Lexer<'src> {
12  /// Char iterator
13  chars: Chars<'src>,
14  /// Indentation stack
15  indentation: Vec<&'src str>,
16  /// Interpolation token start stack
17  interpolation_stack: Vec<Token<'src>>,
18  /// Next character to be lexed
19  next: Option<char>,
20  /// Current open delimiters
21  open_delimiters: Vec<(Delimiter, usize)>,
22  /// Path to source file
23  path: &'src Path,
24  /// Inside recipe body
25  recipe_body: bool,
26  /// Next indent will start a recipe body
27  recipe_body_pending: bool,
28  /// Source text
29  src: &'src str,
30  /// Current token end
31  token_end: Position,
32  /// Current token start
33  token_start: Position,
34  /// Tokens
35  tokens: Vec<Token<'src>>,
36}
37
38impl<'src> Lexer<'src> {
39  pub(crate) const INTERPOLATION_END: &'static str = "}}";
40  pub(crate) const INTERPOLATION_ESCAPE: &'static str = "{{{{";
41  pub(crate) const INTERPOLATION_START: &'static str = "{{";
42
43  /// Lex `src`
44  pub(crate) fn lex(path: &'src Path, src: &'src str) -> CompileResult<'src, Vec<Token<'src>>> {
45    Self::new(path, src).tokenize()
46  }
47
48  #[cfg(test)]
49  pub(crate) fn test_lex(src: &'src str) -> CompileResult<'src, Vec<Token<'src>>> {
50    Self::new("justfile".as_ref(), src).tokenize()
51  }
52
53  /// Create a new Lexer to lex `src`
54  fn new(path: &'src Path, src: &'src str) -> Self {
55    let mut chars = src.chars();
56    let next = chars.next();
57
58    let start = Position {
59      offset: 0,
60      column: 0,
61      line: 0,
62    };
63
64    Self {
65      indentation: vec![""],
66      tokens: Vec::new(),
67      token_start: start,
68      token_end: start,
69      recipe_body_pending: false,
70      recipe_body: false,
71      interpolation_stack: Vec::new(),
72      open_delimiters: Vec::new(),
73      chars,
74      next,
75      src,
76      path,
77    }
78  }
79
80  /// Advance over the character in `self.next`, updating `self.token_end`
81  /// accordingly.
82  fn advance(&mut self) -> CompileResult<'src> {
83    match self.next {
84      Some(c) => {
85        let len_utf8 = c.len_utf8();
86
87        self.token_end.offset += len_utf8;
88        self.token_end.column += len_utf8;
89
90        if c == '\n' {
91          self.token_end.column = 0;
92          self.token_end.line += 1;
93        }
94
95        self.next = self.chars.next();
96
97        Ok(())
98      }
99      None => Err(self.internal_error("Lexer advanced past end of text")),
100    }
101  }
102
103  /// Lexeme of in-progress token
104  fn lexeme(&self) -> &'src str {
105    &self.src[self.token_start.offset..self.token_end.offset]
106  }
107
108  /// Length of current token
109  fn current_token_length(&self) -> usize {
110    self.token_end.offset - self.token_start.offset
111  }
112
113  fn accepted(&mut self, c: char) -> CompileResult<'src, bool> {
114    if self.next_is(c) {
115      self.advance()?;
116      Ok(true)
117    } else {
118      Ok(false)
119    }
120  }
121
122  fn presume(&mut self, c: char) -> CompileResult<'src> {
123    if !self.next_is(c) {
124      return Err(self.internal_error(format!("Lexer presumed character `{c}`")));
125    }
126
127    self.advance()?;
128
129    Ok(())
130  }
131
132  fn presume_str(&mut self, s: &str) -> CompileResult<'src> {
133    for c in s.chars() {
134      self.presume(c)?;
135    }
136
137    Ok(())
138  }
139
140  /// Is next character c?
141  fn next_is(&self, c: char) -> bool {
142    self.next == Some(c)
143  }
144
145  /// Is next character ' ' or '\t'?
146  fn next_is_whitespace(&self) -> bool {
147    self.next_is(' ') || self.next_is('\t')
148  }
149
150  /// Un-lexed text
151  fn rest(&self) -> &'src str {
152    &self.src[self.token_end.offset..]
153  }
154
155  /// Check if unlexed text begins with prefix
156  fn rest_starts_with(&self, prefix: &str) -> bool {
157    self.rest().starts_with(prefix)
158  }
159
160  /// Does rest start with "\n" or "\r\n"?
161  fn at_eol(&self) -> bool {
162    self.next_is('\n') || self.rest_starts_with("\r\n")
163  }
164
165  /// Are we at end-of-file?
166  fn at_eof(&self) -> bool {
167    self.rest().is_empty()
168  }
169
170  /// Are we at end-of-line or end-of-file?
171  fn at_eol_or_eof(&self) -> bool {
172    self.at_eol() || self.at_eof()
173  }
174
175  /// Get current indentation
176  fn indentation(&self) -> &'src str {
177    self.indentation.last().unwrap()
178  }
179
180  /// Are we currently indented
181  fn indented(&self) -> bool {
182    !self.indentation().is_empty()
183  }
184
185  /// Create a new token with `kind` whose lexeme is between `self.token_start`
186  /// and `self.token_end`
187  fn token(&mut self, kind: TokenKind) {
188    self.tokens.push(Token {
189      column: self.token_start.column,
190      kind,
191      length: self.token_end.offset - self.token_start.offset,
192      line: self.token_start.line,
193      offset: self.token_start.offset,
194      path: self.path,
195      src: self.src,
196    });
197
198    // Set `token_start` to point after the lexed token
199    self.token_start = self.token_end;
200  }
201
202  /// Create an internal error with `message`
203  fn internal_error(&self, message: impl Into<String>) -> CompileError<'src> {
204    // Use `self.token_end` as the location of the error
205    let token = Token {
206      src: self.src,
207      offset: self.token_end.offset,
208      line: self.token_end.line,
209      column: self.token_end.column,
210      length: 0,
211      kind: Unspecified,
212      path: self.path,
213    };
214    CompileError::new(
215      token,
216      Internal {
217        message: message.into(),
218      },
219    )
220  }
221
222  /// Create a compilation error with `kind`
223  fn error(&self, kind: CompileErrorKind<'src>) -> CompileError<'src> {
224    // Use the in-progress token span as the location of the error.
225
226    // The width of the error site to highlight depends on the kind of error:
227    let length = match kind {
228      UnterminatedString | UnterminatedBacktick => {
229        let Some(kind) = StringKind::from_token_start(self.lexeme()) else {
230          return self.internal_error("Lexer::error: expected string or backtick token start");
231        };
232        kind.delimiter().len()
233      }
234      // highlight the full token
235      _ => self.lexeme().len(),
236    };
237
238    let token = Token {
239      kind: Unspecified,
240      src: self.src,
241      offset: self.token_start.offset,
242      line: self.token_start.line,
243      column: self.token_start.column,
244      length,
245      path: self.path,
246    };
247
248    CompileError::new(token, kind)
249  }
250
251  fn unterminated_interpolation_error(interpolation_start: Token<'src>) -> CompileError<'src> {
252    CompileError::new(interpolation_start, UnterminatedInterpolation)
253  }
254
255  /// True if `c` can be the first character of an identifier
256  pub(crate) fn is_identifier_start(c: char) -> bool {
257    matches!(c, 'a'..='z' | 'A'..='Z' | '_')
258  }
259
260  /// True if `c` can be a continuation character of an identifier
261  pub(crate) fn is_identifier_continue(c: char) -> bool {
262    Self::is_identifier_start(c) || matches!(c, '0'..='9' | '-')
263  }
264
265  /// Consume the text and produce a series of tokens
266  fn tokenize(mut self) -> CompileResult<'src, Vec<Token<'src>>> {
267    loop {
268      if self.token_start.column == 0 {
269        self.lex_line_start()?;
270      }
271
272      match self.next {
273        Some(first) => {
274          if let Some(&interpolation_start) = self.interpolation_stack.last() {
275            self.lex_interpolation(interpolation_start, first)?;
276          } else if self.recipe_body {
277            self.lex_body()?;
278          } else {
279            self.lex_normal(first)?;
280          }
281        }
282        None => break,
283      }
284    }
285
286    if let Some(&interpolation_start) = self.interpolation_stack.last() {
287      return Err(Self::unterminated_interpolation_error(interpolation_start));
288    }
289
290    while self.indented() {
291      self.lex_dedent();
292    }
293
294    self.token(Eof);
295
296    assert_eq!(self.token_start.offset, self.token_end.offset);
297    assert_eq!(self.token_start.offset, self.src.len());
298    assert_eq!(self.indentation.len(), 1);
299
300    Ok(self.tokens)
301  }
302
303  /// Handle blank lines and indentation
304  fn lex_line_start(&mut self) -> CompileResult<'src> {
305    enum Indentation<'src> {
306      // Line only contains whitespace
307      Blank,
308      // Indentation continues
309      Continue,
310      // Indentation decreases
311      Decrease,
312      // Indentation isn't consistent
313      Inconsistent,
314      // Indentation increases
315      Increase,
316      // Indentation mixes spaces and tabs
317      Mixed { whitespace: &'src str },
318    }
319
320    use Indentation::*;
321
322    let nonblank_index = self
323      .rest()
324      .char_indices()
325      .skip_while(|&(_, c)| c == ' ' || c == '\t')
326      .map(|(i, _)| i)
327      .next()
328      .unwrap_or_else(|| self.rest().len());
329
330    let rest = &self.rest()[nonblank_index..];
331
332    let whitespace = &self.rest()[..nonblank_index];
333
334    if self.open_delimiters_or_interpolation() {
335      if !whitespace.is_empty() {
336        while self.next_is_whitespace() {
337          self.advance()?;
338        }
339
340        self.token(Whitespace);
341      }
342
343      return Ok(());
344    }
345
346    let body_whitespace = &whitespace[..whitespace
347      .char_indices()
348      .take(self.indentation().chars().count())
349      .map(|(i, _c)| i)
350      .next()
351      .unwrap_or(0)];
352
353    let spaces = whitespace.chars().any(|c| c == ' ');
354    let tabs = whitespace.chars().any(|c| c == '\t');
355
356    let body_spaces = body_whitespace.chars().any(|c| c == ' ');
357    let body_tabs = body_whitespace.chars().any(|c| c == '\t');
358
359    #[allow(clippy::if_same_then_else)]
360    let indentation = if rest.starts_with('\n') || rest.starts_with("\r\n") || rest.is_empty() {
361      Blank
362    } else if whitespace == self.indentation() {
363      Continue
364    } else if self.indentation.contains(&whitespace) {
365      Decrease
366    } else if self.recipe_body && whitespace.starts_with(self.indentation()) {
367      Continue
368    } else if self.recipe_body && body_spaces && body_tabs {
369      Mixed {
370        whitespace: body_whitespace,
371      }
372    } else if !self.recipe_body && spaces && tabs {
373      Mixed { whitespace }
374    } else if whitespace.len() < self.indentation().len() {
375      Inconsistent
376    } else if self.recipe_body
377      && body_whitespace.len() >= self.indentation().len()
378      && !body_whitespace.starts_with(self.indentation())
379    {
380      Inconsistent
381    } else if whitespace.len() >= self.indentation().len()
382      && !whitespace.starts_with(self.indentation())
383    {
384      Inconsistent
385    } else {
386      Increase
387    };
388
389    match indentation {
390      Blank => {
391        if !whitespace.is_empty() {
392          while self.next_is_whitespace() {
393            self.advance()?;
394          }
395
396          self.token(Whitespace);
397        }
398
399        Ok(())
400      }
401      Continue => {
402        if !self.indentation().is_empty() {
403          for _ in self.indentation().chars() {
404            self.advance()?;
405          }
406
407          self.token(Whitespace);
408        }
409
410        Ok(())
411      }
412      Decrease => {
413        while self.indentation() != whitespace {
414          self.lex_dedent();
415        }
416
417        if !whitespace.is_empty() {
418          while self.next_is_whitespace() {
419            self.advance()?;
420          }
421
422          self.token(Whitespace);
423        }
424
425        Ok(())
426      }
427      Mixed { whitespace } => {
428        for _ in whitespace.chars() {
429          self.advance()?;
430        }
431
432        Err(self.error(MixedLeadingWhitespace { whitespace }))
433      }
434      Inconsistent => {
435        for _ in whitespace.chars() {
436          self.advance()?;
437        }
438
439        Err(self.error(InconsistentLeadingWhitespace {
440          expected: self.indentation(),
441          found: whitespace,
442        }))
443      }
444      Increase => {
445        while self.next_is_whitespace() {
446          self.advance()?;
447        }
448
449        let indentation = self.lexeme();
450        self.indentation.push(indentation);
451        self.token(Indent);
452        if self.recipe_body_pending {
453          self.recipe_body = true;
454        }
455
456        Ok(())
457      }
458    }
459  }
460
461  /// Lex token beginning with `start` outside of a recipe body
462  fn lex_normal(&mut self, start: char) -> CompileResult<'src> {
463    match start {
464      ' ' | '\t' => self.lex_whitespace(),
465      '!' if self.rest().starts_with("!include") => Err(self.error(Include)),
466      '!' => self.lex_choices('!', &[('=', BangEquals), ('~', BangTilde)], None),
467      '#' => self.lex_comment(),
468      '$' => self.lex_single(Dollar),
469      '&' => self.lex_digraph('&', '&', AmpersandAmpersand),
470      '(' => self.lex_delimiter(ParenL),
471      ')' => self.lex_delimiter(ParenR),
472      '*' => self.lex_single(Asterisk),
473      '+' => self.lex_single(Plus),
474      ',' => self.lex_single(Comma),
475      '/' => self.lex_single(Slash),
476      ':' => self.lex_colon(),
477      '=' => self.lex_choices(
478        '=',
479        &[('=', EqualsEquals), ('~', EqualsTilde)],
480        Some(Equals),
481      ),
482      '?' => self.lex_single(QuestionMark),
483      '@' => self.lex_single(At),
484      '[' => self.lex_delimiter(BracketL),
485      '\\' => self.lex_escape(),
486      '\n' | '\r' => self.lex_eol(),
487      '\u{feff}' => self.lex_single(ByteOrderMark),
488      ']' => self.lex_delimiter(BracketR),
489      '`' | '"' | '\'' => self.lex_string(None),
490      '{' => self.lex_delimiter(BraceL),
491      '|' => self.lex_digraph('|', '|', BarBar),
492      '}' => {
493        let format_string_kind = self.open_delimiters.last().and_then(|(delimiter, _line)| {
494          if !self.rest().starts_with(Self::INTERPOLATION_END) {
495            None
496          } else if let Delimiter::FormatString(kind) = delimiter {
497            Some(kind)
498          } else {
499            None
500          }
501        });
502
503        if let Some(format_string_kind) = format_string_kind {
504          self.lex_string(Some(*format_string_kind))
505        } else {
506          self.lex_delimiter(BraceR)
507        }
508      }
509      _ if Self::is_identifier_start(start) => self.lex_identifier(),
510      _ => {
511        self.advance()?;
512        Err(self.error(UnknownStartOfToken { start }))
513      }
514    }
515  }
516
517  /// Lex token beginning with `start` inside an interpolation
518  fn lex_interpolation(
519    &mut self,
520    interpolation_start: Token<'src>,
521    start: char,
522  ) -> CompileResult<'src> {
523    if self.rest_starts_with(Self::INTERPOLATION_END) && self.open_delimiters.is_empty() {
524      // end current interpolation
525      if self.interpolation_stack.pop().is_none() {
526        self.presume_str(Self::INTERPOLATION_END)?;
527        return Err(self.internal_error(
528          "Lexer::lex_interpolation found `}}` but was called with empty interpolation stack.",
529        ));
530      }
531      // Emit interpolation end token
532      self.lex_double(InterpolationEnd)
533    } else if self.at_eof() && self.open_delimiters.is_empty() {
534      // Return unterminated interpolation error that highlights the opening
535      // {{
536      Err(Self::unterminated_interpolation_error(interpolation_start))
537    } else {
538      // Otherwise lex as per normal
539      self.lex_normal(start)
540    }
541  }
542
543  /// Lex token while in recipe body
544  fn lex_body(&mut self) -> CompileResult<'src> {
545    enum Terminator {
546      EndOfFile,
547      Interpolation,
548      Newline,
549      NewlineCarriageReturn,
550    }
551
552    use Terminator::*;
553
554    let terminator = loop {
555      if self.rest_starts_with(Self::INTERPOLATION_ESCAPE) {
556        self.presume_str(Self::INTERPOLATION_ESCAPE)?;
557        continue;
558      }
559
560      if self.rest_starts_with("\n") {
561        break Newline;
562      }
563
564      if self.rest_starts_with("\r\n") {
565        break NewlineCarriageReturn;
566      }
567
568      if self.rest_starts_with(Self::INTERPOLATION_START) {
569        break Interpolation;
570      }
571
572      if self.at_eof() {
573        break EndOfFile;
574      }
575
576      self.advance()?;
577    };
578
579    // emit text token containing text so far
580    if self.current_token_length() > 0 {
581      self.token(Text);
582    }
583
584    match terminator {
585      Newline => self.lex_single(Eol),
586      NewlineCarriageReturn => self.lex_double(Eol),
587      Interpolation => {
588        self.lex_double(InterpolationStart)?;
589        self
590          .interpolation_stack
591          .push(self.tokens[self.tokens.len() - 1]);
592        Ok(())
593      }
594      EndOfFile => Ok(()),
595    }
596  }
597
598  fn lex_dedent(&mut self) {
599    assert_eq!(self.current_token_length(), 0);
600    self.token(Dedent);
601    self.indentation.pop();
602    self.recipe_body_pending = false;
603    self.recipe_body = false;
604  }
605
606  /// Lex a single-character token
607  fn lex_single(&mut self, kind: TokenKind) -> CompileResult<'src> {
608    self.advance()?;
609    self.token(kind);
610    Ok(())
611  }
612
613  /// Lex a double-character token
614  fn lex_double(&mut self, kind: TokenKind) -> CompileResult<'src> {
615    self.advance()?;
616    self.advance()?;
617    self.token(kind);
618    Ok(())
619  }
620
621  /// Lex a double-character token of kind `then` if the second character of
622  /// that token would be `second`, otherwise lex a single-character token of
623  /// kind `otherwise`
624  fn lex_choices(
625    &mut self,
626    first: char,
627    choices: &[(char, TokenKind)],
628    otherwise: Option<TokenKind>,
629  ) -> CompileResult<'src> {
630    self.presume(first)?;
631
632    for (second, then) in choices {
633      if self.accepted(*second)? {
634        self.token(*then);
635        return Ok(());
636      }
637    }
638
639    if let Some(token) = otherwise {
640      self.token(token);
641    } else {
642      // Emit an unspecified token to consume the current character,
643      self.token(Unspecified);
644
645      let expected = choices.iter().map(|choice| choice.0).collect();
646
647      if self.at_eof() {
648        return Err(self.error(UnexpectedEndOfToken { expected }));
649      }
650
651      // …and advance past another character,
652      self.advance()?;
653
654      // …so that the error we produce highlights the unexpected character.
655      return Err(self.error(UnexpectedCharacter { expected }));
656    }
657
658    Ok(())
659  }
660
661  /// Lex an opening or closing delimiter
662  fn lex_delimiter(&mut self, kind: TokenKind) -> CompileResult<'src> {
663    match kind {
664      BraceL => self.open_delimiter(Delimiter::Brace),
665      BraceR => self.close_delimiter(Delimiter::Brace)?,
666      BracketL => self.open_delimiter(Delimiter::Bracket),
667      BracketR => self.close_delimiter(Delimiter::Bracket)?,
668      ParenL => self.open_delimiter(Delimiter::Paren),
669      ParenR => self.close_delimiter(Delimiter::Paren)?,
670      _ => {
671        return Err(self.internal_error(format!(
672          "Lexer::lex_delimiter called with non-delimiter token: `{kind}`",
673        )));
674      }
675    }
676
677    // Emit the delimiter token
678    self.lex_single(kind)?;
679
680    Ok(())
681  }
682
683  /// Push a delimiter onto the open delimiter stack
684  fn open_delimiter(&mut self, delimiter: Delimiter) {
685    self
686      .open_delimiters
687      .push((delimiter, self.token_start.line));
688  }
689
690  /// Pop a delimiter from the open delimiter stack and error if incorrect type
691  fn close_delimiter(&mut self, close: Delimiter) -> CompileResult<'src> {
692    match self.open_delimiters.pop() {
693      Some((open, _)) if open == close => Ok(()),
694      Some((open, open_line)) => Err(self.error(MismatchedClosingDelimiter {
695        open,
696        close,
697        open_line,
698      })),
699      None => Err(self.error(UnexpectedClosingDelimiter { close })),
700    }
701  }
702
703  /// Return true if there are any unclosed delimiters
704  fn open_delimiters_or_interpolation(&self) -> bool {
705    !self.open_delimiters.is_empty() || !self.interpolation_stack.is_empty()
706  }
707
708  /// Lex a two-character digraph
709  fn lex_digraph(&mut self, left: char, right: char, token: TokenKind) -> CompileResult<'src> {
710    self.presume(left)?;
711
712    if self.accepted(right)? {
713      self.token(token);
714      Ok(())
715    } else {
716      // Emit an unspecified token to consume the current character,
717      self.token(Unspecified);
718
719      if self.at_eof() {
720        return Err(self.error(UnexpectedEndOfToken {
721          expected: vec![right],
722        }));
723      }
724
725      // …and advance past another character,
726      self.advance()?;
727
728      // …so that the error we produce highlights the unexpected character.
729      Err(self.error(UnexpectedCharacter {
730        expected: vec![right],
731      }))
732    }
733  }
734
735  /// Lex a token starting with ':'
736  fn lex_colon(&mut self) -> CompileResult<'src> {
737    self.presume(':')?;
738
739    if self.accepted('=')? {
740      self.token(ColonEquals);
741    } else if self.accepted(':')? {
742      self.token(ColonColon);
743    } else {
744      self.token(Colon);
745      self.recipe_body_pending = true;
746    }
747
748    Ok(())
749  }
750
751  /// Lex an token starting with '\' escape
752  fn lex_escape(&mut self) -> CompileResult<'src> {
753    self.presume('\\')?;
754
755    // Treat newline escaped with \ as whitespace
756    if self.accepted('\n')? {
757      while self.next_is_whitespace() {
758        self.advance()?;
759      }
760      self.token(Whitespace);
761    } else if self.accepted('\r')? {
762      if !self.accepted('\n')? {
763        return Err(self.error(UnpairedCarriageReturn));
764      }
765      while self.next_is_whitespace() {
766        self.advance()?;
767      }
768      self.token(Whitespace);
769    } else if let Some(character) = self.next {
770      return Err(self.error(InvalidEscapeSequence { character }));
771    }
772
773    Ok(())
774  }
775
776  /// Lex a carriage return and line feed
777  fn lex_eol(&mut self) -> CompileResult<'src> {
778    if self.accepted('\r')? {
779      if !self.accepted('\n')? {
780        return Err(self.error(UnpairedCarriageReturn));
781      }
782    } else {
783      self.presume('\n')?;
784    }
785
786    // Emit eol if there are no open delimiters, otherwise emit whitespace.
787    if self.open_delimiters_or_interpolation() {
788      self.token(Whitespace);
789    } else {
790      self.token(Eol);
791    }
792
793    Ok(())
794  }
795
796  /// Lex name: [a-zA-Z_][a-zA-Z0-9_]*
797  fn lex_identifier(&mut self) -> CompileResult<'src> {
798    self.advance()?;
799
800    while let Some(c) = self.next {
801      if !Self::is_identifier_continue(c) {
802        break;
803      }
804
805      self.advance()?;
806    }
807
808    self.token(Identifier);
809
810    Ok(())
811  }
812
813  /// Lex comment: #[^\r\n]
814  fn lex_comment(&mut self) -> CompileResult<'src> {
815    self.presume('#')?;
816
817    while !self.at_eol_or_eof() {
818      self.advance()?;
819    }
820
821    self.token(Comment);
822
823    Ok(())
824  }
825
826  /// Lex whitespace: [ \t]+
827  fn lex_whitespace(&mut self) -> CompileResult<'src> {
828    while self.next_is_whitespace() {
829      self.advance()?;
830    }
831
832    self.token(Whitespace);
833
834    Ok(())
835  }
836
837  /// Lex a backtick, cooked string, or raw string.
838  ///
839  /// Backtick:      ``[^`]*``
840  /// Cooked string: "[^"]*" # also processes escape sequences
841  /// Raw string:    '[^']*'
842  fn lex_string(&mut self, format_string_kind: Option<StringKind>) -> CompileResult<'src> {
843    let format = format_string_kind.is_some()
844      || self.tokens.last().is_some_and(|token| {
845        token.kind == TokenKind::Identifier && token.lexeme() == Keyword::F.lexeme()
846      });
847
848    let kind = if let Some(kind) = format_string_kind {
849      self.presume_str(Self::INTERPOLATION_END)?;
850      kind
851    } else {
852      let Some(kind) = StringKind::from_token_start(self.rest()) else {
853        self.advance()?;
854        return Err(self.internal_error("Lexer::lex_string: invalid string start"));
855      };
856      self.presume_str(kind.delimiter())?;
857      kind
858    };
859
860    let mut escape = false;
861
862    loop {
863      if self.next.is_none() {
864        return Err(self.error(kind.unterminated_error_kind()));
865      } else if !escape && kind.processes_escape_sequences() && self.next_is('\\') {
866        escape = true;
867      } else if escape && kind.processes_escape_sequences() && self.next_is('u') {
868        escape = false;
869      } else if format && self.rest_starts_with(Self::INTERPOLATION_ESCAPE) {
870        escape = false;
871        self.advance()?;
872        self.advance()?;
873        self.advance()?;
874      } else if !escape
875        && (self.rest_starts_with(kind.delimiter())
876          || format && self.rest_starts_with(Self::INTERPOLATION_START))
877      {
878        break;
879      } else {
880        escape = false;
881      }
882
883      self.advance()?;
884    }
885
886    if format && self.rest_starts_with(Self::INTERPOLATION_START) {
887      self.presume_str(Self::INTERPOLATION_START)?;
888      if format_string_kind.is_some() {
889        self.token(FormatStringContinue);
890      } else {
891        self.token(FormatStringStart);
892        self.open_delimiter(Delimiter::FormatString(kind));
893      }
894    } else {
895      self.presume_str(kind.delimiter())?;
896
897      if let Some(format_string_kind) = format_string_kind {
898        self.close_delimiter(Delimiter::FormatString(format_string_kind))?;
899        self.token(FormatStringEnd);
900      } else {
901        self.token(kind.token_kind());
902      }
903    }
904
905    Ok(())
906  }
907}
908
909#[cfg(test)]
910mod tests {
911  use super::*;
912
913  use pretty_assertions::assert_eq;
914
915  macro_rules! test {
916    {
917      name:     $name:ident,
918      text:     $text:expr,
919      tokens:   ($($kind:ident $(: $lexeme:literal)?),* $(,)?)$(,)?
920    } => {
921      #[test]
922      fn $name() {
923        let kinds: &[TokenKind] = &[$($kind,)* Eof];
924
925        let lexemes: &[&str] = &[$(lexeme!($kind $(, $lexeme)?),)* ""];
926
927        test($text, true, kinds, lexemes);
928      }
929    };
930    {
931      name:     $name:ident,
932      text:     $text:expr,
933      tokens:   ($($kind:ident $(: $lexeme:literal)?),* $(,)?)$(,)?
934      unindent: $unindent:expr,
935    } => {
936      #[test]
937      fn $name() {
938        let kinds: &[TokenKind] = &[$($kind,)* Eof];
939
940        let lexemes: &[&str] = &[$(lexeme!($kind $(, $lexeme)?),)* ""];
941
942        test($text, $unindent, kinds, lexemes);
943      }
944    }
945  }
946
947  macro_rules! lexeme {
948    {
949      $kind:ident, $lexeme:literal
950    } => {
951      $lexeme
952    };
953    {
954      $kind:ident
955    } => {
956      default_lexeme($kind)
957    }
958  }
959
960  #[track_caller]
961  fn test(text: &str, unindent_text: bool, want_kinds: &[TokenKind], want_lexemes: &[&str]) {
962    let text = if unindent_text {
963      unindent(text)
964    } else {
965      text.to_owned()
966    };
967
968    let have = Lexer::test_lex(&text).unwrap();
969
970    let have_kinds = have
971      .iter()
972      .map(|token| token.kind)
973      .collect::<Vec<TokenKind>>();
974
975    let have_lexemes = have.iter().map(Token::lexeme).collect::<Vec<&str>>();
976
977    assert_eq!(have_kinds, want_kinds, "Token kind mismatch");
978    assert_eq!(have_lexemes, want_lexemes, "Token lexeme mismatch");
979
980    let mut roundtrip = String::new();
981
982    for lexeme in have_lexemes {
983      roundtrip.push_str(lexeme);
984    }
985
986    assert_eq!(roundtrip, text, "Roundtrip mismatch");
987
988    let mut offset = 0;
989    let mut line = 0;
990    let mut column = 0;
991
992    for token in have {
993      assert_eq!(token.offset, offset);
994      assert_eq!(token.line, line);
995      assert_eq!(token.lexeme().len(), token.length);
996      assert_eq!(token.column, column);
997
998      for c in token.lexeme().chars() {
999        if c == '\n' {
1000          line += 1;
1001          column = 0;
1002        } else {
1003          column += c.len_utf8();
1004        }
1005      }
1006
1007      offset += token.length;
1008    }
1009  }
1010
1011  fn default_lexeme(kind: TokenKind) -> &'static str {
1012    match kind {
1013      // Fixed lexemes
1014      AmpersandAmpersand => "&&",
1015      Asterisk => "*",
1016      At => "@",
1017      BangEquals => "!=",
1018      BangTilde => "!~",
1019      BarBar => "||",
1020      BraceL => "{",
1021      BraceR => "}",
1022      BracketL => "[",
1023      BracketR => "]",
1024      ByteOrderMark => "\u{feff}",
1025      Colon => ":",
1026      ColonColon => "::",
1027      ColonEquals => ":=",
1028      Comma => ",",
1029      Dollar => "$",
1030      Eol => "\n",
1031      Equals => "=",
1032      EqualsEquals => "==",
1033      EqualsTilde => "=~",
1034      Indent => "  ",
1035      InterpolationEnd => "}}",
1036      InterpolationStart => "{{",
1037      ParenL => "(",
1038      ParenR => ")",
1039      Plus => "+",
1040      QuestionMark => "?",
1041      Slash => "/",
1042      Whitespace => " ",
1043
1044      // Empty lexemes
1045      Dedent | Eof => "",
1046
1047      // Variable lexemes
1048      Backtick | Comment | FormatStringContinue | FormatStringEnd | FormatStringStart
1049      | Identifier | StringToken | Text | Unspecified => {
1050        panic!("Token {kind:?} has no default lexeme")
1051      }
1052    }
1053  }
1054
1055  macro_rules! error {
1056    (
1057      name:   $name:ident,
1058      input:  $input:expr,
1059      offset: $offset:expr,
1060      line:   $line:expr,
1061      column: $column:expr,
1062      width:  $width:expr,
1063      kind:   $kind:expr,
1064    ) => {
1065      #[test]
1066      fn $name() {
1067        error($input, $offset, $line, $column, $width, $kind);
1068      }
1069    };
1070  }
1071
1072  #[track_caller]
1073  fn error(
1074    src: &str,
1075    offset: usize,
1076    line: usize,
1077    column: usize,
1078    length: usize,
1079    kind: CompileErrorKind,
1080  ) {
1081    match Lexer::test_lex(src) {
1082      Ok(_) => panic!("Lexing succeeded but expected"),
1083      Err(have) => {
1084        let want = CompileError {
1085          token: Token {
1086            kind: have.token.kind,
1087            src,
1088            offset,
1089            line,
1090            column,
1091            length,
1092            path: "justfile".as_ref(),
1093          },
1094          kind: kind.into(),
1095        };
1096        assert_eq!(have, want);
1097      }
1098    }
1099  }
1100
1101  test! {
1102    name:   name_new,
1103    text:   "foo",
1104    tokens: (Identifier:"foo"),
1105  }
1106
1107  test! {
1108    name:   comment,
1109    text:   "# hello",
1110    tokens: (Comment:"# hello"),
1111  }
1112
1113  test! {
1114    name:   backtick,
1115    text:   "`echo`",
1116    tokens: (Backtick:"`echo`"),
1117  }
1118
1119  test! {
1120    name:   backtick_multi_line,
1121    text:   "`echo\necho`",
1122    tokens: (Backtick:"`echo\necho`"),
1123  }
1124
1125  test! {
1126    name:   raw_string,
1127    text:   "'hello'",
1128    tokens: (StringToken:"'hello'"),
1129  }
1130
1131  test! {
1132    name:   raw_string_multi_line,
1133    text:   "'hello\ngoodbye'",
1134    tokens: (StringToken:"'hello\ngoodbye'"),
1135  }
1136
1137  test! {
1138    name:   cooked_string,
1139    text:   "\"hello\"",
1140    tokens: (StringToken:"\"hello\""),
1141  }
1142
1143  test! {
1144    name:   cooked_string_multi_line,
1145    text:   "\"hello\ngoodbye\"",
1146    tokens: (StringToken:"\"hello\ngoodbye\""),
1147  }
1148
1149  test! {
1150    name:   cooked_multiline_string,
1151    text:   "\"\"\"hello\ngoodbye\"\"\"",
1152    tokens: (StringToken:"\"\"\"hello\ngoodbye\"\"\""),
1153  }
1154
1155  test! {
1156    name:   ampersand_ampersand,
1157    text:   "&&",
1158    tokens: (AmpersandAmpersand),
1159  }
1160
1161  test! {
1162    name:   equals,
1163    text:   "=",
1164    tokens: (Equals),
1165  }
1166
1167  test! {
1168    name:   equals_equals,
1169    text:   "==",
1170    tokens: (EqualsEquals),
1171  }
1172
1173  test! {
1174    name:   bang_equals,
1175    text:   "!=",
1176    tokens: (BangEquals),
1177  }
1178
1179  test! {
1180    name:   brace_l,
1181    text:   "{",
1182    tokens: (BraceL),
1183  }
1184
1185  test! {
1186    name:   brace_r,
1187    text:   "{}",
1188    tokens: (BraceL, BraceR),
1189  }
1190
1191  test! {
1192    name:   brace_lll,
1193    text:   "{{{",
1194    tokens: (BraceL, BraceL, BraceL),
1195  }
1196
1197  test! {
1198    name:   brace_rrr,
1199    text:   "{{{}}}",
1200    tokens: (BraceL, BraceL, BraceL, BraceR, BraceR, BraceR),
1201  }
1202
1203  test! {
1204    name:   dollar,
1205    text:   "$",
1206    tokens: (Dollar),
1207  }
1208
1209  test! {
1210    name:   export_concatenation,
1211    text:   "export foo = 'foo' + 'bar'",
1212    tokens: (
1213      Identifier:"export",
1214      Whitespace,
1215      Identifier:"foo",
1216      Whitespace,
1217      Equals,
1218      Whitespace,
1219      StringToken:"'foo'",
1220      Whitespace,
1221      Plus,
1222      Whitespace,
1223      StringToken:"'bar'",
1224    )
1225  }
1226
1227  test! {
1228    name: export_complex,
1229    text: "export foo = ('foo' + 'bar') + `baz`",
1230    tokens: (
1231      Identifier:"export",
1232      Whitespace,
1233      Identifier:"foo",
1234      Whitespace,
1235      Equals,
1236      Whitespace,
1237      ParenL,
1238      StringToken:"'foo'",
1239      Whitespace,
1240      Plus,
1241      Whitespace,
1242      StringToken:"'bar'",
1243      ParenR,
1244      Whitespace,
1245      Plus,
1246      Whitespace,
1247      Backtick:"`baz`",
1248    ),
1249  }
1250
1251  test! {
1252    name:     eol_linefeed,
1253    text:     "\n",
1254    tokens:   (Eol),
1255    unindent: false,
1256  }
1257
1258  test! {
1259    name:     eol_carriage_return_linefeed,
1260    text:     "\r\n",
1261    tokens:   (Eol:"\r\n"),
1262    unindent: false,
1263  }
1264
1265  test! {
1266    name:   indented_line,
1267    text:   "foo:\n a",
1268    tokens: (Identifier:"foo", Colon, Eol, Indent:" ", Text:"a", Dedent),
1269  }
1270
1271  test! {
1272    name:   indented_normal,
1273    text:   "
1274      a
1275        b
1276        c
1277    ",
1278    tokens: (
1279      Identifier:"a",
1280      Eol,
1281      Indent:"  ",
1282      Identifier:"b",
1283      Eol,
1284      Whitespace:"  ",
1285      Identifier:"c",
1286      Eol,
1287      Dedent,
1288    ),
1289  }
1290
1291  test! {
1292    name:   indented_normal_nonempty_blank,
1293    text:   "a\n  b\n\t\t\n  c\n",
1294    tokens: (
1295      Identifier:"a",
1296      Eol,
1297      Indent:"  ",
1298      Identifier:"b",
1299      Eol,
1300      Whitespace:"\t\t",
1301      Eol,
1302      Whitespace:"  ",
1303      Identifier:"c",
1304      Eol,
1305      Dedent,
1306    ),
1307    unindent: false,
1308  }
1309
1310  test! {
1311    name:   indented_normal_multiple,
1312    text:   "
1313      a
1314        b
1315          c
1316    ",
1317    tokens: (
1318      Identifier:"a",
1319      Eol,
1320      Indent:"  ",
1321      Identifier:"b",
1322      Eol,
1323      Indent:"    ",
1324      Identifier:"c",
1325      Eol,
1326      Dedent,
1327      Dedent,
1328    ),
1329  }
1330
1331  test! {
1332    name:   indent_indent_dedent_indent,
1333    text:   "
1334      a
1335        b
1336          c
1337        d
1338          e
1339    ",
1340    tokens: (
1341      Identifier:"a",
1342      Eol,
1343      Indent:"  ",
1344        Identifier:"b",
1345        Eol,
1346        Indent:"    ",
1347          Identifier:"c",
1348          Eol,
1349        Dedent,
1350        Whitespace:"  ",
1351        Identifier:"d",
1352        Eol,
1353        Indent:"    ",
1354          Identifier:"e",
1355          Eol,
1356        Dedent,
1357      Dedent,
1358    ),
1359  }
1360
1361  test! {
1362    name:   indent_recipe_dedent_indent,
1363    text:   "
1364      a
1365        b:
1366          c
1367        d
1368          e
1369    ",
1370    tokens: (
1371      Identifier:"a",
1372      Eol,
1373      Indent:"  ",
1374        Identifier:"b",
1375        Colon,
1376        Eol,
1377        Indent:"    ",
1378          Text:"c",
1379          Eol,
1380        Dedent,
1381        Whitespace:"  ",
1382        Identifier:"d",
1383        Eol,
1384        Indent:"    ",
1385          Identifier:"e",
1386          Eol,
1387        Dedent,
1388      Dedent,
1389    ),
1390  }
1391
1392  test! {
1393    name: indented_block,
1394    text: "
1395      foo:
1396        a
1397        b
1398        c
1399    ",
1400    tokens: (
1401      Identifier:"foo",
1402      Colon,
1403      Eol,
1404      Indent,
1405      Text:"a",
1406      Eol,
1407      Whitespace:"  ",
1408      Text:"b",
1409      Eol,
1410      Whitespace:"  ",
1411      Text:"c",
1412      Eol,
1413      Dedent,
1414    )
1415  }
1416
1417  test! {
1418    name: brace_escape,
1419    text: "
1420      foo:
1421        {{{{
1422    ",
1423    tokens: (
1424      Identifier:"foo",
1425      Colon,
1426      Eol,
1427      Indent,
1428      Text:"{{{{",
1429      Eol,
1430      Dedent,
1431    )
1432  }
1433
1434  test! {
1435    name: indented_block_followed_by_item,
1436    text: "
1437      foo:
1438        a
1439      b:
1440    ",
1441    tokens: (
1442      Identifier:"foo",
1443      Colon,
1444      Eol,
1445      Indent,
1446      Text:"a",
1447      Eol,
1448      Dedent,
1449      Identifier:"b",
1450      Colon,
1451      Eol,
1452    )
1453  }
1454
1455  test! {
1456    name: indented_block_followed_by_blank,
1457    text: "
1458      foo:
1459          a
1460
1461      b:
1462    ",
1463    tokens: (
1464      Identifier:"foo",
1465      Colon,
1466      Eol,
1467      Indent:"    ",
1468      Text:"a",
1469      Eol,
1470      Eol,
1471      Dedent,
1472      Identifier:"b",
1473      Colon,
1474      Eol,
1475    ),
1476  }
1477
1478  test! {
1479    name: indented_line_containing_unpaired_carriage_return,
1480    text: "foo:\n \r \n",
1481    tokens: (
1482      Identifier:"foo",
1483      Colon,
1484      Eol,
1485      Indent:" ",
1486      Text:"\r ",
1487      Eol,
1488      Dedent,
1489    ),
1490    unindent: false,
1491  }
1492
1493  test! {
1494    name: indented_blocks,
1495    text: "
1496      b: a
1497        @mv a b
1498
1499      a:
1500        @touch F
1501        @touch a
1502
1503      d: c
1504        @rm c
1505
1506      c: b
1507        @mv b c
1508    ",
1509    tokens: (
1510      Identifier:"b",
1511      Colon,
1512      Whitespace,
1513      Identifier:"a",
1514      Eol,
1515      Indent,
1516      Text:"@mv a b",
1517      Eol,
1518      Eol,
1519      Dedent,
1520      Identifier:"a",
1521      Colon,
1522      Eol,
1523      Indent,
1524      Text:"@touch F",
1525      Eol,
1526      Whitespace:"  ",
1527      Text:"@touch a",
1528      Eol,
1529      Eol,
1530      Dedent,
1531      Identifier:"d",
1532      Colon,
1533      Whitespace,
1534      Identifier:"c",
1535      Eol,
1536      Indent,
1537      Text:"@rm c",
1538      Eol,
1539      Eol,
1540      Dedent,
1541      Identifier:"c",
1542      Colon,
1543      Whitespace,
1544      Identifier:"b",
1545      Eol,
1546      Indent,
1547      Text:"@mv b c",
1548      Eol,
1549      Dedent
1550    ),
1551  }
1552
1553  test! {
1554    name: interpolation_empty,
1555    text: "hello:\n echo {{}}",
1556    tokens: (
1557      Identifier:"hello",
1558      Colon,
1559      Eol,
1560      Indent:" ",
1561      Text:"echo ",
1562      InterpolationStart,
1563      InterpolationEnd,
1564      Dedent,
1565    ),
1566  }
1567
1568  test! {
1569    name: interpolation_expression,
1570    text: "hello:\n echo {{`echo hello` + `echo goodbye`}}",
1571    tokens: (
1572      Identifier:"hello",
1573      Colon,
1574      Eol,
1575      Indent:" ",
1576      Text:"echo ",
1577      InterpolationStart,
1578      Backtick:"`echo hello`",
1579      Whitespace,
1580      Plus,
1581      Whitespace,
1582      Backtick:"`echo goodbye`",
1583      InterpolationEnd,
1584      Dedent,
1585    ),
1586  }
1587
1588  test! {
1589    name: interpolation_raw_multiline_string,
1590    text: "hello:\n echo {{'\n'}}",
1591    tokens: (
1592      Identifier:"hello",
1593      Colon,
1594      Eol,
1595      Indent:" ",
1596      Text:"echo ",
1597      InterpolationStart,
1598      StringToken:"'\n'",
1599      InterpolationEnd,
1600      Dedent,
1601    ),
1602  }
1603
1604  test! {
1605    name: tokenize_names,
1606    text: "
1607      foo
1608      bar-bob
1609      b-bob_asdfAAAA
1610      test123
1611    ",
1612    tokens: (
1613      Identifier:"foo",
1614      Eol,
1615      Identifier:"bar-bob",
1616      Eol,
1617      Identifier:"b-bob_asdfAAAA",
1618      Eol,
1619      Identifier:"test123",
1620      Eol,
1621    ),
1622  }
1623
1624  test! {
1625    name: tokenize_indented_line,
1626    text: "foo:\n a",
1627    tokens: (
1628      Identifier:"foo",
1629      Colon,
1630      Eol,
1631      Indent:" ",
1632      Text:"a",
1633      Dedent,
1634    ),
1635  }
1636
1637  test! {
1638    name: tokenize_indented_block,
1639    text: "
1640      foo:
1641        a
1642        b
1643        c
1644    ",
1645    tokens: (
1646      Identifier:"foo",
1647      Colon,
1648      Eol,
1649      Indent,
1650      Text:"a",
1651      Eol,
1652      Whitespace:"  ",
1653      Text:"b",
1654      Eol,
1655      Whitespace:"  ",
1656      Text:"c",
1657      Eol,
1658      Dedent,
1659    ),
1660  }
1661
1662  test! {
1663    name: tokenize_strings,
1664    text: r#"a = "'a'" + '"b"' + "'c'" + '"d"'#echo hello"#,
1665    tokens: (
1666      Identifier:"a",
1667      Whitespace,
1668      Equals,
1669      Whitespace,
1670      StringToken:"\"'a'\"",
1671      Whitespace,
1672      Plus,
1673      Whitespace,
1674      StringToken:"'\"b\"'",
1675      Whitespace,
1676      Plus,
1677      Whitespace,
1678      StringToken:"\"'c'\"",
1679      Whitespace,
1680      Plus,
1681      Whitespace,
1682      StringToken:"'\"d\"'",
1683      Comment:"#echo hello",
1684    )
1685  }
1686
1687  test! {
1688    name: tokenize_recipe_interpolation_eol,
1689    text: "
1690      foo: # some comment
1691       {{hello}}
1692    ",
1693    tokens: (
1694      Identifier:"foo",
1695      Colon,
1696      Whitespace,
1697      Comment:"# some comment",
1698      Eol,
1699      Indent:" ",
1700      InterpolationStart,
1701      Identifier:"hello",
1702      InterpolationEnd,
1703      Eol,
1704      Dedent
1705    ),
1706  }
1707
1708  test! {
1709    name: tokenize_recipe_interpolation_eof,
1710    text: "foo: # more comments
1711 {{hello}}
1712# another comment
1713",
1714    tokens: (
1715      Identifier:"foo",
1716      Colon,
1717      Whitespace,
1718      Comment:"# more comments",
1719      Eol,
1720      Indent:" ",
1721      InterpolationStart,
1722      Identifier:"hello",
1723      InterpolationEnd,
1724      Eol,
1725      Dedent,
1726      Comment:"# another comment",
1727      Eol,
1728    ),
1729  }
1730
1731  test! {
1732    name: tokenize_recipe_complex_interpolation_expression,
1733    text: "foo: #lol\n {{a + b + \"z\" + blarg}}",
1734    tokens: (
1735      Identifier:"foo",
1736      Colon,
1737      Whitespace:" ",
1738      Comment:"#lol",
1739      Eol,
1740      Indent:" ",
1741      InterpolationStart,
1742      Identifier:"a",
1743      Whitespace,
1744      Plus,
1745      Whitespace,
1746      Identifier:"b",
1747      Whitespace,
1748      Plus,
1749      Whitespace,
1750      StringToken:"\"z\"",
1751      Whitespace,
1752      Plus,
1753      Whitespace,
1754      Identifier:"blarg",
1755      InterpolationEnd,
1756      Dedent,
1757    ),
1758  }
1759
1760  test! {
1761    name: tokenize_recipe_multiple_interpolations,
1762    text: "foo:,#ok\n {{a}}0{{b}}1{{c}}",
1763    tokens: (
1764      Identifier:"foo",
1765      Colon,
1766      Comma,
1767      Comment:"#ok",
1768      Eol,
1769      Indent:" ",
1770      InterpolationStart,
1771      Identifier:"a",
1772      InterpolationEnd,
1773      Text:"0",
1774      InterpolationStart,
1775      Identifier:"b",
1776      InterpolationEnd,
1777      Text:"1",
1778      InterpolationStart,
1779      Identifier:"c",
1780      InterpolationEnd,
1781      Dedent,
1782
1783    ),
1784  }
1785
1786  test! {
1787    name: tokenize_junk,
1788    text: "
1789      bob
1790
1791      hello blah blah blah : a b c #whatever
1792    ",
1793    tokens: (
1794      Identifier:"bob",
1795      Eol,
1796      Eol,
1797      Identifier:"hello",
1798      Whitespace,
1799      Identifier:"blah",
1800      Whitespace,
1801      Identifier:"blah",
1802      Whitespace,
1803      Identifier:"blah",
1804      Whitespace,
1805      Colon,
1806      Whitespace,
1807      Identifier:"a",
1808      Whitespace,
1809      Identifier:"b",
1810      Whitespace,
1811      Identifier:"c",
1812      Whitespace,
1813      Comment:"#whatever",
1814      Eol,
1815    )
1816  }
1817
1818  test! {
1819    name: tokenize_empty_lines,
1820    text: "
1821
1822      # this does something
1823      hello:
1824        asdf
1825        bsdf
1826
1827        csdf
1828
1829        dsdf # whatever
1830
1831      # yolo
1832    ",
1833    tokens: (
1834      Eol,
1835      Comment:"# this does something",
1836      Eol,
1837      Identifier:"hello",
1838      Colon,
1839      Eol,
1840      Indent,
1841      Text:"asdf",
1842      Eol,
1843      Whitespace:"  ",
1844      Text:"bsdf",
1845      Eol,
1846      Eol,
1847      Whitespace:"  ",
1848      Text:"csdf",
1849      Eol,
1850      Eol,
1851      Whitespace:"  ",
1852      Text:"dsdf # whatever",
1853      Eol,
1854      Eol,
1855      Dedent,
1856      Comment:"# yolo",
1857      Eol,
1858    ),
1859  }
1860
1861  test! {
1862    name: tokenize_comment_before_variable,
1863    text: "
1864      #
1865      A='1'
1866      echo:
1867        echo {{A}}
1868    ",
1869    tokens: (
1870      Comment:"#",
1871      Eol,
1872      Identifier:"A",
1873      Equals,
1874      StringToken:"'1'",
1875      Eol,
1876      Identifier:"echo",
1877      Colon,
1878      Eol,
1879      Indent,
1880      Text:"echo ",
1881      InterpolationStart,
1882      Identifier:"A",
1883      InterpolationEnd,
1884      Eol,
1885      Dedent,
1886    ),
1887  }
1888
1889  test! {
1890    name: tokenize_interpolation_backticks,
1891    text: "hello:\n echo {{`echo hello` + `echo goodbye`}}",
1892    tokens: (
1893      Identifier:"hello",
1894      Colon,
1895      Eol,
1896      Indent:" ",
1897      Text:"echo ",
1898      InterpolationStart,
1899      Backtick:"`echo hello`",
1900      Whitespace,
1901      Plus,
1902      Whitespace,
1903      Backtick:"`echo goodbye`",
1904      InterpolationEnd,
1905      Dedent
1906    ),
1907  }
1908
1909  test! {
1910    name: tokenize_empty_interpolation,
1911    text: "hello:\n echo {{}}",
1912    tokens: (
1913      Identifier:"hello",
1914      Colon,
1915      Eol,
1916      Indent:" ",
1917      Text:"echo ",
1918      InterpolationStart,
1919      InterpolationEnd,
1920      Dedent,
1921    ),
1922  }
1923
1924  test! {
1925    name: tokenize_assignment_backticks,
1926    text: "a = `echo hello` + `echo goodbye`",
1927    tokens: (
1928      Identifier:"a",
1929      Whitespace,
1930      Equals,
1931      Whitespace,
1932      Backtick:"`echo hello`",
1933      Whitespace,
1934      Plus,
1935      Whitespace,
1936      Backtick:"`echo goodbye`",
1937    ),
1938  }
1939
1940  test! {
1941    name: tokenize_multiple,
1942    text: "
1943
1944      hello:
1945        a
1946        b
1947
1948        c
1949
1950        d
1951
1952      # hello
1953      bob:
1954        frank
1955       \t
1956    ",
1957    tokens: (
1958      Eol,
1959      Identifier:"hello",
1960      Colon,
1961      Eol,
1962      Indent,
1963      Text:"a",
1964      Eol,
1965      Whitespace:"  ",
1966      Text:"b",
1967      Eol,
1968      Eol,
1969      Whitespace:"  ",
1970      Text:"c",
1971      Eol,
1972      Eol,
1973      Whitespace:"  ",
1974      Text:"d",
1975      Eol,
1976      Eol,
1977      Dedent,
1978      Comment:"# hello",
1979      Eol,
1980      Identifier:"bob",
1981      Colon,
1982      Eol,
1983      Indent:"  ",
1984      Text:"frank",
1985      Eol,
1986      Eol,
1987      Dedent,
1988    ),
1989  }
1990
1991  test! {
1992    name: tokenize_comment,
1993    text: "a:=#",
1994    tokens: (
1995      Identifier:"a",
1996      ColonEquals,
1997      Comment:"#",
1998    ),
1999  }
2000
2001  test! {
2002    name: tokenize_comment_with_bang,
2003    text: "a:=#foo!",
2004    tokens: (
2005      Identifier:"a",
2006      ColonEquals,
2007      Comment:"#foo!",
2008    ),
2009  }
2010
2011  test! {
2012    name: tokenize_order,
2013    text: "
2014      b: a
2015        @mv a b
2016
2017      a:
2018        @touch F
2019        @touch a
2020
2021      d: c
2022        @rm c
2023
2024      c: b
2025        @mv b c
2026    ",
2027    tokens: (
2028      Identifier:"b",
2029      Colon,
2030      Whitespace,
2031      Identifier:"a",
2032      Eol,
2033      Indent,
2034      Text:"@mv a b",
2035      Eol,
2036      Eol,
2037      Dedent,
2038      Identifier:"a",
2039      Colon,
2040      Eol,
2041      Indent,
2042      Text:"@touch F",
2043      Eol,
2044      Whitespace:"  ",
2045      Text:"@touch a",
2046      Eol,
2047      Eol,
2048      Dedent,
2049      Identifier:"d",
2050      Colon,
2051      Whitespace,
2052      Identifier:"c",
2053      Eol,
2054      Indent,
2055      Text:"@rm c",
2056      Eol,
2057      Eol,
2058      Dedent,
2059      Identifier:"c",
2060      Colon,
2061      Whitespace,
2062      Identifier:"b",
2063      Eol,
2064      Indent,
2065      Text:"@mv b c",
2066      Eol,
2067      Dedent,
2068    ),
2069  }
2070
2071  test! {
2072    name: tokenize_parens,
2073    text: "((())) ()abc(+",
2074    tokens: (
2075      ParenL,
2076      ParenL,
2077      ParenL,
2078      ParenR,
2079      ParenR,
2080      ParenR,
2081      Whitespace,
2082      ParenL,
2083      ParenR,
2084      Identifier:"abc",
2085      ParenL,
2086      Plus,
2087    ),
2088  }
2089
2090  test! {
2091    name: crlf_newline,
2092    text: "#\r\n#asdf\r\n",
2093    tokens: (
2094      Comment:"#",
2095      Eol:"\r\n",
2096      Comment:"#asdf",
2097      Eol:"\r\n",
2098    ),
2099  }
2100
2101  test! {
2102    name: multiple_recipes,
2103    text: "a:\n  foo\nb:",
2104    tokens: (
2105      Identifier:"a",
2106      Colon,
2107      Eol,
2108      Indent:"  ",
2109      Text:"foo",
2110      Eol,
2111      Dedent,
2112      Identifier:"b",
2113      Colon,
2114    ),
2115  }
2116
2117  test! {
2118    name:   brackets,
2119    text:   "[][]",
2120    tokens: (BracketL, BracketR, BracketL, BracketR),
2121  }
2122
2123  test! {
2124    name:   open_delimiter_eol,
2125    text:   "[\n](\n){\n}",
2126    tokens: (
2127      BracketL, Whitespace:"\n", BracketR,
2128      ParenL, Whitespace:"\n", ParenR,
2129      BraceL, Whitespace:"\n", BraceR
2130    ),
2131  }
2132
2133  test! {
2134    name:   format_string_empty,
2135    text:   "f''",
2136    tokens: (
2137      Identifier: "f",
2138      StringToken: "''",
2139    ),
2140  }
2141
2142  test! {
2143    name:   format_string_identifier,
2144    text:   "f'{{foo}}'",
2145    tokens: (
2146      Identifier: "f",
2147      FormatStringStart: "'{{",
2148      Identifier: "foo",
2149      FormatStringEnd: "}}'",
2150    ),
2151  }
2152
2153  test! {
2154    name:   format_string_continue,
2155    text:   "f'{{foo}}bar{{baz}}'",
2156    tokens: (
2157      Identifier: "f",
2158      FormatStringStart: "'{{",
2159      Identifier: "foo",
2160      FormatStringContinue: "}}bar{{",
2161      Identifier: "baz",
2162      FormatStringEnd: "}}'",
2163    ),
2164  }
2165
2166  test! {
2167    name:   format_string_whitespace,
2168    text:   "f '{{foo}}'",
2169    tokens: (
2170      Identifier: "f",
2171      Whitespace,
2172      StringToken: "'{{foo}}'",
2173    ),
2174  }
2175
2176  test! {
2177    name:   format_string_wrong_identifier,
2178    text:   "g'{{foo}}'",
2179    tokens: (
2180      Identifier: "g",
2181      StringToken: "'{{foo}}'",
2182    ),
2183  }
2184
2185  test! {
2186    name:   format_string_followed_by_recipe,
2187    text:   "foo := f'{{'foo'}}{{'bar'}}'\nbar:",
2188    tokens: (
2189      Identifier: "foo",
2190      Whitespace: " ",
2191      ColonEquals: ":=",
2192      Whitespace: " ",
2193      Identifier: "f",
2194      FormatStringStart: "'{{",
2195      StringToken: "'foo'",
2196      FormatStringContinue: "}}{{",
2197      StringToken: "'bar'",
2198      FormatStringEnd: "}}'",
2199      Eol: "\n",
2200      Identifier: "bar",
2201      Colon,
2202    ),
2203  }
2204
2205  test! {
2206    name:   indented_format_string_followed_by_recipe,
2207    text:   "foo := f'''{{'foo'}}{{'bar'}}'''\nbar:",
2208    tokens: (
2209      Identifier: "foo",
2210      Whitespace: " ",
2211      ColonEquals: ":=",
2212      Whitespace: " ",
2213      Identifier: "f",
2214      FormatStringStart: "'''{{",
2215      StringToken: "'foo'",
2216      FormatStringContinue: "}}{{",
2217      StringToken: "'bar'",
2218      FormatStringEnd: "}}'''",
2219      Eol: "\n",
2220      Identifier: "bar",
2221      Colon,
2222    ),
2223  }
2224
2225  error! {
2226    name:  tokenize_space_then_tab,
2227    input: "a:
2228 0
2229 1
2230\t2
2231",
2232    offset: 9,
2233    line:   3,
2234    column: 0,
2235    width:  1,
2236    kind:   InconsistentLeadingWhitespace{expected: " ", found: "\t"},
2237  }
2238
2239  error! {
2240    name:  tokenize_tabs_then_tab_space,
2241    input: "a:
2242\t\t0
2243\t\t 1
2244\t  2
2245",
2246    offset: 12,
2247    line:   3,
2248    column: 0,
2249    width:  3,
2250    kind:   InconsistentLeadingWhitespace{expected: "\t\t", found: "\t  "},
2251  }
2252
2253  error! {
2254    name:   tokenize_unknown,
2255    input:  "%",
2256    offset: 0,
2257    line:   0,
2258    column: 0,
2259    width:  1,
2260    kind:   UnknownStartOfToken { start: '%'},
2261  }
2262
2263  error! {
2264    name:   unterminated_string_with_escapes,
2265    input:  r#"a = "\n\t\r\"\\"#,
2266    offset: 4,
2267    line:   0,
2268    column: 4,
2269    width:  1,
2270    kind:   UnterminatedString,
2271  }
2272
2273  error! {
2274    name:   unterminated_raw_string,
2275    input:  "r a='asdf",
2276    offset: 4,
2277    line:   0,
2278    column: 4,
2279    width:  1,
2280    kind:   UnterminatedString,
2281  }
2282
2283  error! {
2284    name:   unterminated_interpolation,
2285    input:  "foo:\n echo {{
2286  ",
2287    offset: 11,
2288    line:   1,
2289    column: 6,
2290    width:  2,
2291    kind:   UnterminatedInterpolation,
2292  }
2293
2294  error! {
2295    name:   unterminated_backtick,
2296    input:  "`echo",
2297    offset: 0,
2298    line:   0,
2299    column: 0,
2300    width:  1,
2301    kind:   UnterminatedBacktick,
2302  }
2303
2304  error! {
2305    name:   unpaired_carriage_return,
2306    input:  "foo\rbar",
2307    offset: 3,
2308    line:   0,
2309    column: 3,
2310    width:  1,
2311    kind:   UnpairedCarriageReturn,
2312  }
2313
2314  error! {
2315    name:   invalid_name_start_dash,
2316    input:  "-foo",
2317    offset: 0,
2318    line:   0,
2319    column: 0,
2320    width:  1,
2321    kind:   UnknownStartOfToken{ start: '-'},
2322  }
2323
2324  error! {
2325    name:   invalid_name_start_digit,
2326    input:  "0foo",
2327    offset: 0,
2328    line:   0,
2329    column: 0,
2330    width:  1,
2331    kind:   UnknownStartOfToken { start: '0' },
2332  }
2333
2334  error! {
2335    name:   unterminated_string,
2336    input:  r#"a = ""#,
2337    offset: 4,
2338    line:   0,
2339    column: 4,
2340    width:  1,
2341    kind:   UnterminatedString,
2342  }
2343
2344  error! {
2345    name:   mixed_leading_whitespace_recipe,
2346    input:  "a:\n\t echo hello",
2347    offset: 3,
2348    line:   1,
2349    column: 0,
2350    width:  2,
2351    kind:   MixedLeadingWhitespace{whitespace: "\t "},
2352  }
2353
2354  error! {
2355    name:   mixed_leading_whitespace_normal,
2356    input:  "a\n\t echo hello",
2357    offset: 2,
2358    line:   1,
2359    column: 0,
2360    width:  2,
2361    kind:   MixedLeadingWhitespace{whitespace: "\t "},
2362  }
2363
2364  error! {
2365    name:   mixed_leading_whitespace_indent,
2366    input:  "a\n foo\n \tbar",
2367    offset: 7,
2368    line:   2,
2369    column: 0,
2370    width:  2,
2371    kind:   MixedLeadingWhitespace{whitespace: " \t"},
2372  }
2373
2374  error! {
2375    name:   bad_dedent,
2376    input:  "a\n foo\n   bar\n  baz",
2377    offset: 14,
2378    line:   3,
2379    column: 0,
2380    width:  2,
2381    kind:   InconsistentLeadingWhitespace{expected: "   ", found: "  "},
2382  }
2383
2384  error! {
2385    name:   unclosed_interpolation_delimiter,
2386    input:  "a:\n echo {{ foo",
2387    offset: 9,
2388    line:   1,
2389    column: 6,
2390    width:  2,
2391    kind:   UnterminatedInterpolation,
2392  }
2393
2394  error! {
2395    name:   unexpected_character_after_at,
2396    input:  "@%",
2397    offset: 1,
2398    line:   0,
2399    column: 1,
2400    width:  1,
2401    kind:   UnknownStartOfToken { start: '%'},
2402  }
2403
2404  error! {
2405    name:   mismatched_closing_brace,
2406    input:  "(]",
2407    offset: 1,
2408    line:   0,
2409    column: 1,
2410    width:  0,
2411    kind:   MismatchedClosingDelimiter {
2412      open:      Delimiter::Paren,
2413      close:     Delimiter::Bracket,
2414      open_line: 0,
2415    },
2416  }
2417
2418  error! {
2419    name:   ampersand_eof,
2420    input:  "&",
2421    offset: 1,
2422    line:   0,
2423    column: 1,
2424    width:  0,
2425    kind:   UnexpectedEndOfToken {
2426      expected: vec!['&'],
2427    },
2428  }
2429
2430  error! {
2431    name:   ampersand_unexpected,
2432    input:  "&%",
2433    offset: 1,
2434    line:   0,
2435    column: 1,
2436    width:  1,
2437    kind:   UnexpectedCharacter {
2438      expected: vec!['&'],
2439    },
2440  }
2441
2442  error! {
2443    name:   bang_eof,
2444    input:  "!",
2445    offset: 1,
2446    line:   0,
2447    column: 1,
2448    width:  0,
2449    kind:   UnexpectedEndOfToken {
2450      expected: vec!['=', '~'],
2451    },
2452  }
2453
2454  error! {
2455    name:   unclosed_parenthesis_in_interpolation,
2456    input:  "a:\n echo {{foo(}}",
2457    offset:  15,
2458    line:   1,
2459    column: 12,
2460    width:  0,
2461    kind:   MismatchedClosingDelimiter {
2462      close: Delimiter::Brace,
2463      open: Delimiter::Paren,
2464      open_line: 1,
2465    },
2466  }
2467
2468  #[test]
2469  fn presume_error() {
2470    let compile_error = Lexer::new("justfile".as_ref(), "!")
2471      .presume('-')
2472      .unwrap_err();
2473    assert_matches!(
2474      compile_error.token,
2475      Token {
2476        offset: 0,
2477        line: 0,
2478        column: 0,
2479        length: 0,
2480        src: "!",
2481        kind: Unspecified,
2482        path: _,
2483      }
2484    );
2485    assert_matches!(&*compile_error.kind,
2486        Internal { message }
2487        if message == "Lexer presumed character `-`"
2488    );
2489
2490    assert_eq!(
2491      Error::Compile { compile_error }
2492        .color_display(Color::never())
2493        .to_string(),
2494      "error: Internal error, this may indicate a bug in just: Lexer presumed character `-`
2495consider filing an issue: https://github.com/casey/just/issues/new
2496 ——▶ justfile:1:1
2497  │
24981 │ !
2499  │ ^"
2500    );
2501  }
2502}
just/lexer.rs

just/
lexer.rs