Skip to main content

nu_parser/
lex.rs

1use nu_protocol::{ParseError, Span};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4pub enum TokenContents {
5    Item,
6    Comment,
7    Pipe,
8    PipePipe,
9    AssignmentOperator,
10    ErrGreaterPipe,
11    OutErrGreaterPipe,
12    Semicolon,
13    OutGreaterThan,
14    OutGreaterGreaterThan,
15    ErrGreaterThan,
16    ErrGreaterGreaterThan,
17    OutErrGreaterThan,
18    OutErrGreaterGreaterThan,
19    Eol,
20}
21
22#[derive(Debug, PartialEq, Eq)]
23pub struct Token {
24    pub contents: TokenContents,
25    pub span: Span,
26}
27
28impl Token {
29    pub fn new(contents: TokenContents, span: Span) -> Token {
30        Token { contents, span }
31    }
32}
33
34#[derive(Clone, Copy, Debug)]
35pub enum BlockKind {
36    Paren,
37    CurlyBracket,
38    SquareBracket,
39    AngleBracket,
40}
41
42impl BlockKind {
43    fn closing(self) -> u8 {
44        match self {
45            BlockKind::Paren => b')',
46            BlockKind::SquareBracket => b']',
47            BlockKind::CurlyBracket => b'}',
48            BlockKind::AngleBracket => b'>',
49        }
50    }
51}
52
53// A baseline token is terminated if it's not nested inside of a paired
54// delimiter and the next character is one of: `|`, `;` or any
55// whitespace.
56fn is_item_terminator(
57    block_level: &[BlockKind],
58    c: u8,
59    additional_whitespace: &[u8],
60    special_tokens: &[u8],
61) -> bool {
62    block_level.is_empty()
63        && (c == b' '
64            || c == b'\t'
65            || c == b'\n'
66            || c == b'\r'
67            || c == b'|'
68            || c == b';'
69            || additional_whitespace.contains(&c)
70            || special_tokens.contains(&c))
71}
72
73/// Assignment operators have special handling distinct from math expressions, as they cause the
74/// rest of the pipeline to be consumed.
75pub fn is_assignment_operator(bytes: &[u8]) -> bool {
76    matches!(bytes, b"=" | b"+=" | b"++=" | b"-=" | b"*=" | b"/=")
77}
78
79// A special token is one that is a byte that stands alone as its own token. For example
80// when parsing a signature you may want to have `:` be able to separate tokens and also
81// to be handled as its own token to notify you you're about to parse a type in the example
82// `foo:bar`
83fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool {
84    block_level.is_empty() && special_tokens.contains(&c)
85}
86
87pub fn lex_item(
88    input: &[u8],
89    curr_offset: &mut usize,
90    span_offset: usize,
91    additional_whitespace: &[u8],
92    special_tokens: &[u8],
93    in_signature: bool,
94) -> (Token, Option<ParseError>) {
95    // This variable tracks the starting character of a string literal, so that
96    // we remain inside the string literal lexer mode until we encounter the
97    // closing quote.
98    let mut quote_start: Option<u8> = None;
99
100    let mut in_comment = false;
101
102    let token_start = *curr_offset;
103
104    // This Vec tracks paired delimiters
105    let mut block_level: Vec<BlockKind> = vec![];
106
107    // The process of slurping up a baseline token repeats:
108    //
109    // - String literal, which begins with `'` or `"`, and continues until
110    //   the same character is encountered again.
111    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
112    //   the matching closing delimiter is found, skipping comments and string
113    //   literals.
114    // - When not nested inside of a delimiter pair, when a terminating
115    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
116    //   token is done.
117    // - Otherwise, accumulate the character into the current baseline token.
118    let mut previous_char = None;
119    while let Some(c) = input.get(*curr_offset) {
120        let c = *c;
121
122        if let Some(start) = quote_start {
123            // Check if we're in an escape sequence
124            if c == b'\\' && start == b'"' {
125                // Go ahead and consume the escape character if possible
126                if input.get(*curr_offset + 1).is_some() {
127                    // Successfully escaped the character
128                    *curr_offset += 2;
129                    continue;
130                } else {
131                    let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
132
133                    return (
134                        Token {
135                            contents: TokenContents::Item,
136                            span,
137                        },
138                        Some(ParseError::UnexpectedEof(
139                            (start as char).to_string(),
140                            Span::new(span.end - 1, span.end),
141                        )),
142                    );
143                }
144            }
145            // If we encountered the closing quote character for the current
146            // string, we're done with the current string.
147            if c == start {
148                // Also need to check to make sure we aren't escaped
149                quote_start = None;
150            }
151        } else if c == b'#' && !in_comment {
152            // To start a comment, It either need to be the first character of the token or prefixed with whitespace.
153            in_comment = previous_char
154                .map(char::from)
155                .map(char::is_whitespace)
156                .unwrap_or(true);
157        } else if c == b'\n' || c == b'\r' {
158            in_comment = false;
159            if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
160                break;
161            }
162        } else if in_comment {
163            if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
164                break;
165            }
166        } else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset {
167            *curr_offset += 1;
168            break;
169        } else if c == b'\'' || c == b'"' || c == b'`' {
170            // We encountered the opening quote of a string literal.
171            quote_start = Some(c);
172        } else if c == b'[' {
173            // We encountered an opening `[` delimiter.
174            block_level.push(BlockKind::SquareBracket);
175        } else if c == b'<' && in_signature {
176            block_level.push(BlockKind::AngleBracket);
177        } else if c == b'>' && in_signature {
178            if let Some(BlockKind::AngleBracket) = block_level.last() {
179                let _ = block_level.pop();
180            }
181        } else if c == b']' {
182            // We encountered a closing `]` delimiter. Pop off the opening `[`
183            // delimiter.
184            if let Some(BlockKind::SquareBracket) = block_level.last() {
185                let _ = block_level.pop();
186            }
187        } else if c == b'{' {
188            // We encountered an opening `{` delimiter.
189            block_level.push(BlockKind::CurlyBracket);
190        } else if c == b'}' {
191            // We encountered a closing `}` delimiter. Pop off the opening `{`.
192            if let Some(BlockKind::CurlyBracket) = block_level.last() {
193                let _ = block_level.pop();
194            } else {
195                // We encountered a closing `}` delimiter, but the last opening
196                // delimiter was not a `{`. This is an error.
197                *curr_offset += 1;
198                let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
199
200                return (
201                    Token {
202                        contents: TokenContents::Item,
203                        span,
204                    },
205                    Some(ParseError::Unbalanced(
206                        "{",
207                        "}",
208                        Span::new(span.end - 1, span.end),
209                    )),
210                );
211            }
212        } else if c == b'(' {
213            // We encountered an opening `(` delimiter.
214            block_level.push(BlockKind::Paren);
215        } else if c == b')' {
216            // We encountered a closing `)` delimiter. Pop off the opening `(`.
217            if let Some(BlockKind::Paren) = block_level.last() {
218                let _ = block_level.pop();
219            } else {
220                // We encountered a closing `)` delimiter, but the last opening
221                // delimiter was not a `(`. This is an error.
222                *curr_offset += 1;
223                let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
224
225                return (
226                    Token {
227                        contents: TokenContents::Item,
228                        span,
229                    },
230                    Some(ParseError::Unbalanced(
231                        "(",
232                        ")",
233                        Span::new(span.end - 1, span.end),
234                    )),
235                );
236            }
237        } else if c == b'r' && input.get(*curr_offset + 1) == Some(b'#').as_ref() {
238            // already checked `r#` pattern, so it's a raw string.
239            let lex_result = lex_raw_string(input, curr_offset, span_offset);
240            let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
241            if let Err(e) = lex_result {
242                return (
243                    Token {
244                        contents: TokenContents::Item,
245                        span,
246                    },
247                    Some(e),
248                );
249            }
250        } else if c == b'|' && is_redirection(&input[token_start..*curr_offset]) {
251            // matches err>| etc.
252            *curr_offset += 1;
253            break;
254        } else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
255            break;
256        }
257
258        *curr_offset += 1;
259        previous_char = Some(c);
260    }
261
262    let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
263
264    if let Some(delim) = quote_start {
265        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
266        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
267        // correct information from the non-lite parse.
268        return (
269            Token {
270                contents: TokenContents::Item,
271                span,
272            },
273            Some(ParseError::UnexpectedEof(
274                (delim as char).to_string(),
275                Span::new(span.end - 1, span.end),
276            )),
277        );
278    }
279
280    // If there is still unclosed opening delimiters, remember they were missing
281    if let Some(block) = block_level.last() {
282        let delim = block.closing();
283        let cause = ParseError::UnexpectedEof(
284            (delim as char).to_string(),
285            Span::new(span.end - 1, span.end),
286        );
287
288        return (
289            Token {
290                contents: TokenContents::Item,
291                span,
292            },
293            Some(cause),
294        );
295    }
296
297    // If we didn't accumulate any characters, it's an unexpected error.
298    if *curr_offset - token_start == 0 {
299        return (
300            Token {
301                contents: TokenContents::Item,
302                span,
303            },
304            Some(ParseError::UnexpectedEof("command".to_string(), span)),
305        );
306    }
307
308    let mut err = None;
309    let output = match &input[(span.start - span_offset)..(span.end - span_offset)] {
310        bytes if is_assignment_operator(bytes) => Token {
311            contents: TokenContents::AssignmentOperator,
312            span,
313        },
314        b"out>" | b"o>" => Token {
315            contents: TokenContents::OutGreaterThan,
316            span,
317        },
318        b"out>>" | b"o>>" => Token {
319            contents: TokenContents::OutGreaterGreaterThan,
320            span,
321        },
322        b"out>|" | b"o>|" => {
323            err = Some(ParseError::Expected(
324                "`|`.  Redirecting stdout to a pipe is the same as normal piping.",
325                span,
326            ));
327            Token {
328                // HACK: For more accurate parsing aligned with user intention
329                contents: TokenContents::Pipe,
330                span,
331            }
332        }
333        b"err>" | b"e>" => Token {
334            contents: TokenContents::ErrGreaterThan,
335            span,
336        },
337        b"err>>" | b"e>>" => Token {
338            contents: TokenContents::ErrGreaterGreaterThan,
339            span,
340        },
341        b"err>|" | b"e>|" => Token {
342            contents: TokenContents::ErrGreaterPipe,
343            span,
344        },
345        b"out+err>" | b"err+out>" | b"o+e>" | b"e+o>" => Token {
346            contents: TokenContents::OutErrGreaterThan,
347            span,
348        },
349        b"out+err>>" | b"err+out>>" | b"o+e>>" | b"e+o>>" => Token {
350            contents: TokenContents::OutErrGreaterGreaterThan,
351            span,
352        },
353        b"out+err>|" | b"err+out>|" | b"o+e>|" | b"e+o>|" => Token {
354            contents: TokenContents::OutErrGreaterPipe,
355            span,
356        },
357        b"&&" => {
358            err = Some(ParseError::ShellAndAnd(span));
359            Token {
360                // HACK: For more accurate parsing aligned with user intention
361                contents: TokenContents::Pipe,
362                span,
363            }
364        }
365        b"2>" => {
366            err = Some(ParseError::ShellErrRedirect(span));
367            Token {
368                // HACK: For more accurate parsing aligned with user intention
369                contents: TokenContents::ErrGreaterThan,
370                span,
371            }
372        }
373        b"2>&1" => {
374            err = Some(ParseError::ShellOutErrRedirect(span));
375            Token {
376                // HACK: For more accurate parsing aligned with user intention
377                contents: TokenContents::Pipe,
378                span,
379            }
380        }
381        _ => Token {
382            contents: TokenContents::Item,
383            span,
384        },
385    };
386    (output, err)
387}
388
389fn lex_raw_string(
390    input: &[u8],
391    curr_offset: &mut usize,
392    span_offset: usize,
393) -> Result<(), ParseError> {
394    // A raw string literal looks like `echo r#'Look, I can use 'single quotes'!'#`
395    // If the next character is `#` we're probably looking at a raw string literal
396    // so we need to read all the text until we find a closing `#`. This raw string
397    // can contain any character, including newlines and double quotes without needing
398    // to escape them.
399    //
400    // A raw string can contain many `#` as prefix,
401    // incase if there is a `'#` or `#'` in the string itself.
402    // E.g: r##'I can use '#' in a raw string'##
403    let mut prefix_sharp_cnt = 0;
404    let start = *curr_offset;
405    while let Some(b'#') = input.get(start + prefix_sharp_cnt + 1) {
406        prefix_sharp_cnt += 1;
407    }
408
409    // curr_offset is the character `r`, we need to move forward and skip all `#`
410    // characters.
411    //
412    // e.g: r###'<body>
413    //      ^
414    //      ^
415    //   curr_offset
416    *curr_offset += prefix_sharp_cnt + 1;
417    // the next one should be a single quote.
418    if input.get(*curr_offset) != Some(&b'\'') {
419        return Err(ParseError::Expected(
420            "'",
421            Span::new(span_offset + *curr_offset, span_offset + *curr_offset + 1),
422        ));
423    }
424
425    *curr_offset += 1;
426    let mut matches = false;
427    while let Some(ch) = input.get(*curr_offset) {
428        // check for postfix '###
429        if *ch == b'#' {
430            let start_ch = input[*curr_offset - prefix_sharp_cnt];
431            let postfix = &input[*curr_offset - prefix_sharp_cnt + 1..=*curr_offset];
432            if start_ch == b'\'' && postfix.iter().all(|x| *x == b'#') {
433                matches = true;
434                break;
435            }
436        }
437        *curr_offset += 1
438    }
439    if !matches {
440        let mut expected = '\''.to_string();
441        expected.push_str(&"#".repeat(prefix_sharp_cnt));
442        return Err(ParseError::UnexpectedEof(
443            expected,
444            Span::new(span_offset + *curr_offset - 1, span_offset + *curr_offset),
445        ));
446    }
447    Ok(())
448}
449
450pub fn lex_signature(
451    input: &[u8],
452    span_offset: usize,
453    additional_whitespace: &[u8],
454    special_tokens: &[u8],
455    skip_comment: bool,
456) -> (Vec<Token>, Option<ParseError>) {
457    let mut state = LexState {
458        input,
459        output: Vec::new(),
460        error: None,
461        span_offset,
462    };
463    lex_internal(
464        &mut state,
465        additional_whitespace,
466        special_tokens,
467        skip_comment,
468        true,
469        None,
470    );
471    (state.output, state.error)
472}
473
474#[derive(Debug)]
475pub struct LexState<'a> {
476    pub input: &'a [u8],
477    pub output: Vec<Token>,
478    pub error: Option<ParseError>,
479    pub span_offset: usize,
480}
481
482/// Lex until the output is `max_tokens` longer than before the call, or until the input is exhausted.
483/// The return value indicates how many tokens the call added to / removed from the output.
484///
485/// The behaviour here is non-obvious when `additional_whitespace` doesn't include newline:
486/// If you pass a `state` where the last token in the output is an Eol, this might *remove* tokens.
487pub fn lex_n_tokens(
488    state: &mut LexState,
489    additional_whitespace: &[u8],
490    special_tokens: &[u8],
491    skip_comment: bool,
492    max_tokens: usize,
493) -> isize {
494    let n_tokens = state.output.len();
495    lex_internal(
496        state,
497        additional_whitespace,
498        special_tokens,
499        skip_comment,
500        false,
501        Some(max_tokens),
502    );
503    // If this lex_internal call reached the end of the input, there may now be fewer tokens
504    // in the output than before.
505    let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize);
506    let next_offset = state.output.last().map(|token| token.span.end);
507    if let Some(next_offset) = next_offset {
508        state.input = &state.input[next_offset - state.span_offset..];
509        state.span_offset = next_offset;
510    }
511    tokens_n_diff
512}
513
514pub fn lex(
515    input: &[u8],
516    span_offset: usize,
517    additional_whitespace: &[u8],
518    special_tokens: &[u8],
519    skip_comment: bool,
520) -> (Vec<Token>, Option<ParseError>) {
521    let mut state = LexState {
522        input,
523        output: Vec::new(),
524        error: None,
525        span_offset,
526    };
527    lex_internal(
528        &mut state,
529        additional_whitespace,
530        special_tokens,
531        skip_comment,
532        false,
533        None,
534    );
535    (state.output, state.error)
536}
537
538fn lex_internal(
539    state: &mut LexState,
540    additional_whitespace: &[u8],
541    special_tokens: &[u8],
542    skip_comment: bool,
543    // within signatures we want to treat `<` and `>` specially
544    in_signature: bool,
545    max_tokens: Option<usize>,
546) {
547    let initial_output_len = state.output.len();
548
549    let mut curr_offset = 0;
550
551    let mut is_complete = true;
552    while let Some(c) = state.input.get(curr_offset) {
553        if max_tokens
554            .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens)
555        {
556            break;
557        }
558        let c = *c;
559        if c == b'|' {
560            // If the next character is `|`, it's either `|` or `||`.
561            let idx = curr_offset;
562            let prev_idx = idx;
563            curr_offset += 1;
564
565            // If the next character is `|`, we're looking at a `||`.
566            if let Some(c) = state.input.get(curr_offset)
567                && *c == b'|'
568            {
569                let idx = curr_offset;
570                curr_offset += 1;
571                state.output.push(Token::new(
572                    TokenContents::PipePipe,
573                    Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1),
574                ));
575                continue;
576            }
577
578            // Otherwise, it's just a regular `|` token.
579
580            // Before we push, check to see if the previous character was a newline.
581            // If so, then this is a continuation of the previous line
582            if let Some(prev) = state.output.last_mut() {
583                match prev.contents {
584                    TokenContents::Eol => {
585                        *prev = Token::new(
586                            TokenContents::Pipe,
587                            Span::new(state.span_offset + idx, state.span_offset + idx + 1),
588                        );
589                        // And this is a continuation of the previous line if previous line is a
590                        // comment line (combined with EOL + Comment)
591                        //
592                        // Initially, the last one token is TokenContents::Pipe, we don't need to
593                        // check it, so the beginning offset is 2.
594                        let mut offset = 2;
595                        while state.output.len() > offset {
596                            let index = state.output.len() - offset;
597                            if state.output[index].contents == TokenContents::Comment
598                                && state.output[index - 1].contents == TokenContents::Eol
599                            {
600                                state.output.remove(index - 1);
601                                offset += 1;
602                            } else {
603                                break;
604                            }
605                        }
606                    }
607                    _ => {
608                        state.output.push(Token::new(
609                            TokenContents::Pipe,
610                            Span::new(state.span_offset + idx, state.span_offset + idx + 1),
611                        ));
612                    }
613                }
614            } else {
615                state.output.push(Token::new(
616                    TokenContents::Pipe,
617                    Span::new(state.span_offset + idx, state.span_offset + idx + 1),
618                ));
619            }
620
621            is_complete = false;
622        } else if c == b';' {
623            // If the next character is a `;`, we're looking at a semicolon token.
624
625            if !is_complete && state.error.is_none() {
626                state.error = Some(ParseError::ExtraTokens(Span::new(
627                    curr_offset,
628                    curr_offset + 1,
629                )));
630            }
631            let idx = curr_offset;
632            curr_offset += 1;
633            state.output.push(Token::new(
634                TokenContents::Semicolon,
635                Span::new(state.span_offset + idx, state.span_offset + idx + 1),
636            ));
637        } else if c == b'\r' {
638            // Ignore a stand-alone carriage return
639            curr_offset += 1;
640        } else if c == b'\n' {
641            // If the next character is a newline, we're looking at an EOL (end of line) token.
642            let idx = curr_offset;
643            curr_offset += 1;
644            if !additional_whitespace.contains(&c) {
645                state.output.push(Token::new(
646                    TokenContents::Eol,
647                    Span::new(state.span_offset + idx, state.span_offset + idx + 1),
648                ));
649            }
650        } else if c == b'#' {
651            // If the next character is `#`, we're at the beginning of a line
652            // comment. The comment continues until the next newline.
653            let mut start = curr_offset;
654
655            while let Some(input) = state.input.get(curr_offset) {
656                if *input == b'\n' {
657                    if !skip_comment {
658                        state.output.push(Token::new(
659                            TokenContents::Comment,
660                            Span::new(state.span_offset + start, state.span_offset + curr_offset),
661                        ));
662                    }
663                    start = curr_offset;
664
665                    break;
666                } else {
667                    curr_offset += 1;
668                }
669            }
670            if start != curr_offset && !skip_comment {
671                state.output.push(Token::new(
672                    TokenContents::Comment,
673                    Span::new(state.span_offset + start, state.span_offset + curr_offset),
674                ));
675            }
676        } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
677            // If the next character is non-newline whitespace, skip it.
678            curr_offset += 1;
679        } else {
680            let (token, err) = lex_item(
681                state.input,
682                &mut curr_offset,
683                state.span_offset,
684                additional_whitespace,
685                special_tokens,
686                in_signature,
687            );
688            if state.error.is_none() {
689                state.error = err;
690            }
691            is_complete = true;
692            state.output.push(token);
693        }
694    }
695}
696
697/// True if this the start of a redirection. Does not match `>>` or `>|` forms.
698fn is_redirection(token: &[u8]) -> bool {
699    matches!(
700        token,
701        b"o>" | b"out>" | b"e>" | b"err>" | b"o+e>" | b"e+o>" | b"out+err>" | b"err+out>"
702    )
703}