nu_parser/
lex.rs

1use nu_protocol::{ParseError, Span};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4pub enum TokenContents {
5    Item,
6    Comment,
7    Pipe,
8    PipePipe,
9    AssignmentOperator,
10    ErrGreaterPipe,
11    OutErrGreaterPipe,
12    Semicolon,
13    OutGreaterThan,
14    OutGreaterGreaterThan,
15    ErrGreaterThan,
16    ErrGreaterGreaterThan,
17    OutErrGreaterThan,
18    OutErrGreaterGreaterThan,
19    Eol,
20}
21
22#[derive(Debug, PartialEq, Eq)]
23pub struct Token {
24    pub contents: TokenContents,
25    pub span: Span,
26}
27
28impl Token {
29    pub fn new(contents: TokenContents, span: Span) -> Token {
30        Token { contents, span }
31    }
32}
33
34#[derive(Clone, Copy, Debug)]
35pub enum BlockKind {
36    Paren,
37    CurlyBracket,
38    SquareBracket,
39    AngleBracket,
40}
41
42impl BlockKind {
43    fn closing(self) -> u8 {
44        match self {
45            BlockKind::Paren => b')',
46            BlockKind::SquareBracket => b']',
47            BlockKind::CurlyBracket => b'}',
48            BlockKind::AngleBracket => b'>',
49        }
50    }
51}
52
53// A baseline token is terminated if it's not nested inside of a paired
54// delimiter and the next character is one of: `|`, `;` or any
55// whitespace.
56fn is_item_terminator(
57    block_level: &[BlockKind],
58    c: u8,
59    additional_whitespace: &[u8],
60    special_tokens: &[u8],
61) -> bool {
62    block_level.is_empty()
63        && (c == b' '
64            || c == b'\t'
65            || c == b'\n'
66            || c == b'\r'
67            || c == b'|'
68            || c == b';'
69            || additional_whitespace.contains(&c)
70            || special_tokens.contains(&c))
71}
72
73/// Assignment operators have special handling distinct from math expressions, as they cause the
74/// rest of the pipeline to be consumed.
75pub fn is_assignment_operator(bytes: &[u8]) -> bool {
76    matches!(bytes, b"=" | b"+=" | b"++=" | b"-=" | b"*=" | b"/=")
77}
78
79// A special token is one that is a byte that stands alone as its own token. For example
80// when parsing a signature you may want to have `:` be able to separate tokens and also
81// to be handled as its own token to notify you you're about to parse a type in the example
82// `foo:bar`
83fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool {
84    block_level.is_empty() && special_tokens.contains(&c)
85}
86
87pub fn lex_item(
88    input: &[u8],
89    curr_offset: &mut usize,
90    span_offset: usize,
91    additional_whitespace: &[u8],
92    special_tokens: &[u8],
93    in_signature: bool,
94) -> (Token, Option<ParseError>) {
95    // This variable tracks the starting character of a string literal, so that
96    // we remain inside the string literal lexer mode until we encounter the
97    // closing quote.
98    let mut quote_start: Option<u8> = None;
99
100    let mut in_comment = false;
101
102    let token_start = *curr_offset;
103
104    // This Vec tracks paired delimiters
105    let mut block_level: Vec<BlockKind> = vec![];
106
107    // The process of slurping up a baseline token repeats:
108    //
109    // - String literal, which begins with `'` or `"`, and continues until
110    //   the same character is encountered again.
111    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
112    //   the matching closing delimiter is found, skipping comments and string
113    //   literals.
114    // - When not nested inside of a delimiter pair, when a terminating
115    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
116    //   token is done.
117    // - Otherwise, accumulate the character into the current baseline token.
118    let mut previous_char = None;
119    while let Some(c) = input.get(*curr_offset) {
120        let c = *c;
121
122        if let Some(start) = quote_start {
123            // Check if we're in an escape sequence
124            if c == b'\\' && start == b'"' {
125                // Go ahead and consume the escape character if possible
126                if input.get(*curr_offset + 1).is_some() {
127                    // Successfully escaped the character
128                    *curr_offset += 2;
129                    continue;
130                } else {
131                    let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
132
133                    return (
134                        Token {
135                            contents: TokenContents::Item,
136                            span,
137                        },
138                        Some(ParseError::UnexpectedEof(
139                            (start as char).to_string(),
140                            Span::new(span.end - 1, span.end),
141                        )),
142                    );
143                }
144            }
145            // If we encountered the closing quote character for the current
146            // string, we're done with the current string.
147            if c == start {
148                // Also need to check to make sure we aren't escaped
149                quote_start = None;
150            }
151        } else if c == b'#' && !in_comment {
152            // To start a comment, It either need to be the first character of the token or prefixed with whitespace.
153            in_comment = previous_char
154                .map(char::from)
155                .map(char::is_whitespace)
156                .unwrap_or(true);
157        } else if c == b'\n' || c == b'\r' {
158            in_comment = false;
159            if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
160                break;
161            }
162        } else if in_comment {
163            if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
164                break;
165            }
166        } else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset {
167            *curr_offset += 1;
168            break;
169        } else if c == b'\'' || c == b'"' || c == b'`' {
170            // We encountered the opening quote of a string literal.
171            quote_start = Some(c);
172        } else if c == b'[' {
173            // We encountered an opening `[` delimiter.
174            block_level.push(BlockKind::SquareBracket);
175        } else if c == b'<' && in_signature {
176            block_level.push(BlockKind::AngleBracket);
177        } else if c == b'>' && in_signature {
178            if let Some(BlockKind::AngleBracket) = block_level.last() {
179                let _ = block_level.pop();
180            }
181        } else if c == b']' {
182            // We encountered a closing `]` delimiter. Pop off the opening `[`
183            // delimiter.
184            if let Some(BlockKind::SquareBracket) = block_level.last() {
185                let _ = block_level.pop();
186            }
187        } else if c == b'{' {
188            // We encountered an opening `{` delimiter.
189            block_level.push(BlockKind::CurlyBracket);
190        } else if c == b'}' {
191            // We encountered a closing `}` delimiter. Pop off the opening `{`.
192            if let Some(BlockKind::CurlyBracket) = block_level.last() {
193                let _ = block_level.pop();
194            } else {
195                // We encountered a closing `}` delimiter, but the last opening
196                // delimiter was not a `{`. This is an error.
197                *curr_offset += 1;
198                let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
199
200                return (
201                    Token {
202                        contents: TokenContents::Item,
203                        span,
204                    },
205                    Some(ParseError::Unbalanced(
206                        "{".to_string(),
207                        "}".to_string(),
208                        Span::new(span.end - 1, span.end),
209                    )),
210                );
211            }
212        } else if c == b'(' {
213            // We encountered an opening `(` delimiter.
214            block_level.push(BlockKind::Paren);
215        } else if c == b')' {
216            // We encountered a closing `)` delimiter. Pop off the opening `(`.
217            if let Some(BlockKind::Paren) = block_level.last() {
218                let _ = block_level.pop();
219            } else {
220                // We encountered a closing `)` delimiter, but the last opening
221                // delimiter was not a `(`. This is an error.
222                *curr_offset += 1;
223                let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
224
225                return (
226                    Token {
227                        contents: TokenContents::Item,
228                        span,
229                    },
230                    Some(ParseError::Unbalanced(
231                        "(".to_string(),
232                        ")".to_string(),
233                        Span::new(span.end - 1, span.end),
234                    )),
235                );
236            }
237        } else if c == b'r' && input.get(*curr_offset + 1) == Some(b'#').as_ref() {
238            // already checked `r#` pattern, so it's a raw string.
239            let lex_result = lex_raw_string(input, curr_offset, span_offset);
240            let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
241            if let Err(e) = lex_result {
242                return (
243                    Token {
244                        contents: TokenContents::Item,
245                        span,
246                    },
247                    Some(e),
248                );
249            }
250        } else if c == b'|' && is_redirection(&input[token_start..*curr_offset]) {
251            // matches err>| etc.
252            *curr_offset += 1;
253            break;
254        } else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
255            break;
256        }
257
258        *curr_offset += 1;
259        previous_char = Some(c);
260    }
261
262    let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
263
264    if let Some(delim) = quote_start {
265        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
266        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
267        // correct information from the non-lite parse.
268        return (
269            Token {
270                contents: TokenContents::Item,
271                span,
272            },
273            Some(ParseError::UnexpectedEof(
274                (delim as char).to_string(),
275                Span::new(span.end - 1, span.end),
276            )),
277        );
278    }
279
280    // If there is still unclosed opening delimiters, remember they were missing
281    if let Some(block) = block_level.last() {
282        let delim = block.closing();
283        let cause = ParseError::UnexpectedEof(
284            (delim as char).to_string(),
285            Span::new(span.end - 1, span.end),
286        );
287
288        return (
289            Token {
290                contents: TokenContents::Item,
291                span,
292            },
293            Some(cause),
294        );
295    }
296
297    // If we didn't accumulate any characters, it's an unexpected error.
298    if *curr_offset - token_start == 0 {
299        return (
300            Token {
301                contents: TokenContents::Item,
302                span,
303            },
304            Some(ParseError::UnexpectedEof("command".to_string(), span)),
305        );
306    }
307
308    let mut err = None;
309    let output = match &input[(span.start - span_offset)..(span.end - span_offset)] {
310        bytes if is_assignment_operator(bytes) => Token {
311            contents: TokenContents::AssignmentOperator,
312            span,
313        },
314        b"out>" | b"o>" => Token {
315            contents: TokenContents::OutGreaterThan,
316            span,
317        },
318        b"out>>" | b"o>>" => Token {
319            contents: TokenContents::OutGreaterGreaterThan,
320            span,
321        },
322        b"out>|" | b"o>|" => {
323            err = Some(ParseError::Expected(
324                "`|`.  Redirecting stdout to a pipe is the same as normal piping.",
325                span,
326            ));
327            Token {
328                contents: TokenContents::Item,
329                span,
330            }
331        }
332        b"err>" | b"e>" => Token {
333            contents: TokenContents::ErrGreaterThan,
334            span,
335        },
336        b"err>>" | b"e>>" => Token {
337            contents: TokenContents::ErrGreaterGreaterThan,
338            span,
339        },
340        b"err>|" | b"e>|" => Token {
341            contents: TokenContents::ErrGreaterPipe,
342            span,
343        },
344        b"out+err>" | b"err+out>" | b"o+e>" | b"e+o>" => Token {
345            contents: TokenContents::OutErrGreaterThan,
346            span,
347        },
348        b"out+err>>" | b"err+out>>" | b"o+e>>" | b"e+o>>" => Token {
349            contents: TokenContents::OutErrGreaterGreaterThan,
350            span,
351        },
352        b"out+err>|" | b"err+out>|" | b"o+e>|" | b"e+o>|" => Token {
353            contents: TokenContents::OutErrGreaterPipe,
354            span,
355        },
356        b"&&" => {
357            err = Some(ParseError::ShellAndAnd(span));
358            Token {
359                contents: TokenContents::Item,
360                span,
361            }
362        }
363        b"2>" => {
364            err = Some(ParseError::ShellErrRedirect(span));
365            Token {
366                contents: TokenContents::Item,
367                span,
368            }
369        }
370        b"2>&1" => {
371            err = Some(ParseError::ShellOutErrRedirect(span));
372            Token {
373                contents: TokenContents::Item,
374                span,
375            }
376        }
377        _ => Token {
378            contents: TokenContents::Item,
379            span,
380        },
381    };
382    (output, err)
383}
384
385fn lex_raw_string(
386    input: &[u8],
387    curr_offset: &mut usize,
388    span_offset: usize,
389) -> Result<(), ParseError> {
390    // A raw string literal looks like `echo r#'Look, I can use 'single quotes'!'#`
391    // If the next character is `#` we're probably looking at a raw string literal
392    // so we need to read all the text until we find a closing `#`. This raw string
393    // can contain any character, including newlines and double quotes without needing
394    // to escape them.
395    //
396    // A raw string can contain many `#` as prefix,
397    // incase if there is a `'#` or `#'` in the string itself.
398    // E.g: r##'I can use '#' in a raw string'##
399    let mut prefix_sharp_cnt = 0;
400    let start = *curr_offset;
401    while let Some(b'#') = input.get(start + prefix_sharp_cnt + 1) {
402        prefix_sharp_cnt += 1;
403    }
404
405    // curr_offset is the character `r`, we need to move forward and skip all `#`
406    // characters.
407    //
408    // e.g: r###'<body>
409    //      ^
410    //      ^
411    //   curr_offset
412    *curr_offset += prefix_sharp_cnt + 1;
413    // the next one should be a single quote.
414    if input.get(*curr_offset) != Some(&b'\'') {
415        return Err(ParseError::Expected(
416            "'",
417            Span::new(span_offset + *curr_offset, span_offset + *curr_offset + 1),
418        ));
419    }
420
421    *curr_offset += 1;
422    let mut matches = false;
423    while let Some(ch) = input.get(*curr_offset) {
424        // check for postfix '###
425        if *ch == b'#' {
426            let start_ch = input[*curr_offset - prefix_sharp_cnt];
427            let postfix = &input[*curr_offset - prefix_sharp_cnt + 1..=*curr_offset];
428            if start_ch == b'\'' && postfix.iter().all(|x| *x == b'#') {
429                matches = true;
430                break;
431            }
432        }
433        *curr_offset += 1
434    }
435    if !matches {
436        let mut expected = '\''.to_string();
437        expected.push_str(&"#".repeat(prefix_sharp_cnt));
438        return Err(ParseError::UnexpectedEof(
439            expected,
440            Span::new(span_offset + *curr_offset - 1, span_offset + *curr_offset),
441        ));
442    }
443    Ok(())
444}
445
446pub fn lex_signature(
447    input: &[u8],
448    span_offset: usize,
449    additional_whitespace: &[u8],
450    special_tokens: &[u8],
451    skip_comment: bool,
452) -> (Vec<Token>, Option<ParseError>) {
453    let mut state = LexState {
454        input,
455        output: Vec::new(),
456        error: None,
457        span_offset,
458    };
459    lex_internal(
460        &mut state,
461        additional_whitespace,
462        special_tokens,
463        skip_comment,
464        true,
465        None,
466    );
467    (state.output, state.error)
468}
469
470#[derive(Debug)]
471pub struct LexState<'a> {
472    pub input: &'a [u8],
473    pub output: Vec<Token>,
474    pub error: Option<ParseError>,
475    pub span_offset: usize,
476}
477
478/// Lex until the output is `max_tokens` longer than before the call, or until the input is exhausted.
479/// The return value indicates how many tokens the call added to / removed from the output.
480///
481/// The behaviour here is non-obvious when `additional_whitespace` doesn't include newline:
482/// If you pass a `state` where the last token in the output is an Eol, this might *remove* tokens.
483pub fn lex_n_tokens(
484    state: &mut LexState,
485    additional_whitespace: &[u8],
486    special_tokens: &[u8],
487    skip_comment: bool,
488    max_tokens: usize,
489) -> isize {
490    let n_tokens = state.output.len();
491    lex_internal(
492        state,
493        additional_whitespace,
494        special_tokens,
495        skip_comment,
496        false,
497        Some(max_tokens),
498    );
499    // If this lex_internal call reached the end of the input, there may now be fewer tokens
500    // in the output than before.
501    let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize);
502    let next_offset = state.output.last().map(|token| token.span.end);
503    if let Some(next_offset) = next_offset {
504        state.input = &state.input[next_offset - state.span_offset..];
505        state.span_offset = next_offset;
506    }
507    tokens_n_diff
508}
509
510pub fn lex(
511    input: &[u8],
512    span_offset: usize,
513    additional_whitespace: &[u8],
514    special_tokens: &[u8],
515    skip_comment: bool,
516) -> (Vec<Token>, Option<ParseError>) {
517    let mut state = LexState {
518        input,
519        output: Vec::new(),
520        error: None,
521        span_offset,
522    };
523    lex_internal(
524        &mut state,
525        additional_whitespace,
526        special_tokens,
527        skip_comment,
528        false,
529        None,
530    );
531    (state.output, state.error)
532}
533
534fn lex_internal(
535    state: &mut LexState,
536    additional_whitespace: &[u8],
537    special_tokens: &[u8],
538    skip_comment: bool,
539    // within signatures we want to treat `<` and `>` specially
540    in_signature: bool,
541    max_tokens: Option<usize>,
542) {
543    let initial_output_len = state.output.len();
544
545    let mut curr_offset = 0;
546
547    let mut is_complete = true;
548    while let Some(c) = state.input.get(curr_offset) {
549        if max_tokens
550            .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens)
551        {
552            break;
553        }
554        let c = *c;
555        if c == b'|' {
556            // If the next character is `|`, it's either `|` or `||`.
557            let idx = curr_offset;
558            let prev_idx = idx;
559            curr_offset += 1;
560
561            // If the next character is `|`, we're looking at a `||`.
562            if let Some(c) = state.input.get(curr_offset)
563                && *c == b'|'
564            {
565                let idx = curr_offset;
566                curr_offset += 1;
567                state.output.push(Token::new(
568                    TokenContents::PipePipe,
569                    Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1),
570                ));
571                continue;
572            }
573
574            // Otherwise, it's just a regular `|` token.
575
576            // Before we push, check to see if the previous character was a newline.
577            // If so, then this is a continuation of the previous line
578            if let Some(prev) = state.output.last_mut() {
579                match prev.contents {
580                    TokenContents::Eol => {
581                        *prev = Token::new(
582                            TokenContents::Pipe,
583                            Span::new(state.span_offset + idx, state.span_offset + idx + 1),
584                        );
585                        // And this is a continuation of the previous line if previous line is a
586                        // comment line (combined with EOL + Comment)
587                        //
588                        // Initially, the last one token is TokenContents::Pipe, we don't need to
589                        // check it, so the beginning offset is 2.
590                        let mut offset = 2;
591                        while state.output.len() > offset {
592                            let index = state.output.len() - offset;
593                            if state.output[index].contents == TokenContents::Comment
594                                && state.output[index - 1].contents == TokenContents::Eol
595                            {
596                                state.output.remove(index - 1);
597                                offset += 1;
598                            } else {
599                                break;
600                            }
601                        }
602                    }
603                    _ => {
604                        state.output.push(Token::new(
605                            TokenContents::Pipe,
606                            Span::new(state.span_offset + idx, state.span_offset + idx + 1),
607                        ));
608                    }
609                }
610            } else {
611                state.output.push(Token::new(
612                    TokenContents::Pipe,
613                    Span::new(state.span_offset + idx, state.span_offset + idx + 1),
614                ));
615            }
616
617            is_complete = false;
618        } else if c == b';' {
619            // If the next character is a `;`, we're looking at a semicolon token.
620
621            if !is_complete && state.error.is_none() {
622                state.error = Some(ParseError::ExtraTokens(Span::new(
623                    curr_offset,
624                    curr_offset + 1,
625                )));
626            }
627            let idx = curr_offset;
628            curr_offset += 1;
629            state.output.push(Token::new(
630                TokenContents::Semicolon,
631                Span::new(state.span_offset + idx, state.span_offset + idx + 1),
632            ));
633        } else if c == b'\r' {
634            // Ignore a stand-alone carriage return
635            curr_offset += 1;
636        } else if c == b'\n' {
637            // If the next character is a newline, we're looking at an EOL (end of line) token.
638            let idx = curr_offset;
639            curr_offset += 1;
640            if !additional_whitespace.contains(&c) {
641                state.output.push(Token::new(
642                    TokenContents::Eol,
643                    Span::new(state.span_offset + idx, state.span_offset + idx + 1),
644                ));
645            }
646        } else if c == b'#' {
647            // If the next character is `#`, we're at the beginning of a line
648            // comment. The comment continues until the next newline.
649            let mut start = curr_offset;
650
651            while let Some(input) = state.input.get(curr_offset) {
652                if *input == b'\n' {
653                    if !skip_comment {
654                        state.output.push(Token::new(
655                            TokenContents::Comment,
656                            Span::new(state.span_offset + start, state.span_offset + curr_offset),
657                        ));
658                    }
659                    start = curr_offset;
660
661                    break;
662                } else {
663                    curr_offset += 1;
664                }
665            }
666            if start != curr_offset && !skip_comment {
667                state.output.push(Token::new(
668                    TokenContents::Comment,
669                    Span::new(state.span_offset + start, state.span_offset + curr_offset),
670                ));
671            }
672        } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
673            // If the next character is non-newline whitespace, skip it.
674            curr_offset += 1;
675        } else {
676            let (token, err) = lex_item(
677                state.input,
678                &mut curr_offset,
679                state.span_offset,
680                additional_whitespace,
681                special_tokens,
682                in_signature,
683            );
684            if state.error.is_none() {
685                state.error = err;
686            }
687            is_complete = true;
688            state.output.push(token);
689        }
690    }
691}
692
693/// True if this the start of a redirection. Does not match `>>` or `>|` forms.
694fn is_redirection(token: &[u8]) -> bool {
695    matches!(
696        token,
697        b"o>" | b"out>" | b"e>" | b"err>" | b"o+e>" | b"e+o>" | b"out+err>" | b"err+out>"
698    )
699}