1use nu_protocol::{ParseError, Span};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4pub enum TokenContents {
5 Item,
6 Comment,
7 Pipe,
8 PipePipe,
9 AssignmentOperator,
10 ErrGreaterPipe,
11 OutErrGreaterPipe,
12 Semicolon,
13 OutGreaterThan,
14 OutGreaterGreaterThan,
15 ErrGreaterThan,
16 ErrGreaterGreaterThan,
17 OutErrGreaterThan,
18 OutErrGreaterGreaterThan,
19 Eol,
20}
21
22#[derive(Debug, PartialEq, Eq)]
23pub struct Token {
24 pub contents: TokenContents,
25 pub span: Span,
26}
27
28impl Token {
29 pub fn new(contents: TokenContents, span: Span) -> Token {
30 Token { contents, span }
31 }
32}
33
34#[derive(Clone, Copy, Debug)]
35pub enum BlockKind {
36 Paren,
37 CurlyBracket,
38 SquareBracket,
39 AngleBracket,
40}
41
42impl BlockKind {
43 fn closing(self) -> u8 {
44 match self {
45 BlockKind::Paren => b')',
46 BlockKind::SquareBracket => b']',
47 BlockKind::CurlyBracket => b'}',
48 BlockKind::AngleBracket => b'>',
49 }
50 }
51}
52
53fn is_item_terminator(
57 block_level: &[BlockKind],
58 c: u8,
59 additional_whitespace: &[u8],
60 special_tokens: &[u8],
61) -> bool {
62 block_level.is_empty()
63 && (c == b' '
64 || c == b'\t'
65 || c == b'\n'
66 || c == b'\r'
67 || c == b'|'
68 || c == b';'
69 || additional_whitespace.contains(&c)
70 || special_tokens.contains(&c))
71}
72
73pub fn is_assignment_operator(bytes: &[u8]) -> bool {
76 matches!(bytes, b"=" | b"+=" | b"++=" | b"-=" | b"*=" | b"/=")
77}
78
79fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool {
84 block_level.is_empty() && special_tokens.contains(&c)
85}
86
87pub fn lex_item(
88 input: &[u8],
89 curr_offset: &mut usize,
90 span_offset: usize,
91 additional_whitespace: &[u8],
92 special_tokens: &[u8],
93 in_signature: bool,
94) -> (Token, Option<ParseError>) {
95 let mut quote_start: Option<u8> = None;
99
100 let mut in_comment = false;
101
102 let token_start = *curr_offset;
103
104 let mut block_level: Vec<BlockKind> = vec![];
106
107 let mut previous_char = None;
119 while let Some(c) = input.get(*curr_offset) {
120 let c = *c;
121
122 if let Some(start) = quote_start {
123 if c == b'\\' && start == b'"' {
125 if input.get(*curr_offset + 1).is_some() {
127 *curr_offset += 2;
129 continue;
130 } else {
131 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
132
133 return (
134 Token {
135 contents: TokenContents::Item,
136 span,
137 },
138 Some(ParseError::UnexpectedEof(
139 (start as char).to_string(),
140 Span::new(span.end - 1, span.end),
141 )),
142 );
143 }
144 }
145 if c == start {
148 quote_start = None;
150 }
151 } else if c == b'#' && !in_comment {
152 in_comment = previous_char
154 .map(char::from)
155 .map(char::is_whitespace)
156 .unwrap_or(true);
157 } else if c == b'\n' || c == b'\r' {
158 in_comment = false;
159 if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
160 break;
161 }
162 } else if in_comment {
163 if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
164 break;
165 }
166 } else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset {
167 *curr_offset += 1;
168 break;
169 } else if c == b'\'' || c == b'"' || c == b'`' {
170 quote_start = Some(c);
172 } else if c == b'[' {
173 block_level.push(BlockKind::SquareBracket);
175 } else if c == b'<' && in_signature {
176 block_level.push(BlockKind::AngleBracket);
177 } else if c == b'>' && in_signature {
178 if let Some(BlockKind::AngleBracket) = block_level.last() {
179 let _ = block_level.pop();
180 }
181 } else if c == b']' {
182 if let Some(BlockKind::SquareBracket) = block_level.last() {
185 let _ = block_level.pop();
186 }
187 } else if c == b'{' {
188 block_level.push(BlockKind::CurlyBracket);
190 } else if c == b'}' {
191 if let Some(BlockKind::CurlyBracket) = block_level.last() {
193 let _ = block_level.pop();
194 } else {
195 *curr_offset += 1;
198 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
199
200 return (
201 Token {
202 contents: TokenContents::Item,
203 span,
204 },
205 Some(ParseError::Unbalanced(
206 "{",
207 "}",
208 Span::new(span.end - 1, span.end),
209 )),
210 );
211 }
212 } else if c == b'(' {
213 block_level.push(BlockKind::Paren);
215 } else if c == b')' {
216 if let Some(BlockKind::Paren) = block_level.last() {
218 let _ = block_level.pop();
219 } else {
220 *curr_offset += 1;
223 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
224
225 return (
226 Token {
227 contents: TokenContents::Item,
228 span,
229 },
230 Some(ParseError::Unbalanced(
231 "(",
232 ")",
233 Span::new(span.end - 1, span.end),
234 )),
235 );
236 }
237 } else if c == b'r' && input.get(*curr_offset + 1) == Some(b'#').as_ref() {
238 let lex_result = lex_raw_string(input, curr_offset, span_offset);
240 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
241 if let Err(e) = lex_result {
242 return (
243 Token {
244 contents: TokenContents::Item,
245 span,
246 },
247 Some(e),
248 );
249 }
250 } else if c == b'|' && is_redirection(&input[token_start..*curr_offset]) {
251 *curr_offset += 1;
253 break;
254 } else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
255 break;
256 }
257
258 *curr_offset += 1;
259 previous_char = Some(c);
260 }
261
262 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
263
264 if let Some(delim) = quote_start {
265 return (
269 Token {
270 contents: TokenContents::Item,
271 span,
272 },
273 Some(ParseError::UnexpectedEof(
274 (delim as char).to_string(),
275 Span::new(span.end - 1, span.end),
276 )),
277 );
278 }
279
280 if let Some(block) = block_level.last() {
282 let delim = block.closing();
283 let cause = ParseError::UnexpectedEof(
284 (delim as char).to_string(),
285 Span::new(span.end - 1, span.end),
286 );
287
288 return (
289 Token {
290 contents: TokenContents::Item,
291 span,
292 },
293 Some(cause),
294 );
295 }
296
297 if *curr_offset - token_start == 0 {
299 return (
300 Token {
301 contents: TokenContents::Item,
302 span,
303 },
304 Some(ParseError::UnexpectedEof("command".to_string(), span)),
305 );
306 }
307
308 let mut err = None;
309 let output = match &input[(span.start - span_offset)..(span.end - span_offset)] {
310 bytes if is_assignment_operator(bytes) => Token {
311 contents: TokenContents::AssignmentOperator,
312 span,
313 },
314 b"out>" | b"o>" => Token {
315 contents: TokenContents::OutGreaterThan,
316 span,
317 },
318 b"out>>" | b"o>>" => Token {
319 contents: TokenContents::OutGreaterGreaterThan,
320 span,
321 },
322 b"out>|" | b"o>|" => {
323 err = Some(ParseError::Expected(
324 "`|`. Redirecting stdout to a pipe is the same as normal piping.",
325 span,
326 ));
327 Token {
328 contents: TokenContents::Pipe,
330 span,
331 }
332 }
333 b"err>" | b"e>" => Token {
334 contents: TokenContents::ErrGreaterThan,
335 span,
336 },
337 b"err>>" | b"e>>" => Token {
338 contents: TokenContents::ErrGreaterGreaterThan,
339 span,
340 },
341 b"err>|" | b"e>|" => Token {
342 contents: TokenContents::ErrGreaterPipe,
343 span,
344 },
345 b"out+err>" | b"err+out>" | b"o+e>" | b"e+o>" => Token {
346 contents: TokenContents::OutErrGreaterThan,
347 span,
348 },
349 b"out+err>>" | b"err+out>>" | b"o+e>>" | b"e+o>>" => Token {
350 contents: TokenContents::OutErrGreaterGreaterThan,
351 span,
352 },
353 b"out+err>|" | b"err+out>|" | b"o+e>|" | b"e+o>|" => Token {
354 contents: TokenContents::OutErrGreaterPipe,
355 span,
356 },
357 b"&&" => {
358 err = Some(ParseError::ShellAndAnd(span));
359 Token {
360 contents: TokenContents::Pipe,
362 span,
363 }
364 }
365 b"2>" => {
366 err = Some(ParseError::ShellErrRedirect(span));
367 Token {
368 contents: TokenContents::ErrGreaterThan,
370 span,
371 }
372 }
373 b"2>&1" => {
374 err = Some(ParseError::ShellOutErrRedirect(span));
375 Token {
376 contents: TokenContents::Pipe,
378 span,
379 }
380 }
381 _ => Token {
382 contents: TokenContents::Item,
383 span,
384 },
385 };
386 (output, err)
387}
388
389fn lex_raw_string(
390 input: &[u8],
391 curr_offset: &mut usize,
392 span_offset: usize,
393) -> Result<(), ParseError> {
394 let mut prefix_sharp_cnt = 0;
404 let start = *curr_offset;
405 while let Some(b'#') = input.get(start + prefix_sharp_cnt + 1) {
406 prefix_sharp_cnt += 1;
407 }
408
409 *curr_offset += prefix_sharp_cnt + 1;
417 if input.get(*curr_offset) != Some(&b'\'') {
419 return Err(ParseError::Expected(
420 "'",
421 Span::new(span_offset + *curr_offset, span_offset + *curr_offset + 1),
422 ));
423 }
424
425 *curr_offset += 1;
426 let mut matches = false;
427 while let Some(ch) = input.get(*curr_offset) {
428 if *ch == b'#' {
430 let start_ch = input[*curr_offset - prefix_sharp_cnt];
431 let postfix = &input[*curr_offset - prefix_sharp_cnt + 1..=*curr_offset];
432 if start_ch == b'\'' && postfix.iter().all(|x| *x == b'#') {
433 matches = true;
434 break;
435 }
436 }
437 *curr_offset += 1
438 }
439 if !matches {
440 let mut expected = '\''.to_string();
441 expected.push_str(&"#".repeat(prefix_sharp_cnt));
442 return Err(ParseError::UnexpectedEof(
443 expected,
444 Span::new(span_offset + *curr_offset - 1, span_offset + *curr_offset),
445 ));
446 }
447 Ok(())
448}
449
450pub fn lex_signature(
451 input: &[u8],
452 span_offset: usize,
453 additional_whitespace: &[u8],
454 special_tokens: &[u8],
455 skip_comment: bool,
456) -> (Vec<Token>, Option<ParseError>) {
457 let mut state = LexState {
458 input,
459 output: Vec::new(),
460 error: None,
461 span_offset,
462 };
463 lex_internal(
464 &mut state,
465 additional_whitespace,
466 special_tokens,
467 skip_comment,
468 true,
469 None,
470 );
471 (state.output, state.error)
472}
473
474#[derive(Debug)]
475pub struct LexState<'a> {
476 pub input: &'a [u8],
477 pub output: Vec<Token>,
478 pub error: Option<ParseError>,
479 pub span_offset: usize,
480}
481
482pub fn lex_n_tokens(
488 state: &mut LexState,
489 additional_whitespace: &[u8],
490 special_tokens: &[u8],
491 skip_comment: bool,
492 max_tokens: usize,
493) -> isize {
494 let n_tokens = state.output.len();
495 lex_internal(
496 state,
497 additional_whitespace,
498 special_tokens,
499 skip_comment,
500 false,
501 Some(max_tokens),
502 );
503 let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize);
506 let next_offset = state.output.last().map(|token| token.span.end);
507 if let Some(next_offset) = next_offset {
508 state.input = &state.input[next_offset - state.span_offset..];
509 state.span_offset = next_offset;
510 }
511 tokens_n_diff
512}
513
514pub fn lex(
515 input: &[u8],
516 span_offset: usize,
517 additional_whitespace: &[u8],
518 special_tokens: &[u8],
519 skip_comment: bool,
520) -> (Vec<Token>, Option<ParseError>) {
521 let mut state = LexState {
522 input,
523 output: Vec::new(),
524 error: None,
525 span_offset,
526 };
527 lex_internal(
528 &mut state,
529 additional_whitespace,
530 special_tokens,
531 skip_comment,
532 false,
533 None,
534 );
535 (state.output, state.error)
536}
537
538fn lex_internal(
539 state: &mut LexState,
540 additional_whitespace: &[u8],
541 special_tokens: &[u8],
542 skip_comment: bool,
543 in_signature: bool,
545 max_tokens: Option<usize>,
546) {
547 let initial_output_len = state.output.len();
548
549 let mut curr_offset = 0;
550
551 let mut is_complete = true;
552 while let Some(c) = state.input.get(curr_offset) {
553 if max_tokens
554 .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens)
555 {
556 break;
557 }
558 let c = *c;
559 if c == b'|' {
560 let idx = curr_offset;
562 let prev_idx = idx;
563 curr_offset += 1;
564
565 if let Some(c) = state.input.get(curr_offset)
567 && *c == b'|'
568 {
569 let idx = curr_offset;
570 curr_offset += 1;
571 state.output.push(Token::new(
572 TokenContents::PipePipe,
573 Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1),
574 ));
575 continue;
576 }
577
578 if let Some(prev) = state.output.last_mut() {
583 match prev.contents {
584 TokenContents::Eol => {
585 *prev = Token::new(
586 TokenContents::Pipe,
587 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
588 );
589 let mut offset = 2;
595 while state.output.len() > offset {
596 let index = state.output.len() - offset;
597 if state.output[index].contents == TokenContents::Comment
598 && state.output[index - 1].contents == TokenContents::Eol
599 {
600 state.output.remove(index - 1);
601 offset += 1;
602 } else {
603 break;
604 }
605 }
606 }
607 _ => {
608 state.output.push(Token::new(
609 TokenContents::Pipe,
610 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
611 ));
612 }
613 }
614 } else {
615 state.output.push(Token::new(
616 TokenContents::Pipe,
617 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
618 ));
619 }
620
621 is_complete = false;
622 } else if c == b';' {
623 if !is_complete && state.error.is_none() {
626 state.error = Some(ParseError::ExtraTokens(Span::new(
627 curr_offset,
628 curr_offset + 1,
629 )));
630 }
631 let idx = curr_offset;
632 curr_offset += 1;
633 state.output.push(Token::new(
634 TokenContents::Semicolon,
635 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
636 ));
637 } else if c == b'\r' {
638 curr_offset += 1;
640 } else if c == b'\n' {
641 let idx = curr_offset;
643 curr_offset += 1;
644 if !additional_whitespace.contains(&c) {
645 state.output.push(Token::new(
646 TokenContents::Eol,
647 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
648 ));
649 }
650 } else if c == b'#' {
651 let mut start = curr_offset;
654
655 while let Some(input) = state.input.get(curr_offset) {
656 if *input == b'\n' {
657 if !skip_comment {
658 state.output.push(Token::new(
659 TokenContents::Comment,
660 Span::new(state.span_offset + start, state.span_offset + curr_offset),
661 ));
662 }
663 start = curr_offset;
664
665 break;
666 } else {
667 curr_offset += 1;
668 }
669 }
670 if start != curr_offset && !skip_comment {
671 state.output.push(Token::new(
672 TokenContents::Comment,
673 Span::new(state.span_offset + start, state.span_offset + curr_offset),
674 ));
675 }
676 } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
677 curr_offset += 1;
679 } else {
680 let (token, err) = lex_item(
681 state.input,
682 &mut curr_offset,
683 state.span_offset,
684 additional_whitespace,
685 special_tokens,
686 in_signature,
687 );
688 if state.error.is_none() {
689 state.error = err;
690 }
691 is_complete = true;
692 state.output.push(token);
693 }
694 }
695}
696
697fn is_redirection(token: &[u8]) -> bool {
699 matches!(
700 token,
701 b"o>" | b"out>" | b"e>" | b"err>" | b"o+e>" | b"e+o>" | b"out+err>" | b"err+out>"
702 )
703}