1use nu_protocol::{ParseError, Span};
2
3#[derive(Debug, PartialEq, Eq, Clone, Copy)]
4pub enum TokenContents {
5 Item,
6 Comment,
7 Pipe,
8 PipePipe,
9 AssignmentOperator,
10 ErrGreaterPipe,
11 OutErrGreaterPipe,
12 Semicolon,
13 OutGreaterThan,
14 OutGreaterGreaterThan,
15 ErrGreaterThan,
16 ErrGreaterGreaterThan,
17 OutErrGreaterThan,
18 OutErrGreaterGreaterThan,
19 Eol,
20}
21
22#[derive(Debug, PartialEq, Eq)]
23pub struct Token {
24 pub contents: TokenContents,
25 pub span: Span,
26}
27
28impl Token {
29 pub fn new(contents: TokenContents, span: Span) -> Token {
30 Token { contents, span }
31 }
32}
33
34#[derive(Clone, Copy, Debug)]
35pub enum BlockKind {
36 Paren,
37 CurlyBracket,
38 SquareBracket,
39 AngleBracket,
40}
41
42impl BlockKind {
43 fn closing(self) -> u8 {
44 match self {
45 BlockKind::Paren => b')',
46 BlockKind::SquareBracket => b']',
47 BlockKind::CurlyBracket => b'}',
48 BlockKind::AngleBracket => b'>',
49 }
50 }
51}
52
53fn is_item_terminator(
57 block_level: &[BlockKind],
58 c: u8,
59 additional_whitespace: &[u8],
60 special_tokens: &[u8],
61) -> bool {
62 block_level.is_empty()
63 && (c == b' '
64 || c == b'\t'
65 || c == b'\n'
66 || c == b'\r'
67 || c == b'|'
68 || c == b';'
69 || additional_whitespace.contains(&c)
70 || special_tokens.contains(&c))
71}
72
73pub fn is_assignment_operator(bytes: &[u8]) -> bool {
76 matches!(bytes, b"=" | b"+=" | b"++=" | b"-=" | b"*=" | b"/=")
77}
78
79fn is_special_item(block_level: &[BlockKind], c: u8, special_tokens: &[u8]) -> bool {
84 block_level.is_empty() && special_tokens.contains(&c)
85}
86
87pub fn lex_item(
88 input: &[u8],
89 curr_offset: &mut usize,
90 span_offset: usize,
91 additional_whitespace: &[u8],
92 special_tokens: &[u8],
93 in_signature: bool,
94) -> (Token, Option<ParseError>) {
95 let mut quote_start: Option<u8> = None;
99
100 let mut in_comment = false;
101
102 let token_start = *curr_offset;
103
104 let mut block_level: Vec<BlockKind> = vec![];
106
107 let mut previous_char = None;
119 while let Some(c) = input.get(*curr_offset) {
120 let c = *c;
121
122 if let Some(start) = quote_start {
123 if c == b'\\' && start == b'"' {
125 if input.get(*curr_offset + 1).is_some() {
127 *curr_offset += 2;
129 continue;
130 } else {
131 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
132
133 return (
134 Token {
135 contents: TokenContents::Item,
136 span,
137 },
138 Some(ParseError::UnexpectedEof(
139 (start as char).to_string(),
140 Span::new(span.end - 1, span.end),
141 )),
142 );
143 }
144 }
145 if c == start {
148 quote_start = None;
150 }
151 } else if c == b'#' && !in_comment {
152 in_comment = previous_char
154 .map(char::from)
155 .map(char::is_whitespace)
156 .unwrap_or(true);
157 } else if c == b'\n' || c == b'\r' {
158 in_comment = false;
159 if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
160 break;
161 }
162 } else if in_comment {
163 if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
164 break;
165 }
166 } else if is_special_item(&block_level, c, special_tokens) && token_start == *curr_offset {
167 *curr_offset += 1;
168 break;
169 } else if c == b'\'' || c == b'"' || c == b'`' {
170 quote_start = Some(c);
172 } else if c == b'[' {
173 block_level.push(BlockKind::SquareBracket);
175 } else if c == b'<' && in_signature {
176 block_level.push(BlockKind::AngleBracket);
177 } else if c == b'>' && in_signature {
178 if let Some(BlockKind::AngleBracket) = block_level.last() {
179 let _ = block_level.pop();
180 }
181 } else if c == b']' {
182 if let Some(BlockKind::SquareBracket) = block_level.last() {
185 let _ = block_level.pop();
186 }
187 } else if c == b'{' {
188 block_level.push(BlockKind::CurlyBracket);
190 } else if c == b'}' {
191 if let Some(BlockKind::CurlyBracket) = block_level.last() {
193 let _ = block_level.pop();
194 } else {
195 *curr_offset += 1;
198 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
199
200 return (
201 Token {
202 contents: TokenContents::Item,
203 span,
204 },
205 Some(ParseError::Unbalanced(
206 "{".to_string(),
207 "}".to_string(),
208 Span::new(span.end - 1, span.end),
209 )),
210 );
211 }
212 } else if c == b'(' {
213 block_level.push(BlockKind::Paren);
215 } else if c == b')' {
216 if let Some(BlockKind::Paren) = block_level.last() {
218 let _ = block_level.pop();
219 } else {
220 *curr_offset += 1;
223 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
224
225 return (
226 Token {
227 contents: TokenContents::Item,
228 span,
229 },
230 Some(ParseError::Unbalanced(
231 "(".to_string(),
232 ")".to_string(),
233 Span::new(span.end - 1, span.end),
234 )),
235 );
236 }
237 } else if c == b'r' && input.get(*curr_offset + 1) == Some(b'#').as_ref() {
238 let lex_result = lex_raw_string(input, curr_offset, span_offset);
240 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
241 if let Err(e) = lex_result {
242 return (
243 Token {
244 contents: TokenContents::Item,
245 span,
246 },
247 Some(e),
248 );
249 }
250 } else if c == b'|' && is_redirection(&input[token_start..*curr_offset]) {
251 *curr_offset += 1;
253 break;
254 } else if is_item_terminator(&block_level, c, additional_whitespace, special_tokens) {
255 break;
256 }
257
258 *curr_offset += 1;
259 previous_char = Some(c);
260 }
261
262 let span = Span::new(span_offset + token_start, span_offset + *curr_offset);
263
264 if let Some(delim) = quote_start {
265 return (
269 Token {
270 contents: TokenContents::Item,
271 span,
272 },
273 Some(ParseError::UnexpectedEof(
274 (delim as char).to_string(),
275 Span::new(span.end - 1, span.end),
276 )),
277 );
278 }
279
280 if let Some(block) = block_level.last() {
282 let delim = block.closing();
283 let cause = ParseError::UnexpectedEof(
284 (delim as char).to_string(),
285 Span::new(span.end - 1, span.end),
286 );
287
288 return (
289 Token {
290 contents: TokenContents::Item,
291 span,
292 },
293 Some(cause),
294 );
295 }
296
297 if *curr_offset - token_start == 0 {
299 return (
300 Token {
301 contents: TokenContents::Item,
302 span,
303 },
304 Some(ParseError::UnexpectedEof("command".to_string(), span)),
305 );
306 }
307
308 let mut err = None;
309 let output = match &input[(span.start - span_offset)..(span.end - span_offset)] {
310 bytes if is_assignment_operator(bytes) => Token {
311 contents: TokenContents::AssignmentOperator,
312 span,
313 },
314 b"out>" | b"o>" => Token {
315 contents: TokenContents::OutGreaterThan,
316 span,
317 },
318 b"out>>" | b"o>>" => Token {
319 contents: TokenContents::OutGreaterGreaterThan,
320 span,
321 },
322 b"out>|" | b"o>|" => {
323 err = Some(ParseError::Expected(
324 "`|`. Redirecting stdout to a pipe is the same as normal piping.",
325 span,
326 ));
327 Token {
328 contents: TokenContents::Item,
329 span,
330 }
331 }
332 b"err>" | b"e>" => Token {
333 contents: TokenContents::ErrGreaterThan,
334 span,
335 },
336 b"err>>" | b"e>>" => Token {
337 contents: TokenContents::ErrGreaterGreaterThan,
338 span,
339 },
340 b"err>|" | b"e>|" => Token {
341 contents: TokenContents::ErrGreaterPipe,
342 span,
343 },
344 b"out+err>" | b"err+out>" | b"o+e>" | b"e+o>" => Token {
345 contents: TokenContents::OutErrGreaterThan,
346 span,
347 },
348 b"out+err>>" | b"err+out>>" | b"o+e>>" | b"e+o>>" => Token {
349 contents: TokenContents::OutErrGreaterGreaterThan,
350 span,
351 },
352 b"out+err>|" | b"err+out>|" | b"o+e>|" | b"e+o>|" => Token {
353 contents: TokenContents::OutErrGreaterPipe,
354 span,
355 },
356 b"&&" => {
357 err = Some(ParseError::ShellAndAnd(span));
358 Token {
359 contents: TokenContents::Item,
360 span,
361 }
362 }
363 b"2>" => {
364 err = Some(ParseError::ShellErrRedirect(span));
365 Token {
366 contents: TokenContents::Item,
367 span,
368 }
369 }
370 b"2>&1" => {
371 err = Some(ParseError::ShellOutErrRedirect(span));
372 Token {
373 contents: TokenContents::Item,
374 span,
375 }
376 }
377 _ => Token {
378 contents: TokenContents::Item,
379 span,
380 },
381 };
382 (output, err)
383}
384
385fn lex_raw_string(
386 input: &[u8],
387 curr_offset: &mut usize,
388 span_offset: usize,
389) -> Result<(), ParseError> {
390 let mut prefix_sharp_cnt = 0;
400 let start = *curr_offset;
401 while let Some(b'#') = input.get(start + prefix_sharp_cnt + 1) {
402 prefix_sharp_cnt += 1;
403 }
404
405 *curr_offset += prefix_sharp_cnt + 1;
413 if input.get(*curr_offset) != Some(&b'\'') {
415 return Err(ParseError::Expected(
416 "'",
417 Span::new(span_offset + *curr_offset, span_offset + *curr_offset + 1),
418 ));
419 }
420
421 *curr_offset += 1;
422 let mut matches = false;
423 while let Some(ch) = input.get(*curr_offset) {
424 if *ch == b'#' {
426 let start_ch = input[*curr_offset - prefix_sharp_cnt];
427 let postfix = &input[*curr_offset - prefix_sharp_cnt + 1..=*curr_offset];
428 if start_ch == b'\'' && postfix.iter().all(|x| *x == b'#') {
429 matches = true;
430 break;
431 }
432 }
433 *curr_offset += 1
434 }
435 if !matches {
436 let mut expected = '\''.to_string();
437 expected.push_str(&"#".repeat(prefix_sharp_cnt));
438 return Err(ParseError::UnexpectedEof(
439 expected,
440 Span::new(span_offset + *curr_offset - 1, span_offset + *curr_offset),
441 ));
442 }
443 Ok(())
444}
445
446pub fn lex_signature(
447 input: &[u8],
448 span_offset: usize,
449 additional_whitespace: &[u8],
450 special_tokens: &[u8],
451 skip_comment: bool,
452) -> (Vec<Token>, Option<ParseError>) {
453 let mut state = LexState {
454 input,
455 output: Vec::new(),
456 error: None,
457 span_offset,
458 };
459 lex_internal(
460 &mut state,
461 additional_whitespace,
462 special_tokens,
463 skip_comment,
464 true,
465 None,
466 );
467 (state.output, state.error)
468}
469
470#[derive(Debug)]
471pub struct LexState<'a> {
472 pub input: &'a [u8],
473 pub output: Vec<Token>,
474 pub error: Option<ParseError>,
475 pub span_offset: usize,
476}
477
478pub fn lex_n_tokens(
484 state: &mut LexState,
485 additional_whitespace: &[u8],
486 special_tokens: &[u8],
487 skip_comment: bool,
488 max_tokens: usize,
489) -> isize {
490 let n_tokens = state.output.len();
491 lex_internal(
492 state,
493 additional_whitespace,
494 special_tokens,
495 skip_comment,
496 false,
497 Some(max_tokens),
498 );
499 let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize);
502 let next_offset = state.output.last().map(|token| token.span.end);
503 if let Some(next_offset) = next_offset {
504 state.input = &state.input[next_offset - state.span_offset..];
505 state.span_offset = next_offset;
506 }
507 tokens_n_diff
508}
509
510pub fn lex(
511 input: &[u8],
512 span_offset: usize,
513 additional_whitespace: &[u8],
514 special_tokens: &[u8],
515 skip_comment: bool,
516) -> (Vec<Token>, Option<ParseError>) {
517 let mut state = LexState {
518 input,
519 output: Vec::new(),
520 error: None,
521 span_offset,
522 };
523 lex_internal(
524 &mut state,
525 additional_whitespace,
526 special_tokens,
527 skip_comment,
528 false,
529 None,
530 );
531 (state.output, state.error)
532}
533
534fn lex_internal(
535 state: &mut LexState,
536 additional_whitespace: &[u8],
537 special_tokens: &[u8],
538 skip_comment: bool,
539 in_signature: bool,
541 max_tokens: Option<usize>,
542) {
543 let initial_output_len = state.output.len();
544
545 let mut curr_offset = 0;
546
547 let mut is_complete = true;
548 while let Some(c) = state.input.get(curr_offset) {
549 if max_tokens
550 .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens)
551 {
552 break;
553 }
554 let c = *c;
555 if c == b'|' {
556 let idx = curr_offset;
558 let prev_idx = idx;
559 curr_offset += 1;
560
561 if let Some(c) = state.input.get(curr_offset)
563 && *c == b'|'
564 {
565 let idx = curr_offset;
566 curr_offset += 1;
567 state.output.push(Token::new(
568 TokenContents::PipePipe,
569 Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1),
570 ));
571 continue;
572 }
573
574 if let Some(prev) = state.output.last_mut() {
579 match prev.contents {
580 TokenContents::Eol => {
581 *prev = Token::new(
582 TokenContents::Pipe,
583 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
584 );
585 let mut offset = 2;
591 while state.output.len() > offset {
592 let index = state.output.len() - offset;
593 if state.output[index].contents == TokenContents::Comment
594 && state.output[index - 1].contents == TokenContents::Eol
595 {
596 state.output.remove(index - 1);
597 offset += 1;
598 } else {
599 break;
600 }
601 }
602 }
603 _ => {
604 state.output.push(Token::new(
605 TokenContents::Pipe,
606 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
607 ));
608 }
609 }
610 } else {
611 state.output.push(Token::new(
612 TokenContents::Pipe,
613 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
614 ));
615 }
616
617 is_complete = false;
618 } else if c == b';' {
619 if !is_complete && state.error.is_none() {
622 state.error = Some(ParseError::ExtraTokens(Span::new(
623 curr_offset,
624 curr_offset + 1,
625 )));
626 }
627 let idx = curr_offset;
628 curr_offset += 1;
629 state.output.push(Token::new(
630 TokenContents::Semicolon,
631 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
632 ));
633 } else if c == b'\r' {
634 curr_offset += 1;
636 } else if c == b'\n' {
637 let idx = curr_offset;
639 curr_offset += 1;
640 if !additional_whitespace.contains(&c) {
641 state.output.push(Token::new(
642 TokenContents::Eol,
643 Span::new(state.span_offset + idx, state.span_offset + idx + 1),
644 ));
645 }
646 } else if c == b'#' {
647 let mut start = curr_offset;
650
651 while let Some(input) = state.input.get(curr_offset) {
652 if *input == b'\n' {
653 if !skip_comment {
654 state.output.push(Token::new(
655 TokenContents::Comment,
656 Span::new(state.span_offset + start, state.span_offset + curr_offset),
657 ));
658 }
659 start = curr_offset;
660
661 break;
662 } else {
663 curr_offset += 1;
664 }
665 }
666 if start != curr_offset && !skip_comment {
667 state.output.push(Token::new(
668 TokenContents::Comment,
669 Span::new(state.span_offset + start, state.span_offset + curr_offset),
670 ));
671 }
672 } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) {
673 curr_offset += 1;
675 } else {
676 let (token, err) = lex_item(
677 state.input,
678 &mut curr_offset,
679 state.span_offset,
680 additional_whitespace,
681 special_tokens,
682 in_signature,
683 );
684 if state.error.is_none() {
685 state.error = err;
686 }
687 is_complete = true;
688 state.output.push(token);
689 }
690 }
691}
692
693fn is_redirection(token: &[u8]) -> bool {
695 matches!(
696 token,
697 b"o>" | b"out>" | b"e>" | b"err>" | b"o+e>" | b"e+o>" | b"out+err>" | b"err+out>"
698 )
699}