1use std::convert::TryInto;
2use std::io::Write;
3
4use crate::maybe_byte::*;
5use crate::source::buffer::*;
6use crate::str_term::{str_types::*, StrTerm};
7use crate::TokenBuf;
8use crate::{lex_states::*, DiagnosticMessage};
9use crate::{lexer::*, str_term::StringLiteral};
10
11const ESCAPE_CONTROL: usize = 1;
12const ESCAPE_META: usize = 2;
13
14impl Lexer {
15 fn take_strterm(&mut self) -> StringLiteral {
16 match self.strterm.take().map(|v| *v) {
17 Some(StrTerm::StringLiteral(s)) => s,
18 _ => unreachable!("strterm must be string"),
19 }
20 }
21 fn restore_strterm(&mut self, literal: StringLiteral) {
22 self.strterm = Some(Box::new(StrTerm::StringLiteral(literal)));
23 }
24
25 pub(crate) fn parse_string(&mut self) -> i32 {
26 let mut quote = self.take_strterm();
27
28 let func = quote.func;
29 let term = quote.term;
30 let paren = quote.paren;
31 let mut space = false;
32 self.lval_start = Some(self.buffer.pcur);
33
34 println_if_debug_lexer!(
35 "parse_string: func = {}, pcur = {}, ptok = {}, term = {}",
36 func,
37 self.buffer.pcur,
38 self.buffer.ptok,
39 quote.term
40 );
41
42 if (func & STR_FUNC_TERM) != 0 {
43 if (func & STR_FUNC_QWORDS) != 0 {
44 self.nextc();
45 } self.lex_state.set(EXPR_END);
47 self.strterm = None;
48 if (func & STR_FUNC_REGEXP) != 0 {
49 return Self::tREGEXP_END;
50 } else {
51 if let Some(heredoc_end) = quote.heredoc_end {
52 self.lval_start = Some(heredoc_end.start);
53 self.lval_end = Some(heredoc_end.end);
54 self.set_yylval_str(&TokenBuf::new(&heredoc_end.value));
55 }
56 return Self::tSTRING_END;
57 }
58 }
59 let mut c = self.nextc();
60 if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
61 loop {
62 c = self.nextc();
63
64 if !c.is_space() {
65 break;
66 }
67 }
68 space = true;
69 }
70 if (func & STR_FUNC_LIST) != 0 {
71 quote.func &= !STR_FUNC_LIST;
72 space = true;
73 }
74 if c == term && quote.nest == 0 {
75 if (func & STR_FUNC_QWORDS) != 0 {
76 quote.func |= STR_FUNC_TERM;
77 self.buffer.pushback(c); self.restore_strterm(quote);
79 return Self::tSPACE;
80 }
81 self.restore_strterm(quote);
82 return self.string_term(term, func);
83 }
84 if space {
85 self.buffer.pushback(c);
86 self.restore_strterm(quote);
87 return Self::tSPACE;
88 }
89 self.newtok();
90 if ((func & STR_FUNC_EXPAND) != 0) && c == b'#' {
91 if let Some(t) = self.peek_variable_name() {
92 self.restore_strterm(quote);
93 return t;
94 }
95 self.tokadd(b'#');
96 c = self.nextc();
97 }
98 self.buffer.pushback(c);
99
100 let mut nest = quote.nest;
101 let added = self.tokadd_string(func, term, paren, &mut nest);
102 quote.nest = nest;
103
104 if added.is_some() && self.buffer.eofp {
105 self.literal_flush(self.buffer.pcur);
106 if (func & STR_FUNC_QWORDS) != 0 {
107 self.yyerror0(DiagnosticMessage::UnterminatedList {});
109 self.strterm = None;
110 return Self::tSTRING_END;
111 }
112 if (func & STR_FUNC_REGEXP) != 0 {
113 self.yyerror0(DiagnosticMessage::UnterminatedRegexp {});
114 } else {
115 self.yyerror0(DiagnosticMessage::UnterminatedString {});
116 }
117 quote.func |= STR_FUNC_TERM;
118 }
119
120 self.tokfix();
121 self.set_yylval_str(&self.tokenbuf.clone());
122 self.flush_string_content();
123 self.restore_strterm(quote);
124
125 Self::tSTRING_CONTENT
126 }
127
128 fn string_term(&mut self, term: u8, func: usize) -> i32 {
129 self.strterm = None;
130 if (func & STR_FUNC_REGEXP) != 0 {
131 let flags = self.regx_options();
132 self.set_yylval_num(format!("{}{}", term as char, flags));
133 self.lex_state.set(EXPR_END);
134 return Self::tREGEXP_END;
135 }
136 if (func & STR_FUNC_LABEL) != 0 && self.is_label_suffix(0) {
137 self.nextc();
138 self.lex_state.set(EXPR_BEG | EXPR_LABEL);
139 return Self::tLABEL_END;
140 }
141 self.lex_state.set(EXPR_END);
142 Self::tSTRING_END
143 }
144
145 fn regx_options(&mut self) -> String {
146 let mut c: MaybeByte;
147 let mut result = String::from("");
148
149 self.newtok();
150 loop {
151 c = self.nextc();
152
153 let ch = match c.as_option() {
154 Some(_) if !c.is_alpha() => break,
155 None => break,
156 Some(ch) => ch,
157 };
158
159 match ch {
160 b'o' | b'n' | b'e' | b's' | b'u' | b'i' | b'x' | b'm' => {
161 result.push(ch as char);
162 }
163 _ => {
164 self.tokadd(c);
165 }
166 }
167 }
168
169 self.buffer.pushback(c);
170 if self.toklen() > 0 {
171 self.tokfix();
172 self.compile_error(
173 DiagnosticMessage::UnknownRegexOptions {
174 options: self
175 .tokenbuf
176 .borrow_string()
177 .expect("expected buffer to have only utf-8 chars")
178 .to_string(),
179 },
180 self.current_loc(),
181 );
182 }
183
184 result
185 }
186
187 pub(crate) fn peek_variable_name(&mut self) -> Option<i32> {
188 let mut ptr: usize = self.buffer.pcur;
189
190 if ptr + 1 >= self.buffer.pend {
191 return None;
192 }
193 let mut c = self.char_at(ptr);
194 ptr += 1;
195
196 match c.as_option() {
197 Some(b'$') => {
198 c = self.char_at(ptr);
199 if c == b'-' {
200 ptr += 1;
201 if ptr >= self.buffer.pend {
202 return None;
203 }
204 c = self.char_at(ptr);
205 } else if c.is_global_name_punct() || c.is_digit() {
206 return Some(Self::tSTRING_DVAR);
207 }
208 }
209
210 Some(b'@') => {
211 c = self.char_at(ptr);
212 if c == b'@' {
213 ptr += 1;
214 if ptr >= self.buffer.pend {
215 return None;
216 }
217 c = self.char_at(ptr);
218 }
219 }
220
221 Some(b'{') => {
222 self.buffer.pcur = ptr;
223 self.command_start = true;
224 return Some(Self::tSTRING_DBEG);
225 }
226
227 _ => return None,
228 }
229
230 if !c.is_ascii() || c == b'_' || c.is_alpha() {
231 return Some(Self::tSTRING_DVAR);
232 }
233
234 None
235 }
236
237 pub(crate) fn tokadd_string(
238 &mut self,
239 func: usize,
240 term: u8,
241 paren: Option<u8>,
242 nest: &mut usize,
243 ) -> Option<MaybeByte> {
244 let mut c: MaybeByte;
245 let _erred = false;
246
247 loop {
248 c = self.nextc();
249 if c.is_eof() {
250 break;
251 }
252
253 if self.buffer.heredoc_indent > 0 {
254 self.update_heredoc_indent(c);
255 }
256
257 if c == paren {
258 *nest += 1;
259 } else if c == term {
260 if *nest == 0 {
261 self.buffer.pushback(c);
262 break;
263 }
264 *nest -= 1;
265 } else if ((func & STR_FUNC_EXPAND) != 0)
266 && c == b'#'
267 && self.buffer.pcur < self.buffer.pend
268 {
269 let c2 = self.char_at(self.buffer.pcur);
270 if c2 == b'$' || c2 == b'@' || c2 == b'{' {
271 self.buffer.pushback(c);
272 break;
273 }
274 } else if c == b'\\' {
275 self.literal_flush(self.buffer.pcur - 1);
276 c = self.nextc();
277 match c.as_option() {
278 Some(b'\n') => {
279 if (func & STR_FUNC_QWORDS) != 0 {
280 } else {
282 if (func & STR_FUNC_EXPAND) != 0 {
283 if (func & STR_FUNC_INDENT) == 0 || self.buffer.heredoc_indent < 0 {
284 continue;
285 }
286 if c == term {
287 return Some(MaybeByte::new(b'\\'));
288 }
289 }
290 self.tokadd(b'\\');
291 }
292 }
293 Some(b'\\') => {
294 if (func & STR_FUNC_ESCAPE) != 0 {
295 self.tokadd(c)
296 }
297 }
298 Some(b'u') => {
299 if (func & STR_FUNC_EXPAND) == 0 {
300 self.tokadd(b'\\');
301 } else {
302 self.tokadd_utf8(
303 Some(term),
304 func & STR_FUNC_SYMBOL,
305 func & STR_FUNC_REGEXP,
306 );
307 continue;
308 }
309 }
310 None => {
311 return None;
312 }
313 _ => {
314 if !c.is_ascii() && (func & STR_FUNC_EXPAND) == 0 {
315 self.tokadd(b'\\');
316 self.tokadd(c);
317 }
318 if (func & STR_FUNC_REGEXP) != 0 {
319 match c {
320 MaybeByte::Some(b'c')
321 | MaybeByte::Some(b'C')
322 | MaybeByte::Some(b'M') => {
323 self.buffer.pushback(c);
324 c = self.read_escape(0);
325
326 let mut escbuf = [0_u8; 5];
327 write!(&mut escbuf[..], "\\x{:X}", c.expect("bug")).unwrap();
328 for byte in escbuf.iter().take(4) {
329 self.tokadd(MaybeByte::Some(*byte));
330 }
331 continue;
332 }
333 _ => {}
334 }
335 if c == term && !self.simple_re_meta(c) {
336 self.tokadd(c);
337 continue;
338 }
339 self.buffer.pushback(c);
340 if self.tokadd_escape().is_err() {
341 return None;
342 }
343 continue;
344 } else if (func & STR_FUNC_EXPAND) != 0 {
345 self.buffer.pushback(c);
346 if (func & STR_FUNC_ESCAPE) != 0 {
347 self.tokadd(b'\\')
348 }
349 c = self.read_escape(0);
350 if c.is_eof() {
351 return None;
352 }
353 } else if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
354 } else if c != term && c != paren {
356 self.tokadd(b'\\');
357 self.buffer.pushback(c);
358 continue;
359 }
360 }
361 }
362 } else if !self.is_ascii() {
363 self.tokadd(c);
364 continue;
365 } else if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
366 self.buffer.pushback(c);
367 break;
368 }
369 self.tokadd(c);
370 }
371
372 Some(c)
373 }
374
375 pub(crate) fn flush_string_content(&mut self) {
376 }
378
379 fn tokadd_utf8_unterminated(&mut self) {
380 self.token_flush();
381 self.yyerror1(
382 DiagnosticMessage::UnterminatedUnicodeEscape {},
383 self.loc(self.buffer.ptok, self.buffer.pcur + 1),
384 );
385 }
386
387 fn scan_hex(&mut self, start: usize, len: usize, numlen: &mut usize) -> usize {
388 let mut s = start;
389 let mut result = 0;
390
391 for _ in 0..len {
392 match self.buffer.byte_at(s).as_option() {
393 None => break,
394 Some(c) => match usize::from_str_radix(&(c as char).to_string(), 16) {
395 Ok(hex) => {
396 result <<= 4;
397 result |= hex;
398 }
399 Err(_) => break,
400 },
401 }
402 s += 1;
403 }
404
405 *numlen = s - start;
406 result
407 }
408
409 fn scan_oct(&mut self, start: usize, len: usize, numlen: &mut usize) -> usize {
410 let mut s = start;
411 let mut result: usize = 0;
412
413 for _ in 0..len {
414 match self.buffer.byte_at(s).as_option() {
415 Some(c) if (b'0'..=b'7').contains(&c) => {
416 result <<= 3;
417 result |= (c - b'0') as usize;
418 }
419 _ => break,
420 }
421 s += 1;
422 }
423
424 *numlen = s - start;
425 result
426 }
427
428 pub(crate) fn tokcopy(&mut self, n: usize) {
429 let substr = self
430 .buffer
431 .substr_at(self.buffer.pcur - n, self.buffer.pcur)
432 .unwrap_or_else(|| panic!("no substr {}..{}", self.buffer.pcur - n, self.buffer.pcur));
433 self.tokenbuf.append(substr);
434 }
435
436 fn tokaddmbc(&mut self, codepoint: usize) {
437 let utf8_char =
438 std::char::from_u32(codepoint.try_into().expect("expected codepoint to be u32"))
439 .expect("expected codepoint to have digits");
440 let utf8_bytes = utf8_char.to_string().into_bytes();
441 for byte in utf8_bytes {
442 self.tokadd(byte)
443 }
444 }
445
446 fn tokadd_codepoint(&mut self, regexp_literal: usize, wide: bool) -> bool {
447 let mut numlen = 0;
448 let codepoint = self.scan_hex(
449 self.buffer.pcur,
450 if wide {
451 self.buffer.pend - self.buffer.pcur
452 } else {
453 4
454 },
455 &mut numlen,
456 );
457 self.literal_flush(self.buffer.pcur);
458 self.buffer.pcur += numlen;
459 if if wide {
460 numlen == 0 || numlen > 6
461 } else {
462 numlen < 4
463 } {
464 self.yyerror1(
465 DiagnosticMessage::InvalidUnicodeEscape {},
466 self.loc(self.buffer.pcur, self.buffer.pcur + 1),
467 );
468 return wide && numlen > 0;
469 }
470 if codepoint > 0x10ffff {
471 self.yyerror0(DiagnosticMessage::TooLargeUnicodeCodepoint {});
472 return wide;
473 }
474 if (codepoint & 0xfffff800) == 0xd800 {
475 self.yyerror0(DiagnosticMessage::InvalidUnicodeCodepoint {});
476 return wide;
477 }
478 if regexp_literal != 0 {
479 self.tokcopy(numlen);
480 } else if codepoint >= 0x80 {
481 self.tokaddmbc(codepoint);
485 } else {
486 self.tokadd(codepoint as u8)
487 }
488
489 true
490 }
491
492 pub(crate) fn tokadd_utf8(
493 &mut self,
494 term: Option<u8>,
495 _symbol_literal: usize,
496 regexp_literal: usize,
497 ) {
498 let open_brace = b'{';
499 let close_brace = b'}';
500 let mut err_multiple_codepoints = false;
501
502 if regexp_literal != 0 {
503 self.tokadd(b'\\');
504 self.tokadd(b'u')
505 }
506
507 if self.buffer.peek(open_brace) {
508 let mut second: Option<usize> = None;
509 let mut c;
510 let mut last = self.nextc();
511 if self.buffer.pcur >= self.buffer.pend {
512 return self.tokadd_utf8_unterminated();
513 }
514 loop {
515 c = self.buffer.byte_at(self.buffer.pcur);
516 if !c.is_space() {
517 break;
518 }
519 self.buffer.pcur += 1;
520 if self.buffer.pcur >= self.buffer.pend {
521 break;
522 }
523 }
524 while c != close_brace {
525 if c == term {
526 return self.tokadd_utf8_unterminated();
527 }
528 if err_multiple_codepoints {
529 second = Some(self.buffer.pcur);
530 }
531 if regexp_literal != 0 {
532 self.tokadd(last)
533 }
534 if !self.tokadd_codepoint(regexp_literal, true) {
535 break;
536 }
537 loop {
538 c = self.char_at(self.buffer.pcur);
539 if !c.is_space() {
540 break;
541 }
542 self.buffer.pcur += 1;
543 if self.buffer.pcur >= self.buffer.pend {
544 return self.tokadd_utf8_unterminated();
545 }
546 last = c;
547 }
548 if term.is_none() && second.is_none() {
549 err_multiple_codepoints = true;
550 }
551 }
552
553 if c != close_brace {
554 return self.tokadd_utf8_unterminated();
555 }
556 if let Some(second) = second {
557 if err_multiple_codepoints {
558 let pcur = self.buffer.pcur;
559 self.buffer.pcur = second;
560 self.token_flush();
561 self.buffer.pcur = pcur;
562 self.yyerror0(DiagnosticMessage::MultipleCodepointAtSingleChar {});
563 self.token_flush();
564 }
565 }
566
567 if regexp_literal != 0 {
568 self.tokadd(close_brace)
569 }
570 self.nextc();
571 } else if !self.tokadd_codepoint(regexp_literal, false) {
572 self.token_flush();
573 }
574 }
575
576 fn simple_re_meta(&mut self, c: MaybeByte) -> bool {
577 matches!(
578 c,
579 MaybeByte::Some(b'$')
580 | MaybeByte::Some(b'*')
581 | MaybeByte::Some(b'+')
582 | MaybeByte::Some(b'.')
583 | MaybeByte::Some(b'?')
584 | MaybeByte::Some(b'^')
585 | MaybeByte::Some(b'|')
586 | MaybeByte::Some(b')')
587 | MaybeByte::Some(b']')
588 | MaybeByte::Some(b'}')
589 | MaybeByte::Some(b'>')
590 )
591 }
592
593 fn tokadd_escape_eof(&mut self) -> Result<(), ()> {
594 self.yyerror0(DiagnosticMessage::InvalidEscapeCharacter {});
595 self.token_flush();
596 Err(())
597 }
598
599 fn tokadd_escape(&mut self) -> Result<(), ()> {
600 let mut numlen = 0;
601
602 let c = self.nextc();
603 match c.as_option() {
604 Some(b'\n') => Ok(()),
605
606 Some(octal) if (b'0'..b'8').contains(&octal) => {
607 self.buffer.pcur -= 1;
608 self.scan_oct(self.buffer.pcur, 3, &mut numlen);
609 self.buffer.pcur += numlen;
610 self.tokcopy(numlen + 1);
611 Ok(())
612 }
613
614 Some(b'x') => {
615 self.tok_hex(&mut numlen);
616 if numlen == 0 {
617 return Err(());
618 }
619 self.tokcopy(numlen + 2);
620 Ok(())
621 }
622
623 None => self.tokadd_escape_eof(),
625
626 Some(other) => {
627 self.tokadd(b'\\');
628 self.tokadd(other);
629 Ok(())
630 }
631 }
632 }
633
634 fn read_escape_eof(&mut self) -> MaybeByte {
635 self.yyerror0(DiagnosticMessage::InvalidEscapeCharacter {});
636 self.token_flush();
637 MaybeByte::new(0)
638 }
639
640 fn tok_hex(&mut self, numlen: &mut usize) -> MaybeByte {
641 let c = self.scan_hex(self.buffer.pcur, 2, numlen);
642 if *numlen == 0 {
643 self.yyerror1(DiagnosticMessage::InvalidHexEscape {}, self.current_loc());
644 self.token_flush();
645 return MaybeByte::new(0);
646 }
647 self.buffer.pcur += *numlen;
648 MaybeByte::new(c as u8)
649 }
650
651 pub(crate) fn read_escape(&mut self, flags: usize) -> MaybeByte {
652 let mut numlen: usize = 0;
653
654 let mut c = self.nextc();
655 match c.as_option() {
656 Some(b'\\') => c,
657 Some(b'n') => MaybeByte::new(b'\n'),
658 Some(b't') => MaybeByte::new(b'\t'),
659 Some(b'r') => MaybeByte::new(b'\r'),
660 Some(b'f') => MaybeByte::new(Self::LF_CHAR),
661 Some(b'v') => MaybeByte::new(Self::VTAB_CHAR),
662 Some(b'a') => MaybeByte::new(0x07_u8),
663 Some(b'e') => MaybeByte::new(0x1b_u8),
664
665 Some(b'0') | Some(b'1') | Some(b'2') | Some(b'3') | Some(b'4') | Some(b'5')
666 | Some(b'6') | Some(b'7') | Some(b'8') | Some(b'9') => {
667 self.buffer.pushback(c);
668 let c = self.scan_oct(self.buffer.pcur, 3, &mut numlen);
669 self.buffer.pcur += numlen;
670 MaybeByte::new(c as u8)
671 }
672
673 Some(b'x') => {
674 let c = self.tok_hex(&mut numlen);
675 if numlen == 0 {
676 return MaybeByte::new(0);
677 }
678 c
679 }
680
681 Some(b'b') => MaybeByte::new(0x08),
682 Some(b's') => MaybeByte::new(b' '),
683
684 Some(b'M') => {
685 if (flags & ESCAPE_META) != 0 {
686 return self.read_escape_eof();
687 }
688 c = self.nextc();
689 if c != b'-' {
690 return self.read_escape_eof();
691 }
692 c = self.nextc();
693 if c == b'\\' {
694 match self.buffer.peekc() {
695 MaybeByte::Some(b'u') | MaybeByte::Some(b'U') => {
696 self.nextc();
697 return self.read_escape_eof();
698 }
699 _ => {}
700 }
701 self.read_escape(flags | ESCAPE_META)
702 .map(|byte| MaybeByte::Some(byte | 0x80))
703 } else if c.is_eof() || !c.is_ascii() {
704 self.read_escape_eof()
705 } else {
706 if let Some(c2) = c.escaped_control_code() {
707 if c.is_control() || (flags & ESCAPE_CONTROL) == 0 {
708 self.warn_space_char(c2, "\\M-");
709 } else {
710 self.warn_space_char(c2, "\\C-\\M-");
711 }
712 } else if c.is_control() {
713 return self.read_escape_eof();
714 }
715 c.map(|c| MaybeByte::Some(c | 0x80))
716 }
717 }
718
719 Some(b'C') | Some(b'c') => {
720 if c == b'C' {
721 c = self.nextc();
723 if c != b'-' {
724 return self.read_escape_eof();
725 }
726 }
727 if (flags & ESCAPE_CONTROL) != 0 {
728 return self.read_escape_eof();
729 }
730 c = self.nextc();
731 if c == b'\\' {
732 match self.buffer.peekc() {
733 MaybeByte::Some(b'u') | MaybeByte::Some(b'U') => {
734 self.nextc();
735 return self.read_escape_eof();
736 }
737 _ => {}
738 }
739 c = self.read_escape(flags | ESCAPE_CONTROL)
740 } else if c == b'?' {
741 return MaybeByte::new(0x7f_u8);
742 } else if c.is_eof() || !c.is_ascii() {
743 return self.read_escape_eof();
744 } else if let Some(c2) = c.escaped_control_code() {
745 if c.is_control() {
746 if (flags & ESCAPE_META) != 0 {
747 self.warn_space_char(c2, "\\M-");
748 } else {
749 self.warn_space_char(c2, "");
750 }
751 } else if (flags & ESCAPE_META) != 0 {
752 self.warn_space_char(c2, "\\M-\\C-");
753 } else {
754 self.warn_space_char(c2, "\\C-");
755 }
756 } else if c.is_control() {
757 return self.read_escape_eof();
758 }
759 c.map(|c| MaybeByte::Some(c & 0x9f))
760 }
761
762 None => self.read_escape_eof(),
763
764 _ => c,
765 }
766 }
767
768 pub(crate) fn is_ascii(&self) -> bool {
769 self.char_at(self.buffer.pcur - 1).is_ascii()
770 }
771
772 pub(crate) fn warn_space_char(&mut self, c: u8, prefix: &'static str) {
773 self.warn(
774 DiagnosticMessage::InvalidCharacterSyntax {
775 suggestion: format!("{}\\{}", prefix, c),
776 },
777 self.current_loc(),
778 )
779 }
780}