lib_ruby_parser/lexer/
parse_string.rs

1use std::convert::TryInto;
2use std::io::Write;
3
4use crate::maybe_byte::*;
5use crate::source::buffer::*;
6use crate::str_term::{str_types::*, StrTerm};
7use crate::TokenBuf;
8use crate::{lex_states::*, DiagnosticMessage};
9use crate::{lexer::*, str_term::StringLiteral};
10
11const ESCAPE_CONTROL: usize = 1;
12const ESCAPE_META: usize = 2;
13
14impl Lexer {
15    fn take_strterm(&mut self) -> StringLiteral {
16        match self.strterm.take().map(|v| *v) {
17            Some(StrTerm::StringLiteral(s)) => s,
18            _ => unreachable!("strterm must be string"),
19        }
20    }
21    fn restore_strterm(&mut self, literal: StringLiteral) {
22        self.strterm = Some(Box::new(StrTerm::StringLiteral(literal)));
23    }
24
25    pub(crate) fn parse_string(&mut self) -> i32 {
26        let mut quote = self.take_strterm();
27
28        let func = quote.func;
29        let term = quote.term;
30        let paren = quote.paren;
31        let mut space = false;
32        self.lval_start = Some(self.buffer.pcur);
33
34        println_if_debug_lexer!(
35            "parse_string: func = {}, pcur = {}, ptok = {}, term = {}",
36            func,
37            self.buffer.pcur,
38            self.buffer.ptok,
39            quote.term
40        );
41
42        if (func & STR_FUNC_TERM) != 0 {
43            if (func & STR_FUNC_QWORDS) != 0 {
44                self.nextc();
45            } /* delayed term */
46            self.lex_state.set(EXPR_END);
47            self.strterm = None;
48            if (func & STR_FUNC_REGEXP) != 0 {
49                return Self::tREGEXP_END;
50            } else {
51                if let Some(heredoc_end) = quote.heredoc_end {
52                    self.lval_start = Some(heredoc_end.start);
53                    self.lval_end = Some(heredoc_end.end);
54                    self.set_yylval_str(&TokenBuf::new(&heredoc_end.value));
55                }
56                return Self::tSTRING_END;
57            }
58        }
59        let mut c = self.nextc();
60        if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
61            loop {
62                c = self.nextc();
63
64                if !c.is_space() {
65                    break;
66                }
67            }
68            space = true;
69        }
70        if (func & STR_FUNC_LIST) != 0 {
71            quote.func &= !STR_FUNC_LIST;
72            space = true;
73        }
74        if c == term && quote.nest == 0 {
75            if (func & STR_FUNC_QWORDS) != 0 {
76                quote.func |= STR_FUNC_TERM;
77                self.buffer.pushback(c); /* dispatch the term at tSTRING_END */
78                self.restore_strterm(quote);
79                return Self::tSPACE;
80            }
81            self.restore_strterm(quote);
82            return self.string_term(term, func);
83        }
84        if space {
85            self.buffer.pushback(c);
86            self.restore_strterm(quote);
87            return Self::tSPACE;
88        }
89        self.newtok();
90        if ((func & STR_FUNC_EXPAND) != 0) && c == b'#' {
91            if let Some(t) = self.peek_variable_name() {
92                self.restore_strterm(quote);
93                return t;
94            }
95            self.tokadd(b'#');
96            c = self.nextc();
97        }
98        self.buffer.pushback(c);
99
100        let mut nest = quote.nest;
101        let added = self.tokadd_string(func, term, paren, &mut nest);
102        quote.nest = nest;
103
104        if added.is_some() && self.buffer.eofp {
105            self.literal_flush(self.buffer.pcur);
106            if (func & STR_FUNC_QWORDS) != 0 {
107                /* no content to add, bailing out here */
108                self.yyerror0(DiagnosticMessage::UnterminatedList {});
109                self.strterm = None;
110                return Self::tSTRING_END;
111            }
112            if (func & STR_FUNC_REGEXP) != 0 {
113                self.yyerror0(DiagnosticMessage::UnterminatedRegexp {});
114            } else {
115                self.yyerror0(DiagnosticMessage::UnterminatedString {});
116            }
117            quote.func |= STR_FUNC_TERM;
118        }
119
120        self.tokfix();
121        self.set_yylval_str(&self.tokenbuf.clone());
122        self.flush_string_content();
123        self.restore_strterm(quote);
124
125        Self::tSTRING_CONTENT
126    }
127
128    fn string_term(&mut self, term: u8, func: usize) -> i32 {
129        self.strterm = None;
130        if (func & STR_FUNC_REGEXP) != 0 {
131            let flags = self.regx_options();
132            self.set_yylval_num(format!("{}{}", term as char, flags));
133            self.lex_state.set(EXPR_END);
134            return Self::tREGEXP_END;
135        }
136        if (func & STR_FUNC_LABEL) != 0 && self.is_label_suffix(0) {
137            self.nextc();
138            self.lex_state.set(EXPR_BEG | EXPR_LABEL);
139            return Self::tLABEL_END;
140        }
141        self.lex_state.set(EXPR_END);
142        Self::tSTRING_END
143    }
144
145    fn regx_options(&mut self) -> String {
146        let mut c: MaybeByte;
147        let mut result = String::from("");
148
149        self.newtok();
150        loop {
151            c = self.nextc();
152
153            let ch = match c.as_option() {
154                Some(_) if !c.is_alpha() => break,
155                None => break,
156                Some(ch) => ch,
157            };
158
159            match ch {
160                b'o' | b'n' | b'e' | b's' | b'u' | b'i' | b'x' | b'm' => {
161                    result.push(ch as char);
162                }
163                _ => {
164                    self.tokadd(c);
165                }
166            }
167        }
168
169        self.buffer.pushback(c);
170        if self.toklen() > 0 {
171            self.tokfix();
172            self.compile_error(
173                DiagnosticMessage::UnknownRegexOptions {
174                    options: self
175                        .tokenbuf
176                        .borrow_string()
177                        .expect("expected buffer to have only utf-8 chars")
178                        .to_string(),
179                },
180                self.current_loc(),
181            );
182        }
183
184        result
185    }
186
187    pub(crate) fn peek_variable_name(&mut self) -> Option<i32> {
188        let mut ptr: usize = self.buffer.pcur;
189
190        if ptr + 1 >= self.buffer.pend {
191            return None;
192        }
193        let mut c = self.char_at(ptr);
194        ptr += 1;
195
196        match c.as_option() {
197            Some(b'$') => {
198                c = self.char_at(ptr);
199                if c == b'-' {
200                    ptr += 1;
201                    if ptr >= self.buffer.pend {
202                        return None;
203                    }
204                    c = self.char_at(ptr);
205                } else if c.is_global_name_punct() || c.is_digit() {
206                    return Some(Self::tSTRING_DVAR);
207                }
208            }
209
210            Some(b'@') => {
211                c = self.char_at(ptr);
212                if c == b'@' {
213                    ptr += 1;
214                    if ptr >= self.buffer.pend {
215                        return None;
216                    }
217                    c = self.char_at(ptr);
218                }
219            }
220
221            Some(b'{') => {
222                self.buffer.pcur = ptr;
223                self.command_start = true;
224                return Some(Self::tSTRING_DBEG);
225            }
226
227            _ => return None,
228        }
229
230        if !c.is_ascii() || c == b'_' || c.is_alpha() {
231            return Some(Self::tSTRING_DVAR);
232        }
233
234        None
235    }
236
237    pub(crate) fn tokadd_string(
238        &mut self,
239        func: usize,
240        term: u8,
241        paren: Option<u8>,
242        nest: &mut usize,
243    ) -> Option<MaybeByte> {
244        let mut c: MaybeByte;
245        let _erred = false;
246
247        loop {
248            c = self.nextc();
249            if c.is_eof() {
250                break;
251            }
252
253            if self.buffer.heredoc_indent > 0 {
254                self.update_heredoc_indent(c);
255            }
256
257            if c == paren {
258                *nest += 1;
259            } else if c == term {
260                if *nest == 0 {
261                    self.buffer.pushback(c);
262                    break;
263                }
264                *nest -= 1;
265            } else if ((func & STR_FUNC_EXPAND) != 0)
266                && c == b'#'
267                && self.buffer.pcur < self.buffer.pend
268            {
269                let c2 = self.char_at(self.buffer.pcur);
270                if c2 == b'$' || c2 == b'@' || c2 == b'{' {
271                    self.buffer.pushback(c);
272                    break;
273                }
274            } else if c == b'\\' {
275                self.literal_flush(self.buffer.pcur - 1);
276                c = self.nextc();
277                match c.as_option() {
278                    Some(b'\n') => {
279                        if (func & STR_FUNC_QWORDS) != 0 {
280                            // break;
281                        } else {
282                            if (func & STR_FUNC_EXPAND) != 0 {
283                                if (func & STR_FUNC_INDENT) == 0 || self.buffer.heredoc_indent < 0 {
284                                    continue;
285                                }
286                                if c == term {
287                                    return Some(MaybeByte::new(b'\\'));
288                                }
289                            }
290                            self.tokadd(b'\\');
291                        }
292                    }
293                    Some(b'\\') => {
294                        if (func & STR_FUNC_ESCAPE) != 0 {
295                            self.tokadd(c)
296                        }
297                    }
298                    Some(b'u') => {
299                        if (func & STR_FUNC_EXPAND) == 0 {
300                            self.tokadd(b'\\');
301                        } else {
302                            self.tokadd_utf8(
303                                Some(term),
304                                func & STR_FUNC_SYMBOL,
305                                func & STR_FUNC_REGEXP,
306                            );
307                            continue;
308                        }
309                    }
310                    None => {
311                        return None;
312                    }
313                    _ => {
314                        if !c.is_ascii() && (func & STR_FUNC_EXPAND) == 0 {
315                            self.tokadd(b'\\');
316                            self.tokadd(c);
317                        }
318                        if (func & STR_FUNC_REGEXP) != 0 {
319                            match c {
320                                MaybeByte::Some(b'c')
321                                | MaybeByte::Some(b'C')
322                                | MaybeByte::Some(b'M') => {
323                                    self.buffer.pushback(c);
324                                    c = self.read_escape(0);
325
326                                    let mut escbuf = [0_u8; 5];
327                                    write!(&mut escbuf[..], "\\x{:X}", c.expect("bug")).unwrap();
328                                    for byte in escbuf.iter().take(4) {
329                                        self.tokadd(MaybeByte::Some(*byte));
330                                    }
331                                    continue;
332                                }
333                                _ => {}
334                            }
335                            if c == term && !self.simple_re_meta(c) {
336                                self.tokadd(c);
337                                continue;
338                            }
339                            self.buffer.pushback(c);
340                            if self.tokadd_escape().is_err() {
341                                return None;
342                            }
343                            continue;
344                        } else if (func & STR_FUNC_EXPAND) != 0 {
345                            self.buffer.pushback(c);
346                            if (func & STR_FUNC_ESCAPE) != 0 {
347                                self.tokadd(b'\\')
348                            }
349                            c = self.read_escape(0);
350                            if c.is_eof() {
351                                return None;
352                            }
353                        } else if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
354                            // ignore backslashed spaces in %w
355                        } else if c != term && c != paren {
356                            self.tokadd(b'\\');
357                            self.buffer.pushback(c);
358                            continue;
359                        }
360                    }
361                }
362            } else if !self.is_ascii() {
363                self.tokadd(c);
364                continue;
365            } else if (func & STR_FUNC_QWORDS) != 0 && c.is_space() {
366                self.buffer.pushback(c);
367                break;
368            }
369            self.tokadd(c);
370        }
371
372        Some(c)
373    }
374
375    pub(crate) fn flush_string_content(&mut self) {
376        // noop
377    }
378
379    fn tokadd_utf8_unterminated(&mut self) {
380        self.token_flush();
381        self.yyerror1(
382            DiagnosticMessage::UnterminatedUnicodeEscape {},
383            self.loc(self.buffer.ptok, self.buffer.pcur + 1),
384        );
385    }
386
387    fn scan_hex(&mut self, start: usize, len: usize, numlen: &mut usize) -> usize {
388        let mut s = start;
389        let mut result = 0;
390
391        for _ in 0..len {
392            match self.buffer.byte_at(s).as_option() {
393                None => break,
394                Some(c) => match usize::from_str_radix(&(c as char).to_string(), 16) {
395                    Ok(hex) => {
396                        result <<= 4;
397                        result |= hex;
398                    }
399                    Err(_) => break,
400                },
401            }
402            s += 1;
403        }
404
405        *numlen = s - start;
406        result
407    }
408
409    fn scan_oct(&mut self, start: usize, len: usize, numlen: &mut usize) -> usize {
410        let mut s = start;
411        let mut result: usize = 0;
412
413        for _ in 0..len {
414            match self.buffer.byte_at(s).as_option() {
415                Some(c) if (b'0'..=b'7').contains(&c) => {
416                    result <<= 3;
417                    result |= (c - b'0') as usize;
418                }
419                _ => break,
420            }
421            s += 1;
422        }
423
424        *numlen = s - start;
425        result
426    }
427
428    pub(crate) fn tokcopy(&mut self, n: usize) {
429        let substr = self
430            .buffer
431            .substr_at(self.buffer.pcur - n, self.buffer.pcur)
432            .unwrap_or_else(|| panic!("no substr {}..{}", self.buffer.pcur - n, self.buffer.pcur));
433        self.tokenbuf.append(substr);
434    }
435
436    fn tokaddmbc(&mut self, codepoint: usize) {
437        let utf8_char =
438            std::char::from_u32(codepoint.try_into().expect("expected codepoint to be u32"))
439                .expect("expected codepoint to have digits");
440        let utf8_bytes = utf8_char.to_string().into_bytes();
441        for byte in utf8_bytes {
442            self.tokadd(byte)
443        }
444    }
445
446    fn tokadd_codepoint(&mut self, regexp_literal: usize, wide: bool) -> bool {
447        let mut numlen = 0;
448        let codepoint = self.scan_hex(
449            self.buffer.pcur,
450            if wide {
451                self.buffer.pend - self.buffer.pcur
452            } else {
453                4
454            },
455            &mut numlen,
456        );
457        self.literal_flush(self.buffer.pcur);
458        self.buffer.pcur += numlen;
459        if if wide {
460            numlen == 0 || numlen > 6
461        } else {
462            numlen < 4
463        } {
464            self.yyerror1(
465                DiagnosticMessage::InvalidUnicodeEscape {},
466                self.loc(self.buffer.pcur, self.buffer.pcur + 1),
467            );
468            return wide && numlen > 0;
469        }
470        if codepoint > 0x10ffff {
471            self.yyerror0(DiagnosticMessage::TooLargeUnicodeCodepoint {});
472            return wide;
473        }
474        if (codepoint & 0xfffff800) == 0xd800 {
475            self.yyerror0(DiagnosticMessage::InvalidUnicodeCodepoint {});
476            return wide;
477        }
478        if regexp_literal != 0 {
479            self.tokcopy(numlen);
480        } else if codepoint >= 0x80 {
481            // if self.buffer.encoding != "utf-8" {
482            //     panic!("UTF-8 mixed within source");
483            // }
484            self.tokaddmbc(codepoint);
485        } else {
486            self.tokadd(codepoint as u8)
487        }
488
489        true
490    }
491
492    pub(crate) fn tokadd_utf8(
493        &mut self,
494        term: Option<u8>,
495        _symbol_literal: usize,
496        regexp_literal: usize,
497    ) {
498        let open_brace = b'{';
499        let close_brace = b'}';
500        let mut err_multiple_codepoints = false;
501
502        if regexp_literal != 0 {
503            self.tokadd(b'\\');
504            self.tokadd(b'u')
505        }
506
507        if self.buffer.peek(open_brace) {
508            let mut second: Option<usize> = None;
509            let mut c;
510            let mut last = self.nextc();
511            if self.buffer.pcur >= self.buffer.pend {
512                return self.tokadd_utf8_unterminated();
513            }
514            loop {
515                c = self.buffer.byte_at(self.buffer.pcur);
516                if !c.is_space() {
517                    break;
518                }
519                self.buffer.pcur += 1;
520                if self.buffer.pcur >= self.buffer.pend {
521                    break;
522                }
523            }
524            while c != close_brace {
525                if c == term {
526                    return self.tokadd_utf8_unterminated();
527                }
528                if err_multiple_codepoints {
529                    second = Some(self.buffer.pcur);
530                }
531                if regexp_literal != 0 {
532                    self.tokadd(last)
533                }
534                if !self.tokadd_codepoint(regexp_literal, true) {
535                    break;
536                }
537                loop {
538                    c = self.char_at(self.buffer.pcur);
539                    if !c.is_space() {
540                        break;
541                    }
542                    self.buffer.pcur += 1;
543                    if self.buffer.pcur >= self.buffer.pend {
544                        return self.tokadd_utf8_unterminated();
545                    }
546                    last = c;
547                }
548                if term.is_none() && second.is_none() {
549                    err_multiple_codepoints = true;
550                }
551            }
552
553            if c != close_brace {
554                return self.tokadd_utf8_unterminated();
555            }
556            if let Some(second) = second {
557                if err_multiple_codepoints {
558                    let pcur = self.buffer.pcur;
559                    self.buffer.pcur = second;
560                    self.token_flush();
561                    self.buffer.pcur = pcur;
562                    self.yyerror0(DiagnosticMessage::MultipleCodepointAtSingleChar {});
563                    self.token_flush();
564                }
565            }
566
567            if regexp_literal != 0 {
568                self.tokadd(close_brace)
569            }
570            self.nextc();
571        } else if !self.tokadd_codepoint(regexp_literal, false) {
572            self.token_flush();
573        }
574    }
575
576    fn simple_re_meta(&mut self, c: MaybeByte) -> bool {
577        matches!(
578            c,
579            MaybeByte::Some(b'$')
580                | MaybeByte::Some(b'*')
581                | MaybeByte::Some(b'+')
582                | MaybeByte::Some(b'.')
583                | MaybeByte::Some(b'?')
584                | MaybeByte::Some(b'^')
585                | MaybeByte::Some(b'|')
586                | MaybeByte::Some(b')')
587                | MaybeByte::Some(b']')
588                | MaybeByte::Some(b'}')
589                | MaybeByte::Some(b'>')
590        )
591    }
592
593    fn tokadd_escape_eof(&mut self) -> Result<(), ()> {
594        self.yyerror0(DiagnosticMessage::InvalidEscapeCharacter {});
595        self.token_flush();
596        Err(())
597    }
598
599    fn tokadd_escape(&mut self) -> Result<(), ()> {
600        let mut numlen = 0;
601
602        let c = self.nextc();
603        match c.as_option() {
604            Some(b'\n') => Ok(()),
605
606            Some(octal) if (b'0'..b'8').contains(&octal) => {
607                self.buffer.pcur -= 1;
608                self.scan_oct(self.buffer.pcur, 3, &mut numlen);
609                self.buffer.pcur += numlen;
610                self.tokcopy(numlen + 1);
611                Ok(())
612            }
613
614            Some(b'x') => {
615                self.tok_hex(&mut numlen);
616                if numlen == 0 {
617                    return Err(());
618                }
619                self.tokcopy(numlen + 2);
620                Ok(())
621            }
622
623            // eof:
624            None => self.tokadd_escape_eof(),
625
626            Some(other) => {
627                self.tokadd(b'\\');
628                self.tokadd(other);
629                Ok(())
630            }
631        }
632    }
633
634    fn read_escape_eof(&mut self) -> MaybeByte {
635        self.yyerror0(DiagnosticMessage::InvalidEscapeCharacter {});
636        self.token_flush();
637        MaybeByte::new(0)
638    }
639
640    fn tok_hex(&mut self, numlen: &mut usize) -> MaybeByte {
641        let c = self.scan_hex(self.buffer.pcur, 2, numlen);
642        if *numlen == 0 {
643            self.yyerror1(DiagnosticMessage::InvalidHexEscape {}, self.current_loc());
644            self.token_flush();
645            return MaybeByte::new(0);
646        }
647        self.buffer.pcur += *numlen;
648        MaybeByte::new(c as u8)
649    }
650
651    pub(crate) fn read_escape(&mut self, flags: usize) -> MaybeByte {
652        let mut numlen: usize = 0;
653
654        let mut c = self.nextc();
655        match c.as_option() {
656            Some(b'\\') => c,
657            Some(b'n') => MaybeByte::new(b'\n'),
658            Some(b't') => MaybeByte::new(b'\t'),
659            Some(b'r') => MaybeByte::new(b'\r'),
660            Some(b'f') => MaybeByte::new(Self::LF_CHAR),
661            Some(b'v') => MaybeByte::new(Self::VTAB_CHAR),
662            Some(b'a') => MaybeByte::new(0x07_u8),
663            Some(b'e') => MaybeByte::new(0x1b_u8),
664
665            Some(b'0') | Some(b'1') | Some(b'2') | Some(b'3') | Some(b'4') | Some(b'5')
666            | Some(b'6') | Some(b'7') | Some(b'8') | Some(b'9') => {
667                self.buffer.pushback(c);
668                let c = self.scan_oct(self.buffer.pcur, 3, &mut numlen);
669                self.buffer.pcur += numlen;
670                MaybeByte::new(c as u8)
671            }
672
673            Some(b'x') => {
674                let c = self.tok_hex(&mut numlen);
675                if numlen == 0 {
676                    return MaybeByte::new(0);
677                }
678                c
679            }
680
681            Some(b'b') => MaybeByte::new(0x08),
682            Some(b's') => MaybeByte::new(b' '),
683
684            Some(b'M') => {
685                if (flags & ESCAPE_META) != 0 {
686                    return self.read_escape_eof();
687                }
688                c = self.nextc();
689                if c != b'-' {
690                    return self.read_escape_eof();
691                }
692                c = self.nextc();
693                if c == b'\\' {
694                    match self.buffer.peekc() {
695                        MaybeByte::Some(b'u') | MaybeByte::Some(b'U') => {
696                            self.nextc();
697                            return self.read_escape_eof();
698                        }
699                        _ => {}
700                    }
701                    self.read_escape(flags | ESCAPE_META)
702                        .map(|byte| MaybeByte::Some(byte | 0x80))
703                } else if c.is_eof() || !c.is_ascii() {
704                    self.read_escape_eof()
705                } else {
706                    if let Some(c2) = c.escaped_control_code() {
707                        if c.is_control() || (flags & ESCAPE_CONTROL) == 0 {
708                            self.warn_space_char(c2, "\\M-");
709                        } else {
710                            self.warn_space_char(c2, "\\C-\\M-");
711                        }
712                    } else if c.is_control() {
713                        return self.read_escape_eof();
714                    }
715                    c.map(|c| MaybeByte::Some(c | 0x80))
716                }
717            }
718
719            Some(b'C') | Some(b'c') => {
720                if c == b'C' {
721                    // C fallthrough
722                    c = self.nextc();
723                    if c != b'-' {
724                        return self.read_escape_eof();
725                    }
726                }
727                if (flags & ESCAPE_CONTROL) != 0 {
728                    return self.read_escape_eof();
729                }
730                c = self.nextc();
731                if c == b'\\' {
732                    match self.buffer.peekc() {
733                        MaybeByte::Some(b'u') | MaybeByte::Some(b'U') => {
734                            self.nextc();
735                            return self.read_escape_eof();
736                        }
737                        _ => {}
738                    }
739                    c = self.read_escape(flags | ESCAPE_CONTROL)
740                } else if c == b'?' {
741                    return MaybeByte::new(0x7f_u8);
742                } else if c.is_eof() || !c.is_ascii() {
743                    return self.read_escape_eof();
744                } else if let Some(c2) = c.escaped_control_code() {
745                    if c.is_control() {
746                        if (flags & ESCAPE_META) != 0 {
747                            self.warn_space_char(c2, "\\M-");
748                        } else {
749                            self.warn_space_char(c2, "");
750                        }
751                    } else if (flags & ESCAPE_META) != 0 {
752                        self.warn_space_char(c2, "\\M-\\C-");
753                    } else {
754                        self.warn_space_char(c2, "\\C-");
755                    }
756                } else if c.is_control() {
757                    return self.read_escape_eof();
758                }
759                c.map(|c| MaybeByte::Some(c & 0x9f))
760            }
761
762            None => self.read_escape_eof(),
763
764            _ => c,
765        }
766    }
767
768    pub(crate) fn is_ascii(&self) -> bool {
769        self.char_at(self.buffer.pcur - 1).is_ascii()
770    }
771
772    pub(crate) fn warn_space_char(&mut self, c: u8, prefix: &'static str) {
773        self.warn(
774            DiagnosticMessage::InvalidCharacterSyntax {
775                suggestion: format!("{}\\{}", prefix, c),
776            },
777            self.current_loc(),
778        )
779    }
780}