yash_syntax/parser/lex/
text.rs

1// This file is part of yash, an extended POSIX shell.
2// Copyright (C) 2020 WATANABE Yuki
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Part of the lexer that parses texts
18
19use super::core::Lexer;
20use super::core::WordContext;
21use super::core::WordLexer;
22use crate::parser::core::Result;
23use crate::parser::error::Error;
24use crate::parser::error::SyntaxError;
25use crate::syntax::Backslashed;
26use crate::syntax::Literal;
27use crate::syntax::Text;
28use crate::syntax::TextUnit;
29
30impl WordLexer<'_, '_> {
31    /// Parses a [`TextUnit`].
32    ///
33    /// This function parses a literal character, backslash-escaped character,
34    /// [dollar unit](WordLexer::dollar_unit), or
35    /// [backquote](WordLexer::backquote).
36    ///
37    /// `is_delimiter` is a function that decides if a character is a delimiter.
38    /// An unquoted character is parsed only if `is_delimiter` returns false for
39    /// it.
40    ///
41    /// `is_escapable` decides if a character can be escaped by a backslash. When
42    /// `is_escapable` returns false, the preceding backslash is considered
43    /// literal.
44    ///
45    /// If the text unit is a backquote, treatment of `\"` inside the backquote
46    /// depends on `self.context`. If it is `Text`, `\"` is an escaped
47    /// double-quote. If `Word`, `\"` is treated literally.
48    pub async fn text_unit<F, G>(
49        &mut self,
50        mut is_delimiter: F,
51        mut is_escapable: G,
52    ) -> Result<Option<TextUnit>>
53    where
54        F: FnMut(char) -> bool,
55        G: FnMut(char) -> bool,
56    {
57        self.text_unit_dyn(&mut is_delimiter, &mut is_escapable)
58            .await
59    }
60
61    /// Dynamic version of [`Self::text_unit`]
62    async fn text_unit_dyn(
63        &mut self,
64        is_delimiter: &mut dyn FnMut(char) -> bool,
65        is_escapable: &mut dyn FnMut(char) -> bool,
66    ) -> Result<Option<TextUnit>> {
67        if self.skip_if(|c| c == '\\').await? {
68            if let Some(c) = self.consume_raw_char_if_dyn(is_escapable).await? {
69                return Ok(Some(Backslashed(c)));
70            } else {
71                return Ok(Some(Literal('\\')));
72            }
73        }
74
75        if let Some(u) = self.dollar_unit().await? {
76            return Ok(Some(u));
77        }
78
79        if let Some(u) = self.backquote().await? {
80            return Ok(Some(u));
81        }
82
83        if let Some(sc) = self.consume_char_if(|c| !is_delimiter(c)).await? {
84            return Ok(Some(Literal(sc.value)));
85        }
86
87        Ok(None)
88    }
89
90    /// Like `consume_char_if_dyn`, but ignores line continuation.
91    async fn consume_raw_char_if_dyn(
92        &mut self,
93        is_escapable: &mut dyn FnMut(char) -> bool,
94    ) -> Result<Option<char>> {
95        Ok(self
96            .disable_line_continuation()
97            .consume_char_if_dyn(is_escapable)
98            .await?
99            .map(|c| c.value))
100    }
101}
102
103impl Lexer<'_> {
104    /// Parses a text, i.e., a (possibly empty) sequence of [`TextUnit`]s.
105    ///
106    /// `is_delimiter` tests if an unquoted character is a delimiter. When
107    /// `is_delimiter` returns true, the parser stops parsing and returns the
108    /// text up to the delimiter.
109    ///
110    /// `is_escapable` tests if a backslash can escape a character. When the
111    /// parser founds an unquoted backslash, the next character is passed to
112    /// `is_escapable`. If `is_escapable` returns true, the backslash is treated
113    /// as a valid escape (`TextUnit::Backslashed`). Otherwise, it ia a
114    /// literal (`TextUnit::Literal`).
115    ///
116    /// `is_escapable` also affects escaping of double-quotes inside backquotes.
117    /// See [`text_unit`](WordLexer::text_unit) for details. Note that this
118    /// function calls `text_unit` with [`WordContext::Text`].
119    pub async fn text<F, G>(&mut self, mut is_delimiter: F, mut is_escapable: G) -> Result<Text>
120    where
121        F: FnMut(char) -> bool,
122        G: FnMut(char) -> bool,
123    {
124        self.text_dyn(&mut is_delimiter, &mut is_escapable).await
125    }
126
127    /// Dynamic version of [`Self::text`]
128    async fn text_dyn(
129        &mut self,
130        is_delimiter: &mut dyn FnMut(char) -> bool,
131        is_escapable: &mut dyn FnMut(char) -> bool,
132    ) -> Result<Text> {
133        let mut units = vec![];
134
135        let mut word_lexer = WordLexer {
136            lexer: self,
137            context: WordContext::Text,
138        };
139        while let Some(unit) = word_lexer.text_unit_dyn(is_delimiter, is_escapable).await? {
140            units.push(unit);
141        }
142
143        Ok(Text(units))
144    }
145
146    /// Parses a text that may contain nested parentheses.
147    ///
148    /// This function works similarly to [`text`](Self::text). However, if an
149    /// unquoted `(` is found in the text, all text units are parsed up to the
150    /// next matching unquoted `)`. Inside the parentheses, the `is_delimiter`
151    /// function is ignored and all non-special characters are parsed as literal
152    /// word units. After finding the `)`, this function continues parsing to
153    /// find a delimiter (as per `is_delimiter`) or another parentheses.
154    ///
155    /// Nested parentheses are supported: the number of `(`s and `)`s must
156    /// match. In other words, the final delimiter is recognized only outside
157    /// outermost parentheses.
158    pub async fn text_with_parentheses<F, G>(
159        &mut self,
160        mut is_delimiter: F,
161        mut is_escapable: G,
162    ) -> Result<Text>
163    where
164        F: FnMut(char) -> bool,
165        G: FnMut(char) -> bool,
166    {
167        self.text_with_parentheses_dyn(&mut is_delimiter, &mut is_escapable)
168            .await
169    }
170
171    /// Dynamic version of [`Self::text_with_parentheses`]
172    async fn text_with_parentheses_dyn(
173        &mut self,
174        is_delimiter: &mut dyn FnMut(char) -> bool,
175        is_escapable: &mut dyn FnMut(char) -> bool,
176    ) -> Result<Text> {
177        let mut units = Vec::new();
178        let mut open_paren_locations = Vec::new();
179        loop {
180            let mut is_delimiter_or_paren = |c| {
181                if c == '(' {
182                    return true;
183                }
184                if open_paren_locations.is_empty() {
185                    is_delimiter(c)
186                } else {
187                    c == ')'
188                }
189            };
190            let next_units = self
191                .text_dyn(&mut is_delimiter_or_paren, is_escapable)
192                .await?
193                .0;
194
195            units.extend(next_units);
196
197            if let Some(sc) = self.consume_char_if(|c| c == '(').await? {
198                units.push(Literal('('));
199                open_paren_locations.push(sc.location.clone());
200            } else if let Some(opening_location) = open_paren_locations.pop() {
201                if self.skip_if(|c| c == ')').await? {
202                    units.push(Literal(')'));
203                } else {
204                    let cause = SyntaxError::UnclosedParen { opening_location }.into();
205                    let location = self.location().await?.clone();
206                    return Err(Error { cause, location });
207                }
208            } else {
209                break;
210            }
211        }
212        Ok(Text(units))
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use crate::parser::error::ErrorCause;
220    use crate::source::Source;
221    use crate::syntax::Backquote;
222    use crate::syntax::BackquoteUnit;
223    use crate::syntax::CommandSubst;
224    use assert_matches::assert_matches;
225    use futures_util::FutureExt;
226
227    #[test]
228    fn lexer_text_unit_literal_accepted() {
229        let mut lexer = Lexer::with_code("X");
230        let mut lexer = WordLexer {
231            lexer: &mut lexer,
232            context: WordContext::Word,
233        };
234        let mut called = false;
235        let result = lexer
236            .text_unit(
237                |c| {
238                    called = true;
239                    assert_eq!(c, 'X');
240                    false
241                },
242                |c| unreachable!("unexpected call to is_escapable({:?})", c),
243            )
244            .now_or_never()
245            .unwrap()
246            .unwrap()
247            .unwrap();
248        assert!(called);
249        assert_matches!(result, Literal('X'));
250
251        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
252    }
253
254    #[test]
255    fn lexer_text_unit_literal_rejected() {
256        let mut lexer = Lexer::with_code(";");
257        let mut lexer = WordLexer {
258            lexer: &mut lexer,
259            context: WordContext::Word,
260        };
261        let mut called = false;
262        let result = lexer
263            .text_unit(
264                |c| {
265                    called = true;
266                    assert_eq!(c, ';');
267                    true
268                },
269                |c| unreachable!("unexpected call to is_escapable({:?})", c),
270            )
271            .now_or_never()
272            .unwrap()
273            .unwrap();
274        assert!(called);
275        assert_eq!(result, None);
276
277        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(';')));
278    }
279
280    #[test]
281    fn lexer_text_unit_backslash_accepted() {
282        let mut lexer = Lexer::with_code(r"\#");
283        let mut lexer = WordLexer {
284            lexer: &mut lexer,
285            context: WordContext::Word,
286        };
287        let mut called = false;
288        let result = lexer
289            .text_unit(
290                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
291                |c| {
292                    called = true;
293                    assert_eq!(c, '#');
294                    true
295                },
296            )
297            .now_or_never()
298            .unwrap()
299            .unwrap()
300            .unwrap();
301        assert!(called);
302        assert_eq!(result, Backslashed('#'));
303
304        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
305    }
306
307    #[test]
308    fn lexer_text_unit_backslash_eof() {
309        let mut lexer = Lexer::with_code(r"\");
310        let mut lexer = WordLexer {
311            lexer: &mut lexer,
312            context: WordContext::Word,
313        };
314        let result = lexer
315            .text_unit(
316                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
317                |c| unreachable!("unexpected call to is_escapable({:?})", c),
318            )
319            .now_or_never()
320            .unwrap()
321            .unwrap()
322            .unwrap();
323        assert_eq!(result, Literal('\\'));
324
325        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
326    }
327
328    #[test]
329    fn lexer_text_unit_backslash_line_continuation_not_recognized() {
330        let mut lexer = Lexer::with_code("\\\\\n");
331        let mut lexer = WordLexer {
332            lexer: &mut lexer,
333            context: WordContext::Word,
334        };
335        let mut called = false;
336        let result = lexer
337            .text_unit(
338                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
339                |c| {
340                    called = true;
341                    assert_eq!(c, '\\');
342                    true
343                },
344            )
345            .now_or_never()
346            .unwrap()
347            .unwrap()
348            .unwrap();
349        assert!(called);
350        assert_eq!(result, Backslashed('\\'));
351
352        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('\n')));
353    }
354
355    #[test]
356    fn lexer_text_unit_dollar() {
357        let mut lexer = Lexer::with_code("$()");
358        let mut lexer = WordLexer {
359            lexer: &mut lexer,
360            context: WordContext::Word,
361        };
362        let result = lexer
363            .text_unit(
364                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
365                |c| unreachable!("unexpected call to is_escapable({:?})", c),
366            )
367            .now_or_never()
368            .unwrap()
369            .unwrap()
370            .unwrap();
371        assert_matches!(result, CommandSubst { content, location } => {
372            assert_eq!(&*content, "");
373            assert_eq!(location.range, 0..3);
374        });
375
376        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
377    }
378
379    #[test]
380    fn lexer_text_unit_backquote_double_quote_escapable() {
381        let mut lexer = Lexer::with_code(r#"`\"`"#);
382        let mut lexer = WordLexer {
383            lexer: &mut lexer,
384            context: WordContext::Text,
385        };
386        let result = lexer
387            .text_unit(
388                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
389                |c| unreachable!("unexpected call to is_escapable({:?})", c),
390            )
391            .now_or_never()
392            .unwrap()
393            .unwrap()
394            .unwrap();
395        assert_matches!(result, Backquote { content, location } => {
396            assert_eq!(content, [BackquoteUnit::Backslashed('"')]);
397            assert_eq!(location.range, 0..4);
398        });
399
400        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
401    }
402
403    #[test]
404    fn lexer_text_unit_backquote_double_quote_not_escapable() {
405        let mut lexer = Lexer::with_code(r#"`\"`"#);
406        let mut lexer = WordLexer {
407            lexer: &mut lexer,
408            context: WordContext::Word,
409        };
410        let result = lexer
411            .text_unit(
412                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
413                |c| unreachable!("unexpected call to is_escapable({:?})", c),
414            )
415            .now_or_never()
416            .unwrap()
417            .unwrap()
418            .unwrap();
419        assert_matches!(result, Backquote { content, location } => {
420            assert_eq!(
421                content,
422                [BackquoteUnit::Literal('\\'), BackquoteUnit::Literal('"')]
423            );
424            assert_eq!(location.range, 0..4);
425        });
426
427        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
428    }
429
430    #[test]
431    fn lexer_text_unit_line_continuations() {
432        let mut lexer = Lexer::with_code("\\\n\\\nX");
433        let mut lexer = WordLexer {
434            lexer: &mut lexer,
435            context: WordContext::Word,
436        };
437        let result = lexer
438            .text_unit(
439                |_| false,
440                |c| unreachable!("unexpected call to is_escapable({:?})", c),
441            )
442            .now_or_never()
443            .unwrap()
444            .unwrap()
445            .unwrap();
446        assert_eq!(result, Literal('X'));
447
448        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
449    }
450
451    #[test]
452    fn lexer_text_empty() {
453        let mut lexer = Lexer::with_code("");
454        let Text(units) = lexer
455            .text(
456                |c| unreachable!("unexpected call to is_delimiter({:?})", c),
457                |c| unreachable!("unexpected call to is_escapable({:?})", c),
458            )
459            .now_or_never()
460            .unwrap()
461            .unwrap();
462        assert_eq!(units, &[]);
463    }
464
465    #[test]
466    fn lexer_text_nonempty() {
467        let mut lexer = Lexer::with_code("abc");
468        let mut called = 0;
469        let Text(units) = lexer
470            .text(
471                |c| {
472                    assert!(
473                        matches!(c, 'a' | 'b' | 'c'),
474                        "unexpected call to is_delimiter({c:?}), called={called}"
475                    );
476                    called += 1;
477                    false
478                },
479                |c| unreachable!("unexpected call to is_escapable({:?})", c),
480            )
481            .now_or_never()
482            .unwrap()
483            .unwrap();
484        assert_eq!(units, &[Literal('a'), Literal('b'), Literal('c')]);
485        assert_eq!(called, 3);
486
487        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
488    }
489
490    #[test]
491    fn lexer_text_delimiter() {
492        let mut lexer = Lexer::with_code("abc");
493        let mut called = 0;
494        let Text(units) = lexer
495            .text(
496                |c| {
497                    assert!(
498                        matches!(c, 'a' | 'b' | 'c'),
499                        "unexpected call to is_delimiter({c:?}), called={called}"
500                    );
501                    called += 1;
502                    c == 'c'
503                },
504                |c| unreachable!("unexpected call to is_escapable({:?})", c),
505            )
506            .now_or_never()
507            .unwrap()
508            .unwrap();
509        assert_eq!(units, &[Literal('a'), Literal('b')]);
510        assert_eq!(called, 3);
511
512        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('c')));
513    }
514
515    #[test]
516    fn lexer_text_escaping() {
517        let mut lexer = Lexer::with_code(r"a\b\c");
518        let mut tested_chars = String::new();
519        let Text(units) = lexer
520            .text(
521                |_| false,
522                |c| {
523                    tested_chars.push(c);
524                    c == 'b'
525                },
526            )
527            .now_or_never()
528            .unwrap()
529            .unwrap();
530        assert_eq!(
531            units,
532            &[Literal('a'), Backslashed('b'), Literal('\\'), Literal('c')]
533        );
534        assert_eq!(tested_chars, "bc");
535
536        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
537    }
538
539    #[test]
540    fn lexer_text_with_parentheses_no_parentheses() {
541        let mut lexer = Lexer::with_code("abc");
542        let Text(units) = lexer
543            .text_with_parentheses(|_| false, |_| false)
544            .now_or_never()
545            .unwrap()
546            .unwrap();
547        assert_eq!(units, &[Literal('a'), Literal('b'), Literal('c')]);
548
549        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(None));
550    }
551
552    #[test]
553    fn lexer_text_with_parentheses_nest_1() {
554        let mut lexer = Lexer::with_code("a(b)c)");
555        let Text(units) = lexer
556            .text_with_parentheses(|c| c == 'b' || c == ')', |_| false)
557            .now_or_never()
558            .unwrap()
559            .unwrap();
560        assert_eq!(
561            units,
562            &[
563                Literal('a'),
564                Literal('('),
565                Literal('b'),
566                Literal(')'),
567                Literal('c'),
568            ]
569        );
570
571        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(')')));
572    }
573
574    #[test]
575    fn lexer_text_with_parentheses_nest_1_1() {
576        let mut lexer = Lexer::with_code("ab(CD)ef(GH)ij;");
577        let Text(units) = lexer
578            .text_with_parentheses(|c| c.is_ascii_uppercase() || c == ';', |_| false)
579            .now_or_never()
580            .unwrap()
581            .unwrap();
582        assert_eq!(
583            units,
584            &[
585                Literal('a'),
586                Literal('b'),
587                Literal('('),
588                Literal('C'),
589                Literal('D'),
590                Literal(')'),
591                Literal('e'),
592                Literal('f'),
593                Literal('('),
594                Literal('G'),
595                Literal('H'),
596                Literal(')'),
597                Literal('i'),
598                Literal('j'),
599            ]
600        );
601
602        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(';')));
603    }
604
605    #[test]
606    fn lexer_text_with_parentheses_nest_3() {
607        let mut lexer = Lexer::with_code("a(B((C)D))e;");
608        let Text(units) = lexer
609            .text_with_parentheses(|c| c.is_ascii_uppercase() || c == ';', |_| false)
610            .now_or_never()
611            .unwrap()
612            .unwrap();
613        assert_eq!(
614            units,
615            &[
616                Literal('a'),
617                Literal('('),
618                Literal('B'),
619                Literal('('),
620                Literal('('),
621                Literal('C'),
622                Literal(')'),
623                Literal('D'),
624                Literal(')'),
625                Literal(')'),
626                Literal('e'),
627            ]
628        );
629
630        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(';')));
631    }
632
633    #[test]
634    fn lexer_text_with_parentheses_unclosed() {
635        let mut lexer = Lexer::with_code("x(()");
636        let e = lexer
637            .text_with_parentheses(|_| false, |_| false)
638            .now_or_never()
639            .unwrap()
640            .unwrap_err();
641        assert_matches!(e.cause,
642            ErrorCause::Syntax(SyntaxError::UnclosedParen { opening_location }) => {
643            assert_eq!(*opening_location.code.value.borrow(), "x(()");
644            assert_eq!(opening_location.code.start_line_number.get(), 1);
645            assert_eq!(*opening_location.code.source, Source::Unknown);
646            assert_eq!(opening_location.range, 1..2);
647        });
648        assert_eq!(*e.location.code.value.borrow(), "x(()");
649        assert_eq!(e.location.code.start_line_number.get(), 1);
650        assert_eq!(*e.location.code.source, Source::Unknown);
651        assert_eq!(e.location.range, 4..4);
652    }
653}