yash_syntax/parser/lex/
token.rs

1// This file is part of yash, an extended POSIX shell.
2// Copyright (C) 2020 WATANABE Yuki
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Part of the lexer that parses backquotes
18
19use super::core::Lexer;
20use super::core::Token;
21use super::core::TokenId;
22use super::core::WordContext;
23use super::core::WordLexer;
24use super::core::is_blank;
25use super::op::is_operator_char;
26use crate::parser::core::Result;
27use crate::syntax::MaybeLiteral;
28use crate::syntax::TextUnit;
29use crate::syntax::Word;
30use crate::syntax::WordUnit;
31
32/// Tests whether the given character is a token delimiter.
33///
34/// A character is a token delimiter if it is either a whitespace or [operator](is_operator_char).
35pub fn is_token_delimiter_char(c: char) -> bool {
36    is_operator_char(c) || is_blank(c)
37}
38
39impl Lexer<'_> {
40    /// Determines the token ID for the word.
41    ///
42    /// This is a helper function used by [`Lexer::token`] and does not support
43    /// operators.
44    async fn token_id(&mut self, word: &Word) -> Result<TokenId> {
45        if word.units.is_empty() {
46            return Ok(TokenId::EndOfInput);
47        }
48
49        if let Some(literal) = word.to_string_if_literal() {
50            // keyword?
51            if let Ok(keyword) = literal.parse() {
52                return Ok(TokenId::Token(Some(keyword)));
53            }
54
55            // IO_NUMBER?
56            if literal.chars().all(|c| c.is_ascii_digit())
57                && matches!(self.peek_char().await?, Some('<' | '>'))
58            {
59                return Ok(TokenId::IoNumber);
60            }
61        }
62
63        // IO_LOCATION?
64        if word.units.first() == Some(&WordUnit::Unquoted(TextUnit::Literal('{'))) {
65            let braced = match word.units.last() {
66                Some(WordUnit::Unquoted(TextUnit::Literal('}'))) => word.units.len() >= 3,
67                Some(WordUnit::Unquoted(TextUnit::Backslashed('}'))) => true,
68                Some(WordUnit::Unquoted(TextUnit::BracedParam(_))) => true,
69                _ => false,
70            };
71            if braced && matches!(self.peek_char().await?, Some('<' | '>')) {
72                return Ok(TokenId::IoLocation);
73            }
74        }
75
76        Ok(TokenId::Token(None))
77    }
78
79    /// Parses a token.
80    ///
81    /// If there is no more token that can be parsed, the result is a token with an empty word and
82    /// [`EndOfInput`](TokenId::EndOfInput) token identifier.
83    pub async fn token(&mut self) -> Result<Token> {
84        if let Some(op) = self.operator().await? {
85            return Ok(op);
86        }
87
88        let index = self.index();
89
90        let mut word_lexer = WordLexer {
91            lexer: self,
92            context: WordContext::Word,
93        };
94        let mut word = word_lexer.word(is_token_delimiter_char).await?;
95        word.parse_tilde_front();
96
97        let id = self.token_id(&word).await?;
98
99        Ok(Token { word, id, index })
100    }
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106    use crate::source::Source;
107    use futures_util::FutureExt as _;
108
109    #[test]
110    fn lexer_token_empty() {
111        // If there's no word unit that can be parsed, it is the end of input.
112        let mut lexer = Lexer::with_code("");
113
114        let t = lexer.token().now_or_never().unwrap().unwrap();
115        assert_eq!(*t.word.location.code.value.borrow(), "");
116        assert_eq!(t.word.location.code.start_line_number.get(), 1);
117        assert_eq!(*t.word.location.code.source, Source::Unknown);
118        assert_eq!(t.word.location.range, 0..0);
119        assert_eq!(t.id, TokenId::EndOfInput);
120        assert_eq!(t.index, 0);
121    }
122
123    #[test]
124    fn lexer_token_non_empty() {
125        let mut lexer = Lexer::with_code("abc ");
126
127        let t = lexer.token().now_or_never().unwrap().unwrap();
128        assert_eq!(t.word.units.len(), 3);
129        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('a')));
130        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('b')));
131        assert_eq!(t.word.units[2], WordUnit::Unquoted(TextUnit::Literal('c')));
132        assert_eq!(*t.word.location.code.value.borrow(), "abc ");
133        assert_eq!(t.word.location.code.start_line_number.get(), 1);
134        assert_eq!(*t.word.location.code.source, Source::Unknown);
135        assert_eq!(t.word.location.range, 0..3);
136        assert_eq!(t.id, TokenId::Token(None));
137        assert_eq!(t.index, 0);
138
139        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(' ')));
140    }
141
142    #[test]
143    fn lexer_token_tilde() {
144        let mut lexer = Lexer::with_code("~a:~");
145
146        let t = lexer.token().now_or_never().unwrap().unwrap();
147        assert_eq!(
148            t.word.units,
149            [WordUnit::Tilde {
150                name: "a:~".to_string(),
151                followed_by_slash: false
152            }]
153        );
154    }
155
156    #[test]
157    fn lexer_token_io_number_delimited_by_less() {
158        let mut lexer = Lexer::with_code("12<");
159
160        let t = lexer.token().now_or_never().unwrap().unwrap();
161        assert_eq!(t.word.units.len(), 2);
162        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('1')));
163        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('2')));
164        assert_eq!(*t.word.location.code.value.borrow(), "12<");
165        assert_eq!(t.word.location.code.start_line_number.get(), 1);
166        assert_eq!(*t.word.location.code.source, Source::Unknown);
167        assert_eq!(t.word.location.range, 0..2);
168        assert_eq!(t.id, TokenId::IoNumber);
169        assert_eq!(t.index, 0);
170
171        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
172    }
173
174    #[test]
175    fn lexer_token_io_number_delimited_by_greater() {
176        let mut lexer = Lexer::with_code("0>>");
177
178        let t = lexer.token().now_or_never().unwrap().unwrap();
179        assert_eq!(t.word.units.len(), 1);
180        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('0')));
181        assert_eq!(*t.word.location.code.value.borrow(), "0>>");
182        assert_eq!(t.word.location.code.start_line_number.get(), 1);
183        assert_eq!(*t.word.location.code.source, Source::Unknown);
184        assert_eq!(t.word.location.range, 0..1);
185        assert_eq!(t.id, TokenId::IoNumber);
186        assert_eq!(t.index, 0);
187
188        assert_eq!(
189            lexer.location().now_or_never().unwrap().unwrap().range,
190            1..2
191        );
192    }
193
194    #[test]
195    fn lexer_token_digit_not_followed_by_less_or_greater() {
196        let mut lexer = Lexer::with_code("12;");
197
198        let t = lexer.token().now_or_never().unwrap().unwrap();
199        assert_eq!(t.word.units.len(), 2);
200        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('1')));
201        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('2')));
202        assert_eq!(*t.word.location.code.value.borrow(), "12;");
203        assert_eq!(t.word.location.code.start_line_number.get(), 1);
204        assert_eq!(*t.word.location.code.source, Source::Unknown);
205        assert_eq!(t.word.location.range, 0..2);
206        assert_eq!(t.id, TokenId::Token(None));
207        assert_eq!(t.index, 0);
208
209        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(';')));
210    }
211
212    #[test]
213    fn lexer_token_io_location_delimited_by_less() {
214        let mut lexer = Lexer::with_code("{n}<");
215
216        let t = lexer.token().now_or_never().unwrap().unwrap();
217        assert_eq!(t.word.units.len(), 3);
218        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('{')));
219        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('n')));
220        assert_eq!(t.word.units[2], WordUnit::Unquoted(TextUnit::Literal('}')));
221        assert_eq!(*t.word.location.code.value.borrow(), "{n}<");
222        assert_eq!(t.word.location.code.start_line_number.get(), 1);
223        assert_eq!(*t.word.location.code.source, Source::Unknown);
224        assert_eq!(t.word.location.range, 0..3);
225        assert_eq!(t.id, TokenId::IoLocation);
226        assert_eq!(t.index, 0);
227
228        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
229    }
230
231    #[test]
232    fn lexer_token_io_location_delimited_by_greater() {
233        let mut lexer = Lexer::with_code("{n}>");
234
235        let t = lexer.token().now_or_never().unwrap().unwrap();
236        assert_eq!(t.id, TokenId::IoLocation);
237
238        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('>')));
239    }
240
241    #[test]
242    fn lexer_token_io_location_ending_with_backslashed_brace() {
243        let mut lexer = Lexer::with_code(r"{\}<");
244
245        let t = lexer.token().now_or_never().unwrap().unwrap();
246        assert_eq!(t.id, TokenId::IoLocation);
247
248        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
249    }
250
251    #[test]
252    fn lexer_token_io_location_ending_with_braced_parameter() {
253        let mut lexer = Lexer::with_code("{${n}<");
254
255        let t = lexer.token().now_or_never().unwrap().unwrap();
256        assert_eq!(t.id, TokenId::IoLocation);
257
258        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
259    }
260
261    #[test]
262    fn lexer_token_empty_braces_followed_by_less() {
263        let mut lexer = Lexer::with_code("{}<");
264
265        let t = lexer.token().now_or_never().unwrap().unwrap();
266        assert_eq!(t.id, TokenId::Token(None));
267
268        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
269    }
270
271    #[test]
272    fn lexer_token_braced_word_not_followed_by_less_or_greater() {
273        let mut lexer = Lexer::with_code("{n};");
274
275        let t = lexer.token().now_or_never().unwrap().unwrap();
276        assert_eq!(t.id, TokenId::Token(None));
277
278        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(';')));
279    }
280
281    #[test]
282    fn lexer_token_after_blank() {
283        let mut lexer = Lexer::with_code(" a  ");
284
285        lexer.skip_blanks().now_or_never().unwrap().unwrap();
286        let t = lexer.token().now_or_never().unwrap().unwrap();
287        assert_eq!(*t.word.location.code.value.borrow(), " a  ");
288        assert_eq!(t.word.location.code.start_line_number.get(), 1);
289        assert_eq!(*t.word.location.code.source, Source::Unknown);
290        assert_eq!(t.word.location.range, 1..2);
291        assert_eq!(t.id, TokenId::Token(None));
292        assert_eq!(t.index, 1);
293
294        lexer.skip_blanks().now_or_never().unwrap().unwrap();
295        let t = lexer.token().now_or_never().unwrap().unwrap();
296        assert_eq!(*t.word.location.code.value.borrow(), " a  ");
297        assert_eq!(t.word.location.code.start_line_number.get(), 1);
298        assert_eq!(*t.word.location.code.source, Source::Unknown);
299        assert_eq!(t.word.location.range, 4..4);
300        assert_eq!(t.id, TokenId::EndOfInput);
301        assert_eq!(t.index, 4);
302    }
303}