yash_syntax/parser/lex/
token.rs

1// This file is part of yash, an extended POSIX shell.
2// Copyright (C) 2020 WATANABE Yuki
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! Part of the lexer that parses backquotes
18
19use super::core::Lexer;
20use super::core::Token;
21use super::core::TokenId;
22use super::core::WordContext;
23use super::core::WordLexer;
24use super::core::is_blank;
25use super::op::is_operator_char;
26use crate::parser::core::Result;
27use crate::syntax::MaybeLiteral;
28use crate::syntax::Word;
29
30/// Tests whether the given character is a token delimiter.
31///
32/// A character is a token delimiter if it is either a whitespace or [operator](is_operator_char).
33pub fn is_token_delimiter_char(c: char) -> bool {
34    is_operator_char(c) || is_blank(c)
35}
36
37impl Lexer<'_> {
38    /// Determines the token ID for the word.
39    ///
40    /// This is a helper function used by [`Lexer::token`] and does not support
41    /// operators.
42    async fn token_id(&mut self, word: &Word) -> Result<TokenId> {
43        if word.units.is_empty() {
44            return Ok(TokenId::EndOfInput);
45        }
46
47        if let Some(literal) = word.to_string_if_literal() {
48            if let Ok(keyword) = literal.parse() {
49                return Ok(TokenId::Token(Some(keyword)));
50            }
51
52            if literal.chars().all(|c| c.is_ascii_digit()) {
53                if let Some(next) = self.peek_char().await? {
54                    if next == '<' || next == '>' {
55                        return Ok(TokenId::IoNumber);
56                    }
57                }
58            }
59        }
60
61        Ok(TokenId::Token(None))
62    }
63
64    /// Parses a token.
65    ///
66    /// If there is no more token that can be parsed, the result is a token with an empty word and
67    /// [`EndOfInput`](TokenId::EndOfInput) token identifier.
68    pub async fn token(&mut self) -> Result<Token> {
69        if let Some(op) = self.operator().await? {
70            return Ok(op);
71        }
72
73        let index = self.index();
74
75        let mut word_lexer = WordLexer {
76            lexer: self,
77            context: WordContext::Word,
78        };
79        let mut word = word_lexer.word(is_token_delimiter_char).await?;
80        word.parse_tilde_front();
81
82        let id = self.token_id(&word).await?;
83
84        Ok(Token { word, id, index })
85    }
86}
87
88#[cfg(test)]
89mod tests {
90    use super::*;
91    use crate::source::Source;
92    use crate::syntax::TextUnit;
93    use crate::syntax::WordUnit;
94    use futures_util::FutureExt;
95
96    #[test]
97    fn lexer_token_empty() {
98        // If there's no word unit that can be parsed, it is the end of input.
99        let mut lexer = Lexer::with_code("");
100
101        let t = lexer.token().now_or_never().unwrap().unwrap();
102        assert_eq!(*t.word.location.code.value.borrow(), "");
103        assert_eq!(t.word.location.code.start_line_number.get(), 1);
104        assert_eq!(*t.word.location.code.source, Source::Unknown);
105        assert_eq!(t.word.location.range, 0..0);
106        assert_eq!(t.id, TokenId::EndOfInput);
107        assert_eq!(t.index, 0);
108    }
109
110    #[test]
111    fn lexer_token_non_empty() {
112        let mut lexer = Lexer::with_code("abc ");
113
114        let t = lexer.token().now_or_never().unwrap().unwrap();
115        assert_eq!(t.word.units.len(), 3);
116        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('a')));
117        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('b')));
118        assert_eq!(t.word.units[2], WordUnit::Unquoted(TextUnit::Literal('c')));
119        assert_eq!(*t.word.location.code.value.borrow(), "abc ");
120        assert_eq!(t.word.location.code.start_line_number.get(), 1);
121        assert_eq!(*t.word.location.code.source, Source::Unknown);
122        assert_eq!(t.word.location.range, 0..3);
123        assert_eq!(t.id, TokenId::Token(None));
124        assert_eq!(t.index, 0);
125
126        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some(' ')));
127    }
128
129    #[test]
130    fn lexer_token_tilde() {
131        let mut lexer = Lexer::with_code("~a:~");
132
133        let t = lexer.token().now_or_never().unwrap().unwrap();
134        assert_eq!(t.word.units, [WordUnit::Tilde("a:~".to_string())]);
135    }
136
137    #[test]
138    fn lexer_token_io_number_delimited_by_less() {
139        let mut lexer = Lexer::with_code("12<");
140
141        let t = lexer.token().now_or_never().unwrap().unwrap();
142        assert_eq!(t.word.units.len(), 2);
143        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('1')));
144        assert_eq!(t.word.units[1], WordUnit::Unquoted(TextUnit::Literal('2')));
145        assert_eq!(*t.word.location.code.value.borrow(), "12<");
146        assert_eq!(t.word.location.code.start_line_number.get(), 1);
147        assert_eq!(*t.word.location.code.source, Source::Unknown);
148        assert_eq!(t.word.location.range, 0..2);
149        assert_eq!(t.id, TokenId::IoNumber);
150        assert_eq!(t.index, 0);
151
152        assert_eq!(lexer.peek_char().now_or_never().unwrap(), Ok(Some('<')));
153    }
154
155    #[test]
156    fn lexer_token_io_number_delimited_by_greater() {
157        let mut lexer = Lexer::with_code("0>>");
158
159        let t = lexer.token().now_or_never().unwrap().unwrap();
160        assert_eq!(t.word.units.len(), 1);
161        assert_eq!(t.word.units[0], WordUnit::Unquoted(TextUnit::Literal('0')));
162        assert_eq!(*t.word.location.code.value.borrow(), "0>>");
163        assert_eq!(t.word.location.code.start_line_number.get(), 1);
164        assert_eq!(*t.word.location.code.source, Source::Unknown);
165        assert_eq!(t.word.location.range, 0..1);
166        assert_eq!(t.id, TokenId::IoNumber);
167        assert_eq!(t.index, 0);
168
169        assert_eq!(
170            lexer.location().now_or_never().unwrap().unwrap().range,
171            1..2
172        );
173    }
174
175    #[test]
176    fn lexer_token_after_blank() {
177        let mut lexer = Lexer::with_code(" a  ");
178
179        lexer.skip_blanks().now_or_never().unwrap().unwrap();
180        let t = lexer.token().now_or_never().unwrap().unwrap();
181        assert_eq!(*t.word.location.code.value.borrow(), " a  ");
182        assert_eq!(t.word.location.code.start_line_number.get(), 1);
183        assert_eq!(*t.word.location.code.source, Source::Unknown);
184        assert_eq!(t.word.location.range, 1..2);
185        assert_eq!(t.id, TokenId::Token(None));
186        assert_eq!(t.index, 1);
187
188        lexer.skip_blanks().now_or_never().unwrap().unwrap();
189        let t = lexer.token().now_or_never().unwrap().unwrap();
190        assert_eq!(*t.word.location.code.value.borrow(), " a  ");
191        assert_eq!(t.word.location.code.start_line_number.get(), 1);
192        assert_eq!(*t.word.location.code.source, Source::Unknown);
193        assert_eq!(t.word.location.range, 4..4);
194        assert_eq!(t.id, TokenId::EndOfInput);
195        assert_eq!(t.index, 4);
196    }
197}