accumulo_access/
lexer.rs

1// Copyright 2024 Lars Wilhelmsen <sral-backwards@sral.org>. All rights reserved.
2// Use of this source code is governed by the MIT or Apache-2.0 license that can be found in the LICENSE_MIT or LICENSE_APACHE files.
3
4use std::fmt::Display;
5use std::iter::Peekable;
6use std::str::Chars;
7use thiserror::Error;
8
9#[derive(Debug, PartialEq, Clone)]
10pub enum Token {
11    #[allow(clippy::enum_variant_names)] AccessToken(String),
12    OpenParen,
13    CloseParen,
14    And,
15    Or,
16}
17
18#[derive(Debug, PartialEq, Clone)]
19pub enum Operator {
20    Conjunction,
21    Disjunction,
22}
23
24impl Display for Token {
25    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
26        match self {
27            Token::AccessToken(token) => write!(f, "{:?}", token),
28            Token::OpenParen => write!(f, "("),
29            Token::CloseParen => write!(f, ")"),
30            Token::And => write!(f, "&"),
31            Token::Or => write!(f, "|"),
32        }
33    }
34}
35
36/// `Lexer` is a lexical analyzer (tokenizer) for authorization expressions.
37#[derive(Debug, Clone)]
38pub struct Lexer<'a> {
39    inner_peekable_iterator: Peekable<Chars<'a>>,
40    position: usize,
41}
42
43#[derive(Error, Debug, PartialEq, Clone)]
44pub enum LexerError {
45    UnexpectedCharacter(char, usize),
46}
47
48impl Display for LexerError {
49    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
50        match self {
51            LexerError::UnexpectedCharacter(c, position) => {
52                write!(f, "Unexpected character '{}' at position {}", c, position)
53            }
54        }
55    }
56}
57
58impl<'a> Lexer<'a> {
59    /// Creates a new `Lexer` instance.
60    ///
61    /// # Arguments
62    ///
63    /// * `input` - The authorization expression to tokenize.
64    pub fn new(input: &'a str) -> Self {
65        let inner_peekable_iterator = input.chars().peekable();
66        Lexer {
67            inner_peekable_iterator,
68            position: 0,
69        }
70    }
71
72    fn read_char(&mut self) -> Option<char> {
73        let c = self.inner_peekable_iterator.next();
74        if c.is_some() {
75            self.position += 1;
76        }
77        c
78    }
79
80    fn peek_char(&mut self) -> Option<&char> {
81        self.inner_peekable_iterator.peek()
82    }
83}
84
85fn is_allowed_char_for_unquoted_access_token(c: char) -> bool {
86    c.is_ascii_alphanumeric()
87        || c == '_'
88        || c == '-'
89        || c == '.'
90        || c == ':'
91        || c == '/'
92}
93
94fn is_allowed_char_for_quoted_access_token(c: char) -> bool {
95    // from SPECIFICATION.md:
96    //
97    // check that the character is in the valid ranges:
98    // utf8-subset             = %x20-21 / %x23-5B / %x5D-7E / unicode-beyond-ascii ; utf8 minus '"' and '\'
99    // unicode-beyond-ascii    = %x0080-D7FF / %xE000-10FFFF
100    c.is_ascii_graphic()
101        || c == ' '
102        || (c as u32) >= 0x0080 && (c as u32) <= 0xD7FF
103        || (c as u32) >= 0xE000 && (c as u32) <= 0x10FFFF
104}
105
106impl<'a> Iterator for Lexer<'a> {
107    type Item = Result<Token, LexerError>;
108
109    fn next(&mut self) -> Option<Self::Item> {
110        let c = self.read_char()?;
111        let r = match c {
112            '(' => {
113                //self.read_char();
114                Ok(Token::OpenParen)
115            }
116
117            ')' => {
118                //self.read_char();
119                Ok(Token::CloseParen)
120            }
121            '&' => {
122                //self.read_char();
123                Ok(Token::And)
124            }
125            '|' => {
126                //self.read_char();
127                Ok(Token::Or)
128            }
129            '"' => {
130                self.handle_quoted_access_token()
131            }
132            _ if is_allowed_char_for_unquoted_access_token(c) => {
133                self.handle_unquoted_access_token(c)
134            }
135            _ => {
136                //self.read_char();
137                Err(LexerError::UnexpectedCharacter(c, self.position))
138            }
139        };
140        Some(r)
141    }
142}
143
144impl<'a> Lexer<'a> {
145    fn handle_quoted_access_token(&mut self) -> Result<Token, LexerError> {
146        let mut value = String::new();
147        //self.read_char(); // discard the opening quote
148        while let Some(c) = self.read_char() {
149            if !is_allowed_char_for_quoted_access_token(c)
150            {
151                return Err(LexerError::UnexpectedCharacter(c, self.position));
152            }
153            match c {
154                '\\' => {
155                    if let Some(next_char) = self.read_char() {
156                        if next_char == '"' || next_char == '\\' {
157                            value.push(next_char);
158                        } else {
159                            return Err(LexerError::UnexpectedCharacter(next_char, self.position));
160                        }
161                    }
162                }
163                '"' => {
164                    break;
165                }
166                _ => {
167                    value.push(c);
168                }
169            }
170        }
171        Ok(Token::AccessToken(value))
172    }
173
174    fn handle_unquoted_access_token(&mut self, first_char: char) -> Result<Token, LexerError> {
175        let mut value = String::new();
176        value.push(first_char);
177        while let Some(c) = self.peek_char() {
178            if is_allowed_char_for_unquoted_access_token(*c) {
179                let c = self.read_char().unwrap();
180                value.push(c);
181            } else {
182                break;
183            }
184        }
185        Ok(Token::AccessToken(value))
186    }
187}
188
189#[cfg(test)]
190mod tests {
191    use super::*;
192
193    #[test]
194    fn test_lexer_valid() {
195        let input =
196            "label1&\"label 🕺\"|(\"hello \\\\ \\\"world\"|label4|(label5&label6)))";
197        let lexer = Lexer::new(input);
198        let tokens: Vec<Result<Token, LexerError>> = lexer.collect();
199        assert_eq!(
200            tokens,
201            vec![
202                Ok(Token::AccessToken("label1".to_string())),
203                Ok(Token::And),
204                Ok(Token::AccessToken("label 🕺".to_string())),
205                Ok(Token::Or),
206                Ok(Token::OpenParen),
207                Ok(Token::AccessToken("hello \\ \"world".to_string())),
208                Ok(Token::Or),
209                Ok(Token::AccessToken("label4".to_string())),
210                Ok(Token::Or),
211                Ok(Token::OpenParen),
212                Ok(Token::AccessToken("label5".to_string())),
213                Ok(Token::And),
214                Ok(Token::AccessToken("label6".to_string())),
215                Ok(Token::CloseParen),
216                Ok(Token::CloseParen),
217                Ok(Token::CloseParen),
218            ]
219        );
220    }
221    
222    #[test]
223    fn test_lexer_valid2() {
224        let input = "\"abc!12\"&\"abc\\\\xyz\"&GHI";
225        
226        let lexer = Lexer::new(input);
227        let tokens: Vec<Result<Token, LexerError>> = lexer.collect();
228        
229        assert_eq!(
230            tokens,
231            vec![
232                Ok(Token::AccessToken("abc!12".to_string())),
233                Ok(Token::And),
234                Ok(Token::AccessToken("abc\\xyz".to_string())),
235                Ok(Token::And),
236                Ok(Token::AccessToken("GHI".to_string())),
237            ]);
238    }
239
240    #[test]
241    fn test_lexer_invalid() {
242        let input = "label1 & [";
243        let lexer = Lexer::new(input);
244        let tokens: Vec<Result<Token, LexerError>> = lexer.collect();
245        assert_eq!(
246            tokens,
247            vec![
248                Ok(Token::AccessToken("label1".to_string())),
249                Err(LexerError::UnexpectedCharacter(' ', 7)),
250                Ok(Token::And),
251                Err(LexerError::UnexpectedCharacter(' ', 9)),
252                Err(LexerError::UnexpectedCharacter('[', 10)),
253            ]
254        );
255    }
256}