dalia/
lexer.rs

1use std::borrow::Cow;
2use std::fmt::Formatter;
3
4const TOKEN_NAMES: [&str; 7] = ["n/a", "<EOF>", "LBRACK", "RBRACK", "ALIAS", "PATH", "GLOB"];
5
6pub const TOKEN_EOF: i32 = 1;
7pub const TOKEN_LBRACK: i32 = 2;
8pub const TOKEN_RBRACK: i32 = 3;
9pub const TOKEN_ALIAS: i32 = 4;
10pub const TOKEN_PATH: i32 = 5;
11pub const TOKEN_GLOB: i32 = 6;
12
13const EOF: char = !0 as char;
14
15const UNDERSCORE: char = '_';
16const HYPHEN: char = '-';
17const ASTERISK: char = '*';
18
19/// Token identifies a text and the kind of token it represents.
20#[derive(Debug, Eq, PartialEq)]
21pub struct Token<'a> {
22    /// The specific atom this token represents.
23    pub kind: i32,
24    /// The particular text associated with this token when it was parsed.
25    pub text: Cow<'a, String>,
26}
27
28impl<'a> Token<'a> {
29    pub fn new(kind: i32, text: Cow<'a, String>) -> Self {
30        Self { kind, text }
31    }
32}
33
34impl<'a> std::fmt::Display for Token<'a> {
35    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
36        write!(f, "<'{}', {}>", self.text, TOKEN_NAMES[self.kind as usize])
37    }
38}
39
40/// Cursor allows traversing through an input String character by character while lexing.
41#[derive(Debug)]
42pub struct Cursor {
43    /// The input String being processed.
44    input: String,
45    /// A pointer to the current character.
46    pointer: usize,
47    /// The current character being processed.
48    current_char: char,
49}
50
51impl Cursor {
52    /// Constructs a new Cursor.
53    fn new(input: &str, pointer: usize, c: char) -> Self {
54        Self {
55            input: input.to_string(),
56            pointer,
57            current_char: c,
58        }
59    }
60
61    /// Consumes one character moving forward and detects "end of file".
62    fn consume(&mut self) {
63        self.pointer += 1;
64        if self.pointer >= self.input.len() {
65            self.current_char = EOF;
66        } else if let Some(c) = self.input.chars().nth(self.pointer) {
67            self.current_char = c
68        }
69    }
70}
71
72/// Creates and identifies tokens using the underlying cursor.
73#[derive(Debug)]
74pub struct Lexer<'a> {
75    pub cursor: Cursor,
76    token_names: Vec<&'a str>,
77}
78
79impl<'a> Lexer<'a> {
80    pub fn new(input: &str, pointer: usize, c: char) -> Self {
81        Self {
82            cursor: Cursor::new(input, pointer, c),
83            token_names: Vec::from(TOKEN_NAMES),
84        }
85    }
86
87    pub fn token_names(&self, i: usize) -> String {
88        self.token_names[i].to_string()
89    }
90
91    fn is_not_end_line(&self) -> bool {
92        !matches!(self.cursor.current_char, '\u{ff}' | '\0' | '\n')
93    }
94
95    fn is_alias_name(&self) -> bool {
96        self.cursor.current_char.is_ascii_alphanumeric()
97            || self.cursor.current_char == UNDERSCORE
98            || self.cursor.current_char == HYPHEN
99    }
100
101    fn is_glob_alias(&self) -> bool {
102        self.cursor.current_char == ASTERISK
103    }
104
105    pub fn next_token(&mut self) -> Result<Token<'a>, String> {
106        while self.cursor.current_char != EOF {
107            match self.cursor.current_char {
108                ' ' | '\t' | '\n' | '\r' => {
109                    self.whitespace();
110                    continue;
111                }
112                '[' => {
113                    self.cursor.consume();
114                    return Ok(Token::new(TOKEN_LBRACK, Cow::Owned("[".into())));
115                }
116                ']' => {
117                    self.cursor.consume();
118                    return Ok(Token::new(TOKEN_RBRACK, Cow::Owned("]".into())));
119                }
120                _ => {
121                    if self.is_alias_name() {
122                        return Ok(self.alias());
123                    } else if self.is_glob_alias() {
124                        return Ok(self.glob());
125                    } else if self.is_not_end_line() {
126                        return Ok(self.path());
127                    }
128                    return Err(format!("invalid character {}", self.cursor.current_char));
129                }
130            }
131        }
132
133        Ok(Token::new(TOKEN_EOF, Cow::Owned("<EOF>".into())))
134    }
135
136    fn whitespace(&mut self) {
137        while self.cursor.current_char.is_whitespace() {
138            self.cursor.consume()
139        }
140    }
141
142    fn alias(&mut self) -> crate::lexer::Token<'a> {
143        let mut a: String = String::new();
144        while self.is_alias_name() {
145            a.push(self.cursor.current_char);
146            self.cursor.consume();
147        }
148        Token::new(TOKEN_ALIAS, Cow::Owned(a))
149    }
150
151    fn glob(&mut self) -> crate::lexer::Token<'a> {
152        let mut a: String = String::new();
153        a.push(self.cursor.current_char);
154        self.cursor.consume();
155        Token::new(TOKEN_GLOB, Cow::Owned(a))
156    }
157
158    fn path(&mut self) -> crate::lexer::Token<'a> {
159        let mut p = String::new();
160        while self.is_not_end_line() {
161            p.push(self.cursor.current_char);
162            self.cursor.consume();
163        }
164        Token::new(TOKEN_PATH, Cow::Owned(p))
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_token_display() {
174        let tok = Token::new(TOKEN_EOF, Cow::Owned("<EOF>".into()));
175        assert_eq!("<'<EOF>', <EOF>>", tok.to_string())
176    }
177
178    #[test]
179    fn test_create_cursor() {
180        let cur = Cursor::new("", 0, !0 as char);
181        assert_eq!("".to_string(), cur.input);
182        assert_eq!(0, cur.pointer);
183        assert_eq!(!0 as char, cur.current_char);
184    }
185
186    #[test]
187    fn test_cursor_consumes_characters() {
188        let mut cur = Cursor::new("test", 0, 'e');
189        cur.consume();
190        assert_eq!("test".to_string(), cur.input);
191        assert_eq!(1, cur.pointer);
192        assert_eq!('e', cur.current_char);
193    }
194
195    #[test]
196    fn test_cursor_consumes_end_of_file() {
197        let mut cur = Cursor::new("test", 4, 't');
198        cur.consume();
199        assert_eq!("test".to_string(), cur.input);
200        assert_eq!(5, cur.pointer);
201        assert_eq!(!0 as char, cur.current_char);
202    }
203
204    #[test]
205    fn test_lexer_gets_token_name() {
206        let lexer = Lexer::new("test", 0, 't');
207        let token_name = lexer.token_names(2);
208        assert_eq!(TOKEN_NAMES[2], token_name);
209    }
210
211    #[test]
212    fn test_lexer_detects_line_feed_character() {
213        let lexer = Lexer::new("\0", 0, '\0');
214        assert!(
215            !lexer.is_not_end_line(),
216            "current character was not a LINE FEED"
217        );
218    }
219
220    #[test]
221    fn test_lexer_does_not_detect_non_line_feed_character() {
222        let lexer = Lexer::new("test", 0, 't');
223        assert!(lexer.is_not_end_line(), "current character was LINE FEED");
224    }
225
226    #[test]
227    fn test_lexer_consumes_whitespace() {
228        let mut lexer = Lexer::new("   test", 0, ' ');
229        lexer.whitespace();
230        assert_eq!('t', lexer.cursor.current_char);
231    }
232
233    #[test]
234    fn test_lexer_can_check_is_alis_name() {
235        let lexer = Lexer::new("test0123", 0, 't');
236        assert!(lexer.is_alias_name());
237    }
238
239    #[test]
240    fn test_lexer_can_check_is_alis_name_fails() {
241        let lexer = Lexer::new("*", 0, '*');
242        assert!(!lexer.is_alias_name());
243    }
244
245    #[test]
246    fn test_lexer_creates_alias_token() {
247        let mut lexer = Lexer::new("alias", 0, 'a');
248        let token = lexer.alias();
249        assert_eq!(TOKEN_ALIAS, token.kind);
250        assert_eq!("alias", token.text.as_str());
251    }
252
253    #[test]
254    fn test_lexer_creates_path_token() {
255        let mut lexer = Lexer::new("/some/absolute/path", 0, '/');
256        let token = lexer.path();
257        assert_eq!(TOKEN_PATH, token.kind);
258        assert_eq!("/some/absolute/path", token.text.as_str());
259    }
260
261    #[test]
262    fn test_lexer_next_token() {
263        let input = r#"[test]/some/absolute/path
264        /another/absolute/path
265        "#;
266        let mut lexer = Lexer::new(input, 0, '[');
267        let mut tokens: Vec<Token> = Vec::new();
268        while let Ok(t) = lexer.next_token() {
269            if t.kind == TOKEN_EOF {
270                break;
271            }
272            tokens.push(t);
273        }
274        assert_eq!(Token::new(TOKEN_LBRACK, Cow::Owned("[".into())), tokens[0]);
275        assert_eq!(
276            Token::new(TOKEN_ALIAS, Cow::Owned("test".into())),
277            tokens[1]
278        );
279        assert_eq!(Token::new(TOKEN_RBRACK, Cow::Owned("]".into())), tokens[2]);
280        assert_eq!(
281            Token::new(TOKEN_PATH, Cow::Owned("/some/absolute/path".into())),
282            tokens[3]
283        );
284        assert_eq!(
285            Token::new(TOKEN_PATH, Cow::Owned("/another/absolute/path".into())),
286            tokens[4]
287        );
288    }
289
290    #[test]
291    fn test_lexer_parses_path_without_initial_slash() {
292        let input = "some/absolute/path";
293        let mut lexer = Lexer::new(input, 0, 's');
294        let mut tokens: Vec<Token> = Vec::new();
295        while let Ok(t) = lexer.next_token() {
296            if t.kind == TOKEN_EOF {
297                break;
298            }
299            tokens.push(t);
300        }
301        assert!(!tokens.is_empty());
302        assert_eq!(2, tokens.len())
303    }
304
305    #[test]
306    fn test_lexer_parses_glob() {
307        let input = "[*]/some/absolute/path";
308        let mut lexer = Lexer::new(input, 0, '[');
309        let mut tokens: Vec<Token> = Vec::new();
310        while let Ok(t) = lexer.next_token() {
311            if t.kind == TOKEN_EOF {
312                break;
313            }
314            tokens.push(t);
315        }
316        assert_eq!(Token::new(TOKEN_LBRACK, Cow::Owned("[".into())), tokens[0]);
317        assert_eq!(Token::new(TOKEN_GLOB, Cow::Owned("*".into())), tokens[1]);
318        assert_eq!(Token::new(TOKEN_RBRACK, Cow::Owned("]".into())), tokens[2]);
319        assert_eq!(
320            Token::new(TOKEN_PATH, Cow::Owned("/some/absolute/path".into())),
321            tokens[3]
322        );
323    }
324}