romulus/lex/
mod.rs

1//! A module which extracts romulus tokens out of string content
2
3#[cfg(test)]
4mod tests;
5mod utils;
6
7use utils::*;
8
9///
10/// Represents the individual grammer entity in romulus
11///
12#[derive(Debug, PartialEq)]
13pub enum Token<'a> {
14    /// Represents (, {, [, ], }, )
15    Paren(char),
16
17    /// Represents simple symbols like ^ and $
18    Symbol(char),
19
20    /// Represents positive decimal numbers <br>
21    /// such as `42`
22    Number(i64),
23
24    /// Represents a Regular Expression with flags in the form of /regex/flags <br>
25    /// such as `/[a-z]+/i`
26    ///
27    /// flags supported
28    /// 1. `i` - case insensitive
29    /// 2. `U` - swap greediness semantics
30    Regex(String, String),
31
32    /// Represents a comment
33    ///
34    /// Currently only line comments are supported <br>
35    /// any characters after `#` are apart of a comment
36    Comment(&'a str),
37
38    /// Represents a variable identifier
39    ///
40    /// any bare caracters that match `/[_a-z][_a-z0-9]*/i` <br>
41    /// i.e. _the_answer_42
42    Identifier(String),
43
44    /// Represents a string
45    ///
46    /// single quotes may not interpolate variables, where as double qoutes
47    /// may interpolate variables with a `${identifier}`
48    ///
49    /// such as `'some string'`, `"Ip Address: ${ip}"`
50    String(String, bool),
51
52    /// A newline character, carriage returen, or semicolon
53    Newline,
54
55    /// A comma
56    Comma,
57}
58
59impl Token<'_> {
60    fn significant(&self) -> bool {
61        match self {
62            Token::Number(_) => true,
63            Token::Paren(_) => true,
64            Token::Regex(_, _) => true,
65            Token::Comment(_) => false,
66            Token::Identifier(_) => true,
67            Token::String(_, _) => true,
68            Token::Symbol(_) => true,
69            Token::Newline => false,
70            Token::Comma => true,
71        }
72    }
73}
74
75/// Lexes a given string and returns only significant tokens in
76/// a romulus program
77///
78/// for example newlines and comments are not significant for parsing
79/// a romulus program
80pub fn lex(buf: &str) -> Result<Vec<Token>, String> {
81    let tokens = full_lex(buf)?;
82
83    Ok(tokens
84        .into_iter()
85        .filter(|t| t.significant())
86        .collect::<Vec<Token>>())
87}
88
89/// Lexes a given string and returns all tokens found
90pub fn full_lex(buf: &str) -> Result<Vec<Token>, String> {
91    let mut tokens = Vec::new();
92    let mut it = buf.chars().enumerate().peekable();
93
94    let lower = 'a'..='z';
95    let upper = 'A'..='Z';
96    let under_score = &['_'];
97    let newline_chars = &['\n', '\r', ';'];
98    let number_chars = '0'..='9';
99    let regexflag_chars = &['i', 'U'];
100    let x = [&lower, &upper, &number_chars];
101    let ident_chars = (Multi(&x), under_score);
102
103    while let Some((start, ch)) = it.peek() {
104        let start = *start;
105        match ch {
106            '0'..='9' => {
107                let end = chomp(&number_chars, &mut it);
108                tokens.push(Token::Number(get_number(&buf[start..end])));
109            }
110
111            '{' | '[' | '(' | '}' | ']' | ')' => {
112                tokens.push(Token::Paren(*ch));
113                it.next();
114            }
115
116            ' ' | '\t' => {
117                it.next();
118            }
119
120            '\n' | '\r' | ';' => {
121                chomp(&newline_chars, &mut it);
122                tokens.push(Token::Newline);
123            }
124
125            '#' => {
126                it.next();
127                let end = chomp_until(&newline_chars, &mut it);
128                tokens.push(Token::Comment(&buf[start + 1..end]));
129            }
130
131            ',' => {
132                it.next();
133                tokens.push(Token::Comma);
134            }
135
136            '/' => {
137                it.next();
138                let chars = chomp_until_escaped(
139                    &mut it,
140                    '/',
141                    &[
142                        '{', '}', '[', ']', '.', '^', '$', '*', '+', '?', '|', '(', ')', 'd', 'D',
143                        's', 'S', 'w', 'W', 'p', 'P', 'b', 'B', 'A', 'z', 'a', 'f', 't', 'n', 'r',
144                        'v', 'x', 'u', 'U', '\\',
145                    ],
146                )?;
147                let pattern = chars;
148                if let Some((_, '/')) = it.next() {
149                } else {
150                    return Err("expected character: '/'".to_string());
151                }
152
153                let flags = chomp_str(&regexflag_chars, &mut it);
154
155                tokens.push(Token::Regex(pattern, flags));
156            }
157
158            '"' => {
159                it.next();
160                let content = chomp_until_escaped(&mut it, '"', &['$'])?;
161                it.next();
162
163                tokens.push(Token::String(content, true));
164            }
165
166            '\'' => {
167                it.next();
168                let content = chomp_until_escaped(&mut it, '\'', &[])?;
169                it.next();
170
171                tokens.push(Token::String(content, false));
172            }
173
174            '_' | 'a'..='z' | 'A'..='Z' => {
175                let content = chomp_str(&ident_chars, &mut it);
176
177                tokens.push(Token::Identifier(content));
178            }
179
180            '^' | '$' | '!' | '&' | '|' => {
181                tokens.push(Token::Symbol(*ch));
182                it.next();
183            }
184
185            a => {
186                return Err(format!("unknown character: '{}'", a));
187            }
188        }
189    }
190
191    Ok(tokens)
192}