runtime_lexer/
lib.rs

1//! A simple Lexer
2//!
3//! A lexer based on the regex crate
4
5#[warn(missing_docs)]
6
7/// Contains the main lexer
8pub mod lexer {
9    use regex::{Regex, RegexSet};
10    use lazy_static::lazy_static;
11
12    /// Represents a Lexer Action mapping a regex representation to a TokenType
13    #[derive(Clone)]
14    pub struct LexAction<'s, TokenType> {
15        /// Regex representation of a token
16        pub token:  &'s str,
17        /// Function converting a `&str` token to a `TokenType`
18        pub action: fn(&str) -> TokenType,
19    }
20
21    /// Struct used to generate a Lexer
22    ///
23    /// It can either be initialised with an array of LexActions, or using the
24    /// [push](LexerBuilder::push) method(recommended).
25    #[derive(Default)]
26    pub struct LexerBuilder<'s, TokenType> {
27        /// List of all tokens including conversions used by the resulting Lexer
28        pub actions: Vec<LexAction<'s, TokenType>>,
29    }
30
31    /// Represents a finished Lexer
32    pub struct Lexer<TokenType> {
33        regex_set: RegexSet,
34        regexes: Vec<Regex>,
35        actions: Vec<fn(&str) -> TokenType>,
36        data: String,
37        curr_pos: usize,
38    }
39
40    impl<'s, TokenType> LexerBuilder<'s, TokenType> {
41        /// Returns an empty LexerBuilder
42        pub fn new() -> Self{
43            LexerBuilder{ actions: Vec::new() }
44        }
45
46        /// Adds a new token to the LexerBuilder
47        ///
48        /// token is the regex representation of the string  
49        /// action is a method converting the &str representation of the token to a Token
50        pub fn push(&mut self, token: &'s str, action: fn(&str) -> TokenType) -> &mut Self {
51            self.actions.push(LexAction{ token, action });
52            self
53        }
54
55        /// Builds a new Lexer from the Actions configured in the Builder
56        pub fn build(&self) -> Lexer<TokenType>{
57            Lexer{
58                regex_set: RegexSet::new(self.actions.iter().map(|a| String::from("^") + &a.token )).unwrap(),
59                regexes: self.actions.iter().map(|a| Regex::new(&(String::from("^") + &a.token)).unwrap()).collect(),
60                actions: self.actions.iter().map(|a| a.action ).collect(),
61                data: String::new(),
62                curr_pos: 0,
63            }
64        }
65    }
66
67    impl<TokenType> Lexer<TokenType> {
68        /// Resets the parser to the starting state with input data
69        pub fn init(&mut self, data: String){
70            self.data = data;
71            self.curr_pos = 0;
72        }
73
74        /// Returns the next Token, or None if no token is found
75        pub fn tok(&mut self, skip_ws: bool) -> Option<TokenType> {
76            println!("{}", &self.data[self.curr_pos..]);
77            if skip_ws {
78                lazy_static! {
79                    static ref WS: Regex = Regex::new(r"^\s").unwrap();
80                }
81
82                let res = WS.find(&self.data[self.curr_pos..]);
83                match res {
84                    Some(v) => { self.curr_pos = v.end() + self.curr_pos; }
85                    None => ()
86                };
87            };
88            println!("{} {}\n", self.curr_pos, &self.data[self.curr_pos..]);
89
90            let matches: Vec<_> = self.regex_set.matches(&self.data[self.curr_pos..]).into_iter().collect();
91
92            if matches.is_empty() {
93                return None;
94            }
95
96            let mut longest = 0;
97            let mut longest_id = 0;
98
99            for m in matches {
100                println!("{}", self.curr_pos);
101                let length = self.regexes[m].find(&self.data[self.curr_pos..]).unwrap().end() + self.curr_pos;
102                if length > longest {
103                    longest = length;
104                    longest_id = m;
105                }
106            };
107
108            let token = self.actions[longest_id](&self.data[self.curr_pos..longest]);
109            self.curr_pos = longest;
110            Some(token)
111        }
112
113        /// Returns true if the end of input has been reached.
114        pub fn is_eof(&self) -> bool {
115            self.curr_pos == self.data.len()
116        }
117    }
118}
119
120#[cfg(test)]
121mod tests {
122    use core::panic;
123
124    use crate::lexer::{Lexer, LexerBuilder, LexAction};
125
126    #[test]
127    fn it_works() {
128        let result = 2 + 2;
129        assert_eq!(result, 4);
130    }
131
132    #[derive(Clone)]
133    enum Token1 {
134        TokenInt    (i32),
135        TokenString (String),
136    }
137
138    #[test]
139    fn doesnt_panic_array(){
140        let _l: Lexer<Token1> = LexerBuilder{
141            actions: [LexAction{ token: r"\d+", action: |x: &str| Token1::TokenInt( x.parse::<i32>().unwrap() )}].to_vec(),
142        }.build();
143    }
144
145    #[test]
146    fn doesnt_panic_append(){
147        let _l: Lexer<Token1> = LexerBuilder::new()
148            .push( r"\d+",          |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
149            .push( r"[a-zA-Z_]\w*", |x: &str| Token1::TokenString(String::from(x)))
150            .build();
151    }
152
153    #[test]
154    fn simple_number_test(){
155        let mut l = LexerBuilder::<Token1>::new()
156            .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
157            .build();
158
159        l.init(String::from("42"));
160
161        match l.tok(true).unwrap() {
162            Token1::TokenInt(v) => { assert!(v == 42);},
163            _ => { panic!("Token is not of type int"); },
164        }
165    }
166
167    #[test]
168    fn simple_number_leading_ws(){
169        let mut l = LexerBuilder::<Token1>::new()
170            .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
171            .build();
172
173        l.init(String::from(" 42"));
174
175        match l.tok(true).unwrap() {
176            Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual: {}", v);},
177            _ => { panic!("Token is not of type int"); },
178        }
179    }
180
181    #[test]
182    fn two_numbers(){
183        let mut l = LexerBuilder::<Token1>::new()
184            .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
185            .build();
186
187        l.init(String::from("42 52"));
188
189        match l.tok(true).unwrap() {
190            Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual {}", v);},
191            _ => { panic!("Token is not of type int"); },
192        }
193 
194        match l.tok(true).unwrap() {
195            Token1::TokenInt(v) => { assert!(v == 52);},
196            _ => { panic!("Token is not of type int"); },
197        }       
198    }
199
200    #[test]
201    fn many_numbers(){
202        let mut l = LexerBuilder::<Token1>::new()
203            .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
204            .build();
205        
206        l.init((0..100).map(|x: i8| x.to_string()).collect::<Vec<String>>().join(" "));
207
208        for i in 0..100 {
209            match l.tok(true).unwrap() {
210                Token1::TokenInt(v) => { assert!(v == i, "Expected {}: Actual {}", i, v);},
211                _ => { panic!("Token is not of type int"); },
212            }
213        }
214    }
215
216    #[test]
217    fn test_eof(){
218        let mut l = LexerBuilder::<Token1>::new()
219            .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
220            .build();
221
222        l.init(String::from("42"));
223
224        assert!(!l.is_eof());
225
226        match l.tok(true).unwrap() {
227            Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual: {}", v);},
228            _ => { panic!("Token is not of type int"); },
229        }
230
231        assert!(l.is_eof());
232    }
233}