1#[warn(missing_docs)]
6
7pub mod lexer {
9 use regex::{Regex, RegexSet};
10 use lazy_static::lazy_static;
11
12 #[derive(Clone)]
14 pub struct LexAction<'s, TokenType> {
15 pub token: &'s str,
17 pub action: fn(&str) -> TokenType,
19 }
20
21 #[derive(Default)]
26 pub struct LexerBuilder<'s, TokenType> {
27 pub actions: Vec<LexAction<'s, TokenType>>,
29 }
30
31 pub struct Lexer<TokenType> {
33 regex_set: RegexSet,
34 regexes: Vec<Regex>,
35 actions: Vec<fn(&str) -> TokenType>,
36 data: String,
37 curr_pos: usize,
38 }
39
40 impl<'s, TokenType> LexerBuilder<'s, TokenType> {
41 pub fn new() -> Self{
43 LexerBuilder{ actions: Vec::new() }
44 }
45
46 pub fn push(&mut self, token: &'s str, action: fn(&str) -> TokenType) -> &mut Self {
51 self.actions.push(LexAction{ token, action });
52 self
53 }
54
55 pub fn build(&self) -> Lexer<TokenType>{
57 Lexer{
58 regex_set: RegexSet::new(self.actions.iter().map(|a| String::from("^") + &a.token )).unwrap(),
59 regexes: self.actions.iter().map(|a| Regex::new(&(String::from("^") + &a.token)).unwrap()).collect(),
60 actions: self.actions.iter().map(|a| a.action ).collect(),
61 data: String::new(),
62 curr_pos: 0,
63 }
64 }
65 }
66
67 impl<TokenType> Lexer<TokenType> {
68 pub fn init(&mut self, data: String){
70 self.data = data;
71 self.curr_pos = 0;
72 }
73
74 pub fn tok(&mut self, skip_ws: bool) -> Option<TokenType> {
76 println!("{}", &self.data[self.curr_pos..]);
77 if skip_ws {
78 lazy_static! {
79 static ref WS: Regex = Regex::new(r"^\s").unwrap();
80 }
81
82 let res = WS.find(&self.data[self.curr_pos..]);
83 match res {
84 Some(v) => { self.curr_pos = v.end() + self.curr_pos; }
85 None => ()
86 };
87 };
88 println!("{} {}\n", self.curr_pos, &self.data[self.curr_pos..]);
89
90 let matches: Vec<_> = self.regex_set.matches(&self.data[self.curr_pos..]).into_iter().collect();
91
92 if matches.is_empty() {
93 return None;
94 }
95
96 let mut longest = 0;
97 let mut longest_id = 0;
98
99 for m in matches {
100 println!("{}", self.curr_pos);
101 let length = self.regexes[m].find(&self.data[self.curr_pos..]).unwrap().end() + self.curr_pos;
102 if length > longest {
103 longest = length;
104 longest_id = m;
105 }
106 };
107
108 let token = self.actions[longest_id](&self.data[self.curr_pos..longest]);
109 self.curr_pos = longest;
110 Some(token)
111 }
112
113 pub fn is_eof(&self) -> bool {
115 self.curr_pos == self.data.len()
116 }
117 }
118}
119
120#[cfg(test)]
121mod tests {
122 use core::panic;
123
124 use crate::lexer::{Lexer, LexerBuilder, LexAction};
125
126 #[test]
127 fn it_works() {
128 let result = 2 + 2;
129 assert_eq!(result, 4);
130 }
131
132 #[derive(Clone)]
133 enum Token1 {
134 TokenInt (i32),
135 TokenString (String),
136 }
137
138 #[test]
139 fn doesnt_panic_array(){
140 let _l: Lexer<Token1> = LexerBuilder{
141 actions: [LexAction{ token: r"\d+", action: |x: &str| Token1::TokenInt( x.parse::<i32>().unwrap() )}].to_vec(),
142 }.build();
143 }
144
145 #[test]
146 fn doesnt_panic_append(){
147 let _l: Lexer<Token1> = LexerBuilder::new()
148 .push( r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
149 .push( r"[a-zA-Z_]\w*", |x: &str| Token1::TokenString(String::from(x)))
150 .build();
151 }
152
153 #[test]
154 fn simple_number_test(){
155 let mut l = LexerBuilder::<Token1>::new()
156 .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
157 .build();
158
159 l.init(String::from("42"));
160
161 match l.tok(true).unwrap() {
162 Token1::TokenInt(v) => { assert!(v == 42);},
163 _ => { panic!("Token is not of type int"); },
164 }
165 }
166
167 #[test]
168 fn simple_number_leading_ws(){
169 let mut l = LexerBuilder::<Token1>::new()
170 .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
171 .build();
172
173 l.init(String::from(" 42"));
174
175 match l.tok(true).unwrap() {
176 Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual: {}", v);},
177 _ => { panic!("Token is not of type int"); },
178 }
179 }
180
181 #[test]
182 fn two_numbers(){
183 let mut l = LexerBuilder::<Token1>::new()
184 .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
185 .build();
186
187 l.init(String::from("42 52"));
188
189 match l.tok(true).unwrap() {
190 Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual {}", v);},
191 _ => { panic!("Token is not of type int"); },
192 }
193
194 match l.tok(true).unwrap() {
195 Token1::TokenInt(v) => { assert!(v == 52);},
196 _ => { panic!("Token is not of type int"); },
197 }
198 }
199
200 #[test]
201 fn many_numbers(){
202 let mut l = LexerBuilder::<Token1>::new()
203 .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
204 .build();
205
206 l.init((0..100).map(|x: i8| x.to_string()).collect::<Vec<String>>().join(" "));
207
208 for i in 0..100 {
209 match l.tok(true).unwrap() {
210 Token1::TokenInt(v) => { assert!(v == i, "Expected {}: Actual {}", i, v);},
211 _ => { panic!("Token is not of type int"); },
212 }
213 }
214 }
215
216 #[test]
217 fn test_eof(){
218 let mut l = LexerBuilder::<Token1>::new()
219 .push(r"\d+", |x: &str| Token1::TokenInt(x.parse::<i32>().unwrap()))
220 .build();
221
222 l.init(String::from("42"));
223
224 assert!(!l.is_eof());
225
226 match l.tok(true).unwrap() {
227 Token1::TokenInt(v) => { assert!(v == 42, "Expected 42: Actual: {}", v);},
228 _ => { panic!("Token is not of type int"); },
229 }
230
231 assert!(l.is_eof());
232 }
233}