ram/
lib.rs

1//! This library makes it easy to create finite state machines to tokenize strings.
2//!
3//! Here's the simplest automaton you can make with it, it simply finds EOF:
4//! ```
5//! use ram::Automaton;
6//!
7//! enum TokenType {
8//!     End,
9//! }
10//!
11//! // Create the FSM (2 states, 0 or 1) that will parse the source code
12//! let mut am = Automaton::new(0, 1);
13//! // When the FSM hits the end of the source, go to state 1, the final state
14//! am.find_end(TokenType::End as i32, 0, 1);
15//!
16//! // Run the FSM with an empty string as the source code
17//! let source_code = format!("");
18//! let runner = am.run(source_code);
19//!
20//! assert_eq!(runner.tokens.len(), 1);
21//! assert!(runner.completed());
22//!
23//! // With a non-empty string, the result is not complete
24//! let source_code = format!("Invalid entry");
25//! let runner = am.run(source_code);
26//!
27//! assert_eq!(runner.tokens.len(), 0);
28//! assert!(!runner.completed());
29//! ```
30//!
31//! Run `cargo run --example let-it-be-42` to see a more complete example.
32
33use regex::Regex;
34use std::ops::Deref;
35
36/// Describes a set of valid transitions from one state to the next
37pub struct Automaton {
38    pub state_initial: i32,
39    pub state_final: i32,
40    finders: Vec<Finder>,
41}
42
43/// Contains the token type found during tokenization, along with the corresponding text
44pub struct Token {
45    pub type_id: i32,
46    pub text: std::string::String,
47}
48
49/// Reads the source into a list of tokens following the transitions allowed by an automaton
50pub struct Runner<'a> {
51    pub source: std::string::String,
52    automaton: &'a Automaton,
53    pub state: i32,
54    pub tokens: Vec<Token>,
55}
56
57/// Describes a custom way to tokenize a piece of the source
58pub struct Finder {
59    pub state_from: i32,
60    pub state_to: i32,
61    callback: fn(runner: &mut Runner, finder: &Finder) -> bool,
62    regex: Option<Regex>,
63    automaton: Option<Automaton>,
64    pub token_type: i32,
65    pub join_tokens: bool,
66}
67
68impl<'a> Automaton {
69    pub fn new(state_initial: i32, state_final: i32) -> Automaton {
70        Automaton {
71            state_initial: state_initial,
72            state_final: state_final,
73            finders: vec![],
74        }
75    }
76
77    pub fn run(&'a self, source: std::string::String) -> Runner<'a> {
78        let mut runner = Runner {
79            source: source,
80            automaton: self,
81            state: self.state_initial,
82            tokens: vec![],
83        };
84
85        runner.run();
86
87        runner
88    }
89
90    pub fn run_loop(&'a self, source: std::string::String) -> Runner<'a> {
91        let mut runner = Runner {
92            source: source,
93            automaton: self,
94            state: self.state_initial,
95            tokens: vec![],
96        };
97
98        runner.run_loop();
99
100        runner
101    }
102
103    pub fn find_custom(
104        &mut self,
105        token_type: i32,
106        state_from: i32,
107        state_to: i32,
108        callback: fn(runner: &mut Runner, finder: &Finder) -> bool,
109    ) {
110        self.finders.push(Finder {
111            state_from: state_from,
112            state_to: state_to,
113            callback: callback,
114            regex: None,
115            automaton: None,
116            token_type: token_type,
117            join_tokens: false,
118        })
119    }
120
121    fn finder_whitespace(runner: &mut Runner, finder: &Finder) -> bool {
122        let ws = &[' ', '\t'];
123        if runner.source.len() > 0 && ws.contains(&(runner.source.as_bytes()[0] as char)) {
124            let mut num_spaces = 1;
125            for i in 1..runner.source.len() {
126                if ws.contains(&(runner.source.as_bytes()[i] as char)) {
127                    num_spaces += 1;
128                } else {
129                    break;
130                }
131            }
132            if num_spaces > 0 {
133                let text = runner.source.deref()[..num_spaces].to_string();
134                runner.add_token(Token::new(finder.token_type, text));
135                return true;
136            }
137        }
138        return false;
139    }
140
141    pub fn find_whitespace(&mut self, token_type: i32, state_from: i32, state_to: i32) {
142        self.find_custom(
143            token_type,
144            state_from,
145            state_to,
146            Automaton::finder_whitespace,
147        );
148    }
149
150    fn finder_end(runner: &mut Runner, finder: &Finder) -> bool {
151        if runner.source.len() == 0 {
152            runner.add_token(Token::new(finder.token_type, "".to_string()));
153            true
154        } else {
155            false
156        }
157    }
158
159    pub fn find_end(&mut self, token_type: i32, state_from: i32, state_to: i32) {
160        self.find_custom(token_type, state_from, state_to, Automaton::finder_end);
161    }
162
163    fn finder_regex(runner: &mut Runner, finder: &Finder) -> bool {
164        match finder
165            .regex
166            .clone()
167            .unwrap()
168            .find(runner.source.clone().deref())
169        {
170            Some(regex_match) => {
171                if regex_match.start() == 0 {
172                    let text = runner.source.clone().deref()[..regex_match.end()].to_string();
173                    runner.add_token(Token::new(finder.token_type, text));
174                    true
175                } else {
176                    false
177                }
178            }
179            None => false,
180        }
181    }
182
183    pub fn find_regex(&mut self, token_type: i32, state_from: i32, state_to: i32, re: Regex) {
184        self.finders.push(Finder {
185            state_from: state_from,
186            state_to: state_to,
187            callback: Automaton::finder_regex,
188            regex: Some(re),
189            automaton: None,
190            token_type: token_type,
191            join_tokens: false,
192        })
193    }
194
195    fn automaton_run(runner: &mut Runner, finder: &Finder, am: &Automaton) -> bool {
196        let sub_runner = am.run(runner.source.clone());
197        if sub_runner.state == am.state_final {
198            if finder.join_tokens {
199                let mut full_text = std::string::String::new();
200                for part in sub_runner.tokens.iter() {
201                    full_text.push_str(part.text.deref());
202                }
203                runner.tokens.push(Token {
204                    type_id: finder.token_type,
205                    text: full_text,
206                });
207            } else {
208                for t in sub_runner.tokens.deref().iter() {
209                    runner.tokens.push(t.clone());
210                }
211            }
212            runner.source = sub_runner.source.clone();
213            true
214        } else {
215            false
216        }
217    }
218
219    fn finder_automaton(runner: &mut Runner, finder: &Finder) -> bool {
220        match finder.automaton {
221            Some(ref am) => Automaton::automaton_run(runner, finder, am),
222            None => panic!(),
223        }
224    }
225
226    pub fn find_automaton(
227        &'a mut self,
228        state_from: i32,
229        state_to: i32,
230        am: Automaton,
231    ) -> &'a mut Finder {
232        self.finders.push(Finder {
233            state_from: state_from,
234            state_to: state_to,
235            callback: Automaton::finder_automaton,
236            regex: None,
237            automaton: Some(am),
238            token_type: -1,
239            join_tokens: false,
240        });
241        self.finders.last_mut().unwrap()
242    }
243
244    fn finder_me(runner: &mut Runner, finder: &Finder) -> bool {
245        Automaton::automaton_run(runner, finder, runner.automaton)
246    }
247
248    pub fn find_me(&'a mut self, state_from: i32, state_to: i32) -> &'a mut Finder {
249        self.finders.push(Finder {
250            state_from: state_from,
251            state_to: state_to,
252            callback: Automaton::finder_me,
253            regex: None,
254            automaton: None,
255            token_type: -1,
256            join_tokens: false,
257        });
258        self.finders.last_mut().unwrap()
259    }
260}
261
262impl std::fmt::Debug for Automaton {
263    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
264        write!(f, "([{} --> {}])", self.state_initial, self.state_final)
265    }
266}
267
268impl std::clone::Clone for Automaton {
269    fn clone(&self) -> Automaton {
270        Automaton {
271            state_initial: self.state_initial,
272            state_final: self.state_final,
273            finders: self.finders.clone(),
274        }
275    }
276}
277
278impl Token {
279    pub fn new(type_id: i32, text: std::string::String) -> Token {
280        Token {
281            type_id: type_id,
282            text: text,
283        }
284    }
285}
286
287impl std::fmt::Debug for Token {
288    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
289        write!(f, "([{}] \"{}\")", self.type_id, self.text)
290    }
291}
292
293impl std::clone::Clone for Token {
294    fn clone(&self) -> Token {
295        Token::new(self.type_id, self.text.clone())
296    }
297}
298
299impl<'a> Runner<'a> {
300    fn run(&mut self) {
301        for finder in self.automaton.finders.iter() {
302            let func = finder.callback;
303            if self.state == finder.state_from && func(self, finder) == true {
304                self.state = finder.state_to;
305            }
306        }
307    }
308
309    fn run_loop(&mut self) {
310        let mut has_reached_end = false;
311        loop {
312            self.run();
313            if self.completed() == false || has_reached_end {
314                break;
315            }
316            self.state = self.automaton.state_initial;
317            // we let the automaton run one last time before going out,
318            // which allows it to catch an "EOF" token type if needed
319            has_reached_end = self.source.len() == 0;
320        }
321    }
322
323    pub fn add_token(&mut self, token: Token) {
324        let len = token.text.len();
325        self.tokens.push(token);
326        self.source = self.source.deref()[len..].to_string();
327    }
328
329    pub fn completed(&self) -> bool {
330        self.state == self.automaton.state_final
331    }
332}
333
334impl<'a> std::fmt::Debug for Runner<'a> {
335    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
336        write!(
337            f,
338            "(runner [automaton: {:?}, current_state: {}])",
339            self.automaton, self.state
340        )
341    }
342}
343
344impl Finder {
345    pub fn join_tokens(&mut self, token_type: i32) {
346        self.join_tokens = true;
347        self.token_type = token_type;
348    }
349}
350
351impl std::clone::Clone for Finder {
352    fn clone(&self) -> Finder {
353        Finder {
354            state_from: self.state_from,
355            state_to: self.state_to,
356            callback: self.callback,
357            regex: self.regex.clone(),
358            automaton: self.automaton.clone(),
359            token_type: self.token_type,
360            join_tokens: self.join_tokens,
361        }
362    }
363}