lexer_generator/
lib.rs

1//! # lexer-generator
2//! 
3//! This crate is a small scale lexer package which is parsed from JSON
4//! 
5//! # Example: Basic Tokenizing
6//! 
7//! Potential code one might use to lex tokens for a calculator
8//! 
9//! ```key.json```:
10//! ```
11//! {
12//!     "literals": {
13//!         "number": "[0-9]*(\\.[0-9]*){0, 1}",
14//!         "subtract": "-",
15//!         "add": "\\+",
16//!         "divide": "/",
17//!         "multiply": "\\*" 
18//!     },
19//!     "whitespace": "\n| |\r|\t"
20//! }
21//! ```
22//! ```main.rs```:
23//! ```
24//! let json: String = std::fs::read_to_string("key.json").unwrap();
25//! let source: String = String::from("123 + 456 * 789");
26//! 
27//! let mut lexer = Lexer::from(json, source);
28//! // parsing, runtime, whatever one would want to do with their tokens
29//! ```
30//! 
31//! ```
32//! "123 + 456 * 789" -> Token("number", "123"), Token("add", "*"), Token("number", "456"), Token("multiply", "*"), Token("number", "789") // ignoring line position and the incremental nature of the lexer
33//! ```
34
35use serde::{Deserialize, Serialize};
36
37use std::collections::HashMap;
38use regex::*;
39
40#[derive(Serialize, Deserialize)]
41struct RuleSet { // Parsed rule set from JSON file
42    literals: HashMap<String, String>,
43    whitespace: String
44}
45
46#[derive(Clone)]
47struct RegexRuleSet { // Converting above into regex
48    literals: HashMap<String, Regex>,
49    whitespace: Regex
50}
51
52#[allow(dead_code)]
53impl RegexRuleSet {
54    fn from(ruleset: RuleSet) -> Self {
55        Self {
56            // list of literal values, operators, keywords, etc., "name" : "regex pattern"
57            literals: {
58                let mut hm: HashMap<String, Regex> = HashMap::new();
59                for (k, v) in ruleset.literals {
60                    hm.insert(k, Regex::new(&v).unwrap());
61                }
62                hm
63            },
64            whitespace: Regex::new(&ruleset.whitespace).unwrap()
65        }
66    }
67    fn from_string(json: String) -> Self {
68        Self::from(serde_json::from_str::<RuleSet>(&json).unwrap())
69    }
70}
71
72#[derive(Clone)]
73/// Tokens are parsed from source code, their types are defined by the Lexer's ruleset
74pub struct Token {
75    pub token_type: String,
76    pub value: String,
77    pub line: usize
78}
79
80#[allow(dead_code)]
81impl Token {
82    /// Returns true if token.token_type matches any of the types
83    pub fn is<T: ToString>(&self, types: Vec<T>) -> bool {
84        {
85            let mut v = vec![];
86            for t in types {
87                v.push(t.to_string());
88            }
89            v
90        }.contains(&self.token_type)
91    }
92}
93
94impl std::fmt::Display for Token {
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        write!(f, "{}({})", self.token_type, self.value)
97    }
98}
99
100#[derive(Clone)]
101/// Lexes tokens from source code based on JSON-parsed ruleset
102/// # Example: 
103/// ```
104/// let mut lexer = Lexer::from(json, source);
105/// while !lexer.done() {
106///    println!("{}", lexer.next_token().unwrap());
107///}
108///```
109///
110pub struct Lexer {
111    source: String,
112    last_token: Option<Result<Token, ParsingError>>,
113    cache: Option<Result<Token, ParsingError>>,
114    rules: RegexRuleSet,
115    line: usize
116}
117
118#[derive(Clone, Debug)]
119pub enum ParsingError {
120    EndOfFileError,
121    UnrecognizedPatternError(String),
122}
123
124#[allow(dead_code)]
125impl Lexer {
126    /// Generates a lexer from JSON
127    pub fn from(json: String, source: String) -> Self {
128        Self {
129            source: source,
130            last_token: None,
131            cache: None,
132            rules: RegexRuleSet::from_string(json),
133            line: 0
134        }
135    }
136
137    /// Initializes lexer without JSON parsing
138    pub fn from_args(literals: HashMap<String, String>, whitespace: String, source: String) -> Self {
139        Self {
140            source: source,
141            last_token: None,
142            cache: None,
143            rules: RegexRuleSet::from(RuleSet { literals: literals, whitespace: whitespace } ),
144            line: 0
145        }
146    }
147
148    fn ch(&self) -> char {
149        (&self.source).as_bytes()[0] as char
150    }
151
152    fn skip_whitespace(&mut self) {
153        let mat = match self.rules.whitespace.find(&self.source) { Some(a) => (a.start() as i32, a.end() as i32), None => (-1, -1)};
154        if mat.0 == 0 {
155            for _i in mat.0..mat.1 {
156                match self.source.remove(0) {
157                    '\n' => self.line += 1,
158                    _ => {}
159                }
160            }
161        }
162    }
163
164    pub fn done(&self) -> bool {
165        0 >= self.source.len()
166    }
167
168    fn get(& mut self) -> char {
169        match self.source.remove(0) {
170            c => {
171                if c == '\n' { self.line += 1; }
172                c
173            }
174        }
175    }
176
177    fn parse_next(&mut self) -> Result<Token, ParsingError> {
178        self.skip_whitespace();
179        if !self.done() {
180            let mut name = String::new();
181            let mut mat: (i32, i32) = (-1, -1);
182            for (lit_type, pat) in &self.rules.literals {
183                let new_mat = match pat.find(&self.source) {
184                    Some(thing) => thing,
185                    None => continue
186                };
187                if new_mat.start() == 0 && new_mat.end() as i32 > mat.1 {
188                    mat = (new_mat.start() as i32, new_mat.end() as i32);
189                    name = lit_type.clone();
190                }
191            }
192            if mat.0 != 0 { // no patterns
193                return Err(ParsingError::UnrecognizedPatternError(String::from(self.get())))
194            }
195            let mut lexeme = String::new();
196            for _ in 0..mat.1 {
197                lexeme.push(self.get());
198            }
199            return Ok(Token { token_type: name, value: lexeme, line: self.line });
200        }
201        Err(ParsingError::EndOfFileError)
202    }
203
204    /// Advances and returns the next token
205    pub fn next_token(&mut self) -> Result<Token, ParsingError> {
206        match self.cache.clone() {
207            Some(token) => {
208                self.cache = None;
209                self.last_token = Some(token);
210                self.last_token.clone().unwrap()
211            }
212            None => {
213                self.last_token = Some(self.parse_next());
214                self.last_token.clone().unwrap()
215            }
216        }
217    }
218
219    /// Advances and returns the next token
220    pub fn next_token_x(& mut self) -> Token {
221        self.next_token().unwrap()
222    }
223
224    /// Returns the last token lexed
225    pub fn current_token(&self) -> Option<Result<Token, ParsingError>> {
226        self.last_token.clone()
227    }
228    
229    /// Returns the last token lexed, can throw exceptions
230    pub fn current_token_x(&self) -> Token {
231        self.current_token().unwrap().unwrap()
232    }
233
234    /// Returns the next token to be lexed
235    pub fn peek_next_token(&mut self) -> Option<Result<Token, ParsingError>> {
236        self.cache = Some(self.next_token());
237        self.cache.clone()
238    }
239
240    /// Returns the next token to be lexed. can throw exceptions
241    pub fn peek_next_token_x(&mut self) -> Token {
242        self.peek_next_token().unwrap().unwrap()
243    }
244}