lib_lexin/
lib.rs

1use std::fs;
2
3type Loc = (usize, usize);
4
5#[derive(Debug, PartialEq, Clone)]
6pub enum Token {
7    Keyword(String, Loc),
8    Section(String, String, Loc),
9    Integer(usize, Loc),
10    Float(f64, Loc),
11    Symbol(char, String, Loc),
12    Ident(String, Loc),
13}
14
15#[derive(Debug)]
16enum Value {
17    Start(String),
18    End(String, String),
19}
20
21enum StartOrSection<'a> {
22    Start(Vec<String>),
23    Section(&'a Section),
24}
25
26#[derive(PartialEq, Eq)]
27enum Mode {
28    Section,
29    Normal,
30}
31
32#[derive(Debug, Clone)]
33pub struct Lexer {
34    pub keywords: Vec<String>,
35    pub sections: Vec<Section>,
36    pub symbols: Vec<(char, String)>,
37    pub buffer: Vec<u8>,
38    pub allow_whitespace: bool,
39}
40
41#[derive(Debug, Clone)]
42pub struct Section {
43    pub name: String,
44    pub start: String,
45    pub end: String,
46}
47
48
49impl Token {
50    pub fn as_string(&self) -> String {
51        return match self {
52            Token::Keyword(keyword, _) => keyword.clone(),
53            Token::Section(_, value, _) => value.clone(),
54            Token::Integer(integer, _) => integer.to_string(),
55            Token::Float(float, _) => float.to_string(),
56            Token::Symbol(value, _, _) => value.to_string(),
57            Token::Ident(ident, _) => ident.clone(),
58        };
59    }
60
61    pub fn loc(&self) -> Loc {
62        return match self {
63            Token::Keyword(_, loc) => *loc,
64            Token::Section(_, _, loc) => *loc,
65            Token::Integer(_, loc) => *loc,
66            Token::Float(_, loc) => *loc,
67            Token::Symbol(_, _, loc) => *loc,
68            Token::Ident(_, loc) => *loc,
69        };
70    }
71
72    pub fn is_keyword(&self, keyword: &str) -> Result<(), Box<dyn std::error::Error>> {
73        if let Token::Keyword(value, _) = self {
74            if value == keyword {
75                return Ok(());
76            }
77        }
78        return Err(format!("expected keyword: {:?}", self).into());
79    }
80
81    pub fn is_section(&self, name: &str) -> Result<String, Box<dyn std::error::Error>> {
82        if let Token::Section(s_name, value, _) = self {
83            if name == s_name {
84                return Ok(value.clone());
85            }
86        }
87        return Err(format!("expected section: {:?}", self).into());
88    }
89
90    pub fn is_ident(&self) -> Result<String, Box<dyn std::error::Error>> {
91        if let Token::Ident(value, _) = self {
92            return Ok(value.clone());
93        }
94        return Err(format!("expected ident: {:?}", self).into());
95    }
96
97    pub fn is_integer(&self) -> Result<usize, Box<dyn std::error::Error>> {
98        if let Token::Integer(integer, _) = self {
99            return Ok(*integer);
100        }
101        return Err(format!("expected integer: {:?}", self).into());
102    }
103
104    pub fn is_float(&self) -> Result<f64, Box<dyn std::error::Error>> {
105        if let Token::Float(float, _) = self {
106            return Ok(*float);
107        }
108        return Err(format!("expected float: {:?}", self).into());
109    }
110
111    pub fn is_symbol(&self, name: &str) -> Result<(), Box<dyn std::error::Error>> {
112        if let Token::Symbol(_, s_name, _) = self {
113            if s_name == name {
114                return Ok(());
115            }
116        }
117        return Err(format!("expected symbol: {:?}", self).into());
118    }
119}
120
121impl Section {
122    pub fn new(name: &str, start: &str, end: &str) -> Section {
123        return Section {
124            name: name.to_string(),
125            start: start.to_string(),
126            end: end.to_string(),
127        };
128    }
129
130    pub fn from_end(end: String) -> Section {
131        return Section {
132            name: String::new(),
133            start: String::new(),
134            end,
135        };
136    }
137}
138
139impl Lexer {
140    pub fn new(keywords: &[String], sections: &[Section], symbols: &[(char, String)], allow_whitespace: bool) -> Lexer {
141        return Lexer {
142            keywords: keywords.to_vec(),
143            sections: sections.to_vec(),
144            symbols: symbols.to_vec(),
145            buffer: Vec::new(),
146            allow_whitespace,
147        };
148    }
149
150    pub fn load_str(&mut self, string: &str) {
151        self.buffer = string.as_bytes().to_vec();
152    }
153
154    pub fn load_file(&mut self, filename: &str) -> Result<(), Box<dyn std::error::Error>> {
155        self.buffer = fs::read(filename)?;
156        return Ok(());
157    }
158
159    fn symbols_contain(&self, value: &char) -> Option<&str> {
160        for symbol in &self.symbols {
161            if symbol.0 == *value {
162                return Some(&symbol.1);
163            }
164        }
165        return None;
166    }
167
168    fn section_exists(&self, start: &str, end: &str) -> Result<String, ()> {
169        for section in &self.sections {
170            if section.start == start && section.end == end {
171                return Ok(section.name.to_string());
172            }
173        }
174        return Err(());
175    }
176
177    fn is_section(&self, value: Value) -> Result<StartOrSection, ()> {
178        let mut matches: Vec<String> = Vec::new();
179        for section in &self.sections {
180            if let Value::Start(start) = &value {
181                if &section.start == start {
182                    matches.push(section.end.clone());
183                }
184            } else if let Value::End(start, end) = &value {
185                if &section.end == end && &section.start == start {
186                    return Ok(StartOrSection::Section(section)); // matches is not really needed here
187                }
188            }
189        }
190
191        if matches.len() != 0 {
192            return Ok(StartOrSection::Start(matches));
193        }
194        return Err(());
195    }
196
197    fn is_numeric(&self, token: &String, loc: Loc) -> Token {
198        if let Ok(integer) = token.parse::<usize>() {
199            return Token::Integer(integer, loc);
200        } else if let Ok(integer) = token.parse::<f64>() {
201            return Token::Float(integer, loc);
202        } else {
203            return Token::Ident(token.clone(), loc);
204        }
205    }
206
207    fn lex_token(&self, token: &String, loc: Loc) -> Option<Token> {
208        if token == "\n" {
209            return None;
210        } else if token == "" {
211            if self.allow_whitespace {
212                return Some(Token::Ident(" ".to_string(), loc));
213            } else {
214                return None;
215            }
216        } else if self.keywords.contains(&token) {
217            return Some(Token::Keyword(token.clone(), loc));
218        } else if token.len() == 1 {
219            let character = token.chars().collect::<Vec<char>>()[0];
220            if let Some(symbol_name) = self.symbols_contain(&character) {
221                return Some(Token::Symbol(character, symbol_name.to_string(), loc));
222            } else {
223                return Some(self.is_numeric(token, loc));
224            }
225        } else if let Ok(name) = self.section_exists(&token[0..1], &token[token.len()-1..token.len()]) {
226            return Some(Token::Section(name, token[1..token.len() - 1].to_string(), loc));
227        } else {
228            return Some(self.is_numeric(token, loc));
229        }
230    }
231
232    pub fn tokenize(&mut self) -> Result<Vec<Token>, Box<dyn std::error::Error>> {
233        if self.symbols_contain(&' ').is_none() {
234            self.symbols.push((' ', "Space".to_string()));
235        }
236
237        let mut mode = Mode::Normal;
238        let mut token = String::new();
239        let mut tokens: Vec<Token> = Vec::new();
240        let mut section: Vec<Section> = Vec::new();
241        let mut loc = (1, 1);
242
243        let mut index = 0;
244        while index < self.buffer.len() {
245            let byte = &self.buffer[index];
246            let character = String::from_utf8(vec![byte.clone()])?;
247            if (index + 1) < self.buffer.len() {
248                if mode == Mode::Normal {
249                    if let Ok(StartOrSection::Start(ends)) = self.is_section(Value::Start(character.clone())) {
250                        token = token + &character;
251                        for end in ends {
252                            section.push(Section::from_end(end.clone()));
253                            let idx = section.len() - 1;
254                            section[idx].start = character.clone();
255                        }
256                        mode = Mode::Section;
257                    } else if character.as_str() == "\n" {
258                        self.lex_token(&token, loc).map(|t| tokens.push(t));
259                        token = String::new();
260                    } else if character.as_str() != " " {
261                        token = token + &character;
262                    }
263                    if (self.symbols_contain(&char::from(byte.clone())).is_some() || self.symbols_contain(&char::from(self.buffer[index + 1])).is_some()) &&
264                       section.len() == 0 { // making sure we arent lexing symbols when we're in a section
265                        self.lex_token(&token, loc).map(|t| tokens.push(t));
266                        token = String::new();
267                    }
268                } else if mode == Mode::Section {
269                    if &character == "\\" {
270                        if index + 1 >= self.buffer.len() {
271                            return Ok(tokens);
272                        } else {
273                            index += 1;
274                            token = token + &(self.buffer[index] as char).to_string();
275                        }
276                    } else if self.is_section(Value::End(section[0].start.to_string(), character.clone())).is_ok() || index + 2 >= self.buffer.len() { // index doesnt matter here because all indexes has the same start
277                        token = token + &character;
278                        self.lex_token(&token, loc).map(|t| tokens.push(t));
279                        section = Vec::new();
280                        token = String::new();
281                        mode = Mode::Normal;
282                    } else {
283                        token = token + &character;
284                    }
285                }
286            }
287
288            if &character == "\n" {
289                loc.0 += 1;
290                loc.1 = 1;
291            } else {
292                loc.1 += 1;
293            }
294            index += 1;
295        }
296
297        return Ok(tokens);
298    }
299}
300
301
302#[cfg(test)]
303mod tests {
304    use super::*;
305
306    #[test]
307    fn load_test() -> Result<(), Box<dyn std::error::Error>> {
308        let mut lexer = Lexer::new(
309            &["def".to_string(), "if".to_string(), "return".to_string()],
310            &[Section::new("string", "\"", "\"")],
311            &[(':', "column".to_string()), ('(', "openbrace".to_string()), (')', "closebrace".to_string()), (' ', "space".to_string())],
312            true,
313        );
314
315        //lexer.load_str("def test(): \" return 0 ");
316        lexer.load_str("\"  ");
317
318        println!("tokens: {:?}", lexer.tokenize()?);
319        return Ok(());
320    }
321}
322
323