Expand description
A simple lexer generator based on regex_automata
Warning: this is alpha-quality code (as evidenced by the 0.0.1 version); it has not been exhaustively tested or documented. Unless you wish to help with either, I recommend waiting for version 0.2 (though feel free to prod me to get there)
In particular, the API is likely to change to support the following:
- Support for non-’static tokens
- Support for lexing non-utf8 input (
&[u8]tokens) - Builtin support for skipping a token
- Better error messages (as in, any error messages at all)
- Hooks to be called before and after each token is lexed (e.g., for position tracking)
§Example
ⓘ
use leqx::leqxer;
use regex_automata::{Anchored, Input};
pub enum Token<'a> {
Word(&'a str),
Number(isize),
}
leqxer! {
#[derive(Default)]
struct State {
line: usize,
column: usize,
}
#[leqxer(dfa=sparse, embed=true)]
mode lex_raw(&mut self, tok) -> Option<(usize, usize, Token)> {
"[ \t]+" => {
self.column += tok.len();
None
},
"\r?\n|\r" => {
self.column = 0;
self.line += 1;
None
},
"[a-z]+" => {
let col = self.column;
self.column += tok.len();
Some((self.line, col, Token::Word(tok)))
},
"[0-9]+" => {
let col = self.column;
self.column += tok.len();
Some((self.line, col, Token::Word(tok)))
}
}
}
pub struct Lexer<'a> {
state: State,
input: regex_automata::Input<'a>,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
state: State::default(),
input: regex_automata::Input::new(input).anchored(Anchored::Yes),
}
}
}
impl <'a> Iterator for Lexer<'a> {
type Item = (usize, usize, Token<'a>);
fn next(&mut self) -> Option<Self::Item> {
loop {
// the lex_raw method is from the name of the mode above
let tok = self.state.lex_raw(&mut self.input)?;
if let Some(tok) = tok {
return Some(tok);
}
}
}
}