use crate::*;
use regex::RegexSet;
use std::fmt::Debug;
use std::rc::Rc;
#[derive(Debug)]
pub struct RecognizerBuilder<K: TokenKind> {
rules: Vec<Rule<K>>,
}
impl<K: TokenKind> Default for lexer::RecognizerBuilder<K> {
fn default() -> Self {
Self::new()
}
}
impl<K: TokenKind> RecognizerBuilder<K> {
pub fn new() -> Self {
RecognizerBuilder { rules: vec![] }
}
pub fn token(mut self, rule: Rule<K>) -> Self {
self.rules.push(rule);
self
}
pub fn build(self) -> Recognizer<K> {
let re_set =
RegexSet::new(self.rules.iter().map(|r| r.re.as_str()).collect::<Vec<_>>()).unwrap();
Recognizer {
re_set,
rules: self.rules,
}
}
}
#[derive(Debug, Clone)]
pub struct Recognizer<K: TokenKind> {
re_set: RegexSet,
rules: Vec<Rule<K>>,
}
impl<K: TokenKind> Recognizer<K> {
pub fn read_at<'a>(&self, source: &'a str, position: usize) -> Token<'a, K> {
if position >= source.len() {
return Token::eof(source);
}
let mut matches = vec![false; self.rules.len()];
let any_match = self
.re_set
.read_matches_at(matches.as_mut_slice(), &source[position..], 0);
if !any_match {
let next = self.read_at(source, position + 1);
let (sequence, next) = if next.kind.is_eof() {
(&source[position..], next)
} else if next.kind.is_unrecognized() {
(&source[position..next.end], *next.next.unwrap())
} else {
(&source[position..next.start], next)
};
return Token::new(
source,
K::unrecognized(),
position,
position + sequence.len(),
)
.next(Some(Box::new(next)));
}
let (first_idx, _) = matches.iter().enumerate().find(|(_idx, b)| **b).unwrap();
let rule = &self.rules[first_idx];
let (re_match, maybe_captures) = if rule.capture {
let cap = rule.re.captures(&source[position..]).unwrap();
(cap.get(0).unwrap(), Some(cap))
} else {
let re_match = rule.re.find(&source[position..]).unwrap();
(re_match, None)
};
let start = position + re_match.start();
let end = position + re_match.end();
Token::new(source, rule.kind.clone(), start, end)
.skip(rule.skip)
.captures(maybe_captures.map(|captures| {
captures
.iter()
.map(|maybe_capture| maybe_capture.map(|capture| (capture.start(), capture.end())))
.collect()
}))
}
pub fn scan<'a>(&self, source: &'a str, mut position: usize) -> Token<'a, K> {
let mut skipped = vec![];
let token = loop {
let token = self.read_at(source, position);
position = token.end;
if !token.skip {
break token;
}
skipped.push(token);
};
token.skipped(skipped)
}
pub fn into_lexer<'a>(self, source: &'a str, position: usize) -> Lexer<'a, K> {
Lexer {
source,
position,
recognizer: Rc::new(self),
}
}
}
#[derive(Clone, Debug)]
pub struct Lexer<'a, K: TokenKind> {
pub source: &'a str,
pub position: usize,
pub recognizer: Rc<Recognizer<K>>,
}
impl<'a, K: TokenKind> Lexer<'a, K> {
pub fn next_of<F>(&mut self, f: F) -> Result<Token<'_, K>, Token<'_, K>>
where
F: Fn(&Token<'_, K>) -> bool,
{
let token = self.recognizer.scan(&self.source, self.position);
if f(&token) {
self.position = token.end;
Ok(token)
} else {
Err(token)
}
}
pub fn next_of_kinds<S>(&mut self, kinds: S) -> Result<Token<'_, K>, Token<'_, K>>
where
S: AsRef<[K]>,
{
let kinds = kinds.as_ref();
self.next_of(|t| kinds.contains(&t.kind))
}
pub fn next_of_kind(&mut self, kind: K) -> Result<Token<'_, K>, Token<'_, K>> {
self.next_of_kinds(&[kind])
}
}
impl<'a, K: TokenKind> std::iter::Iterator for Lexer<'a, K> {
type Item = Token<'a, K>;
fn next(&mut self) -> Option<Self::Item> {
let token = self.recognizer.scan(&self.source, self.position);
self.position = token.end;
if token.kind.is_eof() {
None
} else {
Some(token)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_read_at() {
let recognizer = RecognizerBuilder::<&str>::new()
.token(Rule::new("ID", "[A-Za-z]+").unwrap())
.token(Rule::new("WS", "\\s+").unwrap().skip(true))
.build();
let token = recognizer.read_at(" hello", 0);
assert_eq!(token, Token::from_text("WS", " ", 0).skip(true));
let token = recognizer.read_at(" hello", 1);
assert_eq!(token, Token::from_text("ID", "hello", 1));
let token = recognizer.read_at(" 123 hello", 1);
assert_eq!(
token,
Token::from_text("UNRECOGNIZED", "123", 1)
.next(Some(Box::new(Token::from_text("WS", " ", 4).skip(true))))
);
let token = recognizer.read_at(" 123", 1);
assert_eq!(
token,
Token::from_text("UNRECOGNIZED", "123", 1)
.next(Some(Box::new(Token::from_text("EOF", "", 4))))
);
}
#[test]
fn test_lexer_iter() {
let recognizer = RecognizerBuilder::<&str>::new()
.token(Rule::new("NUM", "[0-9]+").unwrap())
.token(Rule::new("WS", "\\s+").unwrap().skip(true))
.build();
let tokens: Vec<_> = recognizer.into_lexer("1 2 3", 0).collect();
assert_eq!(
tokens,
vec![
Token::from_text("NUM", "1", 0),
Token::from_text("NUM", "2", 2).skipped(vec![Token::from_text("WS", " ", 1).skip(true)]),
Token::from_text("NUM", "3", 4).skipped(vec![Token::from_text("WS", " ", 3).skip(true)]),
]
)
}
#[test]
fn test_capture() {
let recognizer = RecognizerBuilder::new()
.token(
Rule::new("FLOAT", r"(\d+)(?:\.(\d+))?")
.unwrap()
.capture(true),
)
.build();
let mut lexer = recognizer.into_lexer("1.234", 0);
let t = lexer.next().unwrap();
assert_eq!(
t.get_capture(0).unwrap(),
TokenCapture {
text: "1.234",
start: t.start,
end: t.end
}
);
assert_eq!(
t.get_capture(1).unwrap(),
TokenCapture {
text: "1",
start: 0,
end: 1,
}
);
assert_eq!(
t.get_capture(2).unwrap(),
TokenCapture {
text: "234",
start: 2,
end: 5
}
);
}
}