regex_tokenizer/tokenization/
matcher.rs

1use std::fmt::Debug;
2
3pub struct Token<TypeEnum>
4where
5    TypeEnum: Copy + Debug,
6{
7    pub value: String,
8    pub position: usize,
9    pub type_: TypeEnum,
10}
11
12pub trait ValidToken: ToString {
13    fn position(&self) -> (usize, usize);
14}
15
16impl<TypeEnum> ToString for Token<TypeEnum>
17where
18    TypeEnum: Copy + Debug,
19{
20    fn to_string(&self) -> String {
21        self.value.clone()
22    }
23}
24
25impl<TypeEnum> ValidToken for Token<TypeEnum>
26where
27    TypeEnum: Copy + Debug,
28{
29    fn position(&self) -> (usize, usize) {
30        (self.position, self.position + self.value.len())
31    }
32}
33
34/// A matcher capable of extracting tokens and associating them to a class of tokens
35pub struct Matcher<TypeEnum> {
36    pub(crate) matchers: Vec<(regex::Regex, TypeEnum)>,
37    pub(crate) ignored: Vec<regex::Regex>,
38}
39
40impl<TypeEnum: Copy> Matcher<TypeEnum> {
41    /// Build a new Matcher for TypeEnum, this function should be invoked only by the `tokenizer!` macro
42    pub fn build(regexes: Vec<(String, TypeEnum)>, ignored: Vec<String>) -> Self {
43        Self {
44            matchers: regexes
45                .iter()
46                .map(|(regex, type_)| (regex::Regex::new(regex.as_str()).unwrap(), type_.clone()))
47                .collect(),
48            ignored: ignored
49                .iter()
50                .map(|regex| regex::Regex::new(regex.as_str()).unwrap())
51                .collect(),
52        }
53    }
54}
55
56/// A Matcher is buildable without external data
57pub trait BuildableMatcher<TypeEnum> {
58    /// Build a new Matcher instance
59    fn new() -> Matcher<TypeEnum>;
60}
61
62pub trait ValidMatcher {
63    type TokenType: ValidToken;
64
65    fn try_match(&self, query: &str, position: usize) -> Option<Self::TokenType>;
66}
67
68impl<TypeEnum> ValidMatcher for Matcher<TypeEnum>
69where
70    TypeEnum: Copy + Debug,
71{
72    type TokenType = Token<TypeEnum>;
73
74    fn try_match(&self, query: &str, position: usize) -> Option<Self::TokenType> {
75        for (regex, type_) in &self.matchers {
76            let m = regex.find(&query);
77            if let Some(token) = m {
78                return Some(Token {
79                    value: String::from(token.as_str()),
80                    position,
81                    type_: type_.clone(),
82                });
83            }
84        }
85
86        for regex in &self.ignored {
87            let m = regex.find(&query);
88            if let Some(token) = m {
89                let token = token.as_str();
90
91                return self.try_match(
92                    &String::from(query.strip_prefix(token).unwrap()),
93                    position + token.len(),
94                );
95            }
96        }
97
98        None
99    }
100}