regex_tokenizer/tokenization/
matcher.rs1use std::fmt::Debug;
2
3pub struct Token<TypeEnum>
4where
5 TypeEnum: Copy + Debug,
6{
7 pub value: String,
8 pub position: usize,
9 pub type_: TypeEnum,
10}
11
12pub trait ValidToken: ToString {
13 fn position(&self) -> (usize, usize);
14}
15
16impl<TypeEnum> ToString for Token<TypeEnum>
17where
18 TypeEnum: Copy + Debug,
19{
20 fn to_string(&self) -> String {
21 self.value.clone()
22 }
23}
24
25impl<TypeEnum> ValidToken for Token<TypeEnum>
26where
27 TypeEnum: Copy + Debug,
28{
29 fn position(&self) -> (usize, usize) {
30 (self.position, self.position + self.value.len())
31 }
32}
33
34pub struct Matcher<TypeEnum> {
36 pub(crate) matchers: Vec<(regex::Regex, TypeEnum)>,
37 pub(crate) ignored: Vec<regex::Regex>,
38}
39
40impl<TypeEnum: Copy> Matcher<TypeEnum> {
41 pub fn build(regexes: Vec<(String, TypeEnum)>, ignored: Vec<String>) -> Self {
43 Self {
44 matchers: regexes
45 .iter()
46 .map(|(regex, type_)| (regex::Regex::new(regex.as_str()).unwrap(), type_.clone()))
47 .collect(),
48 ignored: ignored
49 .iter()
50 .map(|regex| regex::Regex::new(regex.as_str()).unwrap())
51 .collect(),
52 }
53 }
54}
55
56pub trait BuildableMatcher<TypeEnum> {
58 fn new() -> Matcher<TypeEnum>;
60}
61
62pub trait ValidMatcher {
63 type TokenType: ValidToken;
64
65 fn try_match(&self, query: &str, position: usize) -> Option<Self::TokenType>;
66}
67
68impl<TypeEnum> ValidMatcher for Matcher<TypeEnum>
69where
70 TypeEnum: Copy + Debug,
71{
72 type TokenType = Token<TypeEnum>;
73
74 fn try_match(&self, query: &str, position: usize) -> Option<Self::TokenType> {
75 for (regex, type_) in &self.matchers {
76 let m = regex.find(&query);
77 if let Some(token) = m {
78 return Some(Token {
79 value: String::from(token.as_str()),
80 position,
81 type_: type_.clone(),
82 });
83 }
84 }
85
86 for regex in &self.ignored {
87 let m = regex.find(&query);
88 if let Some(token) = m {
89 let token = token.as_str();
90
91 return self.try_match(
92 &String::from(query.strip_prefix(token).unwrap()),
93 position + token.len(),
94 );
95 }
96 }
97
98 None
99 }
100}