1#![cfg(any(target_os = "windows", target_os = "linux"))]
2#![allow(internal_features)]
3#![feature(fmt_internals)]
4
5mod column_line;
6mod constants;
7mod matcher;
8mod platform_const;
9
10use std::any::Any;
11
12use anyhow::{Ok, Result};
13use column_line::*;
14use constants::*;
15use derive_more::derive::Display;
16pub use matcher::*;
17use regex::Regex;
18
19#[derive(Clone)]
20pub struct Token {
21 pub val: String,
22 pub line: usize,
23 pub column: usize,
24}
25
26impl ::std::fmt::Debug for Token {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 write!(
29 f,
30 "Token {{value: {:#?}, line: {:#?}, column: {:#?}}}",
31 self.val, self.line, self.column
32 )
33 }
34}
35
36impl ::std::fmt::Display for Token {
37 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38 <Self as ::std::fmt::Debug>::fmt(self, f)
39 }
40}
41
42pub struct Tokenizer {
43 matchers: Vec<Matcher>,
44 str_iter: String,
45}
46
47impl Tokenizer {
48 pub fn new(s: impl ToString) -> Self {
49 Self {
50 matchers: Default::default(),
51 str_iter: s.to_string(),
52 }
53 }
54
55 pub fn add_str_pat<T: ToString>(&mut self, src: T) {
56 self.matchers.push(src.to_string().into());
57 }
58
59 pub fn add_str_pattern_array<T: ToString, const N: usize>(&mut self, src: [T; N]) {
60 for s in src {
61 self.add_str_pat(s);
62 }
63 }
64
65 pub fn add_str_pattern_vec<T: ToString>(&mut self, src: Vec<T>) {
66 for s in src {
67 self.add_str_pat(s);
68 }
69 }
70
71 pub fn add_pattern_array<T: MatcherTrait + Any, const N: usize>(&mut self, src: [T; N]) {
72 for s in src {
73 self.add_pat(s);
74 }
75 }
76
77 pub fn add_pattern_vec<T: MatcherTrait + Any>(&mut self, src: Vec<T>) {
78 for s in src {
79 self.add_pat(s);
80 }
81 }
82
83 pub fn add_pat<T: MatcherTrait + Any>(&mut self, src: T) {
84 self.matchers.push(src.into());
85 }
86
87 pub fn add_regex_pat(&mut self, src: impl ToString) -> Result<()> {
88 self.matchers
89 .push(Regex::new(src.to_string().as_str())?.into());
90 Ok(())
91 }
92
93 pub fn add_regex_pattern_array<T: ToString, const N: usize>(
94 &mut self,
95 src: [T; N],
96 ) -> Result<()> {
97 for s in src {
98 self.add_regex_pat(s)?;
99 }
100 Ok(())
101 }
102
103 pub fn add_regex_pattern_vec<T: ToString>(&mut self, src: Vec<T>) -> Result<()> {
104 for s in src {
105 self.add_regex_pat(s)?;
106 }
107 Ok(())
108 }
109
110 pub fn add_ws_pat(&mut self) {
111 self.matchers.push(WHITE_SPACE_REGEX.into());
112 }
113
114 pub(crate) fn add_common_pat(&mut self, src: Matcher) {
115 self.matchers.push(src);
116 }
117
118 pub fn start(&mut self) -> Result<Vec<Token>> {
119 let mut res = Vec::with_capacity(self.matchers.len());
120 let lookup = LineColLookup::new(&self.str_iter);
121 let mut current_str = self.str_iter.clone();
122 let mut current_index = 0;
123 loop {
124 let mut matched = false;
125 for reg in &self.matchers {
126 if let Some(s) = reg.get(¤t_str) {
127 current_str = current_str[s.len()..].to_owned();
128 let (line, column) = lookup.get(current_index);
129 current_index += s.len();
130 res.push(Token {
131 val: s,
132 line,
133 column,
134 });
135 matched = true;
136 break;
137 } else {
138 continue;
139 }
140 }
141 if !matched {
142 return Err(TokenizerError::AllMatchersMatchNothing.into());
143 }
144 if current_str.len() == 0 {
145 break;
146 }
147 }
148 Ok(res)
149 }
150}
151
152#[derive(Debug, Display, thiserror::Error)]
153pub enum TokenizerError {
154 AllMatchersMatchNothing,
155}
156
157#[inline]
158pub fn build_tokenizer<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Tokenizer {
159 let mut tokenizer = Tokenizer::new(src);
160 for v in val {
161 tokenizer.add_common_pat(v);
162 }
163 tokenizer
164}
165
166#[inline]
167pub fn to_tokens<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Result<Vec<Token>> {
168 let mut tokenizer = build_tokenizer(val, src);
169 tokenizer.start()
170}
171
172#[inline]
173pub fn to_tokens_without_ws<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Result<Vec<Token>> {
174 let mut tokenizer = build_tokenizer(val, src);
175 Ok(filter_white_spaces(tokenizer.start()?))
176}
177
178#[inline]
179pub fn filter_white_spaces(val: Vec<Token>) -> Vec<Token> {
180 val.iter()
181 .filter(|x| !WHITE_SPACE_REGEX.is_match(&x.val))
182 .map(|x| x.clone())
183 .collect()
184}
185
186#[cfg(test)]
187mod tests {
188 use regex::Regex;
189
190 use super::*;
191
192 #[test]
193 #[allow(unused_must_use)]
194 fn t1() {
195 let src = "
196class Test {
197}";
198 let tokens = filter_white_spaces(
199 to_tokens(
200 vec![
201 "class".into(),
202 WHITE_SPACE_REGEX.clone().into(),
203 Regex::new(r"\A[.[^\{\s]]+").unwrap().into(),
204 "{".into(),
205 "}".into(),
206 ],
207 src,
208 )
209 .unwrap(),
210 );
211 dbg!(tokens);
212 }
213 #[test]
214 #[allow(unused_must_use)]
215 fn test_string_parse() {
216 let src = r#""sudsier\" asdf \"""#;
217 let mut tokenizer = Tokenizer::new(src);
218 tokenizer.add_ws_pat();
219 tokenizer.add_regex_pat(r#"\A"[[.[^"]]\\"]*""#).unwrap();
220 dbg!(tokenizer.start());
221 }
222}