great_tokenizer/
lib.rs

1#![cfg(any(target_os = "windows", target_os = "linux"))]
2#![allow(internal_features)]
3#![feature(fmt_internals)]
4
5mod column_line;
6mod constants;
7mod matcher;
8mod platform_const;
9
10use std::any::Any;
11
12use anyhow::{Ok, Result};
13use column_line::*;
14use constants::*;
15use derive_more::derive::Display;
16pub use matcher::*;
17use regex::Regex;
18
19#[derive(Clone)]
20pub struct Token {
21    pub val: String,
22    pub line: usize,
23    pub column: usize,
24}
25
26impl ::std::fmt::Debug for Token {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        write!(
29            f,
30            "Token {{value: {:#?}, line: {:#?}, column: {:#?}}}",
31            self.val, self.line, self.column
32        )
33    }
34}
35
36impl ::std::fmt::Display for Token {
37    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
38        <Self as ::std::fmt::Debug>::fmt(self, f)
39    }
40}
41
42pub struct Tokenizer {
43    matchers: Vec<Matcher>,
44    str_iter: String,
45}
46
47impl Tokenizer {
48    pub fn new(s: impl ToString) -> Self {
49        Self {
50            matchers: Default::default(),
51            str_iter: s.to_string(),
52        }
53    }
54
55    pub fn add_str_pat<T: ToString>(&mut self, src: T) {
56        self.matchers.push(src.to_string().into());
57    }
58
59    pub fn add_str_pattern_array<T: ToString, const N: usize>(&mut self, src: [T; N]) {
60        for s in src {
61            self.add_str_pat(s);
62        }
63    }
64
65    pub fn add_str_pattern_vec<T: ToString>(&mut self, src: Vec<T>) {
66        for s in src {
67            self.add_str_pat(s);
68        }
69    }
70
71    pub fn add_pattern_array<T: MatcherTrait + Any, const N: usize>(&mut self, src: [T; N]) {
72        for s in src {
73            self.add_pat(s);
74        }
75    }
76
77    pub fn add_pattern_vec<T: MatcherTrait + Any>(&mut self, src: Vec<T>) {
78        for s in src {
79            self.add_pat(s);
80        }
81    }
82
83    pub fn add_pat<T: MatcherTrait + Any>(&mut self, src: T) {
84        self.matchers.push(src.into());
85    }
86
87    pub fn add_regex_pat(&mut self, src: impl ToString) -> Result<()> {
88        self.matchers
89            .push(Regex::new(src.to_string().as_str())?.into());
90        Ok(())
91    }
92
93    pub fn add_regex_pattern_array<T: ToString, const N: usize>(
94        &mut self,
95        src: [T; N],
96    ) -> Result<()> {
97        for s in src {
98            self.add_regex_pat(s)?;
99        }
100        Ok(())
101    }
102
103    pub fn add_regex_pattern_vec<T: ToString>(&mut self, src: Vec<T>) -> Result<()> {
104        for s in src {
105            self.add_regex_pat(s)?;
106        }
107        Ok(())
108    }
109
110    pub fn add_ws_pat(&mut self) {
111        self.matchers.push(WHITE_SPACE_REGEX.into());
112    }
113
114    pub(crate) fn add_common_pat(&mut self, src: Matcher) {
115        self.matchers.push(src);
116    }
117
118    pub fn start(&mut self) -> Result<Vec<Token>> {
119        let mut res = Vec::with_capacity(self.matchers.len());
120        let lookup = LineColLookup::new(&self.str_iter);
121        let mut current_str = self.str_iter.clone();
122        let mut current_index = 0;
123        loop {
124            let mut matched = false;
125            for reg in &self.matchers {
126                if let Some(s) = reg.get(&current_str) {
127                    current_str = current_str[s.len()..].to_owned();
128                    let (line, column) = lookup.get(current_index);
129                    current_index += s.len();
130                    res.push(Token {
131                        val: s,
132                        line,
133                        column,
134                    });
135                    matched = true;
136                    break;
137                } else {
138                    continue;
139                }
140            }
141            if !matched {
142                return Err(TokenizerError::AllMatchersMatchNothing.into());
143            }
144            if current_str.len() == 0 {
145                break;
146            }
147        }
148        Ok(res)
149    }
150}
151
152#[derive(Debug, Display, thiserror::Error)]
153pub enum TokenizerError {
154    AllMatchersMatchNothing,
155}
156
157#[inline]
158pub fn build_tokenizer<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Tokenizer {
159    let mut tokenizer = Tokenizer::new(src);
160    for v in val {
161        tokenizer.add_common_pat(v);
162    }
163    tokenizer
164}
165
166#[inline]
167pub fn to_tokens<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Result<Vec<Token>> {
168    let mut tokenizer = build_tokenizer(val, src);
169    tokenizer.start()
170}
171
172#[inline]
173pub fn to_tokens_without_ws<TSrc: ToString>(val: Vec<Matcher>, src: TSrc) -> Result<Vec<Token>> {
174    let mut tokenizer = build_tokenizer(val, src);
175    Ok(filter_white_spaces(tokenizer.start()?))
176}
177
178#[inline]
179pub fn filter_white_spaces(val: Vec<Token>) -> Vec<Token> {
180    val.iter()
181        .filter(|x| !WHITE_SPACE_REGEX.is_match(&x.val))
182        .map(|x| x.clone())
183        .collect()
184}
185
186#[cfg(test)]
187mod tests {
188    use regex::Regex;
189
190    use super::*;
191
192    #[test]
193    #[allow(unused_must_use)]
194    fn t1() {
195        let src = "
196class Test {
197}";
198        let tokens = filter_white_spaces(
199            to_tokens(
200                vec![
201                    "class".into(),
202                    WHITE_SPACE_REGEX.clone().into(),
203                    Regex::new(r"\A[.[^\{\s]]+").unwrap().into(),
204                    "{".into(),
205                    "}".into(),
206                ],
207                src,
208            )
209            .unwrap(),
210        );
211        dbg!(tokens);
212    }
213    #[test]
214    #[allow(unused_must_use)]
215    fn test_string_parse() {
216        let src = r#""sudsier\" asdf \"""#;
217        let mut tokenizer = Tokenizer::new(src);
218        tokenizer.add_ws_pat();
219        tokenizer.add_regex_pat(r#"\A"[[.[^"]]\\"]*""#).unwrap();
220        dbg!(tokenizer.start());
221    }
222}