1use crate::error::ParserErrorKind;
2use lazy_static::lazy_static;
3use regex::Regex;
4use std::borrow::Cow;
5use std::collections::VecDeque;
6use std::fmt;
7use std::fmt::Display;
8
9static NOWIKI_OPEN: &str = "<nowiki>";
10static NOWIKI_CLOSE: &str = "</nowiki>";
11
12lazy_static! {
13 static ref TEXT_REGEX: Regex = Regex::new(&format!(
14 "(\\{{\\{{|\\}}\\}}|\\[\\[|\\]\\]|=|\\||'|\n|:|;|\\*|#|{NOWIKI_OPEN}|{NOWIKI_CLOSE})"
15 ))
16 .unwrap();
17}
18
19#[derive(Debug, Clone, Eq, PartialEq)]
20pub enum Token<'a> {
21 Text(Cow<'a, str>),
22 Equals,
23 DoubleOpenBrace,
24 DoubleCloseBrace,
25 DoubleOpenBracket,
26 DoubleCloseBracket,
27 NoWikiOpen,
28 NoWikiClose,
29 VerticalBar,
30 Apostrophe,
31 Colon,
32 Semicolon,
33 Star,
34 Sharp,
35 Newline,
36 Eof,
37}
38
39#[derive(Clone, Copy, Debug, Eq, PartialEq)]
41pub struct TextPosition {
42 pub line: usize,
44 pub column: usize,
46}
47
48impl TextPosition {
49 pub fn new(line: usize, column: usize) -> Self {
51 Self { line, column }
52 }
53}
54
55impl Default for TextPosition {
56 fn default() -> Self {
57 Self { line: 1, column: 1 }
58 }
59}
60
61#[derive(Clone, Debug)]
62pub struct PositionAwareStrIterator<'input> {
63 input: &'input str,
64 position: TextPosition,
65}
66
67impl<'input> PositionAwareStrIterator<'input> {
68 pub fn new<'input_argument: 'input>(input: &'input_argument str) -> Self {
69 Self {
70 input,
71 position: Default::default(),
72 }
73 }
74
75 pub fn remaining_input(&self) -> &'input str {
76 self.input
77 }
78
79 pub fn advance_until(&mut self, limit: usize) {
80 let mut cumulative_advancement = 0;
81 while cumulative_advancement < limit {
82 cumulative_advancement += self.advance_one();
83 }
84 assert_eq!(cumulative_advancement, limit);
85 }
86
87 pub fn advance_one(&mut self) -> usize {
88 assert!(!self.input.is_empty());
89 if self.input.starts_with('\n') {
90 self.position.line += 1;
91 self.position.column = 1;
92 } else {
93 self.position.column += 1;
94 }
95
96 if let Some((offset, _)) = self.input.char_indices().nth(1) {
97 self.input = &self.input[offset..];
98 offset
99 } else {
100 let offset = self.input.len();
101 self.input = &self.input[offset..];
102 offset
103 }
104 }
105
106 pub fn is_at_start(&self) -> bool {
108 self.position == Default::default()
109 }
110}
111
112pub struct Tokenizer<'input> {
113 input: PositionAwareStrIterator<'input>,
114}
115
116impl<'input> Tokenizer<'input> {
117 pub fn new<'input_argument: 'input>(input: &'input_argument str) -> Self {
118 Self {
119 input: PositionAwareStrIterator::new(input),
120 }
121 }
122
123 #[allow(unused)]
124 pub fn tokenize_all(&mut self) -> Vec<Token<'input>> {
125 let mut tokens = Vec::new();
126 while tokens.last() != Some(&Token::Eof) {
127 tokens.push(self.next());
128 }
129 tokens
130 }
131
132 pub fn next<'token, 'this>(&'this mut self) -> Token<'token>
133 where
134 'input: 'token + 'this,
135 {
136 let input = self.input.remaining_input();
137 if input.is_empty() {
138 Token::Eof
139 } else if input.starts_with(r"{{") {
140 self.input.advance_until(2);
141 Token::DoubleOpenBrace
142 } else if input.starts_with(r"}}") {
143 self.input.advance_until(2);
144 Token::DoubleCloseBrace
145 } else if input.starts_with("[[") {
146 self.input.advance_until(2);
147 Token::DoubleOpenBracket
148 } else if input.starts_with("]]") {
149 self.input.advance_until(2);
150 Token::DoubleCloseBracket
151 } else if input.starts_with(NOWIKI_OPEN) {
152 self.input.advance_until(NOWIKI_OPEN.len());
153 Token::NoWikiOpen
154 } else if input.starts_with(NOWIKI_CLOSE) {
155 self.input.advance_until(NOWIKI_CLOSE.len());
156 Token::NoWikiClose
157 } else if input.starts_with('=') {
158 self.input.advance_one();
159 Token::Equals
160 } else if input.starts_with('|') {
161 self.input.advance_one();
162 Token::VerticalBar
163 } else if input.starts_with('\'') {
164 self.input.advance_one();
165 Token::Apostrophe
166 } else if input.starts_with('\n') {
167 self.input.advance_one();
168 Token::Newline
169 } else if input.starts_with(':') {
170 self.input.advance_one();
171 Token::Colon
172 } else if input.starts_with(';') {
173 self.input.advance_one();
174 Token::Semicolon
175 } else if input.starts_with('*') {
176 self.input.advance_one();
177 Token::Star
178 } else if input.starts_with('#') {
179 self.input.advance_one();
180 Token::Sharp
181 } else if let Some(regex_match) = TEXT_REGEX.find(input) {
182 let result = Token::Text(input[..regex_match.start()].into());
183 self.input.advance_until(regex_match.start());
184 result
185 } else {
186 let result = Token::Text(self.input.remaining_input().into());
187 self.input.advance_until(input.len());
188 result
189 }
190 }
191
192 #[allow(unused)]
194 pub fn is_at_start(&self) -> bool {
195 self.input.is_at_start()
196 }
197}
198
199pub struct MultipeekTokenizer<'tokenizer> {
200 tokenizer: Tokenizer<'tokenizer>,
201 peek: VecDeque<(Token<'tokenizer>, TextPosition)>,
202 next_was_called: bool,
203}
204
205impl<'tokenizer> MultipeekTokenizer<'tokenizer> {
206 pub fn new(tokenizer: Tokenizer<'tokenizer>) -> Self {
207 Self {
208 tokenizer,
209 peek: VecDeque::new(),
210 next_was_called: false,
211 }
212 }
213
214 pub fn next<'token>(&mut self) -> (Token<'token>, TextPosition)
215 where
216 'tokenizer: 'token,
217 {
218 self.next_was_called = true;
219 if let Some((token, text_position)) = self.peek.pop_front() {
220 (token, text_position)
221 } else {
222 let text_position = self.tokenizer.input.position;
223 (self.tokenizer.next(), text_position)
224 }
225 }
226
227 pub fn peek(&mut self, distance: usize) -> &(Token, TextPosition) {
228 while self.peek.len() < distance + 1 {
229 let text_position = self.tokenizer.input.position;
230 self.peek.push_back((self.tokenizer.next(), text_position));
231 }
232 &self.peek[distance]
233 }
234
235 pub fn repeek(&self, distance: usize) -> Option<&(Token, TextPosition)> {
239 self.peek.get(distance)
240 }
241
242 pub fn expect(&mut self, token: &Token) -> crate::error::Result<()> {
243 let (next, text_position) = self.next();
244 if &next == token {
245 Ok(())
246 } else {
247 Err(ParserErrorKind::UnexpectedToken {
248 expected: token.to_string(),
249 actual: next.to_string(),
250 }
251 .into_parser_error(text_position))
252 }
253 }
254
255 #[allow(unused)]
257 pub fn is_at_start(&self) -> bool {
258 !self.next_was_called
259 }
260}
261
262impl<'token> Display for Token<'token> {
263 fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
264 write!(fmt, "{}", self.to_str())
265 }
266}
267
268impl Token<'_> {
269 pub fn to_str(&self) -> &str {
270 match self {
271 Token::Text(text) => text,
272 Token::Equals => "=",
273 Token::DoubleOpenBrace => "{{",
274 Token::DoubleCloseBrace => "}}",
275 Token::DoubleOpenBracket => "[[",
276 Token::DoubleCloseBracket => "]]",
277 Token::NoWikiOpen => NOWIKI_OPEN,
278 Token::NoWikiClose => NOWIKI_CLOSE,
279 Token::VerticalBar => "|",
280 Token::Apostrophe => "'",
281 Token::Newline => "\n",
282 Token::Colon => ":",
283 Token::Semicolon => ";",
284 Token::Star => "*",
285 Token::Sharp => "#",
286 Token::Eof => "<EOF>",
287 }
288 }
289}
290
291#[cfg(test)]
292mod tests {
293 use crate::tokenizer::{Token, Tokenizer};
294
295 #[test]
296 fn simple() {
297 let input = "{{==a= v}} }} } edf } } [ {";
298 let mut tokenizer = Tokenizer::new(input);
299 let tokens = tokenizer.tokenize_all();
300 assert_eq!(
301 tokens.as_slice(),
302 [
303 Token::DoubleOpenBrace,
304 Token::Equals,
305 Token::Equals,
306 Token::Text("a".into()),
307 Token::Equals,
308 Token::Text(" v".into()),
309 Token::DoubleCloseBrace,
310 Token::Text(" ".into()),
311 Token::DoubleCloseBrace,
312 Token::Text(" } edf } } [ {".into()),
313 Token::Eof,
314 ]
315 );
316 }
317}