wikitext_parser/
tokenizer.rs

1use crate::error::ParserErrorKind;
2use lazy_static::lazy_static;
3use regex::Regex;
4use std::borrow::Cow;
5use std::collections::VecDeque;
6use std::fmt;
7use std::fmt::Display;
8
9static NOWIKI_OPEN: &str = "<nowiki>";
10static NOWIKI_CLOSE: &str = "</nowiki>";
11
12lazy_static! {
13    static ref TEXT_REGEX: Regex = Regex::new(&format!(
14        "(\\{{\\{{|\\}}\\}}|\\[\\[|\\]\\]|=|\\||'|\n|:|;|\\*|#|{NOWIKI_OPEN}|{NOWIKI_CLOSE})"
15    ))
16    .unwrap();
17}
18
19#[derive(Debug, Clone, Eq, PartialEq)]
20pub enum Token<'a> {
21    Text(Cow<'a, str>),
22    Equals,
23    DoubleOpenBrace,
24    DoubleCloseBrace,
25    DoubleOpenBracket,
26    DoubleCloseBracket,
27    NoWikiOpen,
28    NoWikiClose,
29    VerticalBar,
30    Apostrophe,
31    Colon,
32    Semicolon,
33    Star,
34    Sharp,
35    Newline,
36    Eof,
37}
38
39/// A position in a text.
40#[derive(Clone, Copy, Debug, Eq, PartialEq)]
41pub struct TextPosition {
42    /// One-based line number.
43    pub line: usize,
44    /// One-based column number.
45    pub column: usize,
46}
47
48impl TextPosition {
49    /// Create a new text position at the given `line` and `column`.
50    pub fn new(line: usize, column: usize) -> Self {
51        Self { line, column }
52    }
53}
54
55impl Default for TextPosition {
56    fn default() -> Self {
57        Self { line: 1, column: 1 }
58    }
59}
60
61#[derive(Clone, Debug)]
62pub struct PositionAwareStrIterator<'input> {
63    input: &'input str,
64    position: TextPosition,
65}
66
67impl<'input> PositionAwareStrIterator<'input> {
68    pub fn new<'input_argument: 'input>(input: &'input_argument str) -> Self {
69        Self {
70            input,
71            position: Default::default(),
72        }
73    }
74
75    pub fn remaining_input(&self) -> &'input str {
76        self.input
77    }
78
79    pub fn advance_until(&mut self, limit: usize) {
80        let mut cumulative_advancement = 0;
81        while cumulative_advancement < limit {
82            cumulative_advancement += self.advance_one();
83        }
84        assert_eq!(cumulative_advancement, limit);
85    }
86
87    pub fn advance_one(&mut self) -> usize {
88        assert!(!self.input.is_empty());
89        if self.input.starts_with('\n') {
90            self.position.line += 1;
91            self.position.column = 1;
92        } else {
93            self.position.column += 1;
94        }
95
96        if let Some((offset, _)) = self.input.char_indices().nth(1) {
97            self.input = &self.input[offset..];
98            offset
99        } else {
100            let offset = self.input.len();
101            self.input = &self.input[offset..];
102            offset
103        }
104    }
105
106    /// Returns `true` if the tokenizer has not yet been advanced.
107    pub fn is_at_start(&self) -> bool {
108        self.position == Default::default()
109    }
110}
111
112pub struct Tokenizer<'input> {
113    input: PositionAwareStrIterator<'input>,
114}
115
116impl<'input> Tokenizer<'input> {
117    pub fn new<'input_argument: 'input>(input: &'input_argument str) -> Self {
118        Self {
119            input: PositionAwareStrIterator::new(input),
120        }
121    }
122
123    #[allow(unused)]
124    pub fn tokenize_all(&mut self) -> Vec<Token<'input>> {
125        let mut tokens = Vec::new();
126        while tokens.last() != Some(&Token::Eof) {
127            tokens.push(self.next());
128        }
129        tokens
130    }
131
132    pub fn next<'token, 'this>(&'this mut self) -> Token<'token>
133    where
134        'input: 'token + 'this,
135    {
136        let input = self.input.remaining_input();
137        if input.is_empty() {
138            Token::Eof
139        } else if input.starts_with(r"{{") {
140            self.input.advance_until(2);
141            Token::DoubleOpenBrace
142        } else if input.starts_with(r"}}") {
143            self.input.advance_until(2);
144            Token::DoubleCloseBrace
145        } else if input.starts_with("[[") {
146            self.input.advance_until(2);
147            Token::DoubleOpenBracket
148        } else if input.starts_with("]]") {
149            self.input.advance_until(2);
150            Token::DoubleCloseBracket
151        } else if input.starts_with(NOWIKI_OPEN) {
152            self.input.advance_until(NOWIKI_OPEN.len());
153            Token::NoWikiOpen
154        } else if input.starts_with(NOWIKI_CLOSE) {
155            self.input.advance_until(NOWIKI_CLOSE.len());
156            Token::NoWikiClose
157        } else if input.starts_with('=') {
158            self.input.advance_one();
159            Token::Equals
160        } else if input.starts_with('|') {
161            self.input.advance_one();
162            Token::VerticalBar
163        } else if input.starts_with('\'') {
164            self.input.advance_one();
165            Token::Apostrophe
166        } else if input.starts_with('\n') {
167            self.input.advance_one();
168            Token::Newline
169        } else if input.starts_with(':') {
170            self.input.advance_one();
171            Token::Colon
172        } else if input.starts_with(';') {
173            self.input.advance_one();
174            Token::Semicolon
175        } else if input.starts_with('*') {
176            self.input.advance_one();
177            Token::Star
178        } else if input.starts_with('#') {
179            self.input.advance_one();
180            Token::Sharp
181        } else if let Some(regex_match) = TEXT_REGEX.find(input) {
182            let result = Token::Text(input[..regex_match.start()].into());
183            self.input.advance_until(regex_match.start());
184            result
185        } else {
186            let result = Token::Text(self.input.remaining_input().into());
187            self.input.advance_until(input.len());
188            result
189        }
190    }
191
192    /// Returns `true` if the tokenizer has not yet been advanced.
193    #[allow(unused)]
194    pub fn is_at_start(&self) -> bool {
195        self.input.is_at_start()
196    }
197}
198
199pub struct MultipeekTokenizer<'tokenizer> {
200    tokenizer: Tokenizer<'tokenizer>,
201    peek: VecDeque<(Token<'tokenizer>, TextPosition)>,
202    next_was_called: bool,
203}
204
205impl<'tokenizer> MultipeekTokenizer<'tokenizer> {
206    pub fn new(tokenizer: Tokenizer<'tokenizer>) -> Self {
207        Self {
208            tokenizer,
209            peek: VecDeque::new(),
210            next_was_called: false,
211        }
212    }
213
214    pub fn next<'token>(&mut self) -> (Token<'token>, TextPosition)
215    where
216        'tokenizer: 'token,
217    {
218        self.next_was_called = true;
219        if let Some((token, text_position)) = self.peek.pop_front() {
220            (token, text_position)
221        } else {
222            let text_position = self.tokenizer.input.position;
223            (self.tokenizer.next(), text_position)
224        }
225    }
226
227    pub fn peek(&mut self, distance: usize) -> &(Token, TextPosition) {
228        while self.peek.len() < distance + 1 {
229            let text_position = self.tokenizer.input.position;
230            self.peek.push_back((self.tokenizer.next(), text_position));
231        }
232        &self.peek[distance]
233    }
234
235    /// Peeks a position inside the current peek buffer.
236    /// If the position and no position after it was not yet peeked, returns `None`.
237    /// This is useful because it does not require a mutable reference to self.
238    pub fn repeek(&self, distance: usize) -> Option<&(Token, TextPosition)> {
239        self.peek.get(distance)
240    }
241
242    pub fn expect(&mut self, token: &Token) -> crate::error::Result<()> {
243        let (next, text_position) = self.next();
244        if &next == token {
245            Ok(())
246        } else {
247            Err(ParserErrorKind::UnexpectedToken {
248                expected: token.to_string(),
249                actual: next.to_string(),
250            }
251            .into_parser_error(text_position))
252        }
253    }
254
255    /// Returns `true` if the tokenizer has not yet been advanced.
256    #[allow(unused)]
257    pub fn is_at_start(&self) -> bool {
258        !self.next_was_called
259    }
260}
261
262impl<'token> Display for Token<'token> {
263    fn fmt(&self, fmt: &mut fmt::Formatter) -> Result<(), fmt::Error> {
264        write!(fmt, "{}", self.to_str())
265    }
266}
267
268impl Token<'_> {
269    pub fn to_str(&self) -> &str {
270        match self {
271            Token::Text(text) => text,
272            Token::Equals => "=",
273            Token::DoubleOpenBrace => "{{",
274            Token::DoubleCloseBrace => "}}",
275            Token::DoubleOpenBracket => "[[",
276            Token::DoubleCloseBracket => "]]",
277            Token::NoWikiOpen => NOWIKI_OPEN,
278            Token::NoWikiClose => NOWIKI_CLOSE,
279            Token::VerticalBar => "|",
280            Token::Apostrophe => "'",
281            Token::Newline => "\n",
282            Token::Colon => ":",
283            Token::Semicolon => ";",
284            Token::Star => "*",
285            Token::Sharp => "#",
286            Token::Eof => "<EOF>",
287        }
288    }
289}
290
291#[cfg(test)]
292mod tests {
293    use crate::tokenizer::{Token, Tokenizer};
294
295    #[test]
296    fn simple() {
297        let input = "{{==a=  v}} }} } edf } } [ {";
298        let mut tokenizer = Tokenizer::new(input);
299        let tokens = tokenizer.tokenize_all();
300        assert_eq!(
301            tokens.as_slice(),
302            [
303                Token::DoubleOpenBrace,
304                Token::Equals,
305                Token::Equals,
306                Token::Text("a".into()),
307                Token::Equals,
308                Token::Text("  v".into()),
309                Token::DoubleCloseBrace,
310                Token::Text(" ".into()),
311                Token::DoubleCloseBrace,
312                Token::Text(" } edf } } [ {".into()),
313                Token::Eof,
314            ]
315        );
316    }
317}