note_mark/layer/
lexer.rs

1//! Lexer for the Markdown syntax.
2//!
3//! This module contains the lexer for the Markdown syntax. At this stage, the
4//! text is joined and non-breakable spaces are removed.
5
6use peekmore::{PeekMore, PeekMoreIterator};
7use std::iter::Peekable;
8
9use crate::model::token::*;
10
11/// Split a string into tokens.
12///
13/// This returns an iterator that yields tokens. This iterator has static
14/// lifetime, though tokens refer to the input string.
15pub fn lex(input: &'_ str) -> impl Iterator<Item = Token> + '_ {
16    let lexer = Lexer::new(input);
17    let lexer = TextJoiner::new(lexer);
18    SpaceCutter::new(lexer)
19}
20
21struct Lexer<'a> {
22    input: &'a str,
23    cursor: usize,
24}
25
26impl<'a> Lexer<'a> {
27    fn new(input: &'a str) -> Self {
28        Self { input, cursor: 0 }
29    }
30}
31
32impl<'a> Iterator for Lexer<'a> {
33    type Item = Token;
34
35    fn next(&mut self) -> Option<Self::Item> {
36        let mut chars = self.input.char_indices().skip(self.cursor).peekable();
37
38        let (kind, start, len) = if let Some((index, c)) = chars.next() {
39            let len = c.len_utf8();
40
41            let (kind, len) = match c {
42                '#' => (TokenKind::Pound, len),
43                '*' => (TokenKind::Star, len),
44                ':' => (TokenKind::Colon, len),
45                '`' => (TokenKind::Backquote, len),
46                '>' => (TokenKind::Gt, len),
47                '-' => (TokenKind::Hyphen, len),
48                '|' => (TokenKind::VerticalBar, len),
49                '.' => (TokenKind::Dot, len),
50                '(' => (TokenKind::OpenParen, len),
51                ')' => (TokenKind::CloseParen, len),
52                '{' => (TokenKind::OpenBrace, len),
53                '}' => (TokenKind::CloseBrace, len),
54                '[' => (TokenKind::OpenBracket, len),
55                ']' => (TokenKind::CloseBracket, len),
56                ' ' => (TokenKind::Space, len),
57                '\t' => (TokenKind::Tab, len),
58                '\n' => (TokenKind::Break, len),
59                '\r' => {
60                    if let Some((_, c2)) = chars.next_if(|(_, c2)| c2 == &'\n') {
61                        (TokenKind::Break, len + c2.len_utf8())
62                    } else {
63                        (TokenKind::Text, len)
64                    }
65                }
66                '\\' => {
67                    if let Some((_, c2)) = chars.next_if(|(_, c2)| {
68                        matches!(
69                            c2,
70                            '#' | '*'
71                                | ':'
72                                | '`'
73                                | '>'
74                                | '-'
75                                | '|'
76                                | '.'
77                                | '('
78                                | ')'
79                                | '{'
80                                | '}'
81                                | '['
82                                | ']'
83                                | '\\'
84                        )
85                    }) {
86                        self.cursor += len + c2.len_utf8();
87                        return Some(Token {
88                            kind: TokenKind::Text,
89                            start: index + len,
90                            len: c2.len_utf8(),
91                        });
92                    } else {
93                        (TokenKind::Text, len)
94                    }
95                }
96                _ => (TokenKind::Text, len),
97            };
98
99            (kind, index, len)
100        } else {
101            return None;
102        };
103
104        self.cursor += len;
105
106        Some(Token { kind, start, len })
107    }
108}
109
110struct TextJoiner<T: Iterator<Item = Token>> {
111    iter: Peekable<T>,
112}
113
114impl<T: Iterator<Item = Token>> TextJoiner<T> {
115    fn new(iter: T) -> Self {
116        Self {
117            iter: iter.peekable(),
118        }
119    }
120}
121
122impl<T: Iterator<Item = Token>> Iterator for TextJoiner<T> {
123    type Item = Token;
124
125    fn next(&mut self) -> Option<Self::Item> {
126        let mut token = self.iter.next()?;
127
128        if token.kind == TokenKind::Text {
129            while let Some(next) = self.iter.peek() {
130                if next.kind == TokenKind::Text {
131                    token.len += next.len;
132                    self.iter.next();
133                } else {
134                    break;
135                }
136            }
137        }
138
139        Some(token)
140    }
141}
142
143struct SpaceCutter<T: Iterator<Item = Token>> {
144    iter: PeekMoreIterator<T>,
145}
146
147impl<T: Iterator<Item = Token>> SpaceCutter<T> {
148    fn new(iter: T) -> Self {
149        Self {
150            iter: iter.peekmore(),
151        }
152    }
153}
154
155impl<T: Iterator<Item = Token>> Iterator for SpaceCutter<T> {
156    type Item = Token;
157
158    fn next(&mut self) -> Option<Self::Item> {
159        let token = self.iter.next()?;
160
161        use TokenKind::*;
162
163        if token.kind == Break
164            && self.iter.peek().is_some()
165            && self.iter.peek().unwrap().kind != Break
166        {
167            for n in 0.. {
168                if let Some(nth) = self.iter.peek_nth(n) {
169                    match nth.kind {
170                        Space | Tab => continue,
171                        Break => {
172                            self.iter.nth(n - 1).unwrap();
173                            break;
174                        }
175                        _ => break,
176                    }
177                } else {
178                    self.iter.nth(n - 1).unwrap();
179                    break;
180                }
181            }
182        }
183
184        Some(token)
185    }
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn test_lexer() {
194        let mut lexer = Lexer::new("## Hello\n");
195
196        assert_eq!(lexer.next().unwrap().kind, TokenKind::Pound);
197        assert_eq!(lexer.next().unwrap().kind, TokenKind::Pound);
198        assert_eq!(lexer.next().unwrap().kind, TokenKind::Space);
199        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
200        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
201        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
202        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
203        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
204        assert_eq!(lexer.next().unwrap().kind, TokenKind::Break);
205        assert_eq!(lexer.next(), None);
206
207        let mut lexer = Lexer::new("\r\n");
208
209        assert_eq!(lexer.next().unwrap().kind, TokenKind::Break);
210        assert_eq!(lexer.next(), None);
211
212        let mut lexer = Lexer::new("\r");
213
214        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
215        assert_eq!(lexer.next(), None);
216
217        let mut lexer = Lexer::new(r"\# Q");
218
219        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
220        assert_eq!(lexer.next().unwrap().kind, TokenKind::Space);
221        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
222        assert_eq!(lexer.next(), None);
223    }
224
225    #[test]
226    fn test_text_jointer() {
227        let mut lexer = TextJoiner::new(Lexer::new("## Hello Q\n"));
228
229        assert_eq!(lexer.next().unwrap().kind, TokenKind::Pound);
230        assert_eq!(lexer.next().unwrap().kind, TokenKind::Pound);
231        assert_eq!(lexer.next().unwrap().kind, TokenKind::Space);
232        assert_eq!(
233            lexer.next().unwrap(),
234            Token {
235                kind: TokenKind::Text,
236                start: 3,
237                len: 5
238            }
239        );
240        assert_eq!(lexer.next().unwrap().kind, TokenKind::Space);
241        assert_eq!(
242            lexer.next().unwrap(),
243            Token {
244                kind: TokenKind::Text,
245                start: 9,
246                len: 1
247            }
248        );
249        assert_eq!(lexer.next().unwrap().kind, TokenKind::Break);
250        assert_eq!(lexer.next(), None);
251    }
252
253    #[test]
254    fn test_space_cutter() {
255        let mut lexer = SpaceCutter::new(Lexer::new("ABC\n  \nDEF"));
256
257        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
258        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
259        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
260        assert_eq!(lexer.next().unwrap().kind, TokenKind::Break);
261        assert_eq!(lexer.next().unwrap().kind, TokenKind::Break);
262        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
263        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
264        assert_eq!(lexer.next().unwrap().kind, TokenKind::Text);
265    }
266}