Skip to main content

oak_markdown/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the Markdown language.
3pub mod token_type;
4
5mod block;
6mod inline;
7mod list;
8
9use crate::{language::MarkdownLanguage, lexer::token_type::MarkdownTokenType};
10use oak_core::{Lexer, LexerCache, LexerState, TextEdit, errors::OakError, lexer::LexOutput, source::Source};
11
12pub(crate) type State<'a, S> = LexerState<'a, S, MarkdownLanguage>;
13
14/// Lexer for Markdown language.
15#[derive(Clone, Debug)]
16pub struct MarkdownLexer<'config> {
17    config: &'config MarkdownLanguage,
18}
19
20impl<'config> MarkdownLexer<'config> {
21    /// Creates a new MarkdownLexer with the given configuration.
22    pub fn new(config: &'config MarkdownLanguage) -> Self {
23        Self { config }
24    }
25
26    fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
27        while state.not_at_end() {
28            let safe_point = state.get_position();
29
30            if let Some(ch) = state.peek() {
31                match ch {
32                    ' ' | '\t' => {
33                        if self.config.allow_indented_code_blocks && self.lex_indented_code_block(state) {
34                            continue;
35                        }
36                        self.skip_whitespace(state);
37                    }
38                    '\n' | '\r' => {
39                        self.lex_newline(state);
40                    }
41                    '$' if self.config.allow_math => {
42                        if self.lex_math(state) {
43                            continue;
44                        }
45                        self.lex_special_char(state);
46                    }
47                    '^' if self.config.allow_subscript || self.config.allow_footnotes => {
48                        if self.config.allow_footnotes && self.lex_footnote(state) {
49                            continue;
50                        }
51                        if self.config.allow_subscript && self.lex_sub_superscript(state) {
52                            continue;
53                        }
54                        self.lex_special_char(state);
55                    }
56                    '#' => {
57                        if self.config.allow_headings && self.lex_heading(state) {
58                            continue;
59                        }
60                        self.lex_special_char(state);
61                    }
62                    '`' => {
63                        if self.config.allow_fenced_code_blocks && self.lex_code_block(state) {
64                            continue;
65                        }
66                        if self.lex_inline_code(state) {
67                            continue;
68                        }
69                        self.lex_special_char(state);
70                    }
71                    '~' => {
72                        if self.lex_code_block(state) {
73                            continue;
74                        }
75                        if self.config.allow_strikethrough && self.lex_strikethrough(state) {
76                            continue;
77                        }
78                        if self.config.allow_subscript && self.lex_sub_superscript(state) {
79                            continue;
80                        }
81                        self.lex_special_char(state);
82                    }
83                    '*' | '_' => {
84                        if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
85                            continue;
86                        }
87                        if self.config.allow_lists && self.lex_list_marker(state) {
88                            continue;
89                        }
90                        if self.lex_emphasis(state) {
91                            continue;
92                        }
93                        if self.config.allow_abbreviations && self.lex_abbreviation(state) {
94                            continue;
95                        }
96                        self.lex_special_char(state);
97                    }
98                    '-' => {
99                        if self.config.allow_front_matter && self.lex_front_matter(state) {
100                            continue;
101                        }
102                        if self.config.allow_horizontal_rules && self.lex_horizontal_rule(state) {
103                            continue;
104                        }
105                        if self.config.allow_lists && self.lex_list_marker(state) {
106                            continue;
107                        }
108                        self.lex_special_char(state);
109                    }
110                    '+' => {
111                        if self.config.allow_lists && self.lex_list_marker(state) {
112                            continue;
113                        }
114                        self.lex_special_char(state);
115                    }
116                    '!' => {
117                        if self.lex_link_or_image(state) {
118                            continue;
119                        }
120                        self.lex_special_char(state);
121                    }
122                    '[' => {
123                        if self.config.allow_task_lists && self.lex_task_marker(state) {
124                            continue;
125                        }
126                        if self.lex_link_or_image(state) {
127                            continue;
128                        }
129                        self.lex_special_char(state);
130                    }
131                    '>' => {
132                        if self.config.allow_blockquotes && self.lex_blockquote(state) {
133                            continue;
134                        }
135                        self.lex_special_char(state);
136                    }
137                    ':' => {
138                        if self.config.allow_definition_lists && self.lex_definition_description(state) {
139                            continue;
140                        }
141                        self.lex_special_char(state);
142                    }
143                    '|' if self.config.allow_tables => {
144                        self.lex_special_char(state);
145                    }
146                    '0'..='9' => {
147                        if self.lex_list_marker(state) {
148                            continue;
149                        }
150                        self.lex_text(state);
151                    }
152                    '<' => {
153                        if self.config.allow_html && self.lex_html_tag(state) {
154                            continue;
155                        }
156                        if self.config.allow_xml && self.lex_xml_tag(state) {
157                            continue;
158                        }
159                        self.lex_special_char(state);
160                    }
161                    ']' | '(' | ')' | '|' | '.' | '\\' => {
162                        self.lex_special_char(state);
163                    }
164                    _ => {
165                        if self.lex_text(state) {
166                            continue;
167                        }
168                        let start_pos = state.get_position();
169                        state.advance(ch.len_utf8());
170                        state.add_token(MarkdownTokenType::Error, start_pos, state.get_position());
171                    }
172                }
173            }
174
175            state.advance_if_dead_lock(safe_point)
176        }
177        Ok(())
178    }
179
180    /// Skips whitespace
181    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
182        let start_pos = state.get_position();
183
184        while let Some(ch) = state.peek() {
185            if ch == ' ' || ch == '\t' {
186                state.advance(ch.len_utf8());
187            }
188            else {
189                break;
190            }
191        }
192
193        if state.get_position() > start_pos {
194            state.add_token(MarkdownTokenType::Whitespace, start_pos, state.get_position());
195            true
196        }
197        else {
198            false
199        }
200    }
201
202    /// Handles newlines
203    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
204        let start_pos = state.get_position();
205
206        if let Some('\n') = state.peek() {
207            state.advance(1);
208            state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
209            true
210        }
211        else if let Some('\r') = state.peek() {
212            state.advance(1);
213            if let Some('\n') = state.peek() {
214                state.advance(1);
215            }
216            state.add_token(MarkdownTokenType::Newline, start_pos, state.get_position());
217            true
218        }
219        else {
220            false
221        }
222    }
223
224    /// Handles HTML tags or comments.
225    fn lex_html_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
226        self.lex_any_tag(state, MarkdownTokenType::HtmlTag, MarkdownTokenType::HtmlComment)
227    }
228
229    /// Handles XML tags or comments.
230    fn lex_xml_tag<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
231        self.lex_any_tag(state, MarkdownTokenType::XmlTag, MarkdownTokenType::XmlComment)
232    }
233
234    /// Common tag handling logic.
235    fn lex_any_tag<S: Source + ?Sized>(&self, state: &mut State<S>, tag_kind: MarkdownTokenType, comment_kind: MarkdownTokenType) -> bool {
236        let start_pos = state.get_position();
237
238        if let Some('<') = state.peek() {
239            state.advance(1);
240
241            if let Some('!') = state.peek() {
242                if state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('-') {
243                    state.advance(3);
244                    let mut found_end = false;
245                    while let Some(ch) = state.peek() {
246                        if ch == '-' && state.source().get_char_at(state.get_position() + 1) == Some('-') && state.source().get_char_at(state.get_position() + 2) == Some('>') {
247                            state.advance(3);
248                            found_end = true;
249                            break;
250                        }
251                        state.advance(ch.len_utf8());
252                    }
253                    if found_end {
254                        state.add_token(comment_kind, start_pos, state.get_position());
255                        return true;
256                    }
257                }
258            }
259
260            let mut found_end = false;
261            let mut in_string = None;
262
263            while let Some(ch) = state.peek() {
264                if let Some(quote) = in_string {
265                    if ch == quote {
266                        in_string = None;
267                    }
268                }
269                else {
270                    if ch == '>' {
271                        state.advance(1);
272                        found_end = true;
273                        break;
274                    }
275                    else if ch == '"' || ch == '\'' {
276                        in_string = Some(ch);
277                    }
278                }
279                state.advance(ch.len_utf8());
280            }
281
282            if found_end {
283                state.add_token(tag_kind, start_pos, state.get_position());
284                true
285            }
286            else {
287                state.set_position(start_pos);
288                false
289            }
290        }
291        else {
292            false
293        }
294    }
295
296    /// Lexes special characters.
297    fn lex_special_char<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
298        let start_pos = state.get_position();
299
300        if let Some(ch) = state.peek() {
301            let token_kind = match ch {
302                '[' => MarkdownTokenType::LBracket,
303                ']' => MarkdownTokenType::RBracket,
304                '(' => MarkdownTokenType::LParen,
305                ')' => MarkdownTokenType::RParen,
306                '<' => MarkdownTokenType::Less,
307                '>' => MarkdownTokenType::Greater,
308                '*' => MarkdownTokenType::Asterisk,
309                '_' => MarkdownTokenType::Underscore,
310                '`' => MarkdownTokenType::Backtick,
311                '~' => MarkdownTokenType::Tilde,
312                '#' => MarkdownTokenType::Hash,
313                '|' => MarkdownTokenType::Pipe,
314                '-' => MarkdownTokenType::Dash,
315                '+' => MarkdownTokenType::Plus,
316                '.' => MarkdownTokenType::Dot,
317                ':' => MarkdownTokenType::Colon,
318                '!' => MarkdownTokenType::Exclamation,
319                '\\' => MarkdownTokenType::Escape,
320                '$' => MarkdownTokenType::Dollar,
321                '^' => MarkdownTokenType::Caret,
322                _ => return false,
323            };
324
325            state.advance(ch.len_utf8());
326            state.add_token(token_kind, start_pos, state.get_position());
327            true
328        }
329        else {
330            false
331        }
332    }
333
334    /// Lexes automatic links (HTTP/HTTPS URLs).
335    fn lex_auto_link<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
336        let start_pos = state.get_position();
337
338        if state.source().get_char_at(start_pos) == Some('h') && state.source().get_char_at(start_pos + 1) == Some('t') && state.source().get_char_at(start_pos + 2) == Some('t') && state.source().get_char_at(start_pos + 3) == Some('p') {
339            let mut pos = start_pos + 4;
340            if state.source().get_char_at(pos) == Some('s') {
341                pos += 1;
342            }
343
344            if state.source().get_char_at(pos) == Some(':') && state.source().get_char_at(pos + 1) == Some('/') && state.source().get_char_at(pos + 2) == Some('/') {
345                pos += 3;
346
347                while pos < state.source().length() {
348                    if let Some(ch) = state.source().get_char_at(pos) {
349                        if ch.is_alphanumeric() || ch == '-' || ch == '_' || ch == '.' || ch == '/' || ch == '?' || ch == '=' || ch == '&' || ch == '#' || ch == '%' {
350                            pos += 1;
351                        }
352                        else {
353                            break;
354                        }
355                    }
356                    else {
357                        break;
358                    }
359                }
360
361                if pos > start_pos + 7 {
362                    state.set_position(pos);
363                    state.add_token(MarkdownTokenType::AutoLink, start_pos, pos);
364                    return true;
365                }
366            }
367        }
368
369        false
370    }
371
372    /// Lexes plain text.
373    fn lex_text<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
374        if self.lex_auto_link(state) {
375            return true;
376        }
377
378        let start_pos = state.get_position();
379
380        while let Some(ch) = state.peek() {
381            match ch {
382                ' ' | '\t' | '\n' | '\r' | '#' | '*' | '_' | '`' | '~' | '[' | ']' | '(' | ')' | '<' | '>' | '|' | '-' | '+' | '.' | ':' | '!' | '\\' | '$' | '^' => break,
383                _ => {
384                    state.advance(ch.len_utf8());
385                }
386            }
387        }
388
389        if state.get_position() > start_pos {
390            state.add_token(MarkdownTokenType::Text, start_pos, state.get_position());
391            true
392        }
393        else {
394            false
395        }
396    }
397}
398
399impl<'config> Lexer<MarkdownLanguage> for MarkdownLexer<'config> {
400    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MarkdownLanguage>) -> LexOutput<MarkdownLanguage> {
401        let mut state = State::new(text);
402        let result = self.run(&mut state);
403        if result.is_ok() {
404            state.add_eof();
405        }
406        state.finish_with_cache(result, cache)
407    }
408}
409
410impl<'config> MarkdownLexer<'config> {
411    /// Runs the lexer on the given source and returns the output.
412    pub fn lex_internal<'a, S: Source + ?Sized>(&self, source: &'a S) -> LexOutput<MarkdownLanguage> {
413        let mut state = State::new(source);
414        let result = self.run(&mut state);
415        state.finish(result)
416    }
417}