harper_core/parsers/
markdown.rs

1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse
9/// CommonMark files.
10///
11/// Will ignore code blocks and tables.
12#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14    options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20    pub ignore_link_title: bool,
21}
22
23// Clippy rule excepted because this can easily be expanded later
24#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            ignore_link_title: false,
29        }
30    }
31}
32
33impl Markdown {
34    pub fn new(options: MarkdownOptions) -> Self {
35        Self { options }
36    }
37
38    /// Remove hidden Wikilink target text.
39    ///
40    /// As in the stuff to the left of the pipe operator:
41    ///
42    /// ```markdown
43    /// [[Target text|Display Text]]
44    /// ```
45    fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46        let mut to_remove = VecDeque::new();
47
48        for pipe_idx in tokens.iter_pipe_indices() {
49            if pipe_idx < 2 {
50                continue;
51            }
52
53            // Locate preceding `[[`
54            let mut cursor = pipe_idx - 2;
55            let mut open_bracket = None;
56
57            loop {
58                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59                    break;
60                };
61
62                if a.kind.is_newline() {
63                    break;
64                }
65
66                if a.kind.is_open_square() && b.kind.is_open_square() {
67                    open_bracket = Some(cursor);
68                    break;
69                } else if cursor == 0 {
70                    break;
71                } else {
72                    cursor -= 1;
73                }
74            }
75
76            // Locate succeeding `[[`
77            cursor = pipe_idx + 1;
78            let mut close_bracket = None;
79
80            loop {
81                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82                    break;
83                };
84
85                if a.kind.is_newline() {
86                    break;
87                }
88
89                if a.kind.is_close_square() && b.kind.is_close_square() {
90                    close_bracket = Some(cursor);
91                    break;
92                } else {
93                    cursor += 1;
94                }
95            }
96
97            if let Some(open_bracket_idx) = open_bracket {
98                if let Some(close_bracket_idx) = close_bracket {
99                    to_remove.extend(open_bracket_idx..=pipe_idx);
100                    to_remove.push_back(close_bracket_idx);
101                    to_remove.push_back(close_bracket_idx + 1);
102                }
103            }
104        }
105
106        tokens.remove_indices(to_remove);
107    }
108
109    /// Remove the brackets from Wikilinks without pipe operators.
110    /// For __those__ Wikilinks, see [`Self::remove_hidden_wikilink_tokens`]
111    fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112        let mut to_remove = VecDeque::new();
113        let mut open_brackets = None;
114
115        let mut cursor = 0;
116
117        loop {
118            let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119                break;
120            };
121
122            if let Some(open_brackets_idx) = open_brackets {
123                if a.kind.is_newline() {
124                    open_brackets = None;
125                    cursor += 1;
126                    continue;
127                }
128
129                if a.kind.is_close_square() && b.kind.is_close_square() {
130                    to_remove.push_back(open_brackets_idx);
131                    to_remove.push_back(open_brackets_idx + 1);
132
133                    to_remove.push_back(cursor);
134                    to_remove.push_back(cursor + 1);
135
136                    open_brackets = None;
137                }
138            } else if a.kind.is_open_square() && b.kind.is_open_square() {
139                open_brackets = Some(cursor);
140            }
141
142            cursor += 1;
143        }
144
145        tokens.remove_indices(to_remove);
146    }
147}
148
149impl Parser for Markdown {
150    /// This implementation is quite gross to look at, but it works.
151    /// If any issues arise, it would likely help to refactor this out first.
152    fn parse(&self, source: &[char]) -> Vec<Token> {
153        let english_parser = PlainEnglish;
154
155        let source_str: String = source.iter().collect();
156        let md_parser = pulldown_cmark::Parser::new_ext(
157            &source_str,
158            pulldown_cmark::Options::all()
159                .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160        );
161
162        let mut tokens = Vec::new();
163
164        let mut traversed_bytes = 0;
165        let mut traversed_chars = 0;
166
167        let mut stack = Vec::new();
168
169        // NOTE: the range spits out __byte__ indices, not char indices.
170        // This is why we keep track above.
171        for (event, range) in md_parser.into_offset_iter() {
172            if range.start > traversed_bytes {
173                traversed_chars += source_str[traversed_bytes..range.start].chars().count();
174                traversed_bytes = range.start;
175            }
176
177            match event {
178                pulldown_cmark::Event::SoftBreak => {
179                    tokens.push(Token {
180                        span: Span::new_with_len(traversed_chars, 1),
181                        kind: TokenKind::Newline(1),
182                    });
183                }
184                pulldown_cmark::Event::HardBreak => {
185                    tokens.push(Token {
186                        span: Span::new_with_len(traversed_chars, 1),
187                        kind: TokenKind::Newline(2),
188                    });
189                }
190                pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
191                    tokens.push(Token {
192                        span: Span::new_with_len(traversed_chars, 0),
193                        kind: TokenKind::Newline(2),
194                    });
195                    stack.push(pulldown_cmark::Tag::List(v));
196                }
197                pulldown_cmark::Event::Start(tag) => stack.push(tag),
198                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
199                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
200                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
201                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
202                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
203                    tokens.push(Token {
204                        // We cannot use `traversed_chars` here, as it will still point to the
205                        // first character of the `Event` at this point. Instead, we use the
206                        // position of the previous token's last character. This ensures the
207                        // paragraph break is placed at the end of the content, not its beginning.
208                        // For more info, see: https://github.com/Automattic/harper/pull/1239.
209                        span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
210                        kind: TokenKind::ParagraphBreak,
211                    });
212                    stack.pop();
213                }
214                pulldown_cmark::Event::End(_) => {
215                    stack.pop();
216                }
217                pulldown_cmark::Event::InlineMath(code)
218                | pulldown_cmark::Event::DisplayMath(code)
219                | pulldown_cmark::Event::Code(code) => {
220                    let chunk_len = code.chars().count();
221
222                    tokens.push(Token {
223                        span: Span::new_with_len(traversed_chars, chunk_len),
224                        kind: TokenKind::Unlintable,
225                    });
226                }
227                pulldown_cmark::Event::Text(text) => {
228                    let chunk_len = text.chars().count();
229
230                    if let Some(tag) = stack.last() {
231                        use pulldown_cmark::Tag;
232
233                        if matches!(tag, Tag::CodeBlock(..)) {
234                            tokens.push(Token {
235                                span: Span::new_with_len(traversed_chars, text.chars().count()),
236                                kind: TokenKind::Unlintable,
237                            });
238                            continue;
239                        }
240                        if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
241                            tokens.push(Token {
242                                span: Span::new_with_len(traversed_chars, text.chars().count()),
243                                kind: TokenKind::Unlintable,
244                            });
245                            continue;
246                        }
247                        if !(matches!(tag, Tag::Paragraph)
248                            || matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title
249                            || matches!(tag, Tag::Heading { .. })
250                            || matches!(tag, Tag::Item)
251                            || matches!(tag, Tag::TableCell)
252                            || matches!(tag, Tag::Emphasis)
253                            || matches!(tag, Tag::Strong)
254                            || matches!(tag, Tag::Strikethrough))
255                        {
256                            continue;
257                        }
258                    }
259
260                    let mut new_tokens =
261                        english_parser.parse(&source[traversed_chars..traversed_chars + chunk_len]);
262
263                    new_tokens
264                        .iter_mut()
265                        .for_each(|token| token.span.push_by(traversed_chars));
266
267                    tokens.append(&mut new_tokens);
268                }
269                // TODO: Support via `harper-html`
270                pulldown_cmark::Event::Html(_content)
271                | pulldown_cmark::Event::InlineHtml(_content) => {
272                    let size = _content.chars().count();
273                    tokens.push(Token {
274                        span: Span::new_with_len(traversed_chars, size),
275                        kind: TokenKind::Unlintable,
276                    });
277                }
278                _ => (),
279            }
280        }
281
282        if matches!(
283            tokens.last(),
284            Some(Token {
285                kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
286                ..
287            })
288        ) && source.last() != Some(&'\n')
289        {
290            tokens.pop();
291        }
292
293        Self::remove_hidden_wikilink_tokens(&mut tokens);
294        Self::remove_wikilink_brackets(&mut tokens);
295
296        tokens
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::super::StrParser;
303    use super::Markdown;
304    use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
305
306    #[test]
307    fn survives_emojis() {
308        let source = r"🤷.";
309
310        Markdown::default().parse_str(source);
311    }
312
313    /// Check whether the Markdown parser will emit a breaking newline
314    /// at the end of each input.
315    ///
316    /// It should _not_ do this.
317    #[test]
318    fn ends_with_newline() {
319        let source = "This is a test.";
320
321        let tokens = Markdown::default().parse_str(source);
322        assert_ne!(tokens.len(), 0);
323        assert!(!tokens.last().unwrap().kind.is_newline());
324    }
325
326    #[test]
327    fn math_becomes_unlintable() {
328        let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
329
330        let tokens = Markdown::default().parse_str(source);
331        assert_eq!(
332            tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
333            vec![
334                TokenKind::Unlintable,
335                TokenKind::Space(1),
336                TokenKind::Unlintable,
337                TokenKind::Space(1),
338                TokenKind::Unlintable,
339                TokenKind::Punctuation(Punctuation::Period)
340            ]
341        )
342    }
343
344    #[test]
345    fn hidden_wikilink_text() {
346        let source = r"[[this is hidden|this is not]]";
347
348        let tokens = Markdown::default().parse_str(source);
349
350        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
351
352        assert!(matches!(
353            token_kinds.as_slice(),
354            &[
355                TokenKind::Word(_),
356                TokenKind::Space(1),
357                TokenKind::Word(_),
358                TokenKind::Space(1),
359                TokenKind::Word(_),
360            ]
361        ))
362    }
363
364    #[test]
365    fn just_pipe() {
366        let source = r"|";
367
368        let tokens = Markdown::default().parse_str(source);
369
370        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
371
372        dbg!(&token_kinds);
373
374        assert!(matches!(
375            token_kinds.as_slice(),
376            &[TokenKind::Punctuation(Punctuation::Pipe)]
377        ))
378    }
379
380    #[test]
381    fn empty_wikilink_text() {
382        let source = r"[[|]]";
383
384        let tokens = Markdown::default().parse_str(source);
385
386        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
387
388        dbg!(&token_kinds);
389
390        assert!(matches!(token_kinds.as_slice(), &[]))
391    }
392
393    #[test]
394    fn improper_wikilink_text() {
395        let source = r"this is shown|this is also shown]]";
396
397        let tokens = Markdown::default().parse_str(source);
398
399        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
400
401        dbg!(&token_kinds);
402
403        assert!(matches!(
404            token_kinds.as_slice(),
405            &[
406                TokenKind::Word(_),
407                TokenKind::Space(1),
408                TokenKind::Word(_),
409                TokenKind::Space(1),
410                TokenKind::Word(_),
411                TokenKind::Punctuation(Punctuation::Pipe),
412                TokenKind::Word(_),
413                TokenKind::Space(1),
414                TokenKind::Word(_),
415                TokenKind::Space(1),
416                TokenKind::Word(_),
417                TokenKind::Space(1),
418                TokenKind::Word(_),
419                TokenKind::Punctuation(Punctuation::CloseSquare),
420                TokenKind::Punctuation(Punctuation::CloseSquare),
421            ]
422        ))
423    }
424
425    #[test]
426    fn normal_wikilink() {
427        let source = r"[[Wikilink]]";
428        let tokens = Markdown::default().parse_str(source);
429        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
430
431        dbg!(&token_kinds);
432
433        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
434    }
435
436    #[test]
437    fn html_is_unlintable() {
438        let source = r"The range of inputs from <ctrl-g> to ctrl-z";
439        let tokens = Markdown::default().parse_str(source);
440        assert_eq!(tokens.iter_unlintables().count(), 1);
441    }
442
443    #[test]
444    fn link_title_unlintable() {
445        let parser = Markdown::new(MarkdownOptions {
446            ignore_link_title: true,
447            ..MarkdownOptions::default()
448        });
449        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
450        let tokens = parser.parse_str(source);
451        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
452
453        dbg!(&token_kinds);
454
455        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
456    }
457
458    #[test]
459    fn issue_194() {
460        let source = r"<http://localhost:9093>";
461        let parser = Markdown::new(MarkdownOptions {
462            ignore_link_title: true,
463            ..MarkdownOptions::default()
464        });
465        let token_kinds = parser
466            .parse_str(source)
467            .iter()
468            .map(|t| t.kind.clone())
469            .collect::<Vec<_>>();
470
471        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
472    }
473
474    #[test]
475    fn respects_link_title_config() {
476        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
477        let parser = Markdown::new(MarkdownOptions {
478            ignore_link_title: true,
479            ..MarkdownOptions::default()
480        });
481        let token_kinds = parser
482            .parse_str(source)
483            .iter()
484            .map(|t| t.kind.clone())
485            .collect::<Vec<_>>();
486
487        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
488
489        let parser = Markdown::new(MarkdownOptions {
490            ignore_link_title: false,
491            ..MarkdownOptions::default()
492        });
493        let token_kinds = parser
494            .parse_str(source)
495            .iter()
496            .map(|t| t.kind.clone())
497            .collect::<Vec<_>>();
498
499        dbg!(&token_kinds);
500
501        assert!(matches!(
502            token_kinds.as_slice(),
503            &[
504                TokenKind::Word(_),
505                TokenKind::Punctuation(Punctuation::Hyphen),
506                TokenKind::Word(_),
507                TokenKind::Punctuation(Punctuation::ForwardSlash),
508                TokenKind::Word(_)
509            ]
510        ));
511    }
512
513    /// Test that code blocks are immediately followed by a paragraph break.
514    #[test]
515    fn issue_880() {
516        let source = r#"
517Paragraph.
518
519```
520Code block
521```
522Paragraph.
523        "#;
524        let parser = Markdown::new(MarkdownOptions::default());
525        let tokens = parser.parse_str(source);
526        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
527
528        dbg!(&token_kinds);
529
530        assert!(matches!(
531            token_kinds.as_slice(),
532            &[
533                TokenKind::Word(_),
534                TokenKind::Punctuation(_),
535                TokenKind::ParagraphBreak,
536                TokenKind::Unlintable,
537                TokenKind::ParagraphBreak,
538                TokenKind::Word(_),
539                TokenKind::Punctuation(_),
540            ]
541        ))
542    }
543
544    /// Helps ensure that ending tokens (like `ParagraphBreak`) don't get erroneously placed at
545    /// the beginning of a sentence. This kind of behavior can cause crashes, as seen in
546    /// [#1181](https://github.com/Automattic/harper/issues/1181).
547    #[test]
548    fn no_end_token_incorrectly_ending_at_zero() {
549        let source = "Something\n";
550        let parser = Markdown::new(MarkdownOptions::default());
551        let tokens = parser.parse_str(source);
552        assert_ne!(tokens.last().unwrap().span.end, 0);
553    }
554}