harper_core/parsers/
markdown.rs

1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse
9/// CommonMark files.
10///
11/// Will ignore code blocks and tables.
12#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14    options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20    pub ignore_link_title: bool,
21}
22
23// Clippy rule excepted because this can easily be expanded later
24#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            ignore_link_title: false,
29        }
30    }
31}
32
33impl Markdown {
34    pub fn new(options: MarkdownOptions) -> Self {
35        Self { options }
36    }
37
38    /// Remove hidden Wikilink target text.
39    ///
40    /// As in the stuff to the left of the pipe operator:
41    ///
42    /// ```markdown
43    /// [[Target text|Display Text]]
44    /// ```
45    fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46        let mut to_remove = VecDeque::new();
47
48        for pipe_idx in tokens.iter_pipe_indices() {
49            if pipe_idx < 2 {
50                continue;
51            }
52
53            // Locate preceding `[[`
54            let mut cursor = pipe_idx - 2;
55            let mut open_bracket = None;
56
57            loop {
58                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59                    break;
60                };
61
62                if a.kind.is_newline() {
63                    break;
64                }
65
66                if a.kind.is_open_square() && b.kind.is_open_square() {
67                    open_bracket = Some(cursor);
68                    break;
69                } else if cursor == 0 {
70                    break;
71                } else {
72                    cursor -= 1;
73                }
74            }
75
76            // Locate succeeding `[[`
77            cursor = pipe_idx + 1;
78            let mut close_bracket = None;
79
80            loop {
81                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82                    break;
83                };
84
85                if a.kind.is_newline() {
86                    break;
87                }
88
89                if a.kind.is_close_square() && b.kind.is_close_square() {
90                    close_bracket = Some(cursor);
91                    break;
92                } else {
93                    cursor += 1;
94                }
95            }
96
97            if let Some(open_bracket_idx) = open_bracket
98                && let Some(close_bracket_idx) = close_bracket
99            {
100                to_remove.extend(open_bracket_idx..=pipe_idx);
101                to_remove.push_back(close_bracket_idx);
102                to_remove.push_back(close_bracket_idx + 1);
103            }
104        }
105
106        tokens.remove_indices(to_remove);
107    }
108
109    /// Remove the brackets from Wikilinks without pipe operators.
110    /// For __those__ Wikilinks, see [`Self::remove_hidden_wikilink_tokens`]
111    fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112        let mut to_remove = VecDeque::new();
113        let mut open_brackets = None;
114
115        let mut cursor = 0;
116
117        loop {
118            let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119                break;
120            };
121
122            if let Some(open_brackets_idx) = open_brackets {
123                if a.kind.is_newline() {
124                    open_brackets = None;
125                    cursor += 1;
126                    continue;
127                }
128
129                if a.kind.is_close_square() && b.kind.is_close_square() {
130                    to_remove.push_back(open_brackets_idx);
131                    to_remove.push_back(open_brackets_idx + 1);
132
133                    to_remove.push_back(cursor);
134                    to_remove.push_back(cursor + 1);
135
136                    open_brackets = None;
137                }
138            } else if a.kind.is_open_square() && b.kind.is_open_square() {
139                open_brackets = Some(cursor);
140            }
141
142            cursor += 1;
143        }
144
145        tokens.remove_indices(to_remove);
146    }
147}
148
149impl Parser for Markdown {
150    /// This implementation is quite gross to look at, but it works.
151    /// If any issues arise, it would likely help to refactor this out first.
152    fn parse(&self, source: &[char]) -> Vec<Token> {
153        let english_parser = PlainEnglish;
154
155        let source_str: String = source.iter().collect();
156        let md_parser = pulldown_cmark::Parser::new_ext(
157            &source_str,
158            pulldown_cmark::Options::all()
159                .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160        );
161
162        let mut tokens = Vec::new();
163
164        // Build a mapping from the inner parser's byte-based indexing to Harper's char-based
165        // indexing
166        let mut byte_to_char = vec![0; source_str.len() + 1];
167        let mut char_index = 0;
168        let mut byte_idx = 0;
169        for ch in source_str.chars() {
170            let char_len = ch.len_utf8();
171            for _ in 0..char_len {
172                byte_to_char[byte_idx] = char_index;
173                byte_idx += 1;
174            }
175            char_index += 1;
176        }
177        byte_to_char[source_str.len()] = char_index;
178
179        let mut stack = Vec::new();
180
181        // NOTE: the range spits out __byte__ indices, not char indices.
182        // This is why we keep track above.
183        for (event, range) in md_parser.into_offset_iter() {
184            let span_start = byte_to_char[range.start];
185            let span_end = byte_to_char[range.end];
186
187            match event {
188                pulldown_cmark::Event::SoftBreak => {
189                    tokens.push(Token {
190                        span: Span::new_with_len(span_start, 1),
191                        kind: TokenKind::Newline(1),
192                    });
193                }
194                pulldown_cmark::Event::HardBreak => {
195                    tokens.push(Token {
196                        span: Span::new_with_len(span_start, 1),
197                        kind: TokenKind::Newline(2),
198                    });
199                }
200                pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
201                    tokens.push(Token {
202                        span: Span::new_with_len(span_start, 0),
203                        kind: TokenKind::Newline(2),
204                    });
205                    stack.push(pulldown_cmark::Tag::List(v));
206                }
207                pulldown_cmark::Event::Start(tag) => stack.push(tag),
208                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
209                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
210                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
211                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
212                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
213                    tokens.push(Token {
214                        // We cannot use `span_start` here, as it will still point to the
215                        // first character of the `Event` at this point. Instead, we use the
216                        // position of the previous token's last character. This ensures the
217                        // paragraph break is placed at the end of the content, not its beginning.
218                        // For more info, see: https://github.com/Automattic/harper/pull/1239.
219                        span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
220                        kind: TokenKind::ParagraphBreak,
221                    });
222                    stack.pop();
223                }
224                pulldown_cmark::Event::End(_) => {
225                    stack.pop();
226                }
227                pulldown_cmark::Event::InlineMath(_)
228                | pulldown_cmark::Event::DisplayMath(_)
229                | pulldown_cmark::Event::Code(_) => {
230                    let chunk_len = span_end - span_start;
231
232                    tokens.push(Token {
233                        span: Span::new_with_len(span_start, chunk_len),
234                        kind: TokenKind::Unlintable,
235                    });
236                }
237                pulldown_cmark::Event::Text(_text) => {
238                    let chunk_len = span_end - span_start;
239
240                    if let Some(tag) = stack.last() {
241                        use pulldown_cmark::Tag;
242
243                        if matches!(tag, Tag::CodeBlock(..)) {
244                            tokens.push(Token {
245                                span: Span::new_with_len(span_start, chunk_len),
246
247                                kind: TokenKind::Unlintable,
248                            });
249                            continue;
250                        }
251                        if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
252                            tokens.push(Token {
253                                span: Span::new_with_len(span_start, chunk_len),
254                                kind: TokenKind::Unlintable,
255                            });
256                            continue;
257                        }
258                        if !(matches!(tag, Tag::Paragraph)
259                            || (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
260                            || matches!(tag, Tag::Heading { .. })
261                            || matches!(tag, Tag::Item)
262                            || matches!(tag, Tag::TableCell)
263                            || matches!(tag, Tag::Emphasis)
264                            || matches!(tag, Tag::Strong)
265                            || matches!(tag, Tag::Strikethrough))
266                        {
267                            continue;
268                        }
269                    }
270
271                    let mut new_tokens = english_parser.parse(&source[span_start..span_end]);
272
273                    new_tokens
274                        .iter_mut()
275                        .for_each(|token| token.span.push_by(span_start));
276
277                    tokens.append(&mut new_tokens);
278                }
279                // TODO: Support via `harper-html`
280                pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
281                    let size = span_end - span_start;
282                    tokens.push(Token {
283                        span: Span::new_with_len(span_start, size),
284                        kind: TokenKind::Unlintable,
285                    });
286                }
287                _ => (),
288            }
289        }
290
291        if matches!(
292            tokens.last(),
293            Some(Token {
294                kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
295                ..
296            })
297        ) && source.last() != Some(&'\n')
298        {
299            tokens.pop();
300        }
301
302        Self::remove_hidden_wikilink_tokens(&mut tokens);
303        Self::remove_wikilink_brackets(&mut tokens);
304
305        tokens
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::super::StrParser;
312    use super::Markdown;
313    use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
314
315    #[test]
316    fn survives_emojis() {
317        let source = r"🤷.";
318
319        Markdown::default().parse_str(source);
320    }
321
322    /// Check whether the Markdown parser will emit a breaking newline
323    /// at the end of each input.
324    ///
325    /// It should _not_ do this.
326    #[test]
327    fn ends_with_newline() {
328        let source = "This is a test.";
329
330        let tokens = Markdown::default().parse_str(source);
331        assert_ne!(tokens.len(), 0);
332        assert!(!tokens.last().unwrap().kind.is_newline());
333    }
334
335    #[test]
336    fn math_becomes_unlintable() {
337        let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
338
339        let tokens = Markdown::default().parse_str(source);
340        assert_eq!(
341            tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
342            vec![
343                TokenKind::Unlintable,
344                TokenKind::Space(1),
345                TokenKind::Unlintable,
346                TokenKind::Space(1),
347                TokenKind::Unlintable,
348                TokenKind::Punctuation(Punctuation::Period)
349            ]
350        )
351    }
352
353    #[test]
354    fn hidden_wikilink_text() {
355        let source = r"[[this is hidden|this is not]]";
356
357        let tokens = Markdown::default().parse_str(source);
358
359        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
360
361        assert!(matches!(
362            token_kinds.as_slice(),
363            &[
364                TokenKind::Word(_),
365                TokenKind::Space(1),
366                TokenKind::Word(_),
367                TokenKind::Space(1),
368                TokenKind::Word(_),
369            ]
370        ))
371    }
372
373    #[test]
374    fn just_pipe() {
375        let source = r"|";
376
377        let tokens = Markdown::default().parse_str(source);
378
379        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
380
381        dbg!(&token_kinds);
382
383        assert!(matches!(
384            token_kinds.as_slice(),
385            &[TokenKind::Punctuation(Punctuation::Pipe)]
386        ))
387    }
388
389    #[test]
390    fn empty_wikilink_text() {
391        let source = r"[[|]]";
392
393        let tokens = Markdown::default().parse_str(source);
394
395        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
396
397        dbg!(&token_kinds);
398
399        assert!(matches!(token_kinds.as_slice(), &[]))
400    }
401
402    #[test]
403    fn improper_wikilink_text() {
404        let source = r"this is shown|this is also shown]]";
405
406        let tokens = Markdown::default().parse_str(source);
407
408        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
409
410        dbg!(&token_kinds);
411
412        assert!(matches!(
413            token_kinds.as_slice(),
414            &[
415                TokenKind::Word(_),
416                TokenKind::Space(1),
417                TokenKind::Word(_),
418                TokenKind::Space(1),
419                TokenKind::Word(_),
420                TokenKind::Punctuation(Punctuation::Pipe),
421                TokenKind::Word(_),
422                TokenKind::Space(1),
423                TokenKind::Word(_),
424                TokenKind::Space(1),
425                TokenKind::Word(_),
426                TokenKind::Space(1),
427                TokenKind::Word(_),
428                TokenKind::Punctuation(Punctuation::CloseSquare),
429                TokenKind::Punctuation(Punctuation::CloseSquare),
430            ]
431        ))
432    }
433
434    #[test]
435    fn normal_wikilink() {
436        let source = r"[[Wikilink]]";
437        let tokens = Markdown::default().parse_str(source);
438        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
439
440        dbg!(&token_kinds);
441
442        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
443    }
444
445    #[test]
446    fn html_is_unlintable() {
447        let source = r"The range of inputs from <ctrl-g> to ctrl-z";
448        let tokens = Markdown::default().parse_str(source);
449        assert_eq!(tokens.iter_unlintables().count(), 1);
450    }
451
452    #[test]
453    fn link_title_unlintable() {
454        let parser = Markdown::new(MarkdownOptions {
455            ignore_link_title: true,
456            ..MarkdownOptions::default()
457        });
458        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
459        let tokens = parser.parse_str(source);
460        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
461
462        dbg!(&token_kinds);
463
464        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
465    }
466
467    #[test]
468    fn issue_194() {
469        let source = r"<http://localhost:9093>";
470        let parser = Markdown::new(MarkdownOptions {
471            ignore_link_title: true,
472            ..MarkdownOptions::default()
473        });
474        let token_kinds = parser
475            .parse_str(source)
476            .iter()
477            .map(|t| t.kind.clone())
478            .collect::<Vec<_>>();
479
480        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
481    }
482
483    #[test]
484    fn respects_link_title_config() {
485        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
486        let parser = Markdown::new(MarkdownOptions {
487            ignore_link_title: true,
488            ..MarkdownOptions::default()
489        });
490        let token_kinds = parser
491            .parse_str(source)
492            .iter()
493            .map(|t| t.kind.clone())
494            .collect::<Vec<_>>();
495
496        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
497
498        let parser = Markdown::new(MarkdownOptions {
499            ignore_link_title: false,
500            ..MarkdownOptions::default()
501        });
502        let token_kinds = parser
503            .parse_str(source)
504            .iter()
505            .map(|t| t.kind.clone())
506            .collect::<Vec<_>>();
507
508        dbg!(&token_kinds);
509
510        assert!(matches!(
511            token_kinds.as_slice(),
512            &[
513                TokenKind::Word(_),
514                TokenKind::Punctuation(Punctuation::Hyphen),
515                TokenKind::Word(_),
516                TokenKind::Punctuation(Punctuation::ForwardSlash),
517                TokenKind::Word(_)
518            ]
519        ));
520    }
521
522    /// Test that code blocks are immediately followed by a paragraph break.
523    #[test]
524    fn issue_880() {
525        let source = r#"
526Paragraph.
527
528```
529Code block
530```
531Paragraph.
532        "#;
533        let parser = Markdown::new(MarkdownOptions::default());
534        let tokens = parser.parse_str(source);
535        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
536
537        dbg!(&token_kinds);
538
539        assert!(matches!(
540            token_kinds.as_slice(),
541            &[
542                TokenKind::Word(_),
543                TokenKind::Punctuation(_),
544                TokenKind::ParagraphBreak,
545                TokenKind::Unlintable,
546                TokenKind::ParagraphBreak,
547                TokenKind::Word(_),
548                TokenKind::Punctuation(_),
549            ]
550        ))
551    }
552
553    /// Helps ensure that ending tokens (like `ParagraphBreak`) don't get erroneously placed at
554    /// the beginning of a sentence. This kind of behavior can cause crashes, as seen in
555    /// [#1181](https://github.com/Automattic/harper/issues/1181).
556    #[test]
557    fn no_end_token_incorrectly_ending_at_zero() {
558        let source = "Something\n";
559        let parser = Markdown::new(MarkdownOptions::default());
560        let tokens = parser.parse_str(source);
561        assert_ne!(tokens.last().unwrap().span.end, 0);
562    }
563
564    #[test]
565    fn hang() {
566        let opts = MarkdownOptions::default();
567        let parser = Markdown::new(opts);
568        let _res = parser.parse_str("[[#|]]:A]");
569    }
570
571    #[test]
572    fn hang2() {
573        // This seems to only be a java specific problem...
574        let opts = MarkdownOptions::default();
575        let parser = Markdown::new(opts);
576        let _res = parser.parse_str("//{@j");
577    }
578}