harper_core/parsers/
markdown.rs

1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse
9/// CommonMark files.
10///
11/// Will ignore code blocks and tables.
12#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14    options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20    pub ignore_link_title: bool,
21}
22
23// Clippy rule excepted because this can easily be expanded later
24#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26    fn default() -> Self {
27        Self {
28            ignore_link_title: false,
29        }
30    }
31}
32
33impl Markdown {
34    pub fn new(options: MarkdownOptions) -> Self {
35        Self { options }
36    }
37
38    /// Remove hidden Wikilink target text.
39    ///
40    /// As in the stuff to the left of the pipe operator:
41    ///
42    /// ```markdown
43    /// [[Target text|Display Text]]
44    /// ```
45    fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46        let mut to_remove = VecDeque::new();
47
48        for pipe_idx in tokens.iter_pipe_indices() {
49            if pipe_idx < 2 {
50                continue;
51            }
52
53            // Locate preceding `[[`
54            let mut cursor = pipe_idx - 2;
55            let mut open_bracket = None;
56
57            loop {
58                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59                    break;
60                };
61
62                if a.kind.is_newline() {
63                    break;
64                }
65
66                if a.kind.is_open_square() && b.kind.is_open_square() {
67                    open_bracket = Some(cursor);
68                    break;
69                } else if cursor == 0 {
70                    break;
71                } else {
72                    cursor -= 1;
73                }
74            }
75
76            // Locate succeeding `[[`
77            cursor = pipe_idx + 1;
78            let mut close_bracket = None;
79
80            loop {
81                let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82                    break;
83                };
84
85                if a.kind.is_newline() {
86                    break;
87                }
88
89                if a.kind.is_close_square() && b.kind.is_close_square() {
90                    close_bracket = Some(cursor);
91                    break;
92                } else {
93                    cursor += 1;
94                }
95            }
96
97            if let Some(open_bracket_idx) = open_bracket
98                && let Some(close_bracket_idx) = close_bracket
99            {
100                to_remove.extend(open_bracket_idx..=pipe_idx);
101                to_remove.push_back(close_bracket_idx);
102                to_remove.push_back(close_bracket_idx + 1);
103            }
104        }
105
106        tokens.remove_indices(to_remove);
107    }
108
109    /// Remove the brackets from Wikilinks without pipe operators.
110    /// For __those__ Wikilinks, see [`Self::remove_hidden_wikilink_tokens`]
111    fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112        let mut to_remove = VecDeque::new();
113        let mut open_brackets = None;
114
115        let mut cursor = 0;
116
117        loop {
118            let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119                break;
120            };
121
122            if let Some(open_brackets_idx) = open_brackets {
123                if a.kind.is_newline() {
124                    open_brackets = None;
125                    cursor += 1;
126                    continue;
127                }
128
129                if a.kind.is_close_square() && b.kind.is_close_square() {
130                    to_remove.push_back(open_brackets_idx);
131                    to_remove.push_back(open_brackets_idx + 1);
132
133                    to_remove.push_back(cursor);
134                    to_remove.push_back(cursor + 1);
135
136                    open_brackets = None;
137                }
138            } else if a.kind.is_open_square() && b.kind.is_open_square() {
139                open_brackets = Some(cursor);
140            }
141
142            cursor += 1;
143        }
144
145        tokens.remove_indices(to_remove);
146    }
147}
148
149impl Parser for Markdown {
150    /// This implementation is quite gross to look at, but it works.
151    /// If any issues arise, it would likely help to refactor this out first.
152    fn parse(&self, source: &[char]) -> Vec<Token> {
153        let english_parser = PlainEnglish;
154
155        let source_str: String = source.iter().collect();
156        let md_parser = pulldown_cmark::Parser::new_ext(
157            &source_str,
158            pulldown_cmark::Options::all()
159                .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160        );
161
162        let mut tokens = Vec::new();
163
164        // Build a mapping from the inner parser's byte-based indexing to Harper's char-based
165        // indexing
166        let mut byte_to_char = vec![0; source_str.len() + 1];
167        let mut char_index = 0;
168        let mut byte_idx = 0;
169        for ch in source_str.chars() {
170            let char_len = ch.len_utf8();
171            for _ in 0..char_len {
172                byte_to_char[byte_idx] = char_index;
173                byte_idx += 1;
174            }
175            char_index += 1;
176        }
177        byte_to_char[source_str.len()] = char_index;
178
179        let mut stack = Vec::new();
180
181        // NOTE: the range spits out __byte__ indices, not char indices.
182        // This is why we keep track above.
183        for (event, range) in md_parser.into_offset_iter() {
184            let span_start = byte_to_char[range.start];
185            let span_end = byte_to_char[range.end];
186
187            match event {
188                pulldown_cmark::Event::SoftBreak => {
189                    tokens.push(Token {
190                        span: Span::new_with_len(span_start, 1),
191                        kind: TokenKind::Newline(1),
192                    });
193                }
194                pulldown_cmark::Event::HardBreak => {
195                    tokens.push(Token {
196                        span: Span::new_with_len(span_start, 1),
197                        kind: TokenKind::Newline(2),
198                    });
199                }
200                pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
201                    tokens.push(Token {
202                        span: Span::new_with_len(span_start, 0),
203                        kind: TokenKind::Newline(2),
204                    });
205                    stack.push(pulldown_cmark::Tag::List(v));
206                }
207                pulldown_cmark::Event::Start(tag) => {
208                    if matches!(tag, pulldown_cmark::Tag::Heading { .. }) {
209                        tokens.push(Token {
210                            span: Span::new_with_len(span_start, 0),
211                            kind: TokenKind::HeadingStart,
212                        });
213                    }
214
215                    stack.push(tag)
216                }
217                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
218                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
219                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
220                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
221                | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
222                    tokens.push(Token {
223                        // We cannot use `span_start` here, as it will still point to the
224                        // first character of the `Event` at this point. Instead, we use the
225                        // position of the previous token's last character. This ensures the
226                        // paragraph break is placed at the end of the content, not its beginning.
227                        // For more info, see: https://github.com/Automattic/harper/pull/1239.
228                        span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
229                        kind: TokenKind::ParagraphBreak,
230                    });
231                    stack.pop();
232                }
233                pulldown_cmark::Event::End(_) => {
234                    stack.pop();
235                }
236                pulldown_cmark::Event::InlineMath(_)
237                | pulldown_cmark::Event::DisplayMath(_)
238                | pulldown_cmark::Event::Code(_) => {
239                    let chunk_len = span_end - span_start;
240
241                    tokens.push(Token {
242                        span: Span::new_with_len(span_start, chunk_len),
243                        kind: TokenKind::Unlintable,
244                    });
245                }
246                pulldown_cmark::Event::Text(_text) => {
247                    let chunk_len = span_end - span_start;
248
249                    if let Some(tag) = stack.last() {
250                        use pulldown_cmark::Tag;
251
252                        if matches!(tag, Tag::CodeBlock(..)) {
253                            tokens.push(Token {
254                                span: Span::new_with_len(span_start, chunk_len),
255
256                                kind: TokenKind::Unlintable,
257                            });
258                            continue;
259                        }
260                        if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
261                            tokens.push(Token {
262                                span: Span::new_with_len(span_start, chunk_len),
263                                kind: TokenKind::Unlintable,
264                            });
265                            continue;
266                        }
267                        if !(matches!(tag, Tag::Paragraph)
268                            || (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
269                            || matches!(tag, Tag::Heading { .. })
270                            || matches!(tag, Tag::Item)
271                            || matches!(tag, Tag::TableCell)
272                            || matches!(tag, Tag::Emphasis)
273                            || matches!(tag, Tag::Strong)
274                            || matches!(tag, Tag::Strikethrough))
275                        {
276                            continue;
277                        }
278                    }
279
280                    let mut new_tokens = english_parser.parse(&source[span_start..span_end]);
281
282                    new_tokens
283                        .iter_mut()
284                        .for_each(|token| token.span.push_by(span_start));
285
286                    tokens.append(&mut new_tokens);
287                }
288                // TODO: Support via `harper-html`
289                pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
290                    let size = span_end - span_start;
291                    tokens.push(Token {
292                        span: Span::new_with_len(span_start, size),
293                        kind: TokenKind::Unlintable,
294                    });
295                }
296                _ => (),
297            }
298        }
299
300        if matches!(
301            tokens.last(),
302            Some(Token {
303                kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
304                ..
305            })
306        ) && source.last() != Some(&'\n')
307        {
308            tokens.pop();
309        }
310
311        Self::remove_hidden_wikilink_tokens(&mut tokens);
312        Self::remove_wikilink_brackets(&mut tokens);
313
314        tokens
315    }
316}
317
318#[cfg(test)]
319mod tests {
320    use super::super::StrParser;
321    use super::Markdown;
322    use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
323
324    #[test]
325    fn survives_emojis() {
326        let source = r"🤷.";
327
328        Markdown::default().parse_str(source);
329    }
330
331    /// Check whether the Markdown parser will emit a breaking newline
332    /// at the end of each input.
333    ///
334    /// It should _not_ do this.
335    #[test]
336    fn ends_with_newline() {
337        let source = "This is a test.";
338
339        let tokens = Markdown::default().parse_str(source);
340        assert_ne!(tokens.len(), 0);
341        assert!(!tokens.last().unwrap().kind.is_newline());
342    }
343
344    #[test]
345    fn math_becomes_unlintable() {
346        let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
347
348        let tokens = Markdown::default().parse_str(source);
349        assert_eq!(
350            tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
351            vec![
352                TokenKind::Unlintable,
353                TokenKind::Space(1),
354                TokenKind::Unlintable,
355                TokenKind::Space(1),
356                TokenKind::Unlintable,
357                TokenKind::Punctuation(Punctuation::Period)
358            ]
359        )
360    }
361
362    #[test]
363    fn hidden_wikilink_text() {
364        let source = r"[[this is hidden|this is not]]";
365
366        let tokens = Markdown::default().parse_str(source);
367
368        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
369
370        assert!(matches!(
371            token_kinds.as_slice(),
372            &[
373                TokenKind::Word(_),
374                TokenKind::Space(1),
375                TokenKind::Word(_),
376                TokenKind::Space(1),
377                TokenKind::Word(_),
378            ]
379        ))
380    }
381
382    #[test]
383    fn just_pipe() {
384        let source = r"|";
385
386        let tokens = Markdown::default().parse_str(source);
387
388        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
389
390        dbg!(&token_kinds);
391
392        assert!(matches!(
393            token_kinds.as_slice(),
394            &[TokenKind::Punctuation(Punctuation::Pipe)]
395        ))
396    }
397
398    #[test]
399    fn empty_wikilink_text() {
400        let source = r"[[|]]";
401
402        let tokens = Markdown::default().parse_str(source);
403
404        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
405
406        dbg!(&token_kinds);
407
408        assert!(matches!(token_kinds.as_slice(), &[]))
409    }
410
411    #[test]
412    fn improper_wikilink_text() {
413        let source = r"this is shown|this is also shown]]";
414
415        let tokens = Markdown::default().parse_str(source);
416
417        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
418
419        dbg!(&token_kinds);
420
421        assert!(matches!(
422            token_kinds.as_slice(),
423            &[
424                TokenKind::Word(_),
425                TokenKind::Space(1),
426                TokenKind::Word(_),
427                TokenKind::Space(1),
428                TokenKind::Word(_),
429                TokenKind::Punctuation(Punctuation::Pipe),
430                TokenKind::Word(_),
431                TokenKind::Space(1),
432                TokenKind::Word(_),
433                TokenKind::Space(1),
434                TokenKind::Word(_),
435                TokenKind::Space(1),
436                TokenKind::Word(_),
437                TokenKind::Punctuation(Punctuation::CloseSquare),
438                TokenKind::Punctuation(Punctuation::CloseSquare),
439            ]
440        ))
441    }
442
443    #[test]
444    fn normal_wikilink() {
445        let source = r"[[Wikilink]]";
446        let tokens = Markdown::default().parse_str(source);
447        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
448
449        dbg!(&token_kinds);
450
451        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
452    }
453
454    #[test]
455    fn html_is_unlintable() {
456        let source = r"The range of inputs from <ctrl-g> to ctrl-z";
457        let tokens = Markdown::default().parse_str(source);
458        assert_eq!(tokens.iter_unlintables().count(), 1);
459    }
460
461    #[test]
462    fn link_title_unlintable() {
463        let parser = Markdown::new(MarkdownOptions {
464            ignore_link_title: true,
465            ..MarkdownOptions::default()
466        });
467        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
468        let tokens = parser.parse_str(source);
469        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
470
471        dbg!(&token_kinds);
472
473        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
474    }
475
476    #[test]
477    fn issue_194() {
478        let source = r"<http://localhost:9093>";
479        let parser = Markdown::new(MarkdownOptions {
480            ignore_link_title: true,
481            ..MarkdownOptions::default()
482        });
483        let token_kinds = parser
484            .parse_str(source)
485            .iter()
486            .map(|t| t.kind.clone())
487            .collect::<Vec<_>>();
488
489        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
490    }
491
492    #[test]
493    fn respects_link_title_config() {
494        let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
495        let parser = Markdown::new(MarkdownOptions {
496            ignore_link_title: true,
497            ..MarkdownOptions::default()
498        });
499        let token_kinds = parser
500            .parse_str(source)
501            .iter()
502            .map(|t| t.kind.clone())
503            .collect::<Vec<_>>();
504
505        assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
506
507        let parser = Markdown::new(MarkdownOptions {
508            ignore_link_title: false,
509            ..MarkdownOptions::default()
510        });
511        let token_kinds = parser
512            .parse_str(source)
513            .iter()
514            .map(|t| t.kind.clone())
515            .collect::<Vec<_>>();
516
517        dbg!(&token_kinds);
518
519        assert!(matches!(
520            token_kinds.as_slice(),
521            &[
522                TokenKind::Word(_),
523                TokenKind::Punctuation(Punctuation::Hyphen),
524                TokenKind::Word(_),
525                TokenKind::Punctuation(Punctuation::ForwardSlash),
526                TokenKind::Word(_)
527            ]
528        ));
529    }
530
531    /// Test that code blocks are immediately followed by a paragraph break.
532    #[test]
533    fn issue_880() {
534        let source = r#"
535Paragraph.
536
537```
538Code block
539```
540Paragraph.
541        "#;
542        let parser = Markdown::new(MarkdownOptions::default());
543        let tokens = parser.parse_str(source);
544        let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
545
546        dbg!(&token_kinds);
547
548        assert!(matches!(
549            token_kinds.as_slice(),
550            &[
551                TokenKind::Word(_),
552                TokenKind::Punctuation(_),
553                TokenKind::ParagraphBreak,
554                TokenKind::Unlintable,
555                TokenKind::ParagraphBreak,
556                TokenKind::Word(_),
557                TokenKind::Punctuation(_),
558            ]
559        ))
560    }
561
562    /// Helps ensure that ending tokens (like `ParagraphBreak`) don't get erroneously placed at
563    /// the beginning of a sentence. This kind of behavior can cause crashes, as seen in
564    /// [#1181](https://github.com/Automattic/harper/issues/1181).
565    #[test]
566    fn no_end_token_incorrectly_ending_at_zero() {
567        let source = "Something\n";
568        let parser = Markdown::new(MarkdownOptions::default());
569        let tokens = parser.parse_str(source);
570        assert_ne!(tokens.last().unwrap().span.end, 0);
571    }
572
573    #[test]
574    fn hang() {
575        let opts = MarkdownOptions::default();
576        let parser = Markdown::new(opts);
577        let _res = parser.parse_str("[[#|]]:A]");
578    }
579
580    #[test]
581    fn hang2() {
582        // This seems to only be a java specific problem...
583        let opts = MarkdownOptions::default();
584        let parser = Markdown::new(opts);
585        let _res = parser.parse_str("//{@j");
586    }
587
588    #[test]
589    fn simple_headings_are_marked() {
590        let opts = MarkdownOptions::default();
591        let parser = Markdown::new(opts);
592        let tokens = parser.parse_str("# This is a simple heading");
593
594        assert_eq!(tokens.iter_heading_starts().count(), 1);
595        assert_eq!(tokens.iter_headings().count(), 1);
596    }
597
598    #[test]
599    fn multiple_headings_are_marked() {
600        let opts = MarkdownOptions::default();
601        let parser = Markdown::new(opts);
602        let tokens = parser.parse_str(
603            r#"# This is a simple heading
604
605## This is a second simple heading"#,
606        );
607
608        assert_eq!(tokens.iter_heading_starts().count(), 2);
609        assert_eq!(tokens.iter_headings().count(), 2);
610    }
611}