zed_util/markdown.rs
1use std::fmt::{Display, Formatter};
2
3/// Indicates that the wrapped `String` is markdown text.
4#[derive(Debug, Clone)]
5pub struct MarkdownString(pub String);
6
7impl Display for MarkdownString {
8    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
9        write!(f, "{}", self.0)
10    }
11}
12
13/// Escapes markdown special characters in markdown text blocks. Markdown code blocks follow
14/// different rules and `MarkdownInlineCode` or `MarkdownCodeBlock` should be used in that case.
15///
16/// Also escapes the following markdown extensions:
17///
18/// * `^` for superscripts
19/// * `$` for inline math
20/// * `~` for strikethrough
21///
22/// Escape of some characters is unnecessary, because while they are involved in markdown syntax,
23/// the other characters involved are escaped:
24///
25/// * `!`, `]`, `(`, and `)` are used in link syntax, but `[` is escaped so these are parsed as
26///   plaintext.
27///
28/// * `;` is used in HTML entity syntax, but `&` is escaped, so they are parsed as plaintext.
29///
30/// TODO: There is one escape this doesn't do currently. Period after numbers at the start of the
31/// line (`[0-9]*\.`) should also be escaped to avoid it being interpreted as a list item.
32pub struct MarkdownEscaped<'a>(pub &'a str);
33
34/// Implements `Display` to format markdown inline code (wrapped in backticks), handling code that
35/// contains backticks and spaces. All whitespace is treated as a single space character. For text
36/// that does not contain whitespace other than ' ', this escaping roundtrips through
37/// pulldown-cmark.
38///
39/// When used in tables, `|` should be escaped like `\|` in the text provided to this function.
40pub struct MarkdownInlineCode<'a>(pub &'a str);
41
42/// Implements `Display` to format markdown code blocks, wrapped in 3 or more backticks as needed.
43pub struct MarkdownCodeBlock<'a> {
44    pub tag: &'a str,
45    pub text: &'a str,
46}
47
48impl Display for MarkdownEscaped<'_> {
49    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
50        let mut start_of_unescaped = None;
51        for (ix, c) in self.0.char_indices() {
52            match c {
53                // Always escaped.
54                '\\' | '`' | '*' | '_' | '[' | '^' | '$' | '~' | '&' |
55                // TODO: these only need to be escaped when they are the first non-whitespace
56                // character of the line of a block. There should probably be both an `escape_block`
57                // which does this and an `escape_inline` method which does not escape these.
58                '#' | '+' | '=' | '-' => {
59                    match start_of_unescaped {
60                        None => {}
61                        Some(start_of_unescaped) => {
62                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
63                        }
64                    }
65                    write!(formatter, "\\")?;
66                    // Can include this char in the "unescaped" text since a
67                    // backslash was just emitted.
68                    start_of_unescaped = Some(ix);
69                }
70                // Escaped since `<` is used in opening HTML tags. `<` is used since Markdown
71                // supports HTML entities, and this allows the text to be used directly in HTML.
72                '<' => {
73                    match start_of_unescaped {
74                        None => {}
75                        Some(start_of_unescaped) => {
76                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
77                        }
78                    }
79                    write!(formatter, "<")?;
80                    start_of_unescaped = None;
81                }
82                // Escaped since `>` is used for blockquotes. `>` is used since Markdown supports
83                // HTML entities, and this allows the text to be used directly in HTML.
84                '>' => {
85                    match start_of_unescaped {
86                        None => {}
87                        Some(start_of_unescaped) => {
88                            write!(formatter, "{}", &self.0[start_of_unescaped..ix])?;
89                        }
90                    }
91                    write!(formatter, ">")?;
92                    start_of_unescaped = None;
93                }
94                _ => {
95                    if start_of_unescaped.is_none() {
96                        start_of_unescaped = Some(ix);
97                    }
98                }
99            }
100        }
101        if let Some(start_of_unescaped) = start_of_unescaped {
102            write!(formatter, "{}", &self.0[start_of_unescaped..])?;
103        }
104        Ok(())
105    }
106}
107
108impl Display for MarkdownInlineCode<'_> {
109    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
110        // Apache License 2.0, same as this crate.
111        //
112        // Copied from `pulldown-cmark-to-cmark-20.0.0` with modifications:
113        //
114        // * Handling of all whitespace. pulldown-cmark-to-cmark is anticipating
115        // `Code` events parsed by pulldown-cmark.
116        //
117        // https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L290
118
119        let mut all_whitespace = true;
120        let text = self
121            .0
122            .chars()
123            .map(|c| {
124                if c.is_whitespace() {
125                    ' '
126                } else {
127                    all_whitespace = false;
128                    c
129                }
130            })
131            .collect::<String>();
132
133        // When inline code has leading and trailing ' ' characters, additional space is needed
134        // to escape it, unless all characters are space.
135        if all_whitespace {
136            write!(formatter, "`{text}`")
137        } else {
138            // More backticks are needed to delimit the inline code than the maximum number of
139            // backticks in a consecutive run.
140            let backticks = "`".repeat(count_max_consecutive_chars(&text, '`') + 1);
141            let space = match text.as_bytes() {
142                &[b'`', ..] | &[.., b'`'] => " ", // Space needed to separate backtick.
143                &[b' ', .., b' '] => " ",         // Space needed to escape inner space.
144                _ => "",                          // No space needed.
145            };
146            write!(formatter, "{backticks}{space}{text}{space}{backticks}")
147        }
148    }
149}
150
151impl Display for MarkdownCodeBlock<'_> {
152    fn fmt(&self, formatter: &mut Formatter<'_>) -> std::fmt::Result {
153        let tag = self.tag;
154        let text = self.text;
155        let backticks = "`".repeat(3.max(count_max_consecutive_chars(text, '`') + 1));
156        write!(formatter, "{backticks}{tag}\n{text}\n{backticks}\n")
157    }
158}
159
160// Copied from `pulldown-cmark-to-cmark-20.0.0` with changed names.
161// https://github.com/Byron/pulldown-cmark-to-cmark/blob/3c850de2d3d1d79f19ca5f375e1089a653cf3ff7/src/lib.rs#L1063
162// Apache License 2.0, same as this code.
163fn count_max_consecutive_chars(text: &str, search: char) -> usize {
164    let mut in_search_chars = false;
165    let mut max_count = 0;
166    let mut cur_count = 0;
167
168    for ch in text.chars() {
169        if ch == search {
170            cur_count += 1;
171            in_search_chars = true;
172        } else if in_search_chars {
173            max_count = max_count.max(cur_count);
174            cur_count = 0;
175            in_search_chars = false;
176        }
177    }
178    max_count.max(cur_count)
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_markdown_escaped() {
187        let input = r#"
188        # Heading
189
190        Another heading
191        ===
192
193        Another heading variant
194        ---
195
196        Paragraph with [link](https://example.com) and `code`, *emphasis*, and ~strikethrough~.
197
198        ```
199        code block
200        ```
201
202        List with varying leaders:
203          - Item 1
204          * Item 2
205          + Item 3
206
207        Some math:  $`\sqrt{3x-1}+(1+x)^2`$
208
209        HTML entity:  
210        "#;
211
212        let expected = r#"
213        \# Heading
214
215        Another heading
216        \=\=\=
217
218        Another heading variant
219        \-\-\-
220
221        Paragraph with \[link](https://example.com) and \`code\`, \*emphasis\*, and \~strikethrough\~.
222
223        \`\`\`
224        code block
225        \`\`\`
226
227        List with varying leaders:
228          \- Item 1
229          \* Item 2
230          \+ Item 3
231
232        Some math:  \$\`\\sqrt{3x\-1}\+(1\+x)\^2\`\$
233
234        HTML entity: \ 
235        "#;
236
237        assert_eq!(MarkdownEscaped(input).to_string(), expected);
238    }
239
240    #[test]
241    fn test_markdown_inline_code() {
242        assert_eq!(MarkdownInlineCode(" ").to_string(), "` `");
243        assert_eq!(MarkdownInlineCode("text").to_string(), "`text`");
244        assert_eq!(MarkdownInlineCode("text ").to_string(), "`text `");
245        assert_eq!(MarkdownInlineCode(" text ").to_string(), "`  text  `");
246        assert_eq!(MarkdownInlineCode("`").to_string(), "`` ` ``");
247        assert_eq!(MarkdownInlineCode("``").to_string(), "``` `` ```");
248        assert_eq!(MarkdownInlineCode("`text`").to_string(), "`` `text` ``");
249        assert_eq!(
250            MarkdownInlineCode("some `text` no leading or trailing backticks").to_string(),
251            "``some `text` no leading or trailing backticks``"
252        );
253    }
254
255    #[test]
256    fn test_count_max_consecutive_chars() {
257        assert_eq!(
258            count_max_consecutive_chars("``a```b``", '`'),
259            3,
260            "the highest seen consecutive segment of backticks counts"
261        );
262        assert_eq!(
263            count_max_consecutive_chars("```a``b`", '`'),
264            3,
265            "it can't be downgraded later"
266        );
267    }
268}