mini_markdown/
lib.rs

1pub mod lexer;
2pub mod iter;
3use std::fmt;
4use crate::lexer::*;
5use crate::iter::MiniIter;
6
7static COMMONMARK_SCHEME_ASCII: [char; 65] = [ //https://spec.commonmark.org/0.30/#scheme
8    'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
9    'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
10    '1','2','3','4','5','6','7','8','9','0',
11    '+','.','-'];
12
13
14#[derive(Debug)]
15pub(crate) struct SanitizationError<'a>{
16    pub(crate) content: &'a str,
17}
18
19#[derive(Debug, PartialEq, Eq)]
20pub struct ValidURL<'a>{
21    content: &'a str,
22    scheme: Option<Scheme<'a>>,
23}
24
25impl <'a> ValidURL<'a>{
26        fn fmt_unsafe(&self) -> String{
27            let amp_replace_content = self.content.replace('&', "&amp;");
28        match &self.scheme {
29            None => {return format!("http:{}", amp_replace_content)},
30            Some(Scheme::Email(_s)) => {return format!("{}", amp_replace_content)},
31            Some(s) => {return format!("{}:{}", s, amp_replace_content)},
32        }
33    }
34}
35
36
37impl fmt::Display for ValidURL<'_>{
38    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{
39        match &self.scheme {
40            None => {return write!(f, "http:{}", percent_encode(self.content).replace('&', "&amp;"))},
41            Some(s) => {return write!(f, "{}:{}", s, percent_encode(self.content).replace('&', "&amp;"))},
42        }
43    }
44}
45
46#[derive(Debug, PartialEq, Eq)]
47pub(crate) enum Scheme<'a>{
48    Http(&'a str),
49    Email(&'a str),
50    Irc(&'a str),
51    Other(&'a str),
52}
53
54impl fmt::Display for Scheme<'_> {
55    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
56        match self {
57            Scheme::Http(s) => {return write!(f, "{}", s)},
58            Scheme::Email(s) => {return write!(f, "{}", s)},
59            Scheme::Irc(s) => {return write!(f, "{}", s)},
60            Scheme::Other(s) => {return write!(f, "{}", s)},
61        }
62    }
63}
64
65/// Convert source markdown to an ordered vector of tokens
66pub fn lex<'a>(source: &'a str, ignore: &[char]) -> Vec<Token<'a>>{
67    let mut char_iter = MiniIter::new(source);
68    let mut tokens = Vec::new();
69    while char_iter.peek().is_some(){
70        match char_iter.peek().unwrap(){
71            "#" if !ignore.contains(&'#') => {
72                match lex_heading(&mut char_iter) {
73                    Ok(t) => tokens.push(t),
74                    Err(e) => push_str(&mut tokens, e.content),
75                }
76            },
77            "*" | "_" => {
78                match lex_asterisk_underscore(&mut char_iter) {
79                    Ok(t) => tokens.push(t),
80                    Err(e) => push_str(&mut tokens, e.content),
81                }
82            },
83            "~" => {
84                match lex_tilde(&mut char_iter) {
85                    Ok(t) => tokens.push(t),
86                    Err(e) => push_str(&mut tokens, e.content),
87                }
88            },
89            "-" | "+" => {
90                match lex_plus_minus(&mut char_iter) {
91                    Ok(t) => tokens.push(t),
92                    Err(e) => push_str(&mut tokens, e.content),
93                }
94            },
95            " " | "\t" => {
96                match lex_tabs_spaces(&mut char_iter, &tokens) {
97                    Ok(t) => tokens.push(t),
98                    Err(e) => push_str(&mut tokens, e.content),
99                }
100            },
101            "`" => {
102                match lex_backticks(&mut char_iter) {
103                    Ok(t) => tokens.push(t),
104                    Err(e) => push_str(&mut tokens, e.content),
105                }
106            },
107            "\n" => {
108                match lex_newlines(&mut char_iter, &tokens) {
109                    Ok(t) => tokens.push(t),
110                    Err(e) => push_str(&mut tokens, e.content),
111                }
112            },
113            ">" => {
114                match lex_blockquotes(&mut char_iter) {
115                    Ok(t) => {
116                        tokens.push(t);
117                        },
118                    Err(e) => push_str(&mut tokens, e.content),
119                }
120            },
121            "!" => {
122                match lex_images(&mut char_iter) {
123                    Ok(t) => tokens.push(t),
124                    Err(e) => push_str(&mut tokens, e.content),
125                }
126            },
127            "[" => {
128                match lex_links(&mut char_iter) {
129                    Ok(t) => tokens.push(t),
130                    Err(e) => push_str(&mut tokens, e.content),
131                }
132            },
133            "<" => {
134                match lex_side_carrot(&mut char_iter) {
135                    Ok(t) => tokens.push(t),
136                    Err(e) => push_str(&mut tokens, e.content),
137                }
138            },
139            "|" => {
140                match lex_pipes(&mut char_iter) {
141                    Ok(t) => tokens.push(t),
142                    Err(e) => push_str(&mut tokens, e.content),
143                }
144            },
145            "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "0" => {
146                match lex_numbers(&mut char_iter) {
147                    Ok(t) => tokens.push(t),
148                    Err(e) => push_str(&mut tokens, e.content),
149                }
150            }
151            // Parse "\" to escape a markdown control character
152            "\\" => {
153                char_iter.next();
154                if char_iter.peek() == Some(&"#"){
155                    let hashes = char_iter.consume_while_case_holds(&|c| c == "#").unwrap_or("");
156                    push_str(&mut tokens, hashes);
157                } else if char_iter.peek().is_some(){
158                    push_str(&mut tokens, char_iter.next().unwrap());
159                }
160            }
161            _ => {
162                push_str(&mut tokens, char_iter.next().unwrap());
163            },
164        }
165    }
166    tokens
167}
168
169/// Parse tokens to produce safe html output
170pub fn parse(tokens: &[Token]) -> String {
171    let mut html = String::with_capacity(tokens.len()*100);
172    let mut in_task_list = false;
173    let mut in_ordered_list = false;
174    let mut in_unordered_list = false;
175    let mut in_paragraph = false;
176    let mut in_code = false;
177    let mut quote_level = 0;
178    let mut references = Vec::new();
179    let mut token_iter = tokens.iter().peekable();
180
181    while token_iter.peek().is_some(){
182        let token = token_iter.next().unwrap();
183
184        // Handle multi-liners
185        match token {
186            Token::Plaintext(t) if t.trim().is_empty() => {}, //Ignore empty plaintext tokens 
187            Token::Tab | Token::DoubleTab => {},
188            Token::OrderedListEntry(_) | Token::UnorderedListEntry(_) | Token::Newline if in_ordered_list | in_unordered_list => {},
189            Token::TaskListItem(_, _)  | Token::Newline if in_task_list => {},
190            _ if in_ordered_list => {
191                in_ordered_list = false;
192                html.push_str("</ol>\n");
193                if !in_paragraph {
194                    in_paragraph = true;
195                    html.push_str("<p>") 
196                }
197            },
198            _ if in_unordered_list => {
199                in_unordered_list = false;
200                html.push_str("</ul>\n");
201                if !in_paragraph {
202                    in_paragraph = true;
203                    html.push_str("<p>") 
204                }
205            },
206            _ if in_task_list => {
207                in_task_list = false;
208                html.push_str("</ul>\n");
209                if !in_paragraph {
210                    in_paragraph = true;
211                    html.push_str("<p>") 
212                }
213            },
214            Token::Code(_) if !in_code => {
215                html.push_str("<pre><code>");
216                in_code = true;
217            },
218            
219            Token::BlockQuote(_, _) | Token::Newline if quote_level > 0 => {},
220            Token::CodeBlock(_, _) | Token::Newline | Token::Header(_, _, _) if in_paragraph => {
221                in_paragraph = false;
222                html.push_str("</p>\n")
223            },
224            Token::Plaintext(_) | Token::Italic(_) | Token::Bold(_) | Token::BoldItalic(_) | Token::Strikethrough(_) | Token::Link(_, _, _) if !in_paragraph => {
225                for _i in 0..quote_level {
226                        html.push_str("</blockquote>");
227                        quote_level-=1;
228                }
229                in_paragraph = true;
230                html.push_str("<p>")
231            },
232            _ => {}
233        }
234        // Add content
235        match token {
236            Token::Plaintext(t) => {
237                let mut t: String = t.to_string();
238                if t.trim().is_empty() {continue}
239                
240                // Trim trailing whitespace after a \n
241                match t.rfind('\n') {
242                    None => {},
243                    Some(n_index) => {
244                        let (_before, after) = t.split_at(n_index);
245                        if after.chars().all(|c| c.is_whitespace()) {
246                            t = t.trim_end_matches(after).to_string();
247                        } 
248                    }
249                }
250
251                // Handle references
252                if t.contains("[^") && t.contains("]") {
253                    let plaintext_tokens = t.split("[^");
254                    let mut s = String::new();
255                    let mut count = 1;
256                    for tok in plaintext_tokens {
257                        if tok.trim_end().ends_with("]") {
258                            let tok = tok.trim_end().trim_end_matches(']');
259                            s.push_str(format!(
260                                "<sup id=\"fnref:{reference}\" role=\"doc-noteref\"><a href=\"#fn:{reference}\" class=\"footnote\" rel=\"footnote\">{ref_count}</a></sup>", 
261                                reference = sanitize_display_text(tok), 
262                                ref_count = count).as_str());
263                            count+=1;
264                        } else {s.push_str(tok)}
265                    }
266                    html.push_str(&s);
267                } else {
268                    html.push_str(&sanitize_display_text(t.trim_start_matches('\n')))
269                }
270            },
271            Token::Header(l, t, lbl) => {
272                match lbl {
273                    Some(lbl_text) => html.push_str(format!("<h{level} id=\"{id}\">{text}</h{level}>\n", 
274                        level=l, 
275                        text=t, 
276                        id=sanitize_display_text(&lbl_text.replace(" ", "-")))
277                        .as_str()),
278                    None => html.push_str(format!("<h{level}>{text}</h{level}>\n", 
279                        level=l, 
280                        text=t)
281                        .as_str()),
282                };
283            },
284            Token::TaskListItem(c,t) => {
285                if in_task_list == false {
286                    in_task_list = true;
287                    html.push_str("<ul class=\"contains-task-list\">")
288                }
289                match c {
290                    TaskBox::Checked => {
291                        html.push_str(format!("<li class=\"task-list-item\"><input type=\"checkbox\" class=\"task-list-item-checkbox\" checked=\"\">{}</li>", sanitize_display_text(t)).as_str())
292
293                    },
294                    TaskBox::Unchecked => {
295                        html.push_str(format!("<li class=\"task-list-item\"><input type=\"checkbox\" class=\"task-list-item-checkbox\">{}</li>", sanitize_display_text(t)).as_str())
296                    }
297                }
298            },
299            Token::UnorderedListEntry(toks) => {
300                if in_unordered_list == false {
301                    in_unordered_list = true;
302                    html.push_str("<ul>\n")
303                }
304
305                html.push_str(format!("<li>").as_str());
306                if toks.into_iter().all(|t| matches!(t, Token::Plaintext(_))) {html.push_str(format!("\n").as_str());}
307                for token in toks.iter() {
308                    match token {
309                        Token::Plaintext(text) if text.starts_with("\t\t") => {
310                            html.push_str(&render(&text[1..].trim_start_matches(" ")).replace("<pre><code>", "<pre><code>  "));  
311                        },
312                        Token::Plaintext(text) => {
313                            let text = &render(&text.trim_start_matches(" ")).replace("<pre><code>", "<pre><code>  ");
314                            html.push_str(text);
315                        },
316                        _ => {},
317                    }
318                }
319                html.push_str(format!("</li>\n").as_str());
320            },
321            Token::OrderedListEntry(t) => {
322                if in_ordered_list == false {
323                    in_ordered_list = true;
324                    html.push_str(format!("<ol>\n").as_str())
325                }
326                html.push_str(format!("<li>\n{}</li>\n", sanitize_display_text(t)).as_str())
327            },
328            Token::Newline => {},
329            Token::Tab => {html.push('\t')},
330            Token::DoubleTab => {html.push_str("\t\t")},
331            Token::Italic(t) => {html.push_str(format!("<em>{}</em>", sanitize_display_text(t)).as_str())},
332            Token::Bold(t) => {html.push_str(format!("<strong>{}</strong>", sanitize_display_text(t)).as_str())},
333            Token::BoldItalic(t) => {html.push_str(format!("<strong><em>{}</em></strong>", sanitize_display_text(t)).as_str())},
334            Token::LineBreak => {html.push_str("<br>")},
335            Token::HorizontalRule => {html.push_str("<hr />\n")},
336            Token::Strikethrough(t) => {html.push_str(format!("<strike>{}</strike>", sanitize_display_text(t)).as_str())},
337            Token::Code(t) => {
338                html.push_str(format!("{}", sanitize_display_text(t)).as_str())},
339            Token::CodeBlock(t, lang) => {
340                html.push_str("<pre>");
341                match lang.as_str() {
342                    "" => html.push_str(format!("<code>{}</code>", sanitize_display_text(t)).as_str()),
343                    _ => html.push_str(format!(
344                        "<div class=\"language-{} highlighter-rouge\"><div class=\"highlight\"><pre class=\"highlight\"><code>{}</code></div></div>",
345                        sanitize_display_text(lang), 
346                        sanitize_display_text(t)
347                        ).as_str()),
348                };
349                html.push_str("</pre>");
350            },
351            Token::BlockQuote(l, t) => {
352                if in_paragraph {
353                    html.push_str("</p>");
354                    in_paragraph = false;
355                }
356                match quote_level {
357                    _ if l == &quote_level => {},
358                    _ if l < &quote_level => {
359                        let diff = quote_level - l;
360                        quote_level = *l;
361                        for _i in 0..diff {
362                            html.push_str("</blockquote>");
363                        }
364                    },
365                    _ if l > &quote_level => {
366                        let diff = l - quote_level;
367                        quote_level = *l;
368                        for _i in 0..diff {
369                            html.push_str("<blockquote>\n");
370                        }
371                    },
372                    _ => {},
373                }
374                if !t.is_empty(){
375                    html.push_str(
376                        &render(&sanitize_display_text(&t.trim_start_matches(" "))).replace("\t", "  ")
377                        );
378                }
379            },
380            Token::Image(l, t) => {
381                match (l, t) {
382                    (l, None) if l.trim() == "" => {html.push_str("<p><img src=\"data:,\"></p>")}
383                    (l, Some(t)) if l.trim() == "" => {html.push_str(format!("<p><img src=\"data:,\" alt=\"{text}\"></p>", text=sanitize_display_text(t)).as_str())}
384                    (l, None) => {html.push_str(format!("<p><img src=\"{link}\"> referrerpolicy=\"no-referrer\"></p>", link=l).as_str())}
385                    (l, Some(t)) => {html.push_str(format!("<p><img src=\"{link}\" alt=\"{text}\" referrerpolicy=\"no-referrer\"></p>", link=l, text=sanitize_display_text(t)).as_str())}
386                }
387                
388            },
389            Token::Link(l, t, ht) => {
390                match (t, ht){
391                    (Some(t), Some(ht)) => html.push_str(format!("<a href=>\"{link}\" title=\"{hover}\">{text}</a>", link=l, text=sanitize_display_text(t), hover=ht).as_str()),
392                    (Some(t), None) => html.push_str(format!("<a href=\"{link}\">{text}</a>", link=l, text=sanitize_display_text(t)).as_str()),
393                    (None, Some(ht)) => html.push_str(format!("<a href=\"{link}\" title=\"{hover}\">{link}</a>", link=l, hover=sanitize_display_text(ht)).as_str()),
394                    (None, None) => html.push_str(format!("<a href=\"{link}\">{display}</a>", link=l, display=l.fmt_unsafe()).as_str()),
395                }
396            },
397            Token::Detail(summary, inner_tokens) => {
398                if in_paragraph {
399                    html.push_str("</p>\n");
400                    in_paragraph = false;
401                }
402                let inner_html = parse(inner_tokens);
403                html.push_str(format!("<details>\n<summary>{sum}</summary>\n{in_html}\n</details>", sum=sanitize_display_text(summary), in_html=inner_html).as_str());
404            },
405            Token::Table(headings, rows) => {
406                if  headings.len() != rows[0].len() {continue}
407                html.push_str("<table class=\"table table-bordered\">\n\t<thead>\n\t<tr>\n");
408                for h in headings.into_iter() {
409                    html.push_str(format!("\t\t<th style=\"text-align: {align}\">{heading}</th>", heading=sanitize_display_text(&h.1), align=h.0).as_str());
410                }
411                html.push_str("\t</tr>\n\t</thead>\n\t<tbody>");
412                for row in rows.iter(){
413                    html.push_str("\n\t<tr>");
414                    for elem in row.iter(){
415                        let mut row_string = String::new();
416                        for token in elem.1.iter() {
417                           match token {
418                            Token::Plaintext(s) => row_string.push_str(&sanitize_display_text(&s)),
419                            Token::Italic(t) => {row_string.push_str(format!("<em>{}</em>", sanitize_display_text(t)).as_str())},
420                            Token::Bold(t) => {row_string.push_str(format!("<strong>{}</strong>", sanitize_display_text(t)).as_str())},
421                            Token::BoldItalic(t) => {row_string.push_str(format!("<strong><em>{}</em></strong>", sanitize_display_text(t)).as_str())},
422                            Token::LineBreak => {row_string.push_str("<br>")},
423                            Token::HorizontalRule => {row_string.push_str("<hr />")},
424                            Token::Strikethrough(t) => {row_string.push_str(format!("<strike>{}</strike>", sanitize_display_text(t)).as_str())},
425                            _ => row_string.push_str(&parse(&elem.1))
426                            } 
427                        }
428                        html.push_str(format!("\n\t\t<td style=\"text-align: {align}\">{row_text}</td>", align=elem.0, row_text=row_string).as_str());
429                    }
430                    html.push_str("\n\t</tr>");
431                }
432                html.push_str("\n\t</tbody>\n</table>");
433            },
434            Token::Footnote(ref_id, text) => {
435                references.push((ref_id, text));
436            },
437        }
438    }
439
440    // Close out any open tags
441    if in_paragraph {
442        html.push_str("</p>\n");
443    }
444    if in_task_list | in_unordered_list {
445        html.push_str("</ul>");
446    }
447    if in_ordered_list {
448        html.push_str("</ol>");
449    }
450    if quote_level > 0 {
451        for _i in (0..quote_level).rev(){
452            html.push_str("</blockquote>\n");
453        }
454    }
455    if in_code && !matches!(token_iter.peek(), Some(Token::Code(_))) {
456        match html.chars().last().unwrap() {
457            '\n' => {},
458            _ => {html.push('\n')},
459        }
460        html.push_str("</code></pre>");
461    }
462
463    // Add references
464    if references.len() > 0{
465        html.push_str("<div class=\"footnotes\" role=\"doc-endnotes\">\n");
466        html.push_str("\t<ol>\n");
467        for reference in references.iter(){
468            html.push_str("\t\t<li id=\"fn:1\" role=\"doc-endnote\">");
469            html.push_str(format!("\t\t\t<p>{ref_text}<a href=\"#fnref:{ref_count}\" class=\"reversefootnote\" role=\"doc-backlink\">↩</a></p>", 
470                ref_count=sanitize_display_text(reference.0), 
471                ref_text=sanitize_display_text(reference.1)).as_str());
472            html.push_str("\t\t</li>");
473        }
474        html.push_str("\t</ol>\n");
475        html.push_str("</div>\n");
476    }
477    if html.chars().last().unwrap_or(' ') != '\n' {
478        html.push('\n');
479    }
480    html
481}
482
483/// Render HTML from a source markdown string
484/// Output is sanitized to prevent script injection
485pub fn render(source: &str) -> String {
486    parse(&lex(source, &[]))
487}
488
489pub(crate) fn render_ignore(source: &str, ignore: &[char]) -> String {
490    parse(&lex(source, ignore))
491}
492
493/// Replace potentially unsafe characters with html entities
494pub(crate) fn sanitize_display_text(source: &str) -> String {
495    source.replace('&', "&amp;")
496        .replace('<', "&lt;")
497        .replace('>', "&gt;")
498        .replace('"', "&quot;")
499        .replace('\'', "&apos;")
500        .replace('[', "&lbrack;")
501        .replace(']', "&rbrack;")
502        .replace('{', "&lbrace;")
503        .replace('}', "&rbrace;")
504        .replace('|', "&mid;")
505        .replace('\\', "")
506        .replace('~', "&tilde;")
507        .replace(')', "&#41;")
508        .replace('(', "&#40;")
509}
510
511pub(crate) fn percent_encode(source: &str) -> String {
512    source.replace('%', "%25")
513        .replace('#',"%23")
514        .replace('[',"%5B")
515        .replace(']',"%5D")
516        .replace('!',"%21")
517        .replace('$',"%24")
518        .replace("'","%27")
519        .replace('(',"%28")
520        .replace(')',"%29")
521        .replace('*',"%2A")
522        .replace(' ',"%20")
523        .replace('\\', "%5C")
524}
525
526pub(crate) fn validate_link(source: &str) -> Result<ValidURL, SanitizationError> {
527    if !source.is_ascii() || source.contains(char::is_whitespace) { // https://www.rfc-editor.org/rfc/rfc3986#section-2
528        return Err(SanitizationError{content: source})
529    }
530    let (scheme, path) = source.split_at(source.find(':').unwrap_or(0));
531    if scheme.to_lowercase() == "javascript" || !scheme.is_ascii() {
532        return Err(SanitizationError{content: source})
533    }
534    if scheme.to_lowercase() == "data" && !path.starts_with(":image/"){
535        return Err(SanitizationError{content: source})
536    }
537    if scheme.len() != 0 && ( scheme.len() < 2 || scheme.len() > 32 ) {
538        return Err(SanitizationError{content: source})
539    }
540
541    // Scheme defined here https://spec.commonmark.org/0.30/#scheme
542    // char set in COMMONMARK_SCHEME_ASCII. 2 to 32 chars followed by `:`
543    let source_scheme = {
544        let parts: Vec<_> = source.split(":").collect();
545        if source.contains(':')
546            && parts[0].chars().all(|c| COMMONMARK_SCHEME_ASCII.contains(&c))
547            && parts[0].len() >= 2
548            && parts[0].len() <= 32 {
549                match parts[0] {
550                    "http" => Some(Scheme::Http(parts[0])),
551                    "mailto" => Some(Scheme::Email(parts[0])),
552                    "irc" => Some(Scheme::Irc(parts[0])),
553                    _ => Some(Scheme::Other(parts[0]))
554                }
555            } else {None}
556    };
557
558    //Check for mail links
559    if source.contains('@') && source.matches('@').count() == 1 && !source.contains('\\') {
560        if source_scheme.is_some() {
561            return Ok(ValidURL{scheme: Some(source_scheme.unwrap_or(Scheme::Email("mailto"))), content: &source.split(":").last().unwrap()})   
562        }
563        return Ok(ValidURL{scheme: Some(source_scheme.unwrap_or(Scheme::Email("mailto"))), content: &source})
564    }
565    if source.contains('@') && source.matches('@').count() == 1 && source.contains('\\') {
566        return Err(SanitizationError{content: source})
567    }
568
569    match source_scheme {
570        Some(Scheme::Http(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Http(s))})},
571        Some(Scheme::Email(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Email(s))})},
572        Some(Scheme::Irc(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Irc(s))})},
573        Some(Scheme::Other(s)) => Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Other(s))}),
574        None => Ok(ValidURL{content: source, scheme: None}),
575    }
576    
577}