1pub mod lexer;
2pub mod iter;
3use std::fmt;
4use crate::lexer::*;
5use crate::iter::MiniIter;
6
7static COMMONMARK_SCHEME_ASCII: [char; 65] = [ 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z',
9 'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
10 '1','2','3','4','5','6','7','8','9','0',
11 '+','.','-'];
12
13
14#[derive(Debug)]
15pub(crate) struct SanitizationError<'a>{
16 pub(crate) content: &'a str,
17}
18
19#[derive(Debug, PartialEq, Eq)]
20pub struct ValidURL<'a>{
21 content: &'a str,
22 scheme: Option<Scheme<'a>>,
23}
24
25impl <'a> ValidURL<'a>{
26 fn fmt_unsafe(&self) -> String{
27 let amp_replace_content = self.content.replace('&', "&");
28 match &self.scheme {
29 None => {return format!("http:{}", amp_replace_content)},
30 Some(Scheme::Email(_s)) => {return format!("{}", amp_replace_content)},
31 Some(s) => {return format!("{}:{}", s, amp_replace_content)},
32 }
33 }
34}
35
36
37impl fmt::Display for ValidURL<'_>{
38 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result{
39 match &self.scheme {
40 None => {return write!(f, "http:{}", percent_encode(self.content).replace('&', "&"))},
41 Some(s) => {return write!(f, "{}:{}", s, percent_encode(self.content).replace('&', "&"))},
42 }
43 }
44}
45
46#[derive(Debug, PartialEq, Eq)]
47pub(crate) enum Scheme<'a>{
48 Http(&'a str),
49 Email(&'a str),
50 Irc(&'a str),
51 Other(&'a str),
52}
53
54impl fmt::Display for Scheme<'_> {
55 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
56 match self {
57 Scheme::Http(s) => {return write!(f, "{}", s)},
58 Scheme::Email(s) => {return write!(f, "{}", s)},
59 Scheme::Irc(s) => {return write!(f, "{}", s)},
60 Scheme::Other(s) => {return write!(f, "{}", s)},
61 }
62 }
63}
64
65pub fn lex<'a>(source: &'a str, ignore: &[char]) -> Vec<Token<'a>>{
67 let mut char_iter = MiniIter::new(source);
68 let mut tokens = Vec::new();
69 while char_iter.peek().is_some(){
70 match char_iter.peek().unwrap(){
71 "#" if !ignore.contains(&'#') => {
72 match lex_heading(&mut char_iter) {
73 Ok(t) => tokens.push(t),
74 Err(e) => push_str(&mut tokens, e.content),
75 }
76 },
77 "*" | "_" => {
78 match lex_asterisk_underscore(&mut char_iter) {
79 Ok(t) => tokens.push(t),
80 Err(e) => push_str(&mut tokens, e.content),
81 }
82 },
83 "~" => {
84 match lex_tilde(&mut char_iter) {
85 Ok(t) => tokens.push(t),
86 Err(e) => push_str(&mut tokens, e.content),
87 }
88 },
89 "-" | "+" => {
90 match lex_plus_minus(&mut char_iter) {
91 Ok(t) => tokens.push(t),
92 Err(e) => push_str(&mut tokens, e.content),
93 }
94 },
95 " " | "\t" => {
96 match lex_tabs_spaces(&mut char_iter, &tokens) {
97 Ok(t) => tokens.push(t),
98 Err(e) => push_str(&mut tokens, e.content),
99 }
100 },
101 "`" => {
102 match lex_backticks(&mut char_iter) {
103 Ok(t) => tokens.push(t),
104 Err(e) => push_str(&mut tokens, e.content),
105 }
106 },
107 "\n" => {
108 match lex_newlines(&mut char_iter, &tokens) {
109 Ok(t) => tokens.push(t),
110 Err(e) => push_str(&mut tokens, e.content),
111 }
112 },
113 ">" => {
114 match lex_blockquotes(&mut char_iter) {
115 Ok(t) => {
116 tokens.push(t);
117 },
118 Err(e) => push_str(&mut tokens, e.content),
119 }
120 },
121 "!" => {
122 match lex_images(&mut char_iter) {
123 Ok(t) => tokens.push(t),
124 Err(e) => push_str(&mut tokens, e.content),
125 }
126 },
127 "[" => {
128 match lex_links(&mut char_iter) {
129 Ok(t) => tokens.push(t),
130 Err(e) => push_str(&mut tokens, e.content),
131 }
132 },
133 "<" => {
134 match lex_side_carrot(&mut char_iter) {
135 Ok(t) => tokens.push(t),
136 Err(e) => push_str(&mut tokens, e.content),
137 }
138 },
139 "|" => {
140 match lex_pipes(&mut char_iter) {
141 Ok(t) => tokens.push(t),
142 Err(e) => push_str(&mut tokens, e.content),
143 }
144 },
145 "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "0" => {
146 match lex_numbers(&mut char_iter) {
147 Ok(t) => tokens.push(t),
148 Err(e) => push_str(&mut tokens, e.content),
149 }
150 }
151 "\\" => {
153 char_iter.next();
154 if char_iter.peek() == Some(&"#"){
155 let hashes = char_iter.consume_while_case_holds(&|c| c == "#").unwrap_or("");
156 push_str(&mut tokens, hashes);
157 } else if char_iter.peek().is_some(){
158 push_str(&mut tokens, char_iter.next().unwrap());
159 }
160 }
161 _ => {
162 push_str(&mut tokens, char_iter.next().unwrap());
163 },
164 }
165 }
166 tokens
167}
168
169pub fn parse(tokens: &[Token]) -> String {
171 let mut html = String::with_capacity(tokens.len()*100);
172 let mut in_task_list = false;
173 let mut in_ordered_list = false;
174 let mut in_unordered_list = false;
175 let mut in_paragraph = false;
176 let mut in_code = false;
177 let mut quote_level = 0;
178 let mut references = Vec::new();
179 let mut token_iter = tokens.iter().peekable();
180
181 while token_iter.peek().is_some(){
182 let token = token_iter.next().unwrap();
183
184 match token {
186 Token::Plaintext(t) if t.trim().is_empty() => {}, Token::Tab | Token::DoubleTab => {},
188 Token::OrderedListEntry(_) | Token::UnorderedListEntry(_) | Token::Newline if in_ordered_list | in_unordered_list => {},
189 Token::TaskListItem(_, _) | Token::Newline if in_task_list => {},
190 _ if in_ordered_list => {
191 in_ordered_list = false;
192 html.push_str("</ol>\n");
193 if !in_paragraph {
194 in_paragraph = true;
195 html.push_str("<p>")
196 }
197 },
198 _ if in_unordered_list => {
199 in_unordered_list = false;
200 html.push_str("</ul>\n");
201 if !in_paragraph {
202 in_paragraph = true;
203 html.push_str("<p>")
204 }
205 },
206 _ if in_task_list => {
207 in_task_list = false;
208 html.push_str("</ul>\n");
209 if !in_paragraph {
210 in_paragraph = true;
211 html.push_str("<p>")
212 }
213 },
214 Token::Code(_) if !in_code => {
215 html.push_str("<pre><code>");
216 in_code = true;
217 },
218
219 Token::BlockQuote(_, _) | Token::Newline if quote_level > 0 => {},
220 Token::CodeBlock(_, _) | Token::Newline | Token::Header(_, _, _) if in_paragraph => {
221 in_paragraph = false;
222 html.push_str("</p>\n")
223 },
224 Token::Plaintext(_) | Token::Italic(_) | Token::Bold(_) | Token::BoldItalic(_) | Token::Strikethrough(_) | Token::Link(_, _, _) if !in_paragraph => {
225 for _i in 0..quote_level {
226 html.push_str("</blockquote>");
227 quote_level-=1;
228 }
229 in_paragraph = true;
230 html.push_str("<p>")
231 },
232 _ => {}
233 }
234 match token {
236 Token::Plaintext(t) => {
237 let mut t: String = t.to_string();
238 if t.trim().is_empty() {continue}
239
240 match t.rfind('\n') {
242 None => {},
243 Some(n_index) => {
244 let (_before, after) = t.split_at(n_index);
245 if after.chars().all(|c| c.is_whitespace()) {
246 t = t.trim_end_matches(after).to_string();
247 }
248 }
249 }
250
251 if t.contains("[^") && t.contains("]") {
253 let plaintext_tokens = t.split("[^");
254 let mut s = String::new();
255 let mut count = 1;
256 for tok in plaintext_tokens {
257 if tok.trim_end().ends_with("]") {
258 let tok = tok.trim_end().trim_end_matches(']');
259 s.push_str(format!(
260 "<sup id=\"fnref:{reference}\" role=\"doc-noteref\"><a href=\"#fn:{reference}\" class=\"footnote\" rel=\"footnote\">{ref_count}</a></sup>",
261 reference = sanitize_display_text(tok),
262 ref_count = count).as_str());
263 count+=1;
264 } else {s.push_str(tok)}
265 }
266 html.push_str(&s);
267 } else {
268 html.push_str(&sanitize_display_text(t.trim_start_matches('\n')))
269 }
270 },
271 Token::Header(l, t, lbl) => {
272 match lbl {
273 Some(lbl_text) => html.push_str(format!("<h{level} id=\"{id}\">{text}</h{level}>\n",
274 level=l,
275 text=t,
276 id=sanitize_display_text(&lbl_text.replace(" ", "-")))
277 .as_str()),
278 None => html.push_str(format!("<h{level}>{text}</h{level}>\n",
279 level=l,
280 text=t)
281 .as_str()),
282 };
283 },
284 Token::TaskListItem(c,t) => {
285 if in_task_list == false {
286 in_task_list = true;
287 html.push_str("<ul class=\"contains-task-list\">")
288 }
289 match c {
290 TaskBox::Checked => {
291 html.push_str(format!("<li class=\"task-list-item\"><input type=\"checkbox\" class=\"task-list-item-checkbox\" checked=\"\">{}</li>", sanitize_display_text(t)).as_str())
292
293 },
294 TaskBox::Unchecked => {
295 html.push_str(format!("<li class=\"task-list-item\"><input type=\"checkbox\" class=\"task-list-item-checkbox\">{}</li>", sanitize_display_text(t)).as_str())
296 }
297 }
298 },
299 Token::UnorderedListEntry(toks) => {
300 if in_unordered_list == false {
301 in_unordered_list = true;
302 html.push_str("<ul>\n")
303 }
304
305 html.push_str(format!("<li>").as_str());
306 if toks.into_iter().all(|t| matches!(t, Token::Plaintext(_))) {html.push_str(format!("\n").as_str());}
307 for token in toks.iter() {
308 match token {
309 Token::Plaintext(text) if text.starts_with("\t\t") => {
310 html.push_str(&render(&text[1..].trim_start_matches(" ")).replace("<pre><code>", "<pre><code> "));
311 },
312 Token::Plaintext(text) => {
313 let text = &render(&text.trim_start_matches(" ")).replace("<pre><code>", "<pre><code> ");
314 html.push_str(text);
315 },
316 _ => {},
317 }
318 }
319 html.push_str(format!("</li>\n").as_str());
320 },
321 Token::OrderedListEntry(t) => {
322 if in_ordered_list == false {
323 in_ordered_list = true;
324 html.push_str(format!("<ol>\n").as_str())
325 }
326 html.push_str(format!("<li>\n{}</li>\n", sanitize_display_text(t)).as_str())
327 },
328 Token::Newline => {},
329 Token::Tab => {html.push('\t')},
330 Token::DoubleTab => {html.push_str("\t\t")},
331 Token::Italic(t) => {html.push_str(format!("<em>{}</em>", sanitize_display_text(t)).as_str())},
332 Token::Bold(t) => {html.push_str(format!("<strong>{}</strong>", sanitize_display_text(t)).as_str())},
333 Token::BoldItalic(t) => {html.push_str(format!("<strong><em>{}</em></strong>", sanitize_display_text(t)).as_str())},
334 Token::LineBreak => {html.push_str("<br>")},
335 Token::HorizontalRule => {html.push_str("<hr />\n")},
336 Token::Strikethrough(t) => {html.push_str(format!("<strike>{}</strike>", sanitize_display_text(t)).as_str())},
337 Token::Code(t) => {
338 html.push_str(format!("{}", sanitize_display_text(t)).as_str())},
339 Token::CodeBlock(t, lang) => {
340 html.push_str("<pre>");
341 match lang.as_str() {
342 "" => html.push_str(format!("<code>{}</code>", sanitize_display_text(t)).as_str()),
343 _ => html.push_str(format!(
344 "<div class=\"language-{} highlighter-rouge\"><div class=\"highlight\"><pre class=\"highlight\"><code>{}</code></div></div>",
345 sanitize_display_text(lang),
346 sanitize_display_text(t)
347 ).as_str()),
348 };
349 html.push_str("</pre>");
350 },
351 Token::BlockQuote(l, t) => {
352 if in_paragraph {
353 html.push_str("</p>");
354 in_paragraph = false;
355 }
356 match quote_level {
357 _ if l == "e_level => {},
358 _ if l < "e_level => {
359 let diff = quote_level - l;
360 quote_level = *l;
361 for _i in 0..diff {
362 html.push_str("</blockquote>");
363 }
364 },
365 _ if l > "e_level => {
366 let diff = l - quote_level;
367 quote_level = *l;
368 for _i in 0..diff {
369 html.push_str("<blockquote>\n");
370 }
371 },
372 _ => {},
373 }
374 if !t.is_empty(){
375 html.push_str(
376 &render(&sanitize_display_text(&t.trim_start_matches(" "))).replace("\t", " ")
377 );
378 }
379 },
380 Token::Image(l, t) => {
381 match (l, t) {
382 (l, None) if l.trim() == "" => {html.push_str("<p><img src=\"data:,\"></p>")}
383 (l, Some(t)) if l.trim() == "" => {html.push_str(format!("<p><img src=\"data:,\" alt=\"{text}\"></p>", text=sanitize_display_text(t)).as_str())}
384 (l, None) => {html.push_str(format!("<p><img src=\"{link}\"> referrerpolicy=\"no-referrer\"></p>", link=l).as_str())}
385 (l, Some(t)) => {html.push_str(format!("<p><img src=\"{link}\" alt=\"{text}\" referrerpolicy=\"no-referrer\"></p>", link=l, text=sanitize_display_text(t)).as_str())}
386 }
387
388 },
389 Token::Link(l, t, ht) => {
390 match (t, ht){
391 (Some(t), Some(ht)) => html.push_str(format!("<a href=>\"{link}\" title=\"{hover}\">{text}</a>", link=l, text=sanitize_display_text(t), hover=ht).as_str()),
392 (Some(t), None) => html.push_str(format!("<a href=\"{link}\">{text}</a>", link=l, text=sanitize_display_text(t)).as_str()),
393 (None, Some(ht)) => html.push_str(format!("<a href=\"{link}\" title=\"{hover}\">{link}</a>", link=l, hover=sanitize_display_text(ht)).as_str()),
394 (None, None) => html.push_str(format!("<a href=\"{link}\">{display}</a>", link=l, display=l.fmt_unsafe()).as_str()),
395 }
396 },
397 Token::Detail(summary, inner_tokens) => {
398 if in_paragraph {
399 html.push_str("</p>\n");
400 in_paragraph = false;
401 }
402 let inner_html = parse(inner_tokens);
403 html.push_str(format!("<details>\n<summary>{sum}</summary>\n{in_html}\n</details>", sum=sanitize_display_text(summary), in_html=inner_html).as_str());
404 },
405 Token::Table(headings, rows) => {
406 if headings.len() != rows[0].len() {continue}
407 html.push_str("<table class=\"table table-bordered\">\n\t<thead>\n\t<tr>\n");
408 for h in headings.into_iter() {
409 html.push_str(format!("\t\t<th style=\"text-align: {align}\">{heading}</th>", heading=sanitize_display_text(&h.1), align=h.0).as_str());
410 }
411 html.push_str("\t</tr>\n\t</thead>\n\t<tbody>");
412 for row in rows.iter(){
413 html.push_str("\n\t<tr>");
414 for elem in row.iter(){
415 let mut row_string = String::new();
416 for token in elem.1.iter() {
417 match token {
418 Token::Plaintext(s) => row_string.push_str(&sanitize_display_text(&s)),
419 Token::Italic(t) => {row_string.push_str(format!("<em>{}</em>", sanitize_display_text(t)).as_str())},
420 Token::Bold(t) => {row_string.push_str(format!("<strong>{}</strong>", sanitize_display_text(t)).as_str())},
421 Token::BoldItalic(t) => {row_string.push_str(format!("<strong><em>{}</em></strong>", sanitize_display_text(t)).as_str())},
422 Token::LineBreak => {row_string.push_str("<br>")},
423 Token::HorizontalRule => {row_string.push_str("<hr />")},
424 Token::Strikethrough(t) => {row_string.push_str(format!("<strike>{}</strike>", sanitize_display_text(t)).as_str())},
425 _ => row_string.push_str(&parse(&elem.1))
426 }
427 }
428 html.push_str(format!("\n\t\t<td style=\"text-align: {align}\">{row_text}</td>", align=elem.0, row_text=row_string).as_str());
429 }
430 html.push_str("\n\t</tr>");
431 }
432 html.push_str("\n\t</tbody>\n</table>");
433 },
434 Token::Footnote(ref_id, text) => {
435 references.push((ref_id, text));
436 },
437 }
438 }
439
440 if in_paragraph {
442 html.push_str("</p>\n");
443 }
444 if in_task_list | in_unordered_list {
445 html.push_str("</ul>");
446 }
447 if in_ordered_list {
448 html.push_str("</ol>");
449 }
450 if quote_level > 0 {
451 for _i in (0..quote_level).rev(){
452 html.push_str("</blockquote>\n");
453 }
454 }
455 if in_code && !matches!(token_iter.peek(), Some(Token::Code(_))) {
456 match html.chars().last().unwrap() {
457 '\n' => {},
458 _ => {html.push('\n')},
459 }
460 html.push_str("</code></pre>");
461 }
462
463 if references.len() > 0{
465 html.push_str("<div class=\"footnotes\" role=\"doc-endnotes\">\n");
466 html.push_str("\t<ol>\n");
467 for reference in references.iter(){
468 html.push_str("\t\t<li id=\"fn:1\" role=\"doc-endnote\">");
469 html.push_str(format!("\t\t\t<p>{ref_text}<a href=\"#fnref:{ref_count}\" class=\"reversefootnote\" role=\"doc-backlink\">↩</a></p>",
470 ref_count=sanitize_display_text(reference.0),
471 ref_text=sanitize_display_text(reference.1)).as_str());
472 html.push_str("\t\t</li>");
473 }
474 html.push_str("\t</ol>\n");
475 html.push_str("</div>\n");
476 }
477 if html.chars().last().unwrap_or(' ') != '\n' {
478 html.push('\n');
479 }
480 html
481}
482
483pub fn render(source: &str) -> String {
486 parse(&lex(source, &[]))
487}
488
489pub(crate) fn render_ignore(source: &str, ignore: &[char]) -> String {
490 parse(&lex(source, ignore))
491}
492
493pub(crate) fn sanitize_display_text(source: &str) -> String {
495 source.replace('&', "&")
496 .replace('<', "<")
497 .replace('>', ">")
498 .replace('"', """)
499 .replace('\'', "'")
500 .replace('[', "[")
501 .replace(']', "]")
502 .replace('{', "{")
503 .replace('}', "}")
504 .replace('|', "∣")
505 .replace('\\', "")
506 .replace('~', "˜")
507 .replace(')', ")")
508 .replace('(', "(")
509}
510
511pub(crate) fn percent_encode(source: &str) -> String {
512 source.replace('%', "%25")
513 .replace('#',"%23")
514 .replace('[',"%5B")
515 .replace(']',"%5D")
516 .replace('!',"%21")
517 .replace('$',"%24")
518 .replace("'","%27")
519 .replace('(',"%28")
520 .replace(')',"%29")
521 .replace('*',"%2A")
522 .replace(' ',"%20")
523 .replace('\\', "%5C")
524}
525
526pub(crate) fn validate_link(source: &str) -> Result<ValidURL, SanitizationError> {
527 if !source.is_ascii() || source.contains(char::is_whitespace) { return Err(SanitizationError{content: source})
529 }
530 let (scheme, path) = source.split_at(source.find(':').unwrap_or(0));
531 if scheme.to_lowercase() == "javascript" || !scheme.is_ascii() {
532 return Err(SanitizationError{content: source})
533 }
534 if scheme.to_lowercase() == "data" && !path.starts_with(":image/"){
535 return Err(SanitizationError{content: source})
536 }
537 if scheme.len() != 0 && ( scheme.len() < 2 || scheme.len() > 32 ) {
538 return Err(SanitizationError{content: source})
539 }
540
541 let source_scheme = {
544 let parts: Vec<_> = source.split(":").collect();
545 if source.contains(':')
546 && parts[0].chars().all(|c| COMMONMARK_SCHEME_ASCII.contains(&c))
547 && parts[0].len() >= 2
548 && parts[0].len() <= 32 {
549 match parts[0] {
550 "http" => Some(Scheme::Http(parts[0])),
551 "mailto" => Some(Scheme::Email(parts[0])),
552 "irc" => Some(Scheme::Irc(parts[0])),
553 _ => Some(Scheme::Other(parts[0]))
554 }
555 } else {None}
556 };
557
558 if source.contains('@') && source.matches('@').count() == 1 && !source.contains('\\') {
560 if source_scheme.is_some() {
561 return Ok(ValidURL{scheme: Some(source_scheme.unwrap_or(Scheme::Email("mailto"))), content: &source.split(":").last().unwrap()})
562 }
563 return Ok(ValidURL{scheme: Some(source_scheme.unwrap_or(Scheme::Email("mailto"))), content: &source})
564 }
565 if source.contains('@') && source.matches('@').count() == 1 && source.contains('\\') {
566 return Err(SanitizationError{content: source})
567 }
568
569 match source_scheme {
570 Some(Scheme::Http(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Http(s))})},
571 Some(Scheme::Email(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Email(s))})},
572 Some(Scheme::Irc(s)) => {Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Irc(s))})},
573 Some(Scheme::Other(s)) => Ok(ValidURL{content: source.strip_prefix(s).unwrap_or("").strip_prefix(":").unwrap_or(""), scheme: Some(Scheme::Other(s))}),
574 None => Ok(ValidURL{content: source, scheme: None}),
575 }
576
577}