1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14 options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20 pub ignore_link_title: bool,
21}
22
23#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26 fn default() -> Self {
27 Self {
28 ignore_link_title: false,
29 }
30 }
31}
32
33impl Markdown {
34 pub fn new(options: MarkdownOptions) -> Self {
35 Self { options }
36 }
37
38 fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46 let mut to_remove = VecDeque::new();
47
48 for pipe_idx in tokens.iter_pipe_indices() {
49 if pipe_idx < 2 {
50 continue;
51 }
52
53 let mut cursor = pipe_idx - 2;
55 let mut open_bracket = None;
56
57 loop {
58 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59 break;
60 };
61
62 if a.kind.is_newline() {
63 break;
64 }
65
66 if a.kind.is_open_square() && b.kind.is_open_square() {
67 open_bracket = Some(cursor);
68 break;
69 } else if cursor == 0 {
70 break;
71 } else {
72 cursor -= 1;
73 }
74 }
75
76 cursor = pipe_idx + 1;
78 let mut close_bracket = None;
79
80 loop {
81 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82 break;
83 };
84
85 if a.kind.is_newline() {
86 break;
87 }
88
89 if a.kind.is_close_square() && b.kind.is_close_square() {
90 close_bracket = Some(cursor);
91 break;
92 } else {
93 cursor += 1;
94 }
95 }
96
97 if let Some(open_bracket_idx) = open_bracket
98 && let Some(close_bracket_idx) = close_bracket
99 {
100 to_remove.extend(open_bracket_idx..=pipe_idx);
101 to_remove.push_back(close_bracket_idx);
102 to_remove.push_back(close_bracket_idx + 1);
103 }
104 }
105
106 tokens.remove_indices(to_remove);
107 }
108
109 fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112 let mut to_remove = VecDeque::new();
113 let mut open_brackets = None;
114
115 let mut cursor = 0;
116
117 loop {
118 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119 break;
120 };
121
122 if let Some(open_brackets_idx) = open_brackets {
123 if a.kind.is_newline() {
124 open_brackets = None;
125 cursor += 1;
126 continue;
127 }
128
129 if a.kind.is_close_square() && b.kind.is_close_square() {
130 to_remove.push_back(open_brackets_idx);
131 to_remove.push_back(open_brackets_idx + 1);
132
133 to_remove.push_back(cursor);
134 to_remove.push_back(cursor + 1);
135
136 open_brackets = None;
137 }
138 } else if a.kind.is_open_square() && b.kind.is_open_square() {
139 open_brackets = Some(cursor);
140 }
141
142 cursor += 1;
143 }
144
145 tokens.remove_indices(to_remove);
146 }
147}
148
149impl Parser for Markdown {
150 fn parse(&self, source: &[char]) -> Vec<Token> {
153 let english_parser = PlainEnglish;
154
155 let source_str: String = source.iter().collect();
156 let md_parser = pulldown_cmark::Parser::new_ext(
157 &source_str,
158 pulldown_cmark::Options::all()
159 .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160 );
161
162 let mut tokens = Vec::new();
163
164 let mut traversed_bytes = 0;
165 let mut traversed_chars = 0;
166
167 let mut stack = Vec::new();
168
169 for (event, range) in md_parser.into_offset_iter() {
172 if range.start > traversed_bytes {
173 traversed_chars += source_str[traversed_bytes..range.start].chars().count();
174 traversed_bytes = range.start;
175 }
176
177 match event {
178 pulldown_cmark::Event::SoftBreak => {
179 tokens.push(Token {
180 span: Span::new_with_len(traversed_chars, 1),
181 kind: TokenKind::Newline(1),
182 });
183 }
184 pulldown_cmark::Event::HardBreak => {
185 tokens.push(Token {
186 span: Span::new_with_len(traversed_chars, 1),
187 kind: TokenKind::Newline(2),
188 });
189 }
190 pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
191 tokens.push(Token {
192 span: Span::new_with_len(traversed_chars, 0),
193 kind: TokenKind::Newline(2),
194 });
195 stack.push(pulldown_cmark::Tag::List(v));
196 }
197 pulldown_cmark::Event::Start(tag) => stack.push(tag),
198 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
199 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
200 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
201 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
202 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
203 tokens.push(Token {
204 span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
210 kind: TokenKind::ParagraphBreak,
211 });
212 stack.pop();
213 }
214 pulldown_cmark::Event::End(_) => {
215 stack.pop();
216 }
217 pulldown_cmark::Event::InlineMath(code)
218 | pulldown_cmark::Event::DisplayMath(code)
219 | pulldown_cmark::Event::Code(code) => {
220 let chunk_len = code.chars().count();
221
222 tokens.push(Token {
223 span: Span::new_with_len(traversed_chars, chunk_len),
224 kind: TokenKind::Unlintable,
225 });
226 }
227 pulldown_cmark::Event::Text(text) => {
228 let chunk_len = text.chars().count();
229
230 if let Some(tag) = stack.last() {
231 use pulldown_cmark::Tag;
232
233 if matches!(tag, Tag::CodeBlock(..)) {
234 tokens.push(Token {
235 span: Span::new_with_len(traversed_chars, text.chars().count()),
236 kind: TokenKind::Unlintable,
237 });
238 continue;
239 }
240 if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
241 tokens.push(Token {
242 span: Span::new_with_len(traversed_chars, text.chars().count()),
243 kind: TokenKind::Unlintable,
244 });
245 continue;
246 }
247 if !(matches!(tag, Tag::Paragraph)
248 || matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title
249 || matches!(tag, Tag::Heading { .. })
250 || matches!(tag, Tag::Item)
251 || matches!(tag, Tag::TableCell)
252 || matches!(tag, Tag::Emphasis)
253 || matches!(tag, Tag::Strong)
254 || matches!(tag, Tag::Strikethrough))
255 {
256 continue;
257 }
258 }
259
260 let mut new_tokens =
261 english_parser.parse(&source[traversed_chars..traversed_chars + chunk_len]);
262
263 new_tokens
264 .iter_mut()
265 .for_each(|token| token.span.push_by(traversed_chars));
266
267 tokens.append(&mut new_tokens);
268 }
269 pulldown_cmark::Event::Html(_content)
271 | pulldown_cmark::Event::InlineHtml(_content) => {
272 let size = _content.chars().count();
273 tokens.push(Token {
274 span: Span::new_with_len(traversed_chars, size),
275 kind: TokenKind::Unlintable,
276 });
277 }
278 _ => (),
279 }
280 }
281
282 if matches!(
283 tokens.last(),
284 Some(Token {
285 kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
286 ..
287 })
288 ) && source.last() != Some(&'\n')
289 {
290 tokens.pop();
291 }
292
293 Self::remove_hidden_wikilink_tokens(&mut tokens);
294 Self::remove_wikilink_brackets(&mut tokens);
295
296 tokens
297 }
298}
299
300#[cfg(test)]
301mod tests {
302 use super::super::StrParser;
303 use super::Markdown;
304 use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
305
306 #[test]
307 fn survives_emojis() {
308 let source = r"🤷.";
309
310 Markdown::default().parse_str(source);
311 }
312
313 #[test]
318 fn ends_with_newline() {
319 let source = "This is a test.";
320
321 let tokens = Markdown::default().parse_str(source);
322 assert_ne!(tokens.len(), 0);
323 assert!(!tokens.last().unwrap().kind.is_newline());
324 }
325
326 #[test]
327 fn math_becomes_unlintable() {
328 let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
329
330 let tokens = Markdown::default().parse_str(source);
331 assert_eq!(
332 tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
333 vec![
334 TokenKind::Unlintable,
335 TokenKind::Space(1),
336 TokenKind::Unlintable,
337 TokenKind::Space(1),
338 TokenKind::Unlintable,
339 TokenKind::Punctuation(Punctuation::Period)
340 ]
341 )
342 }
343
344 #[test]
345 fn hidden_wikilink_text() {
346 let source = r"[[this is hidden|this is not]]";
347
348 let tokens = Markdown::default().parse_str(source);
349
350 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
351
352 assert!(matches!(
353 token_kinds.as_slice(),
354 &[
355 TokenKind::Word(_),
356 TokenKind::Space(1),
357 TokenKind::Word(_),
358 TokenKind::Space(1),
359 TokenKind::Word(_),
360 ]
361 ))
362 }
363
364 #[test]
365 fn just_pipe() {
366 let source = r"|";
367
368 let tokens = Markdown::default().parse_str(source);
369
370 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
371
372 dbg!(&token_kinds);
373
374 assert!(matches!(
375 token_kinds.as_slice(),
376 &[TokenKind::Punctuation(Punctuation::Pipe)]
377 ))
378 }
379
380 #[test]
381 fn empty_wikilink_text() {
382 let source = r"[[|]]";
383
384 let tokens = Markdown::default().parse_str(source);
385
386 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
387
388 dbg!(&token_kinds);
389
390 assert!(matches!(token_kinds.as_slice(), &[]))
391 }
392
393 #[test]
394 fn improper_wikilink_text() {
395 let source = r"this is shown|this is also shown]]";
396
397 let tokens = Markdown::default().parse_str(source);
398
399 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
400
401 dbg!(&token_kinds);
402
403 assert!(matches!(
404 token_kinds.as_slice(),
405 &[
406 TokenKind::Word(_),
407 TokenKind::Space(1),
408 TokenKind::Word(_),
409 TokenKind::Space(1),
410 TokenKind::Word(_),
411 TokenKind::Punctuation(Punctuation::Pipe),
412 TokenKind::Word(_),
413 TokenKind::Space(1),
414 TokenKind::Word(_),
415 TokenKind::Space(1),
416 TokenKind::Word(_),
417 TokenKind::Space(1),
418 TokenKind::Word(_),
419 TokenKind::Punctuation(Punctuation::CloseSquare),
420 TokenKind::Punctuation(Punctuation::CloseSquare),
421 ]
422 ))
423 }
424
425 #[test]
426 fn normal_wikilink() {
427 let source = r"[[Wikilink]]";
428 let tokens = Markdown::default().parse_str(source);
429 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
430
431 dbg!(&token_kinds);
432
433 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
434 }
435
436 #[test]
437 fn html_is_unlintable() {
438 let source = r"The range of inputs from <ctrl-g> to ctrl-z";
439 let tokens = Markdown::default().parse_str(source);
440 assert_eq!(tokens.iter_unlintables().count(), 1);
441 }
442
443 #[test]
444 fn link_title_unlintable() {
445 let parser = Markdown::new(MarkdownOptions {
446 ignore_link_title: true,
447 ..MarkdownOptions::default()
448 });
449 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
450 let tokens = parser.parse_str(source);
451 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
452
453 dbg!(&token_kinds);
454
455 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
456 }
457
458 #[test]
459 fn issue_194() {
460 let source = r"<http://localhost:9093>";
461 let parser = Markdown::new(MarkdownOptions {
462 ignore_link_title: true,
463 ..MarkdownOptions::default()
464 });
465 let token_kinds = parser
466 .parse_str(source)
467 .iter()
468 .map(|t| t.kind.clone())
469 .collect::<Vec<_>>();
470
471 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
472 }
473
474 #[test]
475 fn respects_link_title_config() {
476 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
477 let parser = Markdown::new(MarkdownOptions {
478 ignore_link_title: true,
479 ..MarkdownOptions::default()
480 });
481 let token_kinds = parser
482 .parse_str(source)
483 .iter()
484 .map(|t| t.kind.clone())
485 .collect::<Vec<_>>();
486
487 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
488
489 let parser = Markdown::new(MarkdownOptions {
490 ignore_link_title: false,
491 ..MarkdownOptions::default()
492 });
493 let token_kinds = parser
494 .parse_str(source)
495 .iter()
496 .map(|t| t.kind.clone())
497 .collect::<Vec<_>>();
498
499 dbg!(&token_kinds);
500
501 assert!(matches!(
502 token_kinds.as_slice(),
503 &[
504 TokenKind::Word(_),
505 TokenKind::Punctuation(Punctuation::Hyphen),
506 TokenKind::Word(_),
507 TokenKind::Punctuation(Punctuation::ForwardSlash),
508 TokenKind::Word(_)
509 ]
510 ));
511 }
512
513 #[test]
515 fn issue_880() {
516 let source = r#"
517Paragraph.
518
519```
520Code block
521```
522Paragraph.
523 "#;
524 let parser = Markdown::new(MarkdownOptions::default());
525 let tokens = parser.parse_str(source);
526 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
527
528 dbg!(&token_kinds);
529
530 assert!(matches!(
531 token_kinds.as_slice(),
532 &[
533 TokenKind::Word(_),
534 TokenKind::Punctuation(_),
535 TokenKind::ParagraphBreak,
536 TokenKind::Unlintable,
537 TokenKind::ParagraphBreak,
538 TokenKind::Word(_),
539 TokenKind::Punctuation(_),
540 ]
541 ))
542 }
543
544 #[test]
548 fn no_end_token_incorrectly_ending_at_zero() {
549 let source = "Something\n";
550 let parser = Markdown::new(MarkdownOptions::default());
551 let tokens = parser.parse_str(source);
552 assert_ne!(tokens.last().unwrap().span.end, 0);
553 }
554}