1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14 options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20 pub ignore_link_title: bool,
21}
22
23#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26 fn default() -> Self {
27 Self {
28 ignore_link_title: false,
29 }
30 }
31}
32
33impl Markdown {
34 pub fn new(options: MarkdownOptions) -> Self {
35 Self { options }
36 }
37
38 fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46 let mut to_remove = VecDeque::new();
47
48 for pipe_idx in tokens.iter_pipe_indices() {
49 if pipe_idx < 2 {
50 continue;
51 }
52
53 let mut cursor = pipe_idx - 2;
55 let mut open_bracket = None;
56
57 loop {
58 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59 break;
60 };
61
62 if a.kind.is_newline() {
63 break;
64 }
65
66 if a.kind.is_open_square() && b.kind.is_open_square() {
67 open_bracket = Some(cursor);
68 break;
69 } else if cursor == 0 {
70 break;
71 } else {
72 cursor -= 1;
73 }
74 }
75
76 cursor = pipe_idx + 1;
78 let mut close_bracket = None;
79
80 loop {
81 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82 break;
83 };
84
85 if a.kind.is_newline() {
86 break;
87 }
88
89 if a.kind.is_close_square() && b.kind.is_close_square() {
90 close_bracket = Some(cursor);
91 break;
92 } else {
93 cursor += 1;
94 }
95 }
96
97 if let Some(open_bracket_idx) = open_bracket
98 && let Some(close_bracket_idx) = close_bracket
99 {
100 to_remove.extend(open_bracket_idx..=pipe_idx);
101 to_remove.push_back(close_bracket_idx);
102 to_remove.push_back(close_bracket_idx + 1);
103 }
104 }
105
106 tokens.remove_indices(to_remove);
107 }
108
109 fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112 let mut to_remove = VecDeque::new();
113 let mut open_brackets = None;
114
115 let mut cursor = 0;
116
117 loop {
118 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119 break;
120 };
121
122 if let Some(open_brackets_idx) = open_brackets {
123 if a.kind.is_newline() {
124 open_brackets = None;
125 cursor += 1;
126 continue;
127 }
128
129 if a.kind.is_close_square() && b.kind.is_close_square() {
130 to_remove.push_back(open_brackets_idx);
131 to_remove.push_back(open_brackets_idx + 1);
132
133 to_remove.push_back(cursor);
134 to_remove.push_back(cursor + 1);
135
136 open_brackets = None;
137 }
138 } else if a.kind.is_open_square() && b.kind.is_open_square() {
139 open_brackets = Some(cursor);
140 }
141
142 cursor += 1;
143 }
144
145 tokens.remove_indices(to_remove);
146 }
147}
148
149impl Parser for Markdown {
150 fn parse(&self, source: &[char]) -> Vec<Token> {
153 let english_parser = PlainEnglish;
154
155 let source_str: String = source.iter().collect();
156 let md_parser = pulldown_cmark::Parser::new_ext(
157 &source_str,
158 pulldown_cmark::Options::all()
159 .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160 );
161
162 let mut tokens = Vec::new();
163
164 let mut byte_to_char = vec![0; source_str.len() + 1];
167 let mut char_index = 0;
168 let mut byte_idx = 0;
169 for ch in source_str.chars() {
170 let char_len = ch.len_utf8();
171 for _ in 0..char_len {
172 byte_to_char[byte_idx] = char_index;
173 byte_idx += 1;
174 }
175 char_index += 1;
176 }
177 byte_to_char[source_str.len()] = char_index;
178
179 let mut stack = Vec::new();
180
181 for (event, range) in md_parser.into_offset_iter() {
184 let span_start = byte_to_char[range.start];
185 let span_end = byte_to_char[range.end];
186
187 match event {
188 pulldown_cmark::Event::SoftBreak => {
189 tokens.push(Token {
190 span: Span::new_with_len(span_start, 1),
191 kind: TokenKind::Newline(1),
192 });
193 }
194 pulldown_cmark::Event::HardBreak => {
195 tokens.push(Token {
196 span: Span::new_with_len(span_start, 1),
197 kind: TokenKind::Newline(2),
198 });
199 }
200 pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
201 tokens.push(Token {
202 span: Span::new_with_len(span_start, 0),
203 kind: TokenKind::Newline(2),
204 });
205 stack.push(pulldown_cmark::Tag::List(v));
206 }
207 pulldown_cmark::Event::Start(tag) => {
208 if matches!(tag, pulldown_cmark::Tag::Heading { .. }) {
209 tokens.push(Token {
210 span: Span::new_with_len(span_start, 0),
211 kind: TokenKind::HeadingStart,
212 });
213 }
214
215 stack.push(tag)
216 }
217 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
218 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
219 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
220 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
221 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
222 tokens.push(Token {
223 span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
229 kind: TokenKind::ParagraphBreak,
230 });
231 stack.pop();
232 }
233 pulldown_cmark::Event::End(_) => {
234 stack.pop();
235 }
236 pulldown_cmark::Event::InlineMath(_)
237 | pulldown_cmark::Event::DisplayMath(_)
238 | pulldown_cmark::Event::Code(_) => {
239 let chunk_len = span_end - span_start;
240
241 tokens.push(Token {
242 span: Span::new_with_len(span_start, chunk_len),
243 kind: TokenKind::Unlintable,
244 });
245 }
246 pulldown_cmark::Event::Text(_text) => {
247 let chunk_len = span_end - span_start;
248
249 if let Some(tag) = stack.last() {
250 use pulldown_cmark::Tag;
251
252 if matches!(tag, Tag::CodeBlock(..)) {
253 tokens.push(Token {
254 span: Span::new_with_len(span_start, chunk_len),
255
256 kind: TokenKind::Unlintable,
257 });
258 continue;
259 }
260 if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
261 tokens.push(Token {
262 span: Span::new_with_len(span_start, chunk_len),
263 kind: TokenKind::Unlintable,
264 });
265 continue;
266 }
267 if !(matches!(tag, Tag::Paragraph)
268 || (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
269 || matches!(tag, Tag::Heading { .. })
270 || matches!(tag, Tag::Item)
271 || matches!(tag, Tag::TableCell)
272 || matches!(tag, Tag::Emphasis)
273 || matches!(tag, Tag::Strong)
274 || matches!(tag, Tag::Strikethrough))
275 {
276 continue;
277 }
278 }
279
280 let mut new_tokens = english_parser.parse(&source[span_start..span_end]);
281
282 new_tokens
283 .iter_mut()
284 .for_each(|token| token.span.push_by(span_start));
285
286 tokens.append(&mut new_tokens);
287 }
288 pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
290 let size = span_end - span_start;
291 tokens.push(Token {
292 span: Span::new_with_len(span_start, size),
293 kind: TokenKind::Unlintable,
294 });
295 }
296 _ => (),
297 }
298 }
299
300 if matches!(
301 tokens.last(),
302 Some(Token {
303 kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
304 ..
305 })
306 ) && source.last() != Some(&'\n')
307 {
308 tokens.pop();
309 }
310
311 Self::remove_hidden_wikilink_tokens(&mut tokens);
312 Self::remove_wikilink_brackets(&mut tokens);
313
314 tokens
315 }
316}
317
318#[cfg(test)]
319mod tests {
320 use super::super::StrParser;
321 use super::Markdown;
322 use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
323
324 #[test]
325 fn survives_emojis() {
326 let source = r"🤷.";
327
328 Markdown::default().parse_str(source);
329 }
330
331 #[test]
336 fn ends_with_newline() {
337 let source = "This is a test.";
338
339 let tokens = Markdown::default().parse_str(source);
340 assert_ne!(tokens.len(), 0);
341 assert!(!tokens.last().unwrap().kind.is_newline());
342 }
343
344 #[test]
345 fn math_becomes_unlintable() {
346 let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
347
348 let tokens = Markdown::default().parse_str(source);
349 assert_eq!(
350 tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
351 vec![
352 TokenKind::Unlintable,
353 TokenKind::Space(1),
354 TokenKind::Unlintable,
355 TokenKind::Space(1),
356 TokenKind::Unlintable,
357 TokenKind::Punctuation(Punctuation::Period)
358 ]
359 )
360 }
361
362 #[test]
363 fn hidden_wikilink_text() {
364 let source = r"[[this is hidden|this is not]]";
365
366 let tokens = Markdown::default().parse_str(source);
367
368 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
369
370 assert!(matches!(
371 token_kinds.as_slice(),
372 &[
373 TokenKind::Word(_),
374 TokenKind::Space(1),
375 TokenKind::Word(_),
376 TokenKind::Space(1),
377 TokenKind::Word(_),
378 ]
379 ))
380 }
381
382 #[test]
383 fn just_pipe() {
384 let source = r"|";
385
386 let tokens = Markdown::default().parse_str(source);
387
388 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
389
390 dbg!(&token_kinds);
391
392 assert!(matches!(
393 token_kinds.as_slice(),
394 &[TokenKind::Punctuation(Punctuation::Pipe)]
395 ))
396 }
397
398 #[test]
399 fn empty_wikilink_text() {
400 let source = r"[[|]]";
401
402 let tokens = Markdown::default().parse_str(source);
403
404 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
405
406 dbg!(&token_kinds);
407
408 assert!(matches!(token_kinds.as_slice(), &[]))
409 }
410
411 #[test]
412 fn improper_wikilink_text() {
413 let source = r"this is shown|this is also shown]]";
414
415 let tokens = Markdown::default().parse_str(source);
416
417 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
418
419 dbg!(&token_kinds);
420
421 assert!(matches!(
422 token_kinds.as_slice(),
423 &[
424 TokenKind::Word(_),
425 TokenKind::Space(1),
426 TokenKind::Word(_),
427 TokenKind::Space(1),
428 TokenKind::Word(_),
429 TokenKind::Punctuation(Punctuation::Pipe),
430 TokenKind::Word(_),
431 TokenKind::Space(1),
432 TokenKind::Word(_),
433 TokenKind::Space(1),
434 TokenKind::Word(_),
435 TokenKind::Space(1),
436 TokenKind::Word(_),
437 TokenKind::Punctuation(Punctuation::CloseSquare),
438 TokenKind::Punctuation(Punctuation::CloseSquare),
439 ]
440 ))
441 }
442
443 #[test]
444 fn normal_wikilink() {
445 let source = r"[[Wikilink]]";
446 let tokens = Markdown::default().parse_str(source);
447 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
448
449 dbg!(&token_kinds);
450
451 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
452 }
453
454 #[test]
455 fn html_is_unlintable() {
456 let source = r"The range of inputs from <ctrl-g> to ctrl-z";
457 let tokens = Markdown::default().parse_str(source);
458 assert_eq!(tokens.iter_unlintables().count(), 1);
459 }
460
461 #[test]
462 fn link_title_unlintable() {
463 let parser = Markdown::new(MarkdownOptions {
464 ignore_link_title: true,
465 ..MarkdownOptions::default()
466 });
467 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
468 let tokens = parser.parse_str(source);
469 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
470
471 dbg!(&token_kinds);
472
473 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
474 }
475
476 #[test]
477 fn issue_194() {
478 let source = r"<http://localhost:9093>";
479 let parser = Markdown::new(MarkdownOptions {
480 ignore_link_title: true,
481 ..MarkdownOptions::default()
482 });
483 let token_kinds = parser
484 .parse_str(source)
485 .iter()
486 .map(|t| t.kind.clone())
487 .collect::<Vec<_>>();
488
489 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
490 }
491
492 #[test]
493 fn respects_link_title_config() {
494 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
495 let parser = Markdown::new(MarkdownOptions {
496 ignore_link_title: true,
497 ..MarkdownOptions::default()
498 });
499 let token_kinds = parser
500 .parse_str(source)
501 .iter()
502 .map(|t| t.kind.clone())
503 .collect::<Vec<_>>();
504
505 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
506
507 let parser = Markdown::new(MarkdownOptions {
508 ignore_link_title: false,
509 ..MarkdownOptions::default()
510 });
511 let token_kinds = parser
512 .parse_str(source)
513 .iter()
514 .map(|t| t.kind.clone())
515 .collect::<Vec<_>>();
516
517 dbg!(&token_kinds);
518
519 assert!(matches!(
520 token_kinds.as_slice(),
521 &[
522 TokenKind::Word(_),
523 TokenKind::Punctuation(Punctuation::Hyphen),
524 TokenKind::Word(_),
525 TokenKind::Punctuation(Punctuation::ForwardSlash),
526 TokenKind::Word(_)
527 ]
528 ));
529 }
530
531 #[test]
533 fn issue_880() {
534 let source = r#"
535Paragraph.
536
537```
538Code block
539```
540Paragraph.
541 "#;
542 let parser = Markdown::new(MarkdownOptions::default());
543 let tokens = parser.parse_str(source);
544 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
545
546 dbg!(&token_kinds);
547
548 assert!(matches!(
549 token_kinds.as_slice(),
550 &[
551 TokenKind::Word(_),
552 TokenKind::Punctuation(_),
553 TokenKind::ParagraphBreak,
554 TokenKind::Unlintable,
555 TokenKind::ParagraphBreak,
556 TokenKind::Word(_),
557 TokenKind::Punctuation(_),
558 ]
559 ))
560 }
561
562 #[test]
566 fn no_end_token_incorrectly_ending_at_zero() {
567 let source = "Something\n";
568 let parser = Markdown::new(MarkdownOptions::default());
569 let tokens = parser.parse_str(source);
570 assert_ne!(tokens.last().unwrap().span.end, 0);
571 }
572
573 #[test]
574 fn hang() {
575 let opts = MarkdownOptions::default();
576 let parser = Markdown::new(opts);
577 let _res = parser.parse_str("[[#|]]:A]");
578 }
579
580 #[test]
581 fn hang2() {
582 let opts = MarkdownOptions::default();
584 let parser = Markdown::new(opts);
585 let _res = parser.parse_str("//{@j");
586 }
587
588 #[test]
589 fn simple_headings_are_marked() {
590 let opts = MarkdownOptions::default();
591 let parser = Markdown::new(opts);
592 let tokens = parser.parse_str("# This is a simple heading");
593
594 assert_eq!(tokens.iter_heading_starts().count(), 1);
595 assert_eq!(tokens.iter_headings().count(), 1);
596 }
597
598 #[test]
599 fn multiple_headings_are_marked() {
600 let opts = MarkdownOptions::default();
601 let parser = Markdown::new(opts);
602 let tokens = parser.parse_str(
603 r#"# This is a simple heading
604
605## This is a second simple heading"#,
606 );
607
608 assert_eq!(tokens.iter_heading_starts().count(), 2);
609 assert_eq!(tokens.iter_headings().count(), 2);
610 }
611}