1use std::collections::VecDeque;
2
3use serde::{Deserialize, Serialize};
4
5use super::{Parser, PlainEnglish};
6use crate::{Span, Token, TokenKind, TokenStringExt, VecExt};
7
8#[derive(Default, Clone, Debug, Copy)]
13pub struct Markdown {
14 options: MarkdownOptions,
15}
16
17#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
18#[non_exhaustive]
19pub struct MarkdownOptions {
20 pub ignore_link_title: bool,
21}
22
23#[allow(clippy::derivable_impls)]
25impl Default for MarkdownOptions {
26 fn default() -> Self {
27 Self {
28 ignore_link_title: false,
29 }
30 }
31}
32
33impl Markdown {
34 pub fn new(options: MarkdownOptions) -> Self {
35 Self { options }
36 }
37
38 fn remove_hidden_wikilink_tokens(tokens: &mut Vec<Token>) {
46 let mut to_remove = VecDeque::new();
47
48 for pipe_idx in tokens.iter_pipe_indices() {
49 if pipe_idx < 2 {
50 continue;
51 }
52
53 let mut cursor = pipe_idx - 2;
55 let mut open_bracket = None;
56
57 loop {
58 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
59 break;
60 };
61
62 if a.kind.is_newline() {
63 break;
64 }
65
66 if a.kind.is_open_square() && b.kind.is_open_square() {
67 open_bracket = Some(cursor);
68 break;
69 } else if cursor == 0 {
70 break;
71 } else {
72 cursor -= 1;
73 }
74 }
75
76 cursor = pipe_idx + 1;
78 let mut close_bracket = None;
79
80 loop {
81 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
82 break;
83 };
84
85 if a.kind.is_newline() {
86 break;
87 }
88
89 if a.kind.is_close_square() && b.kind.is_close_square() {
90 close_bracket = Some(cursor);
91 break;
92 } else {
93 cursor += 1;
94 }
95 }
96
97 if let Some(open_bracket_idx) = open_bracket
98 && let Some(close_bracket_idx) = close_bracket
99 {
100 to_remove.extend(open_bracket_idx..=pipe_idx);
101 to_remove.push_back(close_bracket_idx);
102 to_remove.push_back(close_bracket_idx + 1);
103 }
104 }
105
106 tokens.remove_indices(to_remove);
107 }
108
109 fn remove_wikilink_brackets(tokens: &mut Vec<Token>) {
112 let mut to_remove = VecDeque::new();
113 let mut open_brackets = None;
114
115 let mut cursor = 0;
116
117 loop {
118 let Some((a, b)) = tokens.get(cursor).zip(tokens.get(cursor + 1)) else {
119 break;
120 };
121
122 if let Some(open_brackets_idx) = open_brackets {
123 if a.kind.is_newline() {
124 open_brackets = None;
125 cursor += 1;
126 continue;
127 }
128
129 if a.kind.is_close_square() && b.kind.is_close_square() {
130 to_remove.push_back(open_brackets_idx);
131 to_remove.push_back(open_brackets_idx + 1);
132
133 to_remove.push_back(cursor);
134 to_remove.push_back(cursor + 1);
135
136 open_brackets = None;
137 }
138 } else if a.kind.is_open_square() && b.kind.is_open_square() {
139 open_brackets = Some(cursor);
140 }
141
142 cursor += 1;
143 }
144
145 tokens.remove_indices(to_remove);
146 }
147}
148
149impl Parser for Markdown {
150 fn parse(&self, source: &[char]) -> Vec<Token> {
153 let english_parser = PlainEnglish;
154
155 let source_str: String = source.iter().collect();
156 let md_parser = pulldown_cmark::Parser::new_ext(
157 &source_str,
158 pulldown_cmark::Options::all()
159 .difference(pulldown_cmark::Options::ENABLE_SMART_PUNCTUATION),
160 );
161
162 let mut tokens = Vec::new();
163
164 let mut byte_to_char = vec![0; source_str.len() + 1];
167 let mut char_index = 0;
168 let mut byte_idx = 0;
169 for ch in source_str.chars() {
170 let char_len = ch.len_utf8();
171 for _ in 0..char_len {
172 byte_to_char[byte_idx] = char_index;
173 byte_idx += 1;
174 }
175 char_index += 1;
176 }
177 byte_to_char[source_str.len()] = char_index;
178
179 let mut stack = Vec::new();
180
181 for (event, range) in md_parser.into_offset_iter() {
184 let span_start = byte_to_char[range.start];
185 let span_end = byte_to_char[range.end];
186
187 match event {
188 pulldown_cmark::Event::SoftBreak => {
189 tokens.push(Token {
190 span: Span::new_with_len(span_start, 1),
191 kind: TokenKind::Newline(1),
192 });
193 }
194 pulldown_cmark::Event::HardBreak => {
195 tokens.push(Token {
196 span: Span::new_with_len(span_start, 1),
197 kind: TokenKind::Newline(2),
198 });
199 }
200 pulldown_cmark::Event::Start(pulldown_cmark::Tag::List(v)) => {
201 tokens.push(Token {
202 span: Span::new_with_len(span_start, 0),
203 kind: TokenKind::Newline(2),
204 });
205 stack.push(pulldown_cmark::Tag::List(v));
206 }
207 pulldown_cmark::Event::Start(tag) => stack.push(tag),
208 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph)
209 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Item)
210 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
211 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::CodeBlock)
212 | pulldown_cmark::Event::End(pulldown_cmark::TagEnd::TableCell) => {
213 tokens.push(Token {
214 span: Span::new_with_len(tokens.last().map_or(0, |last| last.span.end), 0),
220 kind: TokenKind::ParagraphBreak,
221 });
222 stack.pop();
223 }
224 pulldown_cmark::Event::End(_) => {
225 stack.pop();
226 }
227 pulldown_cmark::Event::InlineMath(_)
228 | pulldown_cmark::Event::DisplayMath(_)
229 | pulldown_cmark::Event::Code(_) => {
230 let chunk_len = span_end - span_start;
231
232 tokens.push(Token {
233 span: Span::new_with_len(span_start, chunk_len),
234 kind: TokenKind::Unlintable,
235 });
236 }
237 pulldown_cmark::Event::Text(_text) => {
238 let chunk_len = span_end - span_start;
239
240 if let Some(tag) = stack.last() {
241 use pulldown_cmark::Tag;
242
243 if matches!(tag, Tag::CodeBlock(..)) {
244 tokens.push(Token {
245 span: Span::new_with_len(span_start, chunk_len),
246
247 kind: TokenKind::Unlintable,
248 });
249 continue;
250 }
251 if matches!(tag, Tag::Link { .. }) && self.options.ignore_link_title {
252 tokens.push(Token {
253 span: Span::new_with_len(span_start, chunk_len),
254 kind: TokenKind::Unlintable,
255 });
256 continue;
257 }
258 if !(matches!(tag, Tag::Paragraph)
259 || (matches!(tag, Tag::Link { .. }) && !self.options.ignore_link_title)
260 || matches!(tag, Tag::Heading { .. })
261 || matches!(tag, Tag::Item)
262 || matches!(tag, Tag::TableCell)
263 || matches!(tag, Tag::Emphasis)
264 || matches!(tag, Tag::Strong)
265 || matches!(tag, Tag::Strikethrough))
266 {
267 continue;
268 }
269 }
270
271 let mut new_tokens = english_parser.parse(&source[span_start..span_end]);
272
273 new_tokens
274 .iter_mut()
275 .for_each(|token| token.span.push_by(span_start));
276
277 tokens.append(&mut new_tokens);
278 }
279 pulldown_cmark::Event::Html(_) | pulldown_cmark::Event::InlineHtml(_) => {
281 let size = span_end - span_start;
282 tokens.push(Token {
283 span: Span::new_with_len(span_start, size),
284 kind: TokenKind::Unlintable,
285 });
286 }
287 _ => (),
288 }
289 }
290
291 if matches!(
292 tokens.last(),
293 Some(Token {
294 kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
295 ..
296 })
297 ) && source.last() != Some(&'\n')
298 {
299 tokens.pop();
300 }
301
302 Self::remove_hidden_wikilink_tokens(&mut tokens);
303 Self::remove_wikilink_brackets(&mut tokens);
304
305 tokens
306 }
307}
308
309#[cfg(test)]
310mod tests {
311 use super::super::StrParser;
312 use super::Markdown;
313 use crate::{Punctuation, TokenKind, TokenStringExt, parsers::markdown::MarkdownOptions};
314
315 #[test]
316 fn survives_emojis() {
317 let source = r"🤷.";
318
319 Markdown::default().parse_str(source);
320 }
321
322 #[test]
327 fn ends_with_newline() {
328 let source = "This is a test.";
329
330 let tokens = Markdown::default().parse_str(source);
331 assert_ne!(tokens.len(), 0);
332 assert!(!tokens.last().unwrap().kind.is_newline());
333 }
334
335 #[test]
336 fn math_becomes_unlintable() {
337 let source = r"$\Katex$ $\text{is}$ $\text{great}$.";
338
339 let tokens = Markdown::default().parse_str(source);
340 assert_eq!(
341 tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>(),
342 vec![
343 TokenKind::Unlintable,
344 TokenKind::Space(1),
345 TokenKind::Unlintable,
346 TokenKind::Space(1),
347 TokenKind::Unlintable,
348 TokenKind::Punctuation(Punctuation::Period)
349 ]
350 )
351 }
352
353 #[test]
354 fn hidden_wikilink_text() {
355 let source = r"[[this is hidden|this is not]]";
356
357 let tokens = Markdown::default().parse_str(source);
358
359 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
360
361 assert!(matches!(
362 token_kinds.as_slice(),
363 &[
364 TokenKind::Word(_),
365 TokenKind::Space(1),
366 TokenKind::Word(_),
367 TokenKind::Space(1),
368 TokenKind::Word(_),
369 ]
370 ))
371 }
372
373 #[test]
374 fn just_pipe() {
375 let source = r"|";
376
377 let tokens = Markdown::default().parse_str(source);
378
379 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
380
381 dbg!(&token_kinds);
382
383 assert!(matches!(
384 token_kinds.as_slice(),
385 &[TokenKind::Punctuation(Punctuation::Pipe)]
386 ))
387 }
388
389 #[test]
390 fn empty_wikilink_text() {
391 let source = r"[[|]]";
392
393 let tokens = Markdown::default().parse_str(source);
394
395 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
396
397 dbg!(&token_kinds);
398
399 assert!(matches!(token_kinds.as_slice(), &[]))
400 }
401
402 #[test]
403 fn improper_wikilink_text() {
404 let source = r"this is shown|this is also shown]]";
405
406 let tokens = Markdown::default().parse_str(source);
407
408 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
409
410 dbg!(&token_kinds);
411
412 assert!(matches!(
413 token_kinds.as_slice(),
414 &[
415 TokenKind::Word(_),
416 TokenKind::Space(1),
417 TokenKind::Word(_),
418 TokenKind::Space(1),
419 TokenKind::Word(_),
420 TokenKind::Punctuation(Punctuation::Pipe),
421 TokenKind::Word(_),
422 TokenKind::Space(1),
423 TokenKind::Word(_),
424 TokenKind::Space(1),
425 TokenKind::Word(_),
426 TokenKind::Space(1),
427 TokenKind::Word(_),
428 TokenKind::Punctuation(Punctuation::CloseSquare),
429 TokenKind::Punctuation(Punctuation::CloseSquare),
430 ]
431 ))
432 }
433
434 #[test]
435 fn normal_wikilink() {
436 let source = r"[[Wikilink]]";
437 let tokens = Markdown::default().parse_str(source);
438 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
439
440 dbg!(&token_kinds);
441
442 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Word(_)]))
443 }
444
445 #[test]
446 fn html_is_unlintable() {
447 let source = r"The range of inputs from <ctrl-g> to ctrl-z";
448 let tokens = Markdown::default().parse_str(source);
449 assert_eq!(tokens.iter_unlintables().count(), 1);
450 }
451
452 #[test]
453 fn link_title_unlintable() {
454 let parser = Markdown::new(MarkdownOptions {
455 ignore_link_title: true,
456 ..MarkdownOptions::default()
457 });
458 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
459 let tokens = parser.parse_str(source);
460 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
461
462 dbg!(&token_kinds);
463
464 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]))
465 }
466
467 #[test]
468 fn issue_194() {
469 let source = r"<http://localhost:9093>";
470 let parser = Markdown::new(MarkdownOptions {
471 ignore_link_title: true,
472 ..MarkdownOptions::default()
473 });
474 let token_kinds = parser
475 .parse_str(source)
476 .iter()
477 .map(|t| t.kind.clone())
478 .collect::<Vec<_>>();
479
480 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
481 }
482
483 #[test]
484 fn respects_link_title_config() {
485 let source = r"[elijah-potter/harper](https://github.com/elijah-potter/harper)";
486 let parser = Markdown::new(MarkdownOptions {
487 ignore_link_title: true,
488 ..MarkdownOptions::default()
489 });
490 let token_kinds = parser
491 .parse_str(source)
492 .iter()
493 .map(|t| t.kind.clone())
494 .collect::<Vec<_>>();
495
496 assert!(matches!(token_kinds.as_slice(), &[TokenKind::Unlintable]));
497
498 let parser = Markdown::new(MarkdownOptions {
499 ignore_link_title: false,
500 ..MarkdownOptions::default()
501 });
502 let token_kinds = parser
503 .parse_str(source)
504 .iter()
505 .map(|t| t.kind.clone())
506 .collect::<Vec<_>>();
507
508 dbg!(&token_kinds);
509
510 assert!(matches!(
511 token_kinds.as_slice(),
512 &[
513 TokenKind::Word(_),
514 TokenKind::Punctuation(Punctuation::Hyphen),
515 TokenKind::Word(_),
516 TokenKind::Punctuation(Punctuation::ForwardSlash),
517 TokenKind::Word(_)
518 ]
519 ));
520 }
521
522 #[test]
524 fn issue_880() {
525 let source = r#"
526Paragraph.
527
528```
529Code block
530```
531Paragraph.
532 "#;
533 let parser = Markdown::new(MarkdownOptions::default());
534 let tokens = parser.parse_str(source);
535 let token_kinds = tokens.iter().map(|t| t.kind.clone()).collect::<Vec<_>>();
536
537 dbg!(&token_kinds);
538
539 assert!(matches!(
540 token_kinds.as_slice(),
541 &[
542 TokenKind::Word(_),
543 TokenKind::Punctuation(_),
544 TokenKind::ParagraphBreak,
545 TokenKind::Unlintable,
546 TokenKind::ParagraphBreak,
547 TokenKind::Word(_),
548 TokenKind::Punctuation(_),
549 ]
550 ))
551 }
552
553 #[test]
557 fn no_end_token_incorrectly_ending_at_zero() {
558 let source = "Something\n";
559 let parser = Markdown::new(MarkdownOptions::default());
560 let tokens = parser.parse_str(source);
561 assert_ne!(tokens.last().unwrap().span.end, 0);
562 }
563
564 #[test]
565 fn hang() {
566 let opts = MarkdownOptions::default();
567 let parser = Markdown::new(opts);
568 let _res = parser.parse_str("[[#|]]:A]");
569 }
570
571 #[test]
572 fn hang2() {
573 let opts = MarkdownOptions::default();
575 let parser = Markdown::new(opts);
576 let _res = parser.parse_str("//{@j");
577 }
578}