simple_markdown_parser/
lib.rs

1#![doc = include_str!("../README.md")]
2
3pub mod extras;
4pub mod utilities;
5
6/// Markdown block element
7#[derive(Debug, Copy, Clone, PartialEq, Eq)]
8pub enum MarkdownElement<'a> {
9    Heading {
10        level: u8,
11        text: RawText<'a>,
12    },
13    Quote(RawMarkdown<'a>),
14    Paragraph(RawText<'a>),
15    ListItem {
16        level: u8,
17        text: RawText<'a>,
18    },
19    // TODO
20    Table(Table<'a>),
21    // TODO modifiers
22    CodeBlock {
23        language: &'a str,
24        code: &'a str,
25    },
26    LaTeXBlock {
27        script: &'a str,
28    },
29    CommandBlock(CommandBlock<'a>),
30    /// Inside `%%` (from Obsidan)
31    CommentBlock(&'a str),
32    /// Includes HTML comments
33    // TODO how much to do here
34    HTMLElement(&'a str),
35    // TODO at start?
36    Frontmatter(&'a str),
37    HorizontalRule,
38    // Media {
39    //     alt: &'a str,
40    //     link: Option<&'a str>,
41    //     source: &'a str,
42    // },
43    Footnote,
44    Empty,
45}
46
47impl MarkdownElement<'_> {
48    #[must_use]
49    pub fn as_markdown(&self) -> String {
50        match self {
51            Self::Heading { level, text } => {
52                let mut s = "#".repeat(*level as usize);
53                s.push_str(text.0);
54                s.push(' ');
55                s
56            }
57            Self::ListItem { level, text } => {
58                let mut s = "\t".repeat(*level as usize);
59                s.push_str("- ");
60                s.push_str(text.0);
61                s
62            }
63            Self::CodeBlock { language, code } => {
64                format!("```{language}\n{code}```")
65                // let mut s = "```".to_owned();
66                // s.push_str(language);
67                // s.push_str("\n");
68                // s.push_str("```");
69                // s
70            }
71            Self::Paragraph(text) => text.0.to_owned(),
72            Self::Quote(text) => {
73                format!("> {text}", text = text.0)
74            }
75            Self::Empty => String::new(),
76            item => format!("TODO {item:?}"),
77        }
78    }
79
80    /// Paragraph text like elements
81    #[must_use]
82    pub fn inner_paragraph_raw(&self) -> Option<&str> {
83        if let MarkdownElement::Paragraph(text) = self {
84            Some(text.0)
85        } else if let MarkdownElement::Quote(text) = self {
86            // TODO these can be sometimes made up of elements
87            Some(text.0)
88        } else {
89            None
90        }
91    }
92
93    #[must_use]
94    pub fn parts_like(&self) -> Option<RawText> {
95        if let MarkdownElement::Heading { text, .. }
96        | MarkdownElement::Paragraph(text)
97        | MarkdownElement::ListItem { level: _, text } = self
98        {
99            Some(*text)
100        } else if let MarkdownElement::Quote(text) = self {
101            // TODO these can be sometimes made up of elements
102            Some(RawText(text.0))
103        } else {
104            None
105        }
106    }
107
108    #[allow(clippy::match_same_arms)]
109    #[must_use]
110    pub fn debug_without_text(&self) -> String {
111        match self {
112            MarkdownElement::Heading { level, text: _ } => {
113                format!("Heading {{ level: {level} }}")
114            }
115            MarkdownElement::Quote(_) => "Quote".to_owned(),
116            MarkdownElement::Paragraph(_) => "Paragraph".to_owned(),
117            MarkdownElement::ListItem { level, text: _ } => {
118                format!("ListItem {{ level: {level} }}")
119            }
120            MarkdownElement::Table(_table) => "Table".to_owned(),
121            MarkdownElement::CodeBlock { language, code: _ } => format!("CodeBlock ({language})"),
122            MarkdownElement::LaTeXBlock { script: _ } => "LaTeXBlock {{ .. }}".to_owned(),
123            MarkdownElement::CommandBlock(_) => "CommandBlock".to_owned(),
124            MarkdownElement::CommentBlock(_) => "CommentBlock".to_owned(),
125            MarkdownElement::HTMLElement(_) => "HTMLElement".to_owned(),
126            MarkdownElement::Frontmatter(_) => "Frontmatter".to_owned(),
127            MarkdownElement::HorizontalRule => "HorizontalRule".to_owned(),
128            MarkdownElement::Footnote => "Footnote".to_owned(),
129            MarkdownElement::Empty => "Empty".to_owned(),
130        }
131    }
132}
133
134/// (unsplit) Text inside markdown item
135#[derive(Debug, Copy, Clone, PartialEq, Eq)]
136pub struct RawText<'a>(pub &'a str);
137
138impl<'a> RawText<'a> {
139    #[must_use]
140    pub fn parts(&self) -> PartsIterator<'a> {
141        PartsIterator::new(self.0)
142    }
143
144    #[must_use]
145    pub fn no_decoration(&self) -> String {
146        let mut s = String::new();
147        for part in PartsIterator::new(self.0) {
148            s.push_str(part.no_decoration());
149        }
150        s
151    }
152}
153
154/// Some are prefixes, some are wrapped
155#[derive(Debug, Copy, Clone, PartialEq, Eq)]
156pub enum MarkdownTextElement<'a> {
157    Plain(&'a str),
158    /// `*hi*` or `_hi_`
159    Italic(&'a str),
160    /// `**hi**` or `__hi__`
161    Bold(&'a str),
162    /// **_hi_**
163    BoldAndItalic(&'a str),
164    /// `` `code` ``
165    Code(&'a str),
166    /// `~~gone~~`
167    StrikeThrough(&'a str),
168    /// `:emoji:`
169    Emoji(&'a str),
170    /// `$\sin$`
171    Latex(&'a str),
172    /// `{something}` TODO WIP
173    Expression(&'a str),
174    /// `==hightlighted==`
175    Highlight(&'a str),
176    /// `^superscript^`
177    Superscript(&'a str),
178    /// `~subscript~` (unfortuantly not _)
179    Subscript(&'a str),
180    /// `#item`
181    Tag(&'a str),
182    /// `[on](to)`
183    Link {
184        /// TODO not great but..
185        on: RawText<'a>,
186        to: &'a str,
187    },
188    /// `![alt](source)`
189    Media {
190        alt: &'a str,
191        source: &'a str,
192    },
193}
194
195impl<'a> MarkdownTextElement<'a> {
196    #[must_use]
197    pub fn no_decoration(&self) -> &'a str {
198        match self {
199            MarkdownTextElement::Plain(i)
200            | MarkdownTextElement::Bold(i)
201            | MarkdownTextElement::Italic(i)
202            | MarkdownTextElement::BoldAndItalic(i)
203            | MarkdownTextElement::Code(i)
204            | MarkdownTextElement::StrikeThrough(i)
205            | MarkdownTextElement::Emoji(i)
206            | MarkdownTextElement::Latex(i)
207            | MarkdownTextElement::Highlight(i)
208            | MarkdownTextElement::Subscript(i)
209            | MarkdownTextElement::Superscript(i)
210            | MarkdownTextElement::Tag(i) => i,
211            MarkdownTextElement::Expression(_) | MarkdownTextElement::Media { .. } => "",
212            MarkdownTextElement::Link { on: _, to: _ } => {
213                eprintln!("TODO no decoration link");
214                ""
215            }
216        }
217    }
218}
219
220// TODO want to do in main loop
221#[allow(clippy::needless_lifetimes)]
222fn decide<'a>(item: &'a str) -> MarkdownElement<'a> {
223    let item = item.trim();
224    if item.starts_with('#') {
225        let level = item.chars().take_while(|c| *c == '#').count();
226        MarkdownElement::Heading {
227            level: level.try_into().expect("deep header"),
228            text: RawText(item[level..].trim()),
229        }
230    } else if let Some(item) = item.strip_prefix('>') {
231        MarkdownElement::Quote(RawMarkdown(item))
232    } else if let "---" = item {
233        MarkdownElement::HorizontalRule
234    } else if let Some(item) = item.trim_start().strip_prefix('-') {
235        // TODO one or the other
236        let level = item.chars().take_while(|c| *c == '\t' || *c == ' ').count();
237        MarkdownElement::ListItem {
238            level: level.try_into().expect("deep list item"),
239            text: RawText(item.trim()),
240        }
241    } else if item.is_empty() {
242        MarkdownElement::Empty
243    } else {
244        MarkdownElement::Paragraph(RawText(item))
245    }
246}
247
248#[derive(Default, Copy, Clone)]
249pub struct ParseOptions {
250    include_new_lines: bool,
251}
252
253/// # Errors
254/// errors for unclosed blocks
255pub fn parse<'a>(on: &'a str, cb: impl FnMut(MarkdownElement<'a>)) -> Result<(), ()> {
256    parse_with_options(on, &ParseOptions::default(), cb)
257}
258
259pub fn strip_surrounds<'a>(on: &'a str, left: &str, right: &str) -> Option<&'a str> {
260    on.trim()
261        .strip_prefix(left)
262        .and_then(|line| line.strip_suffix(right))
263        .map(str::trim)
264}
265
266/// Parse source using callback
267/// # Errors
268/// errors for unclosed blocks
269#[allow(clippy::result_unit_err, clippy::too_many_lines)]
270pub fn parse_with_options<'a>(
271    on: &'a str,
272    options: &ParseOptions,
273    mut cb: impl FnMut(MarkdownElement<'a>),
274) -> Result<(), ()> {
275    let mut since_new_line = 0;
276    let mut start = 0;
277
278    // Some => in_code
279    let mut current_code_language = None;
280
281    let mut current_command_and_arguments: Option<(&str, &str)> = None;
282
283    let mut in_frontmatter = false;
284    let mut in_table = false;
285    let mut in_latex_block = false;
286    let mut in_markdown_comment = false;
287
288    for (idx, chr) in on.char_indices() {
289        if let '\n' = chr {
290            let line = &on[since_new_line..idx];
291
292            if current_code_language.is_some() {
293                if let "```" = line.trim() {
294                    cb(MarkdownElement::CodeBlock {
295                        language: current_code_language.take().unwrap(),
296                        code: &on[start..since_new_line],
297                    });
298                    start = idx + 1;
299                }
300                since_new_line = idx + 1;
301                continue;
302            }
303
304            if let Some((current_command, arguments)) = current_command_and_arguments {
305                if let Some(command_line) = strip_surrounds(line, "{%", "%}") {
306                    if command_line
307                        .trim()
308                        .strip_prefix('/')
309                        .is_some_and(|command| current_command == command)
310                    {
311                        cb(MarkdownElement::CommandBlock(CommandBlock {
312                            name: current_command,
313                            arguments,
314                            inner: RawMarkdown(&on[start..since_new_line]),
315                        }));
316                        current_command_and_arguments = None;
317                        start = idx + 1;
318                    }
319                }
320                since_new_line = idx + 1;
321                continue;
322            }
323
324            if in_latex_block {
325                if let "$$" = line.trim() {
326                    cb(MarkdownElement::LaTeXBlock {
327                        script: on[start..since_new_line].trim(),
328                    });
329                    in_latex_block = false;
330                    start = idx + 1;
331                }
332                since_new_line = idx + 1;
333                continue;
334            }
335
336            if in_markdown_comment {
337                if line.trim().ends_with("%%") {
338                    cb(MarkdownElement::CommentBlock(
339                        on[start..since_new_line].trim(),
340                    ));
341                    in_markdown_comment = false;
342                    start = idx + 1;
343                }
344                since_new_line = idx + 1;
345                continue;
346            }
347
348            if in_table {
349                if !line.ends_with('|') {
350                    cb(MarkdownElement::Table(Table(&on[start..since_new_line])));
351                    in_table = false;
352                    start = idx + 1;
353                }
354                since_new_line = idx + 1;
355                continue;
356            }
357
358            let is_horizontal_rule = "---" == line.trim();
359
360            if in_frontmatter {
361                if is_horizontal_rule {
362                    cb(MarkdownElement::Frontmatter(&on[start..since_new_line]));
363                    in_frontmatter = false;
364                }
365                since_new_line = idx + 1;
366                continue;
367            }
368
369            since_new_line = idx + 1;
370
371            if let Some(rest) = line.trim().strip_prefix("```") {
372                // TODO other motifiers here
373                let language = rest.trim_end();
374                current_code_language = Some(language);
375            } else if let "$$" = line.trim() {
376                in_latex_block = true;
377            } else if let Some(line) = line.trim_start().strip_prefix("%%") {
378                if let Some(out) = line.trim_end().strip_suffix("%%") {
379                    cb(MarkdownElement::CommentBlock(out.trim()));
380                } else {
381                    in_markdown_comment = true;
382                }
383            } else if start == 0 && is_horizontal_rule {
384                in_frontmatter = true;
385            } else if let Some(command_line) = strip_surrounds(line, "{%", "%}") {
386                current_command_and_arguments =
387                    Some(command_line.split_once(' ').unwrap_or((command_line, "")));
388            } else {
389                let result = decide(line);
390                let to_add = !matches!(
391                    (options.include_new_lines, result),
392                    (false, MarkdownElement::Empty)
393                );
394                if to_add {
395                    cb(result);
396                }
397            }
398
399            start = since_new_line;
400        }
401    }
402
403    if current_code_language.is_some() {
404        eprintln!("TODO error {current_code_language:?}");
405        // todo!("error here");
406    } else if in_latex_block {
407        eprintln!("TODO unclosed latex block");
408    }
409
410    if in_table {
411        cb(MarkdownElement::Table(Table(&on[start..since_new_line])));
412    } else {
413        let line = &on[start..];
414        let result = decide(line);
415        let to_add = !matches!(
416            (options.include_new_lines, result),
417            (false, MarkdownElement::Empty)
418        );
419        if to_add {
420            cb(result);
421        }
422    }
423
424    Ok(())
425}
426
427/// Work in progress abstraction for iterating over markdown text sections giving decoration (bold, links, etc) information
428/// TODO WIP
429#[allow(clippy::struct_excessive_bools)]
430pub struct PartsIterator<'a> {
431    on: &'a str,
432    last: usize,
433    in_tag: bool,
434    pub in_bold: bool,
435    pub in_italic: bool,
436    in_code: bool,
437    in_latex: bool,
438    in_emoji: bool,
439    in_link: bool,
440    in_chevron_link: bool,
441    in_media: bool,
442    in_expression: bool,
443}
444
445impl<'a> PartsIterator<'a> {
446    #[must_use]
447    pub fn new(on: &'a str) -> Self {
448        Self {
449            on,
450            last: 0,
451            in_tag: false,
452            in_bold: false,
453            in_italic: false,
454            in_emoji: false,
455            in_code: false,
456            in_latex: false,
457            in_link: false,
458            in_chevron_link: false,
459            in_media: false,
460            in_expression: false,
461        }
462    }
463}
464
465impl<'a> Iterator for PartsIterator<'a> {
466    type Item = MarkdownTextElement<'a>;
467
468    #[allow(clippy::too_many_lines)]
469    fn next(&mut self) -> Option<Self::Item> {
470        if self.last >= self.on.len() {
471            None
472        } else {
473            let mut link_text_end: Option<usize> = None;
474            let mut bracket_depth: usize = 0;
475
476            let mut range = &self.on[self.last..];
477            let mut iterator = range.char_indices();
478
479            while let Some((idx, chr)) = iterator.next() {
480                if self.in_link || self.in_media {
481                    if let Some(link_text_end) = link_text_end {
482                        if idx == link_text_end + 1 {
483                            if chr != '(' {
484                                if self.in_link {
485                                    self.last += idx;
486                                    self.in_link = false;
487                                    return Some(MarkdownTextElement::Link {
488                                        on: RawText(&range[..link_text_end]),
489                                        to: "",
490                                    });
491                                }
492                                panic!("media parsing broken {chr}");
493                            }
494                        } else if let ')' = chr {
495                            let in_brackets = &range[..link_text_end];
496                            let in_parenthesis = &range[link_text_end + "](".len()..idx];
497                            let element = if self.in_link {
498                                self.in_link = false;
499                                MarkdownTextElement::Link {
500                                    on: RawText(in_brackets),
501                                    to: in_parenthesis,
502                                }
503                            } else {
504                                self.in_media = false;
505                                MarkdownTextElement::Media {
506                                    alt: in_brackets,
507                                    source: in_parenthesis,
508                                }
509                            };
510
511                            self.last += idx + 1;
512                            return Some(element);
513                        }
514                    } else if let ']' = chr {
515                        if let Some(reduced_depth) = bracket_depth.checked_sub(1) {
516                            bracket_depth = reduced_depth;
517                        } else {
518                            link_text_end = Some(idx);
519                        }
520                    } else if let '[' = chr {
521                        bracket_depth += 1;
522                    }
523
524                    continue;
525                }
526
527                // TODO escaped stuff etc
528                if self.in_code {
529                    if let '`' = chr {
530                        self.last += idx + 1;
531                        self.in_code = false;
532                        return Some(MarkdownTextElement::Code(&range[..idx]));
533                    }
534                    continue;
535                }
536                // TODO escaped stuff etc
537                if let (true, '$') = (self.in_latex, chr) {
538                    self.last += idx + 1;
539                    self.in_latex = false;
540                    return Some(MarkdownTextElement::Latex(&range[..idx]));
541                }
542                // TODO escaped stuff etc
543                if let (true, ':') = (self.in_emoji, chr) {
544                    self.last += idx + 1;
545                    self.in_emoji = false;
546                    return Some(MarkdownTextElement::Emoji(&range[..idx]));
547                }
548                // TODO escaped stuff etc
549                if let (true, '}') = (self.in_expression, chr) {
550                    self.last += idx + 1;
551                    self.in_expression = false;
552                    return Some(MarkdownTextElement::Expression(&range[..idx]));
553                }
554                // TODO escaped stuff etc
555                if let (true, '>') = (self.in_chevron_link, chr) {
556                    self.last += idx + 1;
557                    self.in_chevron_link = false;
558                    let inner = &range[..idx];
559                    return Some(MarkdownTextElement::Link {
560                        // presentation as same as link
561                        on: RawText(inner),
562                        to: inner,
563                    });
564                }
565
566                if self.in_tag && chr.is_whitespace() {
567                    self.last += idx + 1;
568                    self.in_tag = false;
569                    return Some(MarkdownTextElement::Tag(&range[..idx]));
570                }
571
572                macro_rules! yield_current {
573                    () => {{
574                        let item = &range[..idx];
575                        if !item.is_empty() {
576                            return Some(MarkdownTextElement::Plain(item));
577                        }
578                        // Reset
579                        range = &self.on[self.last..];
580                        iterator = range.char_indices();
581                    }};
582                }
583
584                match chr {
585                    '`' => {
586                        self.last += idx + 1;
587                        self.in_code = true;
588                        yield_current!();
589                    }
590                    '$' => {
591                        self.last += idx + 1;
592                        self.in_latex = true;
593                        yield_current!();
594                    }
595                    '{' => {
596                        self.last += idx + 1;
597                        self.in_expression = true;
598                        yield_current!();
599                    }
600                    ':' if range[(idx + 1)..]
601                        .chars()
602                        .next()
603                        .is_some_and(char::is_alphanumeric) =>
604                    {
605                        // TODO check next is not whitespace etc
606                        self.last += idx + 1;
607                        self.in_emoji = true;
608                        yield_current!();
609                    }
610                    '#' => {
611                        self.last += idx + 1;
612                        self.in_tag = true;
613                        yield_current!();
614                    }
615                    '<' if range[idx..]
616                        .chars()
617                        .next()
618                        .is_some_and(char::is_alphanumeric) =>
619                    {
620                        self.last += idx + 1;
621                        self.in_chevron_link = true;
622                        yield_current!();
623                    }
624                    '!' if range[idx..].starts_with("![") => {
625                        self.last += idx + "![".len();
626                        self.in_media = true;
627                        yield_current!();
628                    }
629                    '[' => {
630                        self.last += idx + '['.len_utf8();
631                        self.in_link = true;
632                        yield_current!();
633                    }
634                    '*' | '_' => {
635                        let start = &range[idx..];
636                        if start.starts_with("**") || start.starts_with("__") {
637                            self.last += idx + 2;
638                            self.in_bold = !self.in_bold;
639                            if self.in_bold {
640                                yield_current!();
641                            } else {
642                                return Some(MarkdownTextElement::Bold(&range[..idx]));
643                            }
644                        } else {
645                            self.last += idx + 1;
646                            self.in_italic = !self.in_italic;
647                            if self.in_italic {
648                                yield_current!();
649                            } else {
650                                return Some(MarkdownTextElement::Italic(&range[..idx]));
651                            }
652                        }
653                    }
654                    _ => {}
655                }
656            }
657
658            self.last = self.on.len();
659            if range.is_empty() {
660                None
661            } else {
662                // TODO errors left overs. But also others such as tags etc
663                Some(MarkdownTextElement::Plain(range))
664            }
665        }
666    }
667}
668
669#[derive(Debug, Copy, Clone, PartialEq, Eq)]
670pub struct RawMarkdown<'a>(pub &'a str);
671
672// #[cfg(target_family = "wasm")]
673// #[wasm_bindgen]
674// impl RawMarkdown<'_> {
675//     #[must_use]
676//     #[cfg(target_family = "wasm")]
677//     #[wasm_bindgen]
678//     pub fn to_html(&self, emitter: Option<crate::extras::emit::FeatureEmitterWASM>) -> String {
679//         crate::extras::emit::markdown_to_html_string(&self.markdown_content, emitter)
680//     }
681// }
682
683#[derive(Debug, Copy, Clone, PartialEq, Eq)]
684pub struct Table<'a>(pub(crate) &'a str);
685
686impl<'a> Table<'a> {
687    pub fn rows(&self) -> impl Iterator<Item = TableRow<'a>> {
688        let mut lines = self.0.lines();
689        let header = lines.next().expect("no heading (empty table)");
690        std::iter::once(TableRow(header)).chain(lines.skip(1).map(TableRow))
691    }
692}
693
694#[derive(Debug, Copy, Clone, PartialEq, Eq)]
695pub struct TableRow<'a>(pub(crate) &'a str);
696
697impl<'a> TableRow<'a> {
698    pub fn cells(&self) -> impl Iterator<Item = RawText<'a>> {
699        let inner = &self.0[1..(self.0.len() - 1)];
700        inner.split('|').map(RawText)
701    }
702}
703#[derive(Debug, Copy, Clone, PartialEq, Eq)]
704pub struct CommandBlock<'a> {
705    pub name: &'a str,
706    pub arguments: &'a str,
707    pub inner: RawMarkdown<'a>,
708}
709
710impl<'a> CommandBlock<'a> {
711    #[must_use]
712    #[allow(clippy::collapsible_else_if)]
713    pub fn arguments(&self) -> Vec<(&'a str, &'a str)> {
714        let mut arguments = Vec::new();
715        let mut key: Option<&str> = None;
716        let mut start = 0;
717        let mut in_string = false;
718
719        for (idx, chr) in self.arguments.char_indices() {
720            if let Some(current_key) = key {
721                let value = self.arguments[start..idx].trim();
722                if let (' ', false, false) = (chr, in_string, value.is_empty()) {
723                    arguments.push((current_key, value));
724                    start = idx;
725                    key = None;
726                } else if let '"' = chr {
727                    in_string = !in_string;
728                }
729            } else {
730                if let '=' = chr {
731                    let key_acc = &self.arguments[start..idx];
732                    key = Some(key_acc.trim());
733                    start = idx + 1;
734                }
735            }
736        }
737        if let Some(current_key) = key {
738            if in_string {
739                eprintln!("missing '\"'");
740            }
741            let value = self.arguments[start..].trim();
742            arguments.push((current_key, value));
743        }
744
745        arguments
746    }
747}