Skip to main content

basalt_tui/note_editor/
parser.rs

1use std::ops::{Deref, DerefMut};
2
3use pulldown_cmark::{CodeBlockKind, Event, Options, Tag, TagEnd};
4
5use crate::note_editor::{
6    ast::{self, Node, SourceRange, TaskKind},
7    rich_text::{RichText, Style, TextSegment},
8};
9
10pub struct Parser<'a>(pulldown_cmark::TextMergeWithOffset<'a, pulldown_cmark::OffsetIter<'a>>);
11
12impl<'a> Deref for Parser<'a> {
13    type Target = pulldown_cmark::TextMergeWithOffset<'a, pulldown_cmark::OffsetIter<'a>>;
14    fn deref(&self) -> &Self::Target {
15        &self.0
16    }
17}
18
19impl DerefMut for Parser<'_> {
20    fn deref_mut(&mut self) -> &mut Self::Target {
21        &mut self.0
22    }
23}
24
25impl<'a> Iterator for Parser<'a> {
26    type Item = (Event<'a>, SourceRange<usize>);
27    fn next(&mut self) -> Option<Self::Item> {
28        self.deref_mut().next()
29    }
30}
31
32#[derive(Clone, Debug, PartialEq, Default)]
33pub struct ParserState {
34    task_kind: Vec<ast::TaskKind>,
35    item_kind: Vec<ast::ItemKind>,
36}
37
38impl<'a> Parser<'a> {
39    /// Creates a new [`Parser`] from a Markdown input string.
40    ///
41    /// The parser uses [`pulldown_cmark::Parser::new_ext`] with [`Options::all()`] and
42    /// [`pulldown_cmark::TextMergeWithOffset`] internally.
43    ///
44    /// The offset is required to know where the node appears in the provided source text.
45    pub fn new(text: &'a str) -> Self {
46        let mut options = Options::all();
47
48        // Smart punctuation is excluded because it converts ASCII characters (e.g. ", ') to
49        // multi-byte Unicode (“, ‘), making the rendered text longer than the source, causing the
50        // source offset to overlap in some cases and causing unexpected behavior.
51        //
52        // TODO: Holistic approach to support smart punctation. Potentially need to do this on a
53        // different layer of the app to only do the smart punctuation effect visually using
54        // virtual elements or such, but keeping the original source content unchanged.
55        options.remove(Options::ENABLE_SMART_PUNCTUATION);
56
57        let parser = pulldown_cmark::TextMergeWithOffset::new(
58            pulldown_cmark::Parser::new_ext(text, options).into_offset_iter(),
59        );
60
61        Self(parser)
62    }
63
64    pub fn parse(mut self) -> Vec<Node> {
65        let mut result = Vec::new();
66        let mut state = ParserState::default();
67
68        while let Some((event, _)) = self.next() {
69            match event {
70                Event::Start(tag) if Self::is_container_tag(&tag) => {
71                    if let Some(node) = self.parse_container(tag, &mut state) {
72                        result.push(node);
73                    }
74                }
75                _ => {}
76            }
77        }
78
79        result
80    }
81
82    pub fn parse_container(&mut self, tag: Tag, state: &mut ParserState) -> Option<Node> {
83        let mut nodes = Vec::new();
84        let mut text_segments = Vec::new();
85        let mut inline_styles = Vec::new();
86
87        match tag {
88            Tag::List(Some(start)) => {
89                state.item_kind.push(ast::ItemKind::Ordered(start));
90            }
91            Tag::List(..) => {
92                state.item_kind.push(ast::ItemKind::Unordered);
93            }
94            _ => {}
95        };
96
97        while let Some((event, source_range)) = self.next() {
98            match event {
99                Event::Start(inner_tag) if Self::is_container_tag(&inner_tag) => {
100                    if let Some(node) = self.parse_container(inner_tag, state) {
101                        nodes.push(node);
102                    }
103                }
104
105                Event::Start(inner_tag) if Self::is_inline_tag(&inner_tag) => {
106                    if let Some(style) = Self::tag_to_style(&inner_tag) {
107                        inline_styles.push(style);
108                    }
109                }
110
111                Event::TaskListMarker(checked) => {
112                    state.task_kind.push(if checked {
113                        TaskKind::Checked
114                    } else {
115                        TaskKind::Unchecked
116                    });
117                }
118
119                Event::Code(text) => {
120                    let text_segment = TextSegment::styled(&text, Style::Code);
121                    text_segments.push(text_segment);
122                }
123
124                Event::Text(text) => {
125                    let mut text_segment = TextSegment::plain(&text);
126                    inline_styles.iter().for_each(|style| {
127                        text_segment.add_style(style);
128                    });
129                    text_segments.push(text_segment);
130                }
131
132                Event::SoftBreak => {
133                    let text_segment = TextSegment::empty_line();
134                    text_segments.push(text_segment);
135                }
136
137                Event::End(tag_end) if Self::tags_match(&tag, &tag_end) => {
138                    let text = if !text_segments.is_empty() {
139                        RichText::from(text_segments)
140                    } else {
141                        RichText::empty()
142                    };
143
144                    return match tag {
145                        Tag::Heading { level, .. } => Some(Node::Heading {
146                            level: level.into(),
147                            text,
148                            source_range,
149                        }),
150                        Tag::Item => {
151                            // This is required since in block quotes list items are considered
152                            // "tight", thus the text is not stored in a paragraph directly.
153                            // TODO: Think if wrapping this into a paragraph is a good idea or not.
154                            // Potentially storing a RichText here is better.
155                            if !text.is_empty() {
156                                nodes.insert(
157                                    0,
158                                    Node::Paragraph {
159                                        text,
160                                        source_range: source_range.clone(),
161                                    },
162                                );
163                            }
164
165                            let item = if let Some(kind) = state.task_kind.pop() {
166                                Some(Node::Task {
167                                    kind,
168                                    nodes,
169                                    source_range,
170                                })
171                            } else {
172                                Some(Node::Item {
173                                    kind: state
174                                        .item_kind
175                                        .last()
176                                        .cloned()
177                                        .unwrap_or(ast::ItemKind::Unordered),
178                                    nodes,
179                                    source_range,
180                                })
181                            };
182
183                            if let Some(ast::ItemKind::Ordered(start)) = state.item_kind.last_mut()
184                            {
185                                *start += 1;
186                            };
187
188                            item
189                        }
190                        Tag::List(..) => {
191                            state.item_kind.pop();
192
193                            Some(Node::List {
194                                nodes,
195                                source_range,
196                            })
197                        }
198                        Tag::CodeBlock(kind) => Some(Node::CodeBlock {
199                            lang: match kind {
200                                CodeBlockKind::Fenced(lang) => Some(lang.to_string()),
201                                _ => None,
202                            },
203                            text,
204                            source_range,
205                        }),
206                        Tag::BlockQuote(kind) => Some(Node::BlockQuote {
207                            kind: kind.map(|kind| kind.into()),
208                            nodes,
209                            source_range,
210                        }),
211                        Tag::Paragraph => Some(Node::Paragraph { text, source_range }),
212                        _ => None,
213                    };
214                }
215                _ => {}
216            }
217        }
218
219        None
220    }
221
222    fn is_container_tag(tag: &Tag) -> bool {
223        matches!(
224            tag,
225            Tag::Paragraph
226                | Tag::Item
227                | Tag::List(..)
228                | Tag::BlockQuote(..)
229                | Tag::CodeBlock(..)
230                | Tag::Heading { .. }
231        )
232    }
233
234    fn is_inline_tag(tag: &Tag) -> bool {
235        matches!(tag, Tag::Emphasis | Tag::Strong | Tag::Strikethrough)
236    }
237
238    fn tags_match(start: &Tag, end: &TagEnd) -> bool {
239        fn tag_to_end(tag: &Tag) -> Option<TagEnd> {
240            match tag {
241                Tag::Heading { level, .. } => Some(TagEnd::Heading(*level)),
242                Tag::List(ordered) => Some(TagEnd::List(ordered.is_some())),
243                Tag::Item => Some(TagEnd::Item),
244                Tag::BlockQuote(kind) => Some(TagEnd::BlockQuote(*kind)),
245                Tag::CodeBlock(..) => Some(TagEnd::CodeBlock),
246                Tag::Paragraph => Some(TagEnd::Paragraph),
247                _ => None,
248            }
249        }
250
251        if let Some(start) = tag_to_end(start) {
252            std::mem::discriminant(&start) == std::mem::discriminant(end)
253        } else {
254            false
255        }
256    }
257
258    fn tag_to_style(tag: &Tag) -> Option<Style> {
259        match tag {
260            Tag::Emphasis => Some(Style::Emphasis),
261            Tag::Strong => Some(Style::Strong),
262            Tag::Strikethrough => Some(Style::Strikethrough),
263            _ => None,
264        }
265    }
266}
267
268pub fn from_str(text: &str) -> Vec<Node> {
269    Parser::new(text).parse()
270}
271
272#[cfg(test)]
273mod tests {
274    use indoc::indoc;
275    use insta::assert_snapshot;
276
277    use super::*;
278
279    #[test]
280    fn test_parser() {
281        let tests = [
282            (
283                "paragraphs",
284                indoc! { r#"## Paragraphs
285                To create paragraphs in Markdown, use a **blank line** to separate blocks of text. Each block of text separated by a blank line is treated as a distinct paragraph.
286
287                This is a paragraph.
288
289                This is another paragraph.
290
291                A blank line between lines of text creates separate paragraphs. This is the default behavior in Markdown.
292                "#},
293            ),
294            (
295                "headings",
296                indoc! { r#"## Headings
297                To create a heading, add up to six `#` symbols before your heading text. The number of `#` symbols determines the size of the heading.
298
299                # This is a heading 1
300                ## This is a heading 2
301                ### This is a heading 3
302                #### This is a heading 4
303                ##### This is a heading 5
304                ###### This is a heading 6
305                "#},
306            ),
307            (
308                "lists",
309                indoc! { r#"## Lists
310                You can create an unordered list by adding a `-`, `*`, or `+` before the text.
311
312                - First list item
313                - Second list item
314                - Third list item
315
316                To create an ordered list, start each line with a number followed by a `.` or `)` symbol.
317
318                1. First list item
319                2. Second list item
320                3. Third list item
321
322                1) First list item
323                2) Second list item
324                3) Third list item
325                "#},
326            ),
327            (
328                "lists_line_breaks",
329                indoc! { r#"## Lists with line breaks
330                You can use line breaks within an ordered list without altering the numbering.
331
332                1. First list item
333
334                2. Second list item
335                3. Third list item
336
337                4. Fourth list item
338                5. Fifth list item
339                6. Sixth list item
340                "#},
341            ),
342            (
343                "task_lists",
344                indoc! { r#"## Task lists
345                To create a task list, start each list item with a hyphen and space followed by `[ ]`.
346
347                - [x] This is a completed task.
348                - [ ] This is an incomplete task.
349
350                You can toggle a task in Reading view by selecting the checkbox.
351
352                > [!tip]
353                > You can use any character inside the brackets to mark it as complete.
354                >
355                > - [x] Milk
356                > - [?] Eggs
357                > - [-] Eggs
358                "#},
359            ),
360            (
361                "nesting_lists",
362                indoc! { r#"## Nesting lists
363                You can nest any type of list—ordered, unordered, or task lists—under any other type of list.
364
365                To create a nested list, indent one or more list items. You can mix list types within a nested structure:
366
367                1. First list item
368                   1. Ordered nested list item
369                2. Second list item
370                   - Unordered nested list item
371                "#},
372            ),
373            (
374                "nesting_task_lists",
375                indoc! { r#"## Nesting task lists
376                Similarly, you can create a nested task list by indenting one or more list items:
377
378                - [ ] Task item 1
379                  - [ ] Subtask 1
380                - [ ] Task item 2
381                  - [ ] Subtask 2
382                "#},
383            ),
384            // TODO: Implement horizontal rule
385            // (
386            //     "horizontal_rule",
387            //     indoc! { r#"## Horizontal rule
388            //     You can use three or more stars `***`, hyphens `---`, or underscore `___` on its own line to add a horizontal bar. You can also separate symbols using spaces.
389            //
390            //     ***
391            //     ****
392            //     * * *
393            //     ---
394            //     ----
395            //     - - -
396            //     ___
397            //     ____
398            //     _ _ _
399            //     "#},
400            // ),
401            (
402                "code_blocks",
403                indoc! { r#"## Code blocks
404                To format code as a block, enclose it with three backticks or three tildes.
405
406                ```md
407                cd ~/Desktop
408                ```
409
410                You can also create a code block by indenting the text using `Tab` or 4 blank spaces.
411
412                    cd ~/Desktop
413
414                "#},
415            ),
416            (
417                "code_syntax_highlighting_in_blocks",
418                indoc! { r#"## Code syntax highlighting in blocks
419                You can add syntax highlighting to a code block, by adding a language code after the first set of backticks.
420
421                ```js
422                function fancyAlert(arg) {
423                  if(arg) {
424                    $.facebox({div:'#foo'})
425                  }
426                }
427                ```
428                "#},
429            ),
430        ];
431
432        tests.into_iter().for_each(|(name, text)| {
433            assert_snapshot!(
434                name,
435                format!(
436                    "{}\n ---\n\n{}",
437                    text,
438                    ast::nodes_to_sexp(&from_str(text), 0)
439                )
440            );
441        });
442    }
443}