html_to_pulldown_cmark_events/
lib.rs

1use pulldown_cmark::{CodeBlockKind, CowStr, Event, LinkType, Tag};
2use scraper::{
3    node::{Element, Text},
4    ElementRef, Html, Node, Selector,
5};
6
7const CRTL: &str = "\n";
8
9pub fn parser(raw: impl AsRef<str>, events: &mut Vec<Event<'_>>) {
10    let html = Html::parse_fragment(raw.as_ref());
11
12    parse_block(events, *html.root_element());
13}
14
15fn parse_block(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>) {
16    for node in parent.children() {
17        // blocks
18        match node.value() {
19            Node::Element(elem) => {
20                let name = elem.name();
21                match name {
22                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
23                        let level = atoi::ascii_to_digit::<usize>(name.as_bytes()[1]).unwrap();
24                        let tag = Tag::Heading(level.try_into().unwrap(), None, Vec::new());
25                        events.push(Event::Start(tag.clone()));
26
27                        parse_inline(events, node, false);
28
29                        events.push(Event::End(tag));
30                    }
31                    "p" => {
32                        let tag = Tag::Paragraph;
33                        events.push(Event::Start(tag.clone()));
34
35                        parse_inline(events, node, false);
36
37                        events.push(Event::End(tag));
38                    }
39                    "img" => {
40                        let mut attrs = elem
41                            .attrs()
42                            .filter(|a| a.0 == "src" || a.0 == "alt")
43                            .collect::<Vec<_>>();
44
45                        attrs.sort_by_key(|attr| attr.0);
46
47                        if attrs.is_empty() {
48                            continue;
49                        }
50
51                        let (src, alt) = (
52                            attrs[0].1.to_string(),
53                            if attrs.len() == 1 {
54                                String::new()
55                            } else {
56                                attrs[1].1.to_string()
57                            },
58                        );
59
60                        let tag = Tag::Image(LinkType::Inline, src.into(), alt.into());
61                        events.push(Event::Start(tag.clone()));
62                        events.push(Event::End(tag));
63                    }
64                    "blockquote" => {
65                        let tag = Tag::BlockQuote;
66                        events.push(Event::Start(tag.clone()));
67
68                        parse_block(events, node);
69
70                        events.push(Event::End(tag));
71                    }
72                    "ol" | "ul" => {
73                        parse_list(events, node, name.starts_with('o').then_some(1));
74                    }
75                    "br" => {
76                        events.push(Event::HardBreak);
77                    }
78                    "hr" => {
79                        events.push(Event::Rule);
80                    }
81                    "pre" => {
82                        parse_code(events, elem, node);
83                    }
84                    // "code" => {}
85                    // foot
86                    _ => {}
87                }
88            }
89            Node::Text(Text { text }) if text.trim_end_matches(' ') == CRTL => {
90                // events.push(Event::SoftBreak)
91            }
92            _ => {}
93        }
94    }
95}
96
97fn parse_code(events: &mut Vec<Event<'_>>, elem: &Element, node: ego_tree::NodeRef<'_, Node>) {
98    let mut kind = CodeBlockKind::Indented;
99    let elem_ref = ElementRef::wrap(node).unwrap();
100    let mut text = String::new();
101    elem_ref.text().collect::<Vec<_>>().iter().for_each(|s| {
102        text.push_str(s);
103    });
104
105    if let Some(k) = elem
106        .classes()
107        .find_map(|name| name.split_once("language-"))
108        .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim_end().into())))
109        .or_else(|| {
110            elem.attrs()
111                .find(|attr| attr.0 == "data-lang")
112                .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim().into())))
113        })
114    {
115        kind = k;
116    } else {
117        let selector = Selector::parse("code").unwrap();
118        if let Some(k) = elem_ref.select(&selector).next().and_then(|e| {
119            let elem = e.value();
120            elem.classes()
121                .find_map(|name| name.split_once("language-"))
122                .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim_end().into())))
123                .or_else(|| {
124                    elem.attrs()
125                        .find(|attr| attr.0 == "data-lang")
126                        .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim().into())))
127                })
128        }) {
129            kind = k;
130        }
131    }
132
133    // TODO: https://shiki.matsu.io/
134
135    let tag = Tag::CodeBlock(kind);
136    events.push(Event::Start(tag.clone()));
137
138    events.push(Event::Text(CowStr::Boxed(text.into())));
139
140    events.push(Event::End(tag));
141}
142
143/// Netest structs
144fn parse_list(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>, kind: Option<u64>) {
145    let tag = Tag::List(kind);
146    events.push(Event::Start(tag.clone()));
147
148    for node in parent.children() {
149        if matches!(node.value(), Node::Element(elem) if elem.name() == "li") {
150            let tag = Tag::Item;
151            events.push(Event::Start(tag.clone()));
152
153            let trim = false;
154            for node in node.children() {
155                match node.value() {
156                    Node::Element(elem) => {
157                        let name = elem.name();
158                        match name {
159                            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
160                                let level =
161                                    atoi::ascii_to_digit::<usize>(name.as_bytes()[1]).unwrap();
162                                let tag = Tag::Heading(level.try_into().unwrap(), None, Vec::new());
163                                events.push(Event::Start(tag.clone()));
164
165                                parse_inline(events, node, trim);
166
167                                events.push(Event::End(tag));
168                            }
169                            "p" => {
170                                let tag = Tag::Paragraph;
171                                events.push(Event::Start(tag.clone()));
172
173                                parse_inline(events, node, trim);
174
175                                events.push(Event::End(tag));
176                            }
177                            "img" => {
178                                let mut attrs = elem
179                                    .attrs()
180                                    .filter(|a| a.0 == "src" || a.0 == "alt")
181                                    .collect::<Vec<_>>();
182
183                                attrs.sort_by_key(|attr| attr.0);
184
185                                if attrs.is_empty() {
186                                    continue;
187                                }
188
189                                let (src, alt) = (
190                                    attrs[0].1.to_string(),
191                                    if attrs.len() == 1 {
192                                        String::new()
193                                    } else {
194                                        attrs[1].1.to_string()
195                                    },
196                                );
197
198                                let tag = Tag::Image(LinkType::Inline, src.into(), alt.into());
199                                events.push(Event::Start(tag.clone()));
200                                events.push(Event::End(tag));
201                            }
202                            "blockquote" => {
203                                let tag = Tag::BlockQuote;
204                                events.push(Event::Start(tag.clone()));
205
206                                parse_block(events, node);
207
208                                events.push(Event::End(tag));
209                            }
210                            "ol" | "ul" => {
211                                parse_list(events, node, name.starts_with('o').then_some(1));
212                            }
213                            "br" => {
214                                events.push(Event::HardBreak);
215                            }
216                            "hr" => {
217                                events.push(Event::Rule);
218                            }
219                            "pre" => {
220                                parse_code(events, elem, node);
221                            }
222                            // "code" => {}
223                            // foot
224                            k => {
225                                let (start, end) = match k {
226                                    // Link
227                                    "a" => {
228                                        let mut attrs = elem
229                                            .attrs()
230                                            .filter(|a| a.0 == "href" || a.0 == "title")
231                                            .collect::<Vec<_>>();
232
233                                        attrs.sort_by_key(|attr| attr.0);
234
235                                        if attrs.is_empty() {
236                                            continue;
237                                        }
238
239                                        let (href, title) = (
240                                            attrs[0].1.to_string(),
241                                            if attrs.len() == 1 {
242                                                String::new()
243                                            } else {
244                                                attrs[1].1.to_string()
245                                            },
246                                        );
247
248                                        let tag =
249                                            Tag::Link(LinkType::Inline, href.into(), title.into());
250                                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
251                                    }
252                                    // Blod
253                                    "strong" => {
254                                        let tag = Tag::Strong;
255                                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
256                                    }
257                                    // Italic
258                                    "em" => {
259                                        let tag = Tag::Emphasis;
260                                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
261                                    }
262                                    // Strikethrough
263                                    "del" => {
264                                        let tag = Tag::Strikethrough;
265                                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
266                                    }
267                                    // Inline Code
268                                    "code" => (
269                                        node.first_child()
270                                            .and_then(|node| node.value().as_text())
271                                            .map(|text| {
272                                                let text = text.to_string();
273                                                Event::Code(CowStr::Boxed(if trim {
274                                                    text.trim().into()
275                                                } else {
276                                                    text.into()
277                                                }))
278                                            }),
279                                        None,
280                                    ),
281                                    // Subscript
282                                    // "sub" => {},
283                                    // Superscript
284                                    // "sup" => {},
285                                    _ => (None, None),
286                                };
287
288                                if let Some(e) = start {
289                                    events.push(e);
290                                }
291
292                                if let Some(e) = end {
293                                    parse_inline(events, node, trim);
294
295                                    events.push(e);
296                                }
297                            }
298                        }
299                    }
300                    Node::Text(Text { text }) if text.trim_end_matches(' ') != CRTL => {
301                        events.push(Event::Text(CowStr::Boxed(text.to_string().into())));
302                    }
303                    _ => {}
304                }
305            }
306            events.push(Event::End(tag));
307        }
308    }
309
310    events.push(Event::End(tag));
311}
312
313fn parse_inline(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>, trim: bool) {
314    for node in parent.children() {
315        match node.value() {
316            Node::Element(elem) => {
317                let (start, end) = match elem.name() {
318                    // Link
319                    "a" => {
320                        let mut attrs = elem
321                            .attrs()
322                            .filter(|a| a.0 == "href" || a.0 == "title")
323                            .collect::<Vec<_>>();
324
325                        attrs.sort_by_key(|attr| attr.0);
326
327                        if attrs.is_empty() {
328                            continue;
329                        }
330
331                        let (href, title) = (
332                            attrs[0].1.to_string(),
333                            if attrs.len() == 1 {
334                                String::new()
335                            } else {
336                                attrs[1].1.to_string()
337                            },
338                        );
339
340                        let tag = Tag::Link(LinkType::Inline, href.into(), title.into());
341                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
342                    }
343                    // Blod
344                    "strong" => {
345                        let tag = Tag::Strong;
346                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
347                    }
348                    // Italic
349                    "em" => {
350                        let tag = Tag::Emphasis;
351                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
352                    }
353                    // Strikethrough
354                    "del" => {
355                        let tag = Tag::Strikethrough;
356                        (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
357                    }
358                    // Inline Code
359                    "code" => (
360                        node.first_child()
361                            .and_then(|node| node.value().as_text())
362                            .map(|text| {
363                                let text = text.to_string();
364                                Event::Code(CowStr::Boxed(if trim {
365                                    text.trim().into()
366                                } else {
367                                    text.into()
368                                }))
369                            }),
370                        None,
371                    ),
372                    // Subscript
373                    // "sub" => {},
374                    // Superscript
375                    // "sup" => {},
376                    _ => (None, None),
377                };
378
379                if let Some(e) = start {
380                    events.push(e);
381                }
382
383                if let Some(e) = end {
384                    parse_inline(events, node, trim);
385
386                    events.push(e);
387                }
388            }
389            Node::Text(Text { text }) => {
390                events.push(Event::Text(CowStr::Boxed(
391                    text.replace('\n', " ").to_string().into(),
392                )));
393            }
394            _ => {}
395        }
396    }
397}