html_to_pulldown_cmark_events/
lib.rs1use pulldown_cmark::{CodeBlockKind, CowStr, Event, LinkType, Tag};
2use scraper::{
3 node::{Element, Text},
4 ElementRef, Html, Node, Selector,
5};
6
7const CRTL: &str = "\n";
8
9pub fn parser(raw: impl AsRef<str>, events: &mut Vec<Event<'_>>) {
10 let html = Html::parse_fragment(raw.as_ref());
11
12 parse_block(events, *html.root_element());
13}
14
15fn parse_block(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>) {
16 for node in parent.children() {
17 match node.value() {
19 Node::Element(elem) => {
20 let name = elem.name();
21 match name {
22 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
23 let level = atoi::ascii_to_digit::<usize>(name.as_bytes()[1]).unwrap();
24 let tag = Tag::Heading(level.try_into().unwrap(), None, Vec::new());
25 events.push(Event::Start(tag.clone()));
26
27 parse_inline(events, node, false);
28
29 events.push(Event::End(tag));
30 }
31 "p" => {
32 let tag = Tag::Paragraph;
33 events.push(Event::Start(tag.clone()));
34
35 parse_inline(events, node, false);
36
37 events.push(Event::End(tag));
38 }
39 "img" => {
40 let mut attrs = elem
41 .attrs()
42 .filter(|a| a.0 == "src" || a.0 == "alt")
43 .collect::<Vec<_>>();
44
45 attrs.sort_by_key(|attr| attr.0);
46
47 if attrs.is_empty() {
48 continue;
49 }
50
51 let (src, alt) = (
52 attrs[0].1.to_string(),
53 if attrs.len() == 1 {
54 String::new()
55 } else {
56 attrs[1].1.to_string()
57 },
58 );
59
60 let tag = Tag::Image(LinkType::Inline, src.into(), alt.into());
61 events.push(Event::Start(tag.clone()));
62 events.push(Event::End(tag));
63 }
64 "blockquote" => {
65 let tag = Tag::BlockQuote;
66 events.push(Event::Start(tag.clone()));
67
68 parse_block(events, node);
69
70 events.push(Event::End(tag));
71 }
72 "ol" | "ul" => {
73 parse_list(events, node, name.starts_with('o').then_some(1));
74 }
75 "br" => {
76 events.push(Event::HardBreak);
77 }
78 "hr" => {
79 events.push(Event::Rule);
80 }
81 "pre" => {
82 parse_code(events, elem, node);
83 }
84 _ => {}
87 }
88 }
89 Node::Text(Text { text }) if text.trim_end_matches(' ') == CRTL => {
90 }
92 _ => {}
93 }
94 }
95}
96
97fn parse_code(events: &mut Vec<Event<'_>>, elem: &Element, node: ego_tree::NodeRef<'_, Node>) {
98 let mut kind = CodeBlockKind::Indented;
99 let elem_ref = ElementRef::wrap(node).unwrap();
100 let mut text = String::new();
101 elem_ref.text().collect::<Vec<_>>().iter().for_each(|s| {
102 text.push_str(s);
103 });
104
105 if let Some(k) = elem
106 .classes()
107 .find_map(|name| name.split_once("language-"))
108 .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim_end().into())))
109 .or_else(|| {
110 elem.attrs()
111 .find(|attr| attr.0 == "data-lang")
112 .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim().into())))
113 })
114 {
115 kind = k;
116 } else {
117 let selector = Selector::parse("code").unwrap();
118 if let Some(k) = elem_ref.select(&selector).next().and_then(|e| {
119 let elem = e.value();
120 elem.classes()
121 .find_map(|name| name.split_once("language-"))
122 .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim_end().into())))
123 .or_else(|| {
124 elem.attrs()
125 .find(|attr| attr.0 == "data-lang")
126 .map(|(_, lang)| CodeBlockKind::Fenced(CowStr::Boxed(lang.trim().into())))
127 })
128 }) {
129 kind = k;
130 }
131 }
132
133 let tag = Tag::CodeBlock(kind);
136 events.push(Event::Start(tag.clone()));
137
138 events.push(Event::Text(CowStr::Boxed(text.into())));
139
140 events.push(Event::End(tag));
141}
142
143fn parse_list(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>, kind: Option<u64>) {
145 let tag = Tag::List(kind);
146 events.push(Event::Start(tag.clone()));
147
148 for node in parent.children() {
149 if matches!(node.value(), Node::Element(elem) if elem.name() == "li") {
150 let tag = Tag::Item;
151 events.push(Event::Start(tag.clone()));
152
153 let trim = false;
154 for node in node.children() {
155 match node.value() {
156 Node::Element(elem) => {
157 let name = elem.name();
158 match name {
159 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
160 let level =
161 atoi::ascii_to_digit::<usize>(name.as_bytes()[1]).unwrap();
162 let tag = Tag::Heading(level.try_into().unwrap(), None, Vec::new());
163 events.push(Event::Start(tag.clone()));
164
165 parse_inline(events, node, trim);
166
167 events.push(Event::End(tag));
168 }
169 "p" => {
170 let tag = Tag::Paragraph;
171 events.push(Event::Start(tag.clone()));
172
173 parse_inline(events, node, trim);
174
175 events.push(Event::End(tag));
176 }
177 "img" => {
178 let mut attrs = elem
179 .attrs()
180 .filter(|a| a.0 == "src" || a.0 == "alt")
181 .collect::<Vec<_>>();
182
183 attrs.sort_by_key(|attr| attr.0);
184
185 if attrs.is_empty() {
186 continue;
187 }
188
189 let (src, alt) = (
190 attrs[0].1.to_string(),
191 if attrs.len() == 1 {
192 String::new()
193 } else {
194 attrs[1].1.to_string()
195 },
196 );
197
198 let tag = Tag::Image(LinkType::Inline, src.into(), alt.into());
199 events.push(Event::Start(tag.clone()));
200 events.push(Event::End(tag));
201 }
202 "blockquote" => {
203 let tag = Tag::BlockQuote;
204 events.push(Event::Start(tag.clone()));
205
206 parse_block(events, node);
207
208 events.push(Event::End(tag));
209 }
210 "ol" | "ul" => {
211 parse_list(events, node, name.starts_with('o').then_some(1));
212 }
213 "br" => {
214 events.push(Event::HardBreak);
215 }
216 "hr" => {
217 events.push(Event::Rule);
218 }
219 "pre" => {
220 parse_code(events, elem, node);
221 }
222 k => {
225 let (start, end) = match k {
226 "a" => {
228 let mut attrs = elem
229 .attrs()
230 .filter(|a| a.0 == "href" || a.0 == "title")
231 .collect::<Vec<_>>();
232
233 attrs.sort_by_key(|attr| attr.0);
234
235 if attrs.is_empty() {
236 continue;
237 }
238
239 let (href, title) = (
240 attrs[0].1.to_string(),
241 if attrs.len() == 1 {
242 String::new()
243 } else {
244 attrs[1].1.to_string()
245 },
246 );
247
248 let tag =
249 Tag::Link(LinkType::Inline, href.into(), title.into());
250 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
251 }
252 "strong" => {
254 let tag = Tag::Strong;
255 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
256 }
257 "em" => {
259 let tag = Tag::Emphasis;
260 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
261 }
262 "del" => {
264 let tag = Tag::Strikethrough;
265 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
266 }
267 "code" => (
269 node.first_child()
270 .and_then(|node| node.value().as_text())
271 .map(|text| {
272 let text = text.to_string();
273 Event::Code(CowStr::Boxed(if trim {
274 text.trim().into()
275 } else {
276 text.into()
277 }))
278 }),
279 None,
280 ),
281 _ => (None, None),
286 };
287
288 if let Some(e) = start {
289 events.push(e);
290 }
291
292 if let Some(e) = end {
293 parse_inline(events, node, trim);
294
295 events.push(e);
296 }
297 }
298 }
299 }
300 Node::Text(Text { text }) if text.trim_end_matches(' ') != CRTL => {
301 events.push(Event::Text(CowStr::Boxed(text.to_string().into())));
302 }
303 _ => {}
304 }
305 }
306 events.push(Event::End(tag));
307 }
308 }
309
310 events.push(Event::End(tag));
311}
312
313fn parse_inline(events: &mut Vec<Event<'_>>, parent: ego_tree::NodeRef<'_, Node>, trim: bool) {
314 for node in parent.children() {
315 match node.value() {
316 Node::Element(elem) => {
317 let (start, end) = match elem.name() {
318 "a" => {
320 let mut attrs = elem
321 .attrs()
322 .filter(|a| a.0 == "href" || a.0 == "title")
323 .collect::<Vec<_>>();
324
325 attrs.sort_by_key(|attr| attr.0);
326
327 if attrs.is_empty() {
328 continue;
329 }
330
331 let (href, title) = (
332 attrs[0].1.to_string(),
333 if attrs.len() == 1 {
334 String::new()
335 } else {
336 attrs[1].1.to_string()
337 },
338 );
339
340 let tag = Tag::Link(LinkType::Inline, href.into(), title.into());
341 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
342 }
343 "strong" => {
345 let tag = Tag::Strong;
346 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
347 }
348 "em" => {
350 let tag = Tag::Emphasis;
351 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
352 }
353 "del" => {
355 let tag = Tag::Strikethrough;
356 (Some(Event::Start(tag.clone())), Some(Event::End(tag)))
357 }
358 "code" => (
360 node.first_child()
361 .and_then(|node| node.value().as_text())
362 .map(|text| {
363 let text = text.to_string();
364 Event::Code(CowStr::Boxed(if trim {
365 text.trim().into()
366 } else {
367 text.into()
368 }))
369 }),
370 None,
371 ),
372 _ => (None, None),
377 };
378
379 if let Some(e) = start {
380 events.push(e);
381 }
382
383 if let Some(e) = end {
384 parse_inline(events, node, trim);
385
386 events.push(e);
387 }
388 }
389 Node::Text(Text { text }) => {
390 events.push(Event::Text(CowStr::Boxed(
391 text.replace('\n', " ").to_string().into(),
392 )));
393 }
394 _ => {}
395 }
396 }
397}