1use crate::entities::ListStyle;
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18 pub spans: Vec<ParsedSpan>,
19 pub heading_level: Option<i64>,
20 pub list_style: Option<ListStyle>,
21 pub is_code_block: bool,
22}
23
24pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
27 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
28
29 let options =
30 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
31 let parser = Parser::new_ext(markdown, options);
32
33 let mut blocks: Vec<ParsedBlock> = Vec::new();
34 let mut current_spans: Vec<ParsedSpan> = Vec::new();
35 let mut current_heading: Option<i64> = None;
36 let mut current_list_style: Option<ListStyle> = None;
37 let mut is_code_block = false;
38 let mut in_block = false;
39
40 let mut bold = false;
42 let mut italic = false;
43 let mut strikeout = false;
44 let mut link_href: Option<String> = None;
45
46 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
48
49 for event in parser {
50 match event {
51 Event::Start(Tag::Paragraph) => {
52 in_block = true;
53 current_heading = None;
54 is_code_block = false;
55 }
56 Event::End(TagEnd::Paragraph) => {
57 if !current_spans.is_empty() || in_block {
58 blocks.push(ParsedBlock {
59 spans: std::mem::take(&mut current_spans),
60 heading_level: current_heading.take(),
61 list_style: current_list_style.clone(),
62 is_code_block: false,
63 });
64 }
65 in_block = false;
66 current_list_style = None;
67 }
68 Event::Start(Tag::Heading { level, .. }) => {
69 in_block = true;
70 current_heading = Some(heading_level_to_i64(level));
71 is_code_block = false;
72 }
73 Event::End(TagEnd::Heading(_)) => {
74 blocks.push(ParsedBlock {
75 spans: std::mem::take(&mut current_spans),
76 heading_level: current_heading.take(),
77 list_style: None,
78 is_code_block: false,
79 });
80 in_block = false;
81 }
82 Event::Start(Tag::List(ordered)) => {
83 let style = if ordered.is_some() {
84 Some(ListStyle::Decimal)
85 } else {
86 Some(ListStyle::Disc)
87 };
88 list_stack.push(style);
89 }
90 Event::End(TagEnd::List(_)) => {
91 list_stack.pop();
92 }
93 Event::Start(Tag::Item) => {
94 in_block = true;
95 current_list_style = list_stack.last().cloned().flatten();
96 }
97 Event::End(TagEnd::Item) => {
98 if !current_spans.is_empty() {
101 blocks.push(ParsedBlock {
102 spans: std::mem::take(&mut current_spans),
103 heading_level: None,
104 list_style: current_list_style.clone(),
105 is_code_block: false,
106 });
107 }
108 in_block = false;
109 current_list_style = None;
110 }
111 Event::Start(Tag::CodeBlock(_)) => {
112 in_block = true;
113 is_code_block = true;
114 }
115 Event::End(TagEnd::CodeBlock) => {
116 blocks.push(ParsedBlock {
117 spans: std::mem::take(&mut current_spans),
118 heading_level: None,
119 list_style: None,
120 is_code_block: true,
121 });
122 in_block = false;
123 is_code_block = false;
124 }
125 Event::Start(Tag::Emphasis) => {
126 italic = true;
127 }
128 Event::End(TagEnd::Emphasis) => {
129 italic = false;
130 }
131 Event::Start(Tag::Strong) => {
132 bold = true;
133 }
134 Event::End(TagEnd::Strong) => {
135 bold = false;
136 }
137 Event::Start(Tag::Strikethrough) => {
138 strikeout = true;
139 }
140 Event::End(TagEnd::Strikethrough) => {
141 strikeout = false;
142 }
143 Event::Start(Tag::Link { dest_url, .. }) => {
144 link_href = Some(dest_url.to_string());
145 }
146 Event::End(TagEnd::Link) => {
147 link_href = None;
148 }
149 Event::Text(text) => {
150 if !in_block {
151 in_block = true;
153 }
154 current_spans.push(ParsedSpan {
155 text: text.to_string(),
156 bold,
157 italic,
158 underline: false,
159 strikeout,
160 code: is_code_block,
161 link_href: link_href.clone(),
162 });
163 }
164 Event::Code(text) => {
165 if !in_block {
166 in_block = true;
167 }
168 current_spans.push(ParsedSpan {
169 text: text.to_string(),
170 bold,
171 italic,
172 underline: false,
173 strikeout,
174 code: true,
175 link_href: link_href.clone(),
176 });
177 }
178 Event::SoftBreak => {
179 current_spans.push(ParsedSpan {
181 text: " ".to_string(),
182 bold,
183 italic,
184 underline: false,
185 strikeout,
186 code: false,
187 link_href: link_href.clone(),
188 });
189 }
190 Event::HardBreak => {
191 if !current_spans.is_empty() || in_block {
193 blocks.push(ParsedBlock {
194 spans: std::mem::take(&mut current_spans),
195 heading_level: current_heading.take(),
196 list_style: current_list_style.clone(),
197 is_code_block,
198 });
199 }
200 }
201 _ => {}
202 }
203 }
204
205 if !current_spans.is_empty() {
207 blocks.push(ParsedBlock {
208 spans: std::mem::take(&mut current_spans),
209 heading_level: current_heading,
210 list_style: current_list_style,
211 is_code_block,
212 });
213 }
214
215 if blocks.is_empty() {
217 blocks.push(ParsedBlock {
218 spans: vec![ParsedSpan {
219 text: String::new(),
220 ..Default::default()
221 }],
222 heading_level: None,
223 list_style: None,
224 is_code_block: false,
225 });
226 }
227
228 blocks
229}
230
231fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
232 use pulldown_cmark::HeadingLevel;
233 match level {
234 HeadingLevel::H1 => 1,
235 HeadingLevel::H2 => 2,
236 HeadingLevel::H3 => 3,
237 HeadingLevel::H4 => 4,
238 HeadingLevel::H5 => 5,
239 HeadingLevel::H6 => 6,
240 }
241}
242
243use scraper::Node;
246
247pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
248 use scraper::Html;
249
250 let fragment = Html::parse_fragment(html);
251 let mut blocks: Vec<ParsedBlock> = Vec::new();
252
253 let root = fragment.root_element();
255
256 #[derive(Clone, Default)]
257 struct FmtState {
258 bold: bool,
259 italic: bool,
260 underline: bool,
261 strikeout: bool,
262 code: bool,
263 link_href: Option<String>,
264 }
265
266 const MAX_RECURSION_DEPTH: usize = 256;
267
268 fn walk_node(
269 node: ego_tree::NodeRef<Node>,
270 state: &FmtState,
271 blocks: &mut Vec<ParsedBlock>,
272 current_list_style: &Option<ListStyle>,
273 depth: usize,
274 ) {
275 if depth > MAX_RECURSION_DEPTH {
276 return;
277 }
278 match node.value() {
279 Node::Element(el) => {
280 let tag = el.name();
281 let mut new_state = state.clone();
282 let mut new_list_style = current_list_style.clone();
283
284 let is_block_tag = matches!(
286 tag,
287 "p" | "div"
288 | "h1"
289 | "h2"
290 | "h3"
291 | "h4"
292 | "h5"
293 | "h6"
294 | "li"
295 | "pre"
296 | "br"
297 | "blockquote"
298 );
299
300 match tag {
302 "b" | "strong" => new_state.bold = true,
303 "i" | "em" => new_state.italic = true,
304 "u" | "ins" => new_state.underline = true,
305 "s" | "del" | "strike" => new_state.strikeout = true,
306 "code" => new_state.code = true,
307 "a" => {
308 if let Some(href) = el.attr("href") {
309 new_state.link_href = Some(href.to_string());
310 }
311 }
312 "ul" => {
313 new_list_style = Some(ListStyle::Disc);
314 }
315 "ol" => {
316 new_list_style = Some(ListStyle::Decimal);
317 }
318 _ => {}
319 }
320
321 let heading_level = match tag {
323 "h1" => Some(1),
324 "h2" => Some(2),
325 "h3" => Some(3),
326 "h4" => Some(4),
327 "h5" => Some(5),
328 "h6" => Some(6),
329 _ => None,
330 };
331
332 let is_code_block = tag == "pre";
333
334 if tag == "br" {
335 blocks.push(ParsedBlock {
337 spans: vec![ParsedSpan {
338 text: String::new(),
339 ..Default::default()
340 }],
341 heading_level: None,
342 list_style: None,
343 is_code_block: false,
344 });
345 return;
346 }
347
348 if is_block_tag && tag != "br" {
349 let mut spans: Vec<ParsedSpan> = Vec::new();
351 collect_inline_spans(node, &new_state, &mut spans, &new_list_style, blocks, depth + 1);
352
353 let list_style_for_block = if tag == "li" {
354 new_list_style.clone()
355 } else {
356 None
357 };
358
359 if !spans.is_empty() || heading_level.is_some() {
360 blocks.push(ParsedBlock {
361 spans,
362 heading_level,
363 list_style: list_style_for_block,
364 is_code_block,
365 });
366 }
367 } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
368 for child in node.children() {
370 walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
371 }
372 } else {
373 for child in node.children() {
375 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
376 }
377 }
378 }
379 Node::Text(text) => {
380 let t = text.text.to_string();
381 let trimmed = t.trim();
382 if !trimmed.is_empty() {
383 blocks.push(ParsedBlock {
385 spans: vec![ParsedSpan {
386 text: trimmed.to_string(),
387 bold: state.bold,
388 italic: state.italic,
389 underline: state.underline,
390 strikeout: state.strikeout,
391 code: state.code,
392 link_href: state.link_href.clone(),
393 }],
394 heading_level: None,
395 list_style: None,
396 is_code_block: false,
397 });
398 }
399 }
400 _ => {
401 for child in node.children() {
403 walk_node(child, state, blocks, current_list_style, depth + 1);
404 }
405 }
406 }
407 }
408
409 fn collect_inline_spans(
413 node: ego_tree::NodeRef<Node>,
414 state: &FmtState,
415 spans: &mut Vec<ParsedSpan>,
416 current_list_style: &Option<ListStyle>,
417 blocks: &mut Vec<ParsedBlock>,
418 depth: usize,
419 ) {
420 if depth > MAX_RECURSION_DEPTH {
421 return;
422 }
423 for child in node.children() {
424 match child.value() {
425 Node::Text(text) => {
426 let t = text.text.to_string();
427 if !t.is_empty() {
428 spans.push(ParsedSpan {
429 text: t,
430 bold: state.bold,
431 italic: state.italic,
432 underline: state.underline,
433 strikeout: state.strikeout,
434 code: state.code,
435 link_href: state.link_href.clone(),
436 });
437 }
438 }
439 Node::Element(el) => {
440 let tag = el.name();
441 let mut new_state = state.clone();
442
443 match tag {
444 "b" | "strong" => new_state.bold = true,
445 "i" | "em" => new_state.italic = true,
446 "u" | "ins" => new_state.underline = true,
447 "s" | "del" | "strike" => new_state.strikeout = true,
448 "code" => new_state.code = true,
449 "a" => {
450 if let Some(href) = el.attr("href") {
451 new_state.link_href = Some(href.to_string());
452 }
453 }
454 _ => {}
455 }
456
457 let nested_block = matches!(
459 tag,
460 "p" | "div"
461 | "h1"
462 | "h2"
463 | "h3"
464 | "h4"
465 | "h5"
466 | "h6"
467 | "li"
468 | "pre"
469 | "blockquote"
470 | "ul"
471 | "ol"
472 );
473
474 if tag == "br" {
475 spans.push(ParsedSpan {
478 text: String::new(),
479 ..Default::default()
480 });
481 } else if nested_block {
482 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
484 } else {
485 collect_inline_spans(child, &new_state, spans, current_list_style, blocks, depth + 1);
487 }
488 }
489 _ => {}
490 }
491 }
492 }
493
494 let initial_state = FmtState::default();
495 for child in root.children() {
496 walk_node(child, &initial_state, &mut blocks, &None, 0);
497 }
498
499 if blocks.is_empty() {
501 blocks.push(ParsedBlock {
502 spans: vec![ParsedSpan {
503 text: String::new(),
504 ..Default::default()
505 }],
506 heading_level: None,
507 list_style: None,
508 is_code_block: false,
509 });
510 }
511
512 blocks
513}
514
515#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
520 fn test_parse_markdown_simple_paragraph() {
521 let blocks = parse_markdown("Hello **world**");
522 assert_eq!(blocks.len(), 1);
523 assert!(blocks[0].spans.len() >= 2);
524 let plain_span = blocks[0]
526 .spans
527 .iter()
528 .find(|s| s.text.contains("Hello"))
529 .unwrap();
530 assert!(!plain_span.bold);
531 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
532 assert!(bold_span.bold);
533 }
534
535 #[test]
536 fn test_parse_markdown_heading() {
537 let blocks = parse_markdown("# Title");
538 assert_eq!(blocks.len(), 1);
539 assert_eq!(blocks[0].heading_level, Some(1));
540 assert_eq!(blocks[0].spans[0].text, "Title");
541 }
542
543 #[test]
544 fn test_parse_markdown_list() {
545 let blocks = parse_markdown("- item1\n- item2");
546 assert!(blocks.len() >= 2);
547 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
548 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
549 }
550
551 #[test]
552 fn test_parse_html_simple() {
553 let blocks = parse_html("<p>Hello <b>world</b></p>");
554 assert_eq!(blocks.len(), 1);
555 assert!(blocks[0].spans.len() >= 2);
556 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
557 assert!(bold_span.bold);
558 }
559
560 #[test]
561 fn test_parse_html_multiple_paragraphs() {
562 let blocks = parse_html("<p>A</p><p>B</p>");
563 assert_eq!(blocks.len(), 2);
564 }
565
566 #[test]
567 fn test_parse_html_heading() {
568 let blocks = parse_html("<h2>Subtitle</h2>");
569 assert_eq!(blocks.len(), 1);
570 assert_eq!(blocks[0].heading_level, Some(2));
571 }
572
573 #[test]
574 fn test_parse_html_list() {
575 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
576 assert!(blocks.len() >= 2);
577 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
578 }
579
580 #[test]
581 fn test_parse_markdown_code_block() {
582 let blocks = parse_markdown("```\nfn main() {}\n```");
583 assert_eq!(blocks.len(), 1);
584 assert!(blocks[0].is_code_block);
585 assert!(blocks[0].spans[0].code);
586 }
587
588 #[test]
589 fn test_parse_markdown_nested_formatting() {
590 let blocks = parse_markdown("***bold italic***");
591 assert_eq!(blocks.len(), 1);
592 let span = &blocks[0].spans[0];
593 assert!(span.bold);
594 assert!(span.italic);
595 }
596
597 #[test]
598 fn test_parse_markdown_link() {
599 let blocks = parse_markdown("[click](http://example.com)");
600 assert_eq!(blocks.len(), 1);
601 let span = &blocks[0].spans[0];
602 assert_eq!(span.text, "click");
603 assert_eq!(span.link_href, Some("http://example.com".to_string()));
604 }
605
606 #[test]
607 fn test_parse_markdown_empty() {
608 let blocks = parse_markdown("");
609 assert_eq!(blocks.len(), 1);
610 assert!(blocks[0].spans[0].text.is_empty());
611 }
612
613 #[test]
614 fn test_parse_html_empty() {
615 let blocks = parse_html("");
616 assert_eq!(blocks.len(), 1);
617 assert!(blocks[0].spans[0].text.is_empty());
618 }
619
620 #[test]
621 fn test_parse_html_nested_formatting() {
622 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
623 assert_eq!(blocks.len(), 1);
624 let span = &blocks[0].spans[0];
625 assert!(span.bold);
626 assert!(span.italic);
627 }
628
629 #[test]
630 fn test_parse_html_link() {
631 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
632 assert_eq!(blocks.len(), 1);
633 let span = &blocks[0].spans[0];
634 assert_eq!(span.text, "click");
635 assert_eq!(span.link_href, Some("http://example.com".to_string()));
636 }
637
638 #[test]
639 fn test_parse_html_ordered_list() {
640 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
641 assert!(blocks.len() >= 2);
642 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
643 }
644
645 #[test]
646 fn test_parse_markdown_ordered_list() {
647 let blocks = parse_markdown("1. first\n2. second");
648 assert!(blocks.len() >= 2);
649 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
650 }
651
652 #[test]
653 fn test_parse_html_blockquote_nested() {
654 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
655 assert!(blocks.len() >= 3);
656 }
657}