1use crate::entities::ListStyle;
2
3#[derive(Debug, Clone, Default)]
5pub struct ParsedSpan {
6 pub text: String,
7 pub bold: bool,
8 pub italic: bool,
9 pub underline: bool,
10 pub strikeout: bool,
11 pub code: bool,
12 pub link_href: Option<String>,
13}
14
15#[derive(Debug, Clone)]
17pub struct ParsedBlock {
18 pub spans: Vec<ParsedSpan>,
19 pub heading_level: Option<i64>,
20 pub list_style: Option<ListStyle>,
21 pub is_code_block: bool,
22}
23
24pub fn parse_markdown(markdown: &str) -> Vec<ParsedBlock> {
27 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
28
29 let options =
30 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_TASKLISTS;
31 let parser = Parser::new_ext(markdown, options);
32
33 let mut blocks: Vec<ParsedBlock> = Vec::new();
34 let mut current_spans: Vec<ParsedSpan> = Vec::new();
35 let mut current_heading: Option<i64> = None;
36 let mut current_list_style: Option<ListStyle> = None;
37 let mut is_code_block = false;
38 let mut in_block = false;
39
40 let mut bold = false;
42 let mut italic = false;
43 let mut strikeout = false;
44 let mut link_href: Option<String> = None;
45
46 let mut list_stack: Vec<Option<ListStyle>> = Vec::new();
48
49 for event in parser {
50 match event {
51 Event::Start(Tag::Paragraph) => {
52 in_block = true;
53 current_heading = None;
54 is_code_block = false;
55 }
56 Event::End(TagEnd::Paragraph) => {
57 if !current_spans.is_empty() || in_block {
58 blocks.push(ParsedBlock {
59 spans: std::mem::take(&mut current_spans),
60 heading_level: current_heading.take(),
61 list_style: current_list_style.clone(),
62 is_code_block: false,
63 });
64 }
65 in_block = false;
66 current_list_style = None;
67 }
68 Event::Start(Tag::Heading { level, .. }) => {
69 in_block = true;
70 current_heading = Some(heading_level_to_i64(level));
71 is_code_block = false;
72 }
73 Event::End(TagEnd::Heading(_)) => {
74 blocks.push(ParsedBlock {
75 spans: std::mem::take(&mut current_spans),
76 heading_level: current_heading.take(),
77 list_style: None,
78 is_code_block: false,
79 });
80 in_block = false;
81 }
82 Event::Start(Tag::List(ordered)) => {
83 let style = if ordered.is_some() {
84 Some(ListStyle::Decimal)
85 } else {
86 Some(ListStyle::Disc)
87 };
88 list_stack.push(style);
89 }
90 Event::End(TagEnd::List(_)) => {
91 list_stack.pop();
92 }
93 Event::Start(Tag::Item) => {
94 in_block = true;
95 current_list_style = list_stack.last().cloned().flatten();
96 }
97 Event::End(TagEnd::Item) => {
98 if !current_spans.is_empty() {
101 blocks.push(ParsedBlock {
102 spans: std::mem::take(&mut current_spans),
103 heading_level: None,
104 list_style: current_list_style.clone(),
105 is_code_block: false,
106 });
107 }
108 in_block = false;
109 current_list_style = None;
110 }
111 Event::Start(Tag::CodeBlock(_)) => {
112 in_block = true;
113 is_code_block = true;
114 }
115 Event::End(TagEnd::CodeBlock) => {
116 blocks.push(ParsedBlock {
117 spans: std::mem::take(&mut current_spans),
118 heading_level: None,
119 list_style: None,
120 is_code_block: true,
121 });
122 in_block = false;
123 is_code_block = false;
124 }
125 Event::Start(Tag::Emphasis) => {
126 italic = true;
127 }
128 Event::End(TagEnd::Emphasis) => {
129 italic = false;
130 }
131 Event::Start(Tag::Strong) => {
132 bold = true;
133 }
134 Event::End(TagEnd::Strong) => {
135 bold = false;
136 }
137 Event::Start(Tag::Strikethrough) => {
138 strikeout = true;
139 }
140 Event::End(TagEnd::Strikethrough) => {
141 strikeout = false;
142 }
143 Event::Start(Tag::Link { dest_url, .. }) => {
144 link_href = Some(dest_url.to_string());
145 }
146 Event::End(TagEnd::Link) => {
147 link_href = None;
148 }
149 Event::Text(text) => {
150 if !in_block {
151 in_block = true;
153 }
154 current_spans.push(ParsedSpan {
155 text: text.to_string(),
156 bold,
157 italic,
158 underline: false,
159 strikeout,
160 code: is_code_block,
161 link_href: link_href.clone(),
162 });
163 }
164 Event::Code(text) => {
165 if !in_block {
166 in_block = true;
167 }
168 current_spans.push(ParsedSpan {
169 text: text.to_string(),
170 bold,
171 italic,
172 underline: false,
173 strikeout,
174 code: true,
175 link_href: link_href.clone(),
176 });
177 }
178 Event::SoftBreak => {
179 current_spans.push(ParsedSpan {
181 text: " ".to_string(),
182 bold,
183 italic,
184 underline: false,
185 strikeout,
186 code: false,
187 link_href: link_href.clone(),
188 });
189 }
190 Event::HardBreak => {
191 if !current_spans.is_empty() || in_block {
193 blocks.push(ParsedBlock {
194 spans: std::mem::take(&mut current_spans),
195 heading_level: current_heading.take(),
196 list_style: current_list_style.clone(),
197 is_code_block,
198 });
199 }
200 }
201 _ => {}
202 }
203 }
204
205 if !current_spans.is_empty() {
207 blocks.push(ParsedBlock {
208 spans: std::mem::take(&mut current_spans),
209 heading_level: current_heading,
210 list_style: current_list_style,
211 is_code_block,
212 });
213 }
214
215 if blocks.is_empty() {
217 blocks.push(ParsedBlock {
218 spans: vec![ParsedSpan {
219 text: String::new(),
220 ..Default::default()
221 }],
222 heading_level: None,
223 list_style: None,
224 is_code_block: false,
225 });
226 }
227
228 blocks
229}
230
231fn heading_level_to_i64(level: pulldown_cmark::HeadingLevel) -> i64 {
232 use pulldown_cmark::HeadingLevel;
233 match level {
234 HeadingLevel::H1 => 1,
235 HeadingLevel::H2 => 2,
236 HeadingLevel::H3 => 3,
237 HeadingLevel::H4 => 4,
238 HeadingLevel::H5 => 5,
239 HeadingLevel::H6 => 6,
240 }
241}
242
243use scraper::Node;
246
247pub fn parse_html(html: &str) -> Vec<ParsedBlock> {
248 use scraper::Html;
249
250 let fragment = Html::parse_fragment(html);
251 let mut blocks: Vec<ParsedBlock> = Vec::new();
252
253 let root = fragment.root_element();
255
256 #[derive(Clone, Default)]
257 struct FmtState {
258 bold: bool,
259 italic: bool,
260 underline: bool,
261 strikeout: bool,
262 code: bool,
263 link_href: Option<String>,
264 }
265
266 const MAX_RECURSION_DEPTH: usize = 256;
267
268 fn walk_node(
269 node: ego_tree::NodeRef<Node>,
270 state: &FmtState,
271 blocks: &mut Vec<ParsedBlock>,
272 current_list_style: &Option<ListStyle>,
273 depth: usize,
274 ) {
275 if depth > MAX_RECURSION_DEPTH {
276 return;
277 }
278 match node.value() {
279 Node::Element(el) => {
280 let tag = el.name();
281 let mut new_state = state.clone();
282 let mut new_list_style = current_list_style.clone();
283
284 let is_block_tag = matches!(
286 tag,
287 "p" | "div"
288 | "h1"
289 | "h2"
290 | "h3"
291 | "h4"
292 | "h5"
293 | "h6"
294 | "li"
295 | "pre"
296 | "br"
297 | "blockquote"
298 );
299
300 match tag {
302 "b" | "strong" => new_state.bold = true,
303 "i" | "em" => new_state.italic = true,
304 "u" | "ins" => new_state.underline = true,
305 "s" | "del" | "strike" => new_state.strikeout = true,
306 "code" => new_state.code = true,
307 "a" => {
308 if let Some(href) = el.attr("href") {
309 new_state.link_href = Some(href.to_string());
310 }
311 }
312 "ul" => {
313 new_list_style = Some(ListStyle::Disc);
314 }
315 "ol" => {
316 new_list_style = Some(ListStyle::Decimal);
317 }
318 _ => {}
319 }
320
321 let heading_level = match tag {
323 "h1" => Some(1),
324 "h2" => Some(2),
325 "h3" => Some(3),
326 "h4" => Some(4),
327 "h5" => Some(5),
328 "h6" => Some(6),
329 _ => None,
330 };
331
332 let is_code_block = tag == "pre";
333
334 if tag == "br" {
335 blocks.push(ParsedBlock {
337 spans: vec![ParsedSpan {
338 text: String::new(),
339 ..Default::default()
340 }],
341 heading_level: None,
342 list_style: None,
343 is_code_block: false,
344 });
345 return;
346 }
347
348 if is_block_tag && tag != "br" {
349 let mut spans: Vec<ParsedSpan> = Vec::new();
351 collect_inline_spans(
352 node,
353 &new_state,
354 &mut spans,
355 &new_list_style,
356 blocks,
357 depth + 1,
358 );
359
360 let list_style_for_block = if tag == "li" {
361 new_list_style.clone()
362 } else {
363 None
364 };
365
366 if !spans.is_empty() || heading_level.is_some() {
367 blocks.push(ParsedBlock {
368 spans,
369 heading_level,
370 list_style: list_style_for_block,
371 is_code_block,
372 });
373 }
374 } else if matches!(tag, "ul" | "ol" | "table" | "thead" | "tbody" | "tr") {
375 for child in node.children() {
377 walk_node(child, &new_state, blocks, &new_list_style, depth + 1);
378 }
379 } else {
380 for child in node.children() {
382 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
383 }
384 }
385 }
386 Node::Text(text) => {
387 let t = text.text.to_string();
388 let trimmed = t.trim();
389 if !trimmed.is_empty() {
390 blocks.push(ParsedBlock {
392 spans: vec![ParsedSpan {
393 text: trimmed.to_string(),
394 bold: state.bold,
395 italic: state.italic,
396 underline: state.underline,
397 strikeout: state.strikeout,
398 code: state.code,
399 link_href: state.link_href.clone(),
400 }],
401 heading_level: None,
402 list_style: None,
403 is_code_block: false,
404 });
405 }
406 }
407 _ => {
408 for child in node.children() {
410 walk_node(child, state, blocks, current_list_style, depth + 1);
411 }
412 }
413 }
414 }
415
416 fn collect_inline_spans(
420 node: ego_tree::NodeRef<Node>,
421 state: &FmtState,
422 spans: &mut Vec<ParsedSpan>,
423 current_list_style: &Option<ListStyle>,
424 blocks: &mut Vec<ParsedBlock>,
425 depth: usize,
426 ) {
427 if depth > MAX_RECURSION_DEPTH {
428 return;
429 }
430 for child in node.children() {
431 match child.value() {
432 Node::Text(text) => {
433 let t = text.text.to_string();
434 if !t.is_empty() {
435 spans.push(ParsedSpan {
436 text: t,
437 bold: state.bold,
438 italic: state.italic,
439 underline: state.underline,
440 strikeout: state.strikeout,
441 code: state.code,
442 link_href: state.link_href.clone(),
443 });
444 }
445 }
446 Node::Element(el) => {
447 let tag = el.name();
448 let mut new_state = state.clone();
449
450 match tag {
451 "b" | "strong" => new_state.bold = true,
452 "i" | "em" => new_state.italic = true,
453 "u" | "ins" => new_state.underline = true,
454 "s" | "del" | "strike" => new_state.strikeout = true,
455 "code" => new_state.code = true,
456 "a" => {
457 if let Some(href) = el.attr("href") {
458 new_state.link_href = Some(href.to_string());
459 }
460 }
461 _ => {}
462 }
463
464 let nested_block = matches!(
466 tag,
467 "p" | "div"
468 | "h1"
469 | "h2"
470 | "h3"
471 | "h4"
472 | "h5"
473 | "h6"
474 | "li"
475 | "pre"
476 | "blockquote"
477 | "ul"
478 | "ol"
479 );
480
481 if tag == "br" {
482 spans.push(ParsedSpan {
485 text: String::new(),
486 ..Default::default()
487 });
488 } else if nested_block {
489 walk_node(child, &new_state, blocks, current_list_style, depth + 1);
491 } else {
492 collect_inline_spans(
494 child,
495 &new_state,
496 spans,
497 current_list_style,
498 blocks,
499 depth + 1,
500 );
501 }
502 }
503 _ => {}
504 }
505 }
506 }
507
508 let initial_state = FmtState::default();
509 for child in root.children() {
510 walk_node(child, &initial_state, &mut blocks, &None, 0);
511 }
512
513 if blocks.is_empty() {
515 blocks.push(ParsedBlock {
516 spans: vec![ParsedSpan {
517 text: String::new(),
518 ..Default::default()
519 }],
520 heading_level: None,
521 list_style: None,
522 is_code_block: false,
523 });
524 }
525
526 blocks
527}
528
529#[cfg(test)]
530mod tests {
531 use super::*;
532
533 #[test]
534 fn test_parse_markdown_simple_paragraph() {
535 let blocks = parse_markdown("Hello **world**");
536 assert_eq!(blocks.len(), 1);
537 assert!(blocks[0].spans.len() >= 2);
538 let plain_span = blocks[0]
540 .spans
541 .iter()
542 .find(|s| s.text.contains("Hello"))
543 .unwrap();
544 assert!(!plain_span.bold);
545 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
546 assert!(bold_span.bold);
547 }
548
549 #[test]
550 fn test_parse_markdown_heading() {
551 let blocks = parse_markdown("# Title");
552 assert_eq!(blocks.len(), 1);
553 assert_eq!(blocks[0].heading_level, Some(1));
554 assert_eq!(blocks[0].spans[0].text, "Title");
555 }
556
557 #[test]
558 fn test_parse_markdown_list() {
559 let blocks = parse_markdown("- item1\n- item2");
560 assert!(blocks.len() >= 2);
561 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
562 assert_eq!(blocks[1].list_style, Some(ListStyle::Disc));
563 }
564
565 #[test]
566 fn test_parse_html_simple() {
567 let blocks = parse_html("<p>Hello <b>world</b></p>");
568 assert_eq!(blocks.len(), 1);
569 assert!(blocks[0].spans.len() >= 2);
570 let bold_span = blocks[0].spans.iter().find(|s| s.text == "world").unwrap();
571 assert!(bold_span.bold);
572 }
573
574 #[test]
575 fn test_parse_html_multiple_paragraphs() {
576 let blocks = parse_html("<p>A</p><p>B</p>");
577 assert_eq!(blocks.len(), 2);
578 }
579
580 #[test]
581 fn test_parse_html_heading() {
582 let blocks = parse_html("<h2>Subtitle</h2>");
583 assert_eq!(blocks.len(), 1);
584 assert_eq!(blocks[0].heading_level, Some(2));
585 }
586
587 #[test]
588 fn test_parse_html_list() {
589 let blocks = parse_html("<ul><li>one</li><li>two</li></ul>");
590 assert!(blocks.len() >= 2);
591 assert_eq!(blocks[0].list_style, Some(ListStyle::Disc));
592 }
593
594 #[test]
595 fn test_parse_markdown_code_block() {
596 let blocks = parse_markdown("```\nfn main() {}\n```");
597 assert_eq!(blocks.len(), 1);
598 assert!(blocks[0].is_code_block);
599 assert!(blocks[0].spans[0].code);
600 }
601
602 #[test]
603 fn test_parse_markdown_nested_formatting() {
604 let blocks = parse_markdown("***bold italic***");
605 assert_eq!(blocks.len(), 1);
606 let span = &blocks[0].spans[0];
607 assert!(span.bold);
608 assert!(span.italic);
609 }
610
611 #[test]
612 fn test_parse_markdown_link() {
613 let blocks = parse_markdown("[click](http://example.com)");
614 assert_eq!(blocks.len(), 1);
615 let span = &blocks[0].spans[0];
616 assert_eq!(span.text, "click");
617 assert_eq!(span.link_href, Some("http://example.com".to_string()));
618 }
619
620 #[test]
621 fn test_parse_markdown_empty() {
622 let blocks = parse_markdown("");
623 assert_eq!(blocks.len(), 1);
624 assert!(blocks[0].spans[0].text.is_empty());
625 }
626
627 #[test]
628 fn test_parse_html_empty() {
629 let blocks = parse_html("");
630 assert_eq!(blocks.len(), 1);
631 assert!(blocks[0].spans[0].text.is_empty());
632 }
633
634 #[test]
635 fn test_parse_html_nested_formatting() {
636 let blocks = parse_html("<p><b><i>bold italic</i></b></p>");
637 assert_eq!(blocks.len(), 1);
638 let span = &blocks[0].spans[0];
639 assert!(span.bold);
640 assert!(span.italic);
641 }
642
643 #[test]
644 fn test_parse_html_link() {
645 let blocks = parse_html("<p><a href=\"http://example.com\">click</a></p>");
646 assert_eq!(blocks.len(), 1);
647 let span = &blocks[0].spans[0];
648 assert_eq!(span.text, "click");
649 assert_eq!(span.link_href, Some("http://example.com".to_string()));
650 }
651
652 #[test]
653 fn test_parse_html_ordered_list() {
654 let blocks = parse_html("<ol><li>first</li><li>second</li></ol>");
655 assert!(blocks.len() >= 2);
656 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
657 }
658
659 #[test]
660 fn test_parse_markdown_ordered_list() {
661 let blocks = parse_markdown("1. first\n2. second");
662 assert!(blocks.len() >= 2);
663 assert_eq!(blocks[0].list_style, Some(ListStyle::Decimal));
664 }
665
666 #[test]
667 fn test_parse_html_blockquote_nested() {
668 let blocks = parse_html("<p>before</p><blockquote>quoted</blockquote><p>after</p>");
669 assert!(blocks.len() >= 3);
670 }
671}