webfetch/convert/
markdown.rs1use ego_tree::NodeRef;
6use scraper::node::Node;
7use scraper::Html;
8use url::Url;
9
10use crate::extract;
11
12fn resolve(href: &str, base: &Option<Url>) -> String {
13 match base {
14 Some(b) => b
15 .join(href)
16 .map(|u| u.to_string())
17 .unwrap_or_else(|_| href.to_string()),
18 None => href.to_string(),
19 }
20}
21
22fn walk(node: NodeRef<Node>, out: &mut String, base: &Option<Url>) {
23 match node.value() {
24 Node::Text(t) => out.push_str(&t[..]),
25 Node::Element(el) => {
26 let name = el.name();
27 if super::is_skippable(name) {
28 return;
29 }
30
31 let prefix = match name {
32 "h1" => Some("\n# "),
33 "h2" => Some("\n## "),
34 "h3" => Some("\n### "),
35 "h4" => Some("\n#### "),
36 "h5" => Some("\n##### "),
37 "h6" => Some("\n###### "),
38 "li" => Some("\n- "),
39 "blockquote" => Some("\n> "),
40 _ => None,
41 };
42
43 if name == "br" {
44 out.push('\n');
45 return;
46 }
47
48 if name == "a" {
49 let mut inner = String::new();
50 for child in node.children() {
51 walk(child, &mut inner, base);
52 }
53 let inner = inner.trim().to_string();
54 match el.attr("href") {
55 Some(href) if !href.trim().is_empty() && !href.starts_with('#') => {
56 out.push_str(&format!("[{}]({})", inner, resolve(href, base)));
57 }
58 _ => out.push_str(&inner),
59 }
60 return;
61 }
62
63 if name == "code" {
64 let mut inner = String::new();
65 for child in node.children() {
66 walk(child, &mut inner, base);
67 }
68 out.push_str(&format!("`{}`", inner.trim()));
69 return;
70 }
71
72 if let Some(p) = prefix {
73 out.push_str(p);
74 }
75 for child in node.children() {
76 walk(child, out, base);
77 }
78 if matches!(
79 name,
80 "p" | "div" | "section" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
81 ) {
82 out.push('\n');
83 }
84 }
85 _ => {}
86 }
87}
88
89pub fn html_to_markdown(html: &str, base_url: &str) -> String {
90 let doc = Html::parse_document(html);
91 let root = match extract::content_root(&doc) {
92 Some(el) => el,
93 None => return String::new(),
94 };
95 let base = Url::parse(base_url).ok();
96 let mut out = String::new();
97 for child in root.children() {
98 walk(child, &mut out, &base);
99 }
100 out
101}