Skip to main content

webfetch/convert/
markdown.rs

1//! Markdown conversion. Unlike the text path, markdown keeps links inline as
2//! `[text](url)` for maximum fidelity — the right trade-off when the consumer
3//! wants a faithful, re-renderable document rather than minimal tokens.
4
5use ego_tree::NodeRef;
6use scraper::node::Node;
7use scraper::Html;
8use url::Url;
9
10use crate::extract;
11
12fn resolve(href: &str, base: &Option<Url>) -> String {
13    match base {
14        Some(b) => b
15            .join(href)
16            .map(|u| u.to_string())
17            .unwrap_or_else(|_| href.to_string()),
18        None => href.to_string(),
19    }
20}
21
22fn walk(node: NodeRef<Node>, out: &mut String, base: &Option<Url>) {
23    match node.value() {
24        Node::Text(t) => out.push_str(&t[..]),
25        Node::Element(el) => {
26            let name = el.name();
27            if super::is_skippable(name) {
28                return;
29            }
30
31            let prefix = match name {
32                "h1" => Some("\n# "),
33                "h2" => Some("\n## "),
34                "h3" => Some("\n### "),
35                "h4" => Some("\n#### "),
36                "h5" => Some("\n##### "),
37                "h6" => Some("\n###### "),
38                "li" => Some("\n- "),
39                "blockquote" => Some("\n> "),
40                _ => None,
41            };
42
43            if name == "br" {
44                out.push('\n');
45                return;
46            }
47
48            if name == "a" {
49                let mut inner = String::new();
50                for child in node.children() {
51                    walk(child, &mut inner, base);
52                }
53                let inner = inner.trim().to_string();
54                match el.attr("href") {
55                    Some(href) if !href.trim().is_empty() && !href.starts_with('#') => {
56                        out.push_str(&format!("[{}]({})", inner, resolve(href, base)));
57                    }
58                    _ => out.push_str(&inner),
59                }
60                return;
61            }
62
63            if name == "code" {
64                let mut inner = String::new();
65                for child in node.children() {
66                    walk(child, &mut inner, base);
67                }
68                out.push_str(&format!("`{}`", inner.trim()));
69                return;
70            }
71
72            if let Some(p) = prefix {
73                out.push_str(p);
74            }
75            for child in node.children() {
76                walk(child, out, base);
77            }
78            if matches!(
79                name,
80                "p" | "div" | "section" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6"
81            ) {
82                out.push('\n');
83            }
84        }
85        _ => {}
86    }
87}
88
89pub fn html_to_markdown(html: &str, base_url: &str) -> String {
90    let doc = Html::parse_document(html);
91    let root = match extract::content_root(&doc) {
92        Some(el) => el,
93        None => return String::new(),
94    };
95    let base = Url::parse(base_url).ok();
96    let mut out = String::new();
97    for child in root.children() {
98        walk(child, &mut out, &base);
99    }
100    out
101}