Skip to main content

webfetch/convert/
text.rs

1//! Plain-text conversion with **reference-style URL preservation**.
2//!
3//! Links are not stripped to their domain, nor expanded inline. Instead each
4//! distinct URL is assigned a stable index and the anchor text is followed by
5//! a compact `[N]` marker. The full URLs are collected into a reference list
6//! that callers can append to the output or expose separately, so the agent
7//! sees `[1]` inline (≈1 token) but can still recover the exact link.
8
9use std::collections::HashMap;
10
11use ego_tree::NodeRef;
12use scraper::node::Node;
13use scraper::{ElementRef, Html};
14use url::Url;
15
16use crate::extract;
17use crate::types::UrlReference;
18
19struct RefCollector {
20    /// Maps a resolved URL to its assigned reference index (for de-duplication).
21    seen: HashMap<String, usize>,
22    references: Vec<UrlReference>,
23    base: Option<Url>,
24}
25
26impl RefCollector {
27    fn new(base_url: &str) -> Self {
28        Self {
29            seen: HashMap::new(),
30            references: Vec::new(),
31            base: Url::parse(base_url).ok(),
32        }
33    }
34
35    /// Resolve a possibly-relative href against the page's base URL.
36    fn resolve(&self, href: &str) -> Option<String> {
37        let href = href.trim();
38        if href.is_empty() || href.starts_with('#') {
39            return None;
40        }
41        if href.starts_with("javascript:") || href.starts_with("mailto:") {
42            return None;
43        }
44        match &self.base {
45            Some(base) => base.join(href).ok().map(|u| u.to_string()),
46            None => Url::parse(href).ok().map(|u| u.to_string()),
47        }
48    }
49
50    /// Return the reference index for a URL, assigning a new one if unseen.
51    fn index_for(&mut self, url: String, text: &str) -> usize {
52        if let Some(idx) = self.seen.get(&url) {
53            return *idx;
54        }
55        let idx = self.references.len() + 1;
56        self.seen.insert(url.clone(), idx);
57        self.references.push(UrlReference {
58            index: idx,
59            url,
60            text: text.trim().to_string(),
61        });
62        idx
63    }
64}
65
66fn is_block(name: &str) -> bool {
67    matches!(
68        name,
69        "p" | "div"
70            | "section"
71            | "article"
72            | "header"
73            | "footer"
74            | "h1"
75            | "h2"
76            | "h3"
77            | "h4"
78            | "h5"
79            | "h6"
80            | "li"
81            | "ul"
82            | "ol"
83            | "table"
84            | "tr"
85            | "blockquote"
86            | "pre"
87            | "figure"
88            | "aside"
89            | "nav"
90            | "main"
91    )
92}
93
94fn walk(node: NodeRef<Node>, out: &mut String, refs: &mut RefCollector) {
95    match node.value() {
96        Node::Text(t) => out.push_str(&t[..]),
97        Node::Element(el) => {
98            let name = el.name();
99            if super::is_skippable(name) {
100                return;
101            }
102
103            if name == "br" {
104                out.push('\n');
105                return;
106            }
107
108            if name == "a" {
109                // Collect the anchor's inner text first.
110                let mut inner = String::new();
111                for child in node.children() {
112                    walk(child, &mut inner, refs);
113                }
114                let inner = inner.trim().to_string();
115                out.push_str(&inner);
116                if let Some(href) = el.attr("href") {
117                    if let Some(resolved) = refs.resolve(href) {
118                        let idx = refs.index_for(resolved, &inner);
119                        out.push_str(&format!(" [{}]", idx));
120                    }
121                }
122                return;
123            }
124
125            let block = is_block(name);
126            if block && !out.ends_with('\n') && !out.is_empty() {
127                out.push('\n');
128            }
129            for child in node.children() {
130                walk(child, out, refs);
131            }
132            if block && !out.ends_with('\n') {
133                out.push('\n');
134            }
135        }
136        _ => {}
137    }
138}
139
140/// Convert an HTML document to reference-style plain text.
141///
142/// Returns the body text (with inline `[N]` markers) and the ordered list of
143/// references. The returned text does **not** include the rendered
144/// "References:" block — see [`render_references`] to append it.
145pub fn html_to_text_with_refs(html: &str, base_url: &str) -> (String, Vec<UrlReference>) {
146    let doc = Html::parse_document(html);
147    let root: ElementRef = match extract::content_root(&doc) {
148        Some(el) => el,
149        None => return (String::new(), Vec::new()),
150    };
151
152    let mut refs = RefCollector::new(base_url);
153    let mut out = String::new();
154    for child in root.children() {
155        walk(child, &mut out, &mut refs);
156    }
157    (out, refs.references)
158}
159
160/// Render a reference list into the canonical block appended to text output.
161/// Thin wrapper over [`crate::refs::render_block`].
162pub fn render_references(references: &[UrlReference]) -> String {
163    crate::refs::render_block(references)
164}