1use std::collections::HashMap;
10
11use ego_tree::NodeRef;
12use scraper::node::Node;
13use scraper::{ElementRef, Html};
14use url::Url;
15
16use crate::extract;
17use crate::types::UrlReference;
18
19struct RefCollector {
20 seen: HashMap<String, usize>,
22 references: Vec<UrlReference>,
23 base: Option<Url>,
24}
25
26impl RefCollector {
27 fn new(base_url: &str) -> Self {
28 Self {
29 seen: HashMap::new(),
30 references: Vec::new(),
31 base: Url::parse(base_url).ok(),
32 }
33 }
34
35 fn resolve(&self, href: &str) -> Option<String> {
37 let href = href.trim();
38 if href.is_empty() || href.starts_with('#') {
39 return None;
40 }
41 if href.starts_with("javascript:") || href.starts_with("mailto:") {
42 return None;
43 }
44 match &self.base {
45 Some(base) => base.join(href).ok().map(|u| u.to_string()),
46 None => Url::parse(href).ok().map(|u| u.to_string()),
47 }
48 }
49
50 fn index_for(&mut self, url: String, text: &str) -> usize {
52 if let Some(idx) = self.seen.get(&url) {
53 return *idx;
54 }
55 let idx = self.references.len() + 1;
56 self.seen.insert(url.clone(), idx);
57 self.references.push(UrlReference {
58 index: idx,
59 url,
60 text: text.trim().to_string(),
61 });
62 idx
63 }
64}
65
66fn is_block(name: &str) -> bool {
67 matches!(
68 name,
69 "p" | "div"
70 | "section"
71 | "article"
72 | "header"
73 | "footer"
74 | "h1"
75 | "h2"
76 | "h3"
77 | "h4"
78 | "h5"
79 | "h6"
80 | "li"
81 | "ul"
82 | "ol"
83 | "table"
84 | "tr"
85 | "blockquote"
86 | "pre"
87 | "figure"
88 | "aside"
89 | "nav"
90 | "main"
91 )
92}
93
94fn walk(node: NodeRef<Node>, out: &mut String, refs: &mut RefCollector) {
95 match node.value() {
96 Node::Text(t) => out.push_str(&t[..]),
97 Node::Element(el) => {
98 let name = el.name();
99 if super::is_skippable(name) {
100 return;
101 }
102
103 if name == "br" {
104 out.push('\n');
105 return;
106 }
107
108 if name == "a" {
109 let mut inner = String::new();
111 for child in node.children() {
112 walk(child, &mut inner, refs);
113 }
114 let inner = inner.trim().to_string();
115 out.push_str(&inner);
116 if let Some(href) = el.attr("href") {
117 if let Some(resolved) = refs.resolve(href) {
118 let idx = refs.index_for(resolved, &inner);
119 out.push_str(&format!(" [{}]", idx));
120 }
121 }
122 return;
123 }
124
125 let block = is_block(name);
126 if block && !out.ends_with('\n') && !out.is_empty() {
127 out.push('\n');
128 }
129 for child in node.children() {
130 walk(child, out, refs);
131 }
132 if block && !out.ends_with('\n') {
133 out.push('\n');
134 }
135 }
136 _ => {}
137 }
138}
139
140pub fn html_to_text_with_refs(html: &str, base_url: &str) -> (String, Vec<UrlReference>) {
146 let doc = Html::parse_document(html);
147 let root: ElementRef = match extract::content_root(&doc) {
148 Some(el) => el,
149 None => return (String::new(), Vec::new()),
150 };
151
152 let mut refs = RefCollector::new(base_url);
153 let mut out = String::new();
154 for child in root.children() {
155 walk(child, &mut out, &mut refs);
156 }
157 (out, refs.references)
158}
159
160pub fn render_references(references: &[UrlReference]) -> String {
163 crate::refs::render_block(references)
164}