use std::collections::HashMap;
use ego_tree::NodeRef;
use scraper::node::Node;
use scraper::{ElementRef, Html};
use url::Url;
use crate::extract;
use crate::types::UrlReference;
struct RefCollector {
seen: HashMap<String, usize>,
references: Vec<UrlReference>,
base: Option<Url>,
}
impl RefCollector {
fn new(base_url: &str) -> Self {
Self {
seen: HashMap::new(),
references: Vec::new(),
base: Url::parse(base_url).ok(),
}
}
fn resolve(&self, href: &str) -> Option<String> {
let href = href.trim();
if href.is_empty() || href.starts_with('#') {
return None;
}
if href.starts_with("javascript:") || href.starts_with("mailto:") {
return None;
}
match &self.base {
Some(base) => base.join(href).ok().map(|u| u.to_string()),
None => Url::parse(href).ok().map(|u| u.to_string()),
}
}
fn index_for(&mut self, url: String, text: &str) -> usize {
if let Some(idx) = self.seen.get(&url) {
return *idx;
}
let idx = self.references.len() + 1;
self.seen.insert(url.clone(), idx);
self.references.push(UrlReference {
index: idx,
url,
text: text.trim().to_string(),
});
idx
}
}
fn is_block(name: &str) -> bool {
matches!(
name,
"p" | "div"
| "section"
| "article"
| "header"
| "footer"
| "h1"
| "h2"
| "h3"
| "h4"
| "h5"
| "h6"
| "li"
| "ul"
| "ol"
| "table"
| "tr"
| "blockquote"
| "pre"
| "figure"
| "aside"
| "nav"
| "main"
)
}
fn walk(node: NodeRef<Node>, out: &mut String, refs: &mut RefCollector) {
match node.value() {
Node::Text(t) => out.push_str(&t[..]),
Node::Element(el) => {
let name = el.name();
if super::is_skippable(name) {
return;
}
if name == "br" {
out.push('\n');
return;
}
if name == "a" {
let mut inner = String::new();
for child in node.children() {
walk(child, &mut inner, refs);
}
let inner = inner.trim().to_string();
out.push_str(&inner);
if let Some(href) = el.attr("href") {
if let Some(resolved) = refs.resolve(href) {
let idx = refs.index_for(resolved, &inner);
out.push_str(&format!(" [{}]", idx));
}
}
return;
}
let block = is_block(name);
if block && !out.ends_with('\n') && !out.is_empty() {
out.push('\n');
}
for child in node.children() {
walk(child, out, refs);
}
if block && !out.ends_with('\n') {
out.push('\n');
}
}
_ => {}
}
}
pub fn html_to_text_with_refs(html: &str, base_url: &str) -> (String, Vec<UrlReference>) {
let doc = Html::parse_document(html);
let root: ElementRef = match extract::content_root(&doc) {
Some(el) => el,
None => return (String::new(), Vec::new()),
};
let mut refs = RefCollector::new(base_url);
let mut out = String::new();
for child in root.children() {
walk(child, &mut out, &mut refs);
}
(out, refs.references)
}
pub fn render_references(references: &[UrlReference]) -> String {
crate::refs::render_block(references)
}