use super::main::HTMLElement;
use crate::dom::node::Node;
use std::sync::OnceLock;
static WHITESPACE_REGEX: OnceLock<regex::Regex> = OnceLock::new();
impl HTMLElement {
pub fn structured_text(&self) -> String {
use std::collections::HashSet;
use std::sync::OnceLock;
static BLOCK: OnceLock<HashSet<&'static str>> = OnceLock::new();
let block = BLOCK.get_or_init(|| {
[
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"details",
"dialog",
"dd",
"div",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"table",
"td",
"tr",
"address",
"article",
"aside",
"blockquote",
"br",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"ul",
]
.into_iter()
.collect()
});
#[derive(Default)]
struct LineBlock {
parts: Vec<String>,
prepend_ws: bool,
}
let mut blocks: Vec<LineBlock> = vec![LineBlock::default()];
fn dfs(
cur: &HTMLElement,
block: &std::collections::HashSet<&'static str>,
) -> Vec<LineBlock> {
let tag = cur.name();
let is_block =
!cur.is_root() && (block.contains(tag) || block.contains(&tag.to_lowercase()[..]));
let children = &cur.children;
let mut acc: Vec<LineBlock> = Vec::new();
let mut current = LineBlock::default();
for child in children {
match child {
Node::Element(e) => {
let cname = e.name();
let child_block =
block.contains(cname) || block.contains(&cname.to_lowercase()[..]);
if child_block && !current.parts.is_empty() {
acc.push(current);
current = LineBlock::default();
}
let sub_blocks = dfs(e, block); for (i, sb) in sub_blocks.into_iter().enumerate() {
if i == 0 {
if current.prepend_ws && !sb.parts.is_empty() {
current.parts.push(format!(" {}", sb.parts.join("")));
current.prepend_ws = false;
} else {
current.parts.extend(sb.parts);
}
if sb.prepend_ws {
current.prepend_ws = true;
}
} else {
acc.push(current);
current = sb; }
}
if child_block && !current.parts.is_empty() {
acc.push(current);
current = LineBlock::default();
}
}
Node::Text(t0) => {
if t0.is_whitespace() {
current.prepend_ws = true;
continue;
}
let mut tc = t0.clone();
let txt = tc.trimmed_text().to_string();
if current.prepend_ws {
current.parts.push(format!(" {}", txt));
current.prepend_ws = false;
} else {
current.parts.push(txt);
}
}
Node::Comment(_) => {}
}
}
if !current.parts.is_empty() {
acc.push(current);
}
if is_block {
acc.push(LineBlock::default());
}
acc
}
let mut collected = dfs(self, block);
blocks.append(&mut collected);
blocks
.into_iter()
.filter(|b| !b.parts.is_empty())
.map(|b| {
let joined = b.parts.join("");
WHITESPACE_REGEX
.get_or_init(|| regex::Regex::new(r"\s{2,}").unwrap())
.replace_all(&joined, " ")
.to_string()
})
.collect::<Vec<_>>()
.join("\n")
.trim_end()
.to_string()
}
}