use std::borrow::Cow;
use std::collections::BTreeMap;
use crate::converter::prescan::PrescanReport;
use crate::options::ConversionOptions;
use crate::text;
pub fn extract_frontmatter(
html: &str,
report: &PrescanReport,
options: &ConversionOptions,
) -> Option<String> {
if !options.extract_metadata {
return None;
}
let head_range = report.head_range.as_ref()?;
if head_range.end > html.len() {
return None;
}
let head_content = &html[head_range.clone()];
if head_content.is_empty() {
return None;
}
let wrapped = format!("<html><head>{head_content}</head></html>");
let dom = match tl::parse(&wrapped, tl::ParserOptions::default()) {
Ok(d) => d,
Err(_) => return None,
};
let parser = dom.parser();
let metadata = extract_metadata_from_dom(&dom, parser);
if metadata.is_empty() {
return None;
}
Some(format_metadata_frontmatter(&metadata))
}
fn extract_metadata_from_dom(
dom: &crate::tl_types::Dom,
parser: &crate::tl_types::Parser,
) -> BTreeMap<String, String> {
let mut metadata = BTreeMap::new();
let head_handle = dom
.children()
.iter()
.find_map(|child_handle| find_head_node(child_handle, parser));
let head_handle = match head_handle {
Some(h) => h,
None => return metadata,
};
if let Some(head_node) = head_handle.get(parser)
&& let tl::Node::Tag(head_tag) = head_node
{
let children = head_tag.children();
for child_handle in children.top().iter() {
if let Some(child_node) = child_handle.get(parser)
&& let tl::Node::Tag(child_tag) = child_node
{
let raw_name = child_tag.name().as_utf8_str();
let tag_name = normalize_tag_name(raw_name);
match tag_name.as_ref() {
"title" => extract_title(child_tag, parser, &mut metadata),
"base" => extract_base(child_tag, &mut metadata),
"meta" => extract_meta(child_tag, &mut metadata),
"link" => extract_link(child_tag, &mut metadata),
_ => {}
}
}
}
}
metadata
}
fn find_head_node(
node_handle: &tl::NodeHandle,
parser: &crate::tl_types::Parser,
) -> Option<tl::NodeHandle> {
if let Some(node) = node_handle.get(parser)
&& let tl::Node::Tag(tag) = node
{
if normalize_tag_name(tag.name().as_utf8_str()) == "head" {
return Some(*node_handle);
}
let children = tag.children();
for child_handle in children.top().iter() {
if let Some(result) = find_head_node(child_handle, parser) {
return Some(result);
}
}
}
None
}
fn extract_title(
tag: &tl::HTMLTag,
parser: &crate::tl_types::Parser,
metadata: &mut BTreeMap<String, String>,
) {
let children = tag.children();
if let Some(first_child) = children.top().iter().next()
&& let Some(text_node) = first_child.get(parser)
&& let tl::Node::Raw(bytes) = text_node
{
let title = text::normalize_whitespace(&bytes.as_utf8_str())
.trim()
.to_string();
if !title.is_empty() {
metadata.insert("title".to_string(), title);
}
}
}
fn extract_base(tag: &tl::HTMLTag, metadata: &mut BTreeMap<String, String>) {
if let Some(href_attr) = tag.attributes().get("href")
&& let Some(href_bytes) = href_attr
{
let href = href_bytes.as_utf8_str().to_string();
if !href.is_empty() {
metadata.insert("base-href".to_string(), href);
}
}
}
fn extract_meta(tag: &tl::HTMLTag, metadata: &mut BTreeMap<String, String>) {
let mut name_attr: Option<String> = None;
let mut property_attr: Option<String> = None;
let mut http_equiv_attr: Option<String> = None;
let mut content_attr: Option<String> = None;
if let Some(attr) = tag.attributes().get("name")
&& let Some(bytes) = attr
{
name_attr = Some(bytes.as_utf8_str().to_string());
}
if let Some(attr) = tag.attributes().get("property")
&& let Some(bytes) = attr
{
property_attr = Some(bytes.as_utf8_str().to_string());
}
if let Some(attr) = tag.attributes().get("http-equiv")
&& let Some(bytes) = attr
{
http_equiv_attr = Some(bytes.as_utf8_str().to_string());
}
if let Some(attr) = tag.attributes().get("content")
&& let Some(bytes) = attr
{
content_attr = Some(bytes.as_utf8_str().to_string());
}
if let Some(content) = content_attr {
if let Some(name) = name_attr {
let key = format!("meta-{}", name.to_lowercase());
metadata.insert(key, content);
} else if let Some(property) = property_attr {
let key = format!("meta-{}", property.to_lowercase().replace(':', "-"));
metadata.insert(key, content);
} else if let Some(http_equiv) = http_equiv_attr {
let key = format!("meta-{}", http_equiv.to_lowercase());
metadata.insert(key, content);
}
}
}
fn extract_link(tag: &tl::HTMLTag, metadata: &mut BTreeMap<String, String>) {
let mut rel_attr: Option<String> = None;
let mut href_attr: Option<String> = None;
if let Some(attr) = tag.attributes().get("rel")
&& let Some(bytes) = attr
{
rel_attr = Some(bytes.as_utf8_str().to_string());
}
if let Some(attr) = tag.attributes().get("href")
&& let Some(bytes) = attr
{
href_attr = Some(bytes.as_utf8_str().to_string());
}
if let (Some(rel), Some(href)) = (rel_attr, href_attr) {
let rel_lower = rel.to_lowercase();
match rel_lower.as_str() {
"canonical" => {
metadata.insert("canonical".to_string(), href);
}
"author" | "license" | "alternate" => {
metadata.insert(format!("link-{rel_lower}"), href);
}
_ => {}
}
}
}
fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
debug_assert!(!metadata.is_empty());
let mut lines = vec!["---".to_string()];
for (key, value) in metadata {
let needs_quotes = value.contains(':')
|| value.contains('#')
|| value.contains('[')
|| value.contains(']');
if needs_quotes {
let escaped = value.replace('\\', "\\\\").replace('"', "\\\"");
lines.push(format!("{key}: \"{escaped}\""));
} else {
lines.push(format!("{key}: {value}"));
}
}
lines.push("---".to_string());
lines.join("\n") + "\n\n"
}
fn normalize_tag_name(raw: Cow<'_, str>) -> Cow<'_, str> {
if raw.as_bytes().iter().any(u8::is_ascii_uppercase) {
let mut owned = raw.into_owned();
owned.make_ascii_lowercase();
Cow::Owned(owned)
} else {
raw
}
}