use once_cell::sync::Lazy;
use regex::Regex;
static PERMALINK_TEXT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[#ΒΆΒ§π\s]*$").unwrap());
static H1_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?si)<h1[^>]*>(.*?)</h1>").unwrap());
static HEADING_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?si)(<h[1-6][^>]*>)(.*?)(</h[1-6]>)").unwrap());
static ANCHOR_IN_HEADING_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r##"(?si)<a\s+[^>]*href="#[^"]*"[^>]*>(.*?)</a>"##).unwrap());
pub fn standardize_headings(html: &str, title: Option<&str>) -> String {
let mut output = html.to_string();
let mut first_h1 = true;
output = H1_RE.replace_all(&output, |caps: ®ex::Captures| {
let inner = caps[1].to_string();
let text = strip_html_tags(&inner);
let text_normalized = normalize_title_text(&text);
if first_h1 {
first_h1 = false;
if let Some(t) = title {
let title_normalized = normalize_title_text(t);
if text_normalized == title_normalized {
return String::new();
}
}
}
format!("<h2>{}</h2>", inner)
}).to_string();
output = strip_permalink_anchors(&output);
output
}
fn strip_permalink_anchors(html: &str) -> String {
HEADING_RE.replace_all(html, |caps: ®ex::Captures| {
let open_tag = &caps[1];
let inner = &caps[2];
let close_tag = &caps[3];
let cleaned_inner = ANCHOR_IN_HEADING_RE.replace_all(inner, |acaps: ®ex::Captures| {
let link_text = acaps.get(1).map(|m| m.as_str()).unwrap_or("");
if link_text.trim().is_empty() || PERMALINK_TEXT_RE.is_match(link_text.trim()) {
String::new()
} else {
link_text.to_string()
}
});
format!("{}{}{}", open_tag, cleaned_inner, close_tag)
}).to_string()
}
static HTML_TAG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"<[^>]+>").unwrap());
fn strip_html_tags(html: &str) -> String {
HTML_TAG_RE.replace_all(html, "").to_string()
}
fn normalize_title_text(text: &str) -> String {
text.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
.to_lowercase()
.trim_matches(|c: char| c.is_ascii_punctuation())
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_remove_matching_h1() {
let html = r#"<h1>My Article Title</h1><p>Content</p>"#;
let result = standardize_headings(html, Some("My Article Title"));
assert!(!result.contains("<h1>"));
assert!(result.contains("Content"));
}
#[test]
fn test_rename_h1_to_h2() {
let html = r#"<h1>Other Heading</h1><p>Content</p>"#;
let result = standardize_headings(html, Some("Different Title"));
assert!(result.contains("<h2>Other Heading</h2>"));
}
#[test]
fn test_strip_permalink() {
let html = r##"<h2>Heading <a href="#heading" class="anchor">#</a></h2>"##;
let result = standardize_headings(html, None);
assert!(!result.contains("<a"));
assert!(result.contains("Heading"));
}
#[test]
fn test_keep_real_links_in_headings() {
let html = r#"<h2>See <a href="https://example.com">this</a></h2>"#;
let result = standardize_headings(html, None);
assert!(result.contains("https://example.com"));
}
}