use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Html, Selector};
use super::languages::{is_known_language, normalize_language};
static LANGUAGE_CLASS_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^language-(.+)$").unwrap());
static LANG_CLASS_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^lang-(.+)$").unwrap());
static HIGHLIGHT_SOURCE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)^highlight-source-(.+)$").unwrap());
static BRUSH_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)brush:\s*(\w+)").unwrap());
static LINE_NUMBER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?m)^\s*\d+[\s|]").unwrap());
static MULTI_NEWLINE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\n{3,}").unwrap());
pub fn standardize_code_blocks(html: &str) -> String {
let doc = Html::parse_fragment(html);
let mut output = html.to_string();
let mut replacements: Vec<(String, String)> = Vec::new();
if let Ok(sel) = Selector::parse("figure[data-rehype-pretty-code-figure]") {
for el in doc.select(&sel) {
if let Some(canonical) = standardize_rehype_figure(&el) {
replacements.push((el.html(), canonical));
}
}
}
if let Ok(sel) = Selector::parse("div.highlight") {
for el in doc.select(&sel) {
let class_attr = el.value().attr("class").unwrap_or("");
if let Some(lang) = extract_github_language(class_attr) {
if let Some(code) = extract_pre_text(&el) {
let cleaned = clean_code_content(&code);
replacements.push((el.html(), format_canonical_code_block(&lang, &cleaned)));
}
}
}
}
if let Ok(sel) = Selector::parse("table.highlight-table, table.rouge-table, table.code-listing") {
for el in doc.select(&sel) {
if let Some((lang, code)) = extract_table_code(&el) {
let cleaned = clean_code_content(&code);
replacements.push((el.html(), format_canonical_code_block(&lang, &cleaned)));
}
}
}
if let Ok(sel) = Selector::parse("pre.shiki") {
for el in doc.select(&sel) {
let lang = detect_language_from_element(&el);
if let Some(code) = extract_shiki_text(&el) {
let cleaned = clean_code_content(&code);
replacements.push((el.html(), format_canonical_code_block(&lang, &cleaned)));
}
}
}
if let Ok(sel) = Selector::parse("pre") {
for pre in doc.select(&sel) {
let pre_html = pre.html();
if pre_html.contains("data-lang=") {
continue;
}
if replacements.iter().any(|(orig, _)| orig.contains(&pre_html)) {
continue;
}
let lang = detect_language_from_pre(&pre);
if let Some(text) = extract_code_text_from_pre(&pre) {
let cleaned = clean_code_content(&text);
replacements.push((pre_html, format_canonical_code_block(&lang, &cleaned)));
}
}
}
for (original, canonical) in &replacements {
output = output.replacen(original, canonical, 1);
}
output
}
fn detect_language_from_pre(pre: &scraper::ElementRef) -> String {
if let Some(lang) = pre.value().attr("data-lang").or(pre.value().attr("data-language")) {
return normalize_language(lang);
}
if let Some(lang) = detect_language_from_classes(pre.value().attr("class").unwrap_or("")) {
return lang;
}
if let Ok(code_sel) = Selector::parse("code") {
if let Some(code_el) = pre.select(&code_sel).next() {
if let Some(lang) = code_el.value().attr("data-lang").or(code_el.value().attr("data-language")) {
return normalize_language(lang);
}
if let Some(lang) = detect_language_from_classes(code_el.value().attr("class").unwrap_or("")) {
return lang;
}
}
}
String::new()
}
fn detect_language_from_element(el: &scraper::ElementRef) -> String {
if let Some(lang) = el.value().attr("data-lang").or(el.value().attr("data-language")) {
return normalize_language(lang);
}
if let Some(lang) = detect_language_from_classes(el.value().attr("class").unwrap_or("")) {
return lang;
}
if let Ok(code_sel) = Selector::parse("code") {
if let Some(code_el) = el.select(&code_sel).next() {
if let Some(lang) = code_el.value().attr("data-lang").or(code_el.value().attr("data-language")) {
return normalize_language(lang);
}
if let Some(lang) = detect_language_from_classes(code_el.value().attr("class").unwrap_or("")) {
return lang;
}
}
}
String::new()
}
fn detect_language_from_classes(classes: &str) -> Option<String> {
for class in classes.split_whitespace() {
if let Some(caps) = LANGUAGE_CLASS_RE.captures(class) {
return Some(normalize_language(&caps[1]));
}
if let Some(caps) = LANG_CLASS_RE.captures(class) {
return Some(normalize_language(&caps[1]));
}
if let Some(caps) = HIGHLIGHT_SOURCE_RE.captures(class) {
return Some(normalize_language(&caps[1]));
}
}
if let Some(caps) = BRUSH_RE.captures(classes) {
return Some(normalize_language(&caps[1]));
}
for class in classes.split_whitespace() {
if is_known_language(class) {
return Some(normalize_language(class));
}
}
None
}
fn extract_github_language(class_attr: &str) -> Option<String> {
for class in class_attr.split_whitespace() {
if let Some(caps) = HIGHLIGHT_SOURCE_RE.captures(class) {
return Some(normalize_language(&caps[1]));
}
}
detect_language_from_classes(class_attr)
}
fn extract_pre_text(el: &scraper::ElementRef) -> Option<String> {
let sel = Selector::parse("pre").ok()?;
let pre = el.select(&sel).next()?;
Some(pre.text().collect::<String>())
}
fn extract_shiki_text(el: &scraper::ElementRef) -> Option<String> {
let code_sel = Selector::parse("code").ok()?;
if let Some(code) = el.select(&code_sel).next() {
let line_sel = Selector::parse("span.line").ok()?;
let lines: Vec<String> = code.select(&line_sel)
.map(|span| span.text().collect::<String>())
.collect();
if !lines.is_empty() {
return Some(lines.join("\n"));
}
return Some(code.text().collect::<String>());
}
Some(el.text().collect::<String>())
}
fn extract_table_code(el: &scraper::ElementRef) -> Option<(String, String)> {
let td_sel = Selector::parse("td").ok()?;
let tds: Vec<_> = el.select(&td_sel).collect();
for td in &tds {
let class = td.value().attr("class").unwrap_or("");
if class.contains("code") || class.contains("rouge-code") {
let code_text = td.text().collect::<String>();
let lang = detect_language_from_element(el);
return Some((lang, code_text));
}
}
if tds.len() >= 2 {
let code_text = tds.last()?.text().collect::<String>();
let lang = detect_language_from_element(el);
return Some((lang, code_text));
}
None
}
fn extract_code_text_from_pre(pre: &scraper::ElementRef) -> Option<String> {
if let Ok(code_sel) = Selector::parse("code") {
if let Some(code) = pre.select(&code_sel).next() {
let line_sel = Selector::parse("span.line").ok();
if let Some(ref ls) = line_sel {
let lines: Vec<String> = code.select(ls)
.map(|s| s.text().collect::<String>())
.collect();
if !lines.is_empty() {
return Some(lines.join("\n"));
}
}
return Some(code.text().collect::<String>());
}
}
Some(pre.text().collect::<String>())
}
fn standardize_rehype_figure(el: &scraper::ElementRef) -> Option<String> {
let pre_sel = Selector::parse("pre").ok()?;
let pre = el.select(&pre_sel).next()?;
let lang = detect_language_from_pre(&pre);
let code_text = extract_code_text_from_pre(&pre)?;
let cleaned = clean_code_content(&code_text);
Some(format_canonical_code_block(&lang, &cleaned))
}
fn clean_code_content(code: &str) -> String {
let mut s = code.replace('\t', " ");
s = s.replace('\u{00a0}', " ");
let lines: Vec<&str> = s.lines().collect();
let has_line_numbers = lines.len() > 2
&& lines.iter().filter(|l| !l.trim().is_empty()).take(5)
.all(|l| LINE_NUMBER_RE.is_match(l));
if has_line_numbers {
s = lines.iter()
.map(|l| LINE_NUMBER_RE.replace(l, "").to_string())
.collect::<Vec<_>>()
.join("\n");
}
s = MULTI_NEWLINE_RE.replace_all(&s, "\n\n").to_string();
s.trim().to_string()
}
fn html_escape_code(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
}
fn format_canonical_code_block(lang: &str, code: &str) -> String {
let escaped = html_escape_code(code);
if lang.is_empty() {
format!("<pre><code>{}</code></pre>", escaped)
} else {
format!(
"<pre><code class=\"language-{}\" data-lang=\"{}\">{}</code></pre>",
lang, lang, escaped
)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prism_code_block() {
let html = r#"<pre class="language-python"><code class="language-python">print("hello")</code></pre>"#;
let result = standardize_code_blocks(html);
assert!(result.contains("data-lang=\"python\""));
assert!(result.contains("print(\"hello\")"));
}
#[test]
fn test_brush_wordpress() {
let html = r#"<pre class="brush: ruby"><code>puts "hi"</code></pre>"#;
let result = standardize_code_blocks(html);
assert!(result.contains("data-lang=\"ruby\""));
}
#[test]
fn test_language_detection_bare() {
assert_eq!(detect_language_from_classes("python"), Some("python".into()));
assert_eq!(detect_language_from_classes("language-js"), Some("javascript".into()));
assert_eq!(detect_language_from_classes("lang-ts"), Some("typescript".into()));
assert_eq!(detect_language_from_classes("highlight-source-go"), Some("go".into()));
}
#[test]
fn test_clean_code_content_tabs() {
let code = "fn main() {\n\tprintln!(\"hi\");\n}";
let cleaned = clean_code_content(code);
assert!(cleaned.contains(" println!"));
}
#[test]
fn test_clean_code_content_nbsp() {
let code = "let\u{00a0}x = 1;";
let cleaned = clean_code_content(code);
assert_eq!(cleaned, "let x = 1;");
}
}