use ego_tree::NodeId;
use markup5ever::{QualName, ns};
use scraper::{Html, Node};
use crate::dom;
const HIGHLIGHTER_PREFIXES: &[&str] = &[
"language-",
"lang-",
"syntax-",
"highlight-",
"code-",
"cm-s-", ];
const HIGHLIGHTER_SUFFIXES: &[&str] = &["-code", "-snippet"];
const LINE_NUMBER_CLASSES: &[&str] = &[
"linenumber",
"line-number",
"line-numbers",
"lineno",
"linenos",
"rouge-gutter",
"hljs-ln-numbers",
"hljs-ln-line",
"lnt",
"react-syntax-highlighter-line-number",
"code-line-number",
"gutter",
];
const BUTTON_CLASSES: &[&str] = &[
"copy-button",
"copy-code-button",
"copybutton",
"code-copy",
"btn-copy",
"copy",
];
const BUTTON_DATA_ATTRS: &[&str] = &["data-copy", "data-clipboard-text"];
const LANGUAGES: &[&str] = &[
"actionscript",
"ada",
"agda",
"apache",
"applescript",
"arduino",
"asm",
"assembly",
"astro",
"awk",
"bash",
"basic",
"batch",
"brainfuck",
"bsl",
"c",
"ceylon",
"clojure",
"cmake",
"cobol",
"coffeescript",
"cpp",
"crystal",
"csharp",
"css",
"csv",
"cuda",
"d",
"dart",
"delphi",
"diff",
"django",
"dockerfile",
"dotnet",
"eiffel",
"elixir",
"elm",
"emacs",
"erb",
"erlang",
"excel",
"fish",
"fortran",
"fsharp",
"gdscript",
"gherkin",
"glsl",
"go",
"graphql",
"groovy",
"haml",
"handlebars",
"haskell",
"haxe",
"hlsl",
"html",
"http",
"ini",
"io",
"java",
"javascript",
"jinja",
"json",
"jsonnet",
"jsx",
"julia",
"jupyter",
"kotlin",
"latex",
"lean",
"less",
"liquid",
"lisp",
"livescript",
"llvm",
"lua",
"makefile",
"markdown",
"markup",
"mathematica",
"matlab",
"mdx",
"mermaid",
"mips",
"moonscript",
"nginx",
"nim",
"nix",
"nushell",
"objc",
"objective-c",
"ocaml",
"openscad",
"pascal",
"perl",
"php",
"plaintext",
"plsql",
"postcss",
"powershell",
"prisma",
"processing",
"prolog",
"protobuf",
"puppet",
"purescript",
"python",
"r",
"racket",
"razor",
"regex",
"rescript",
"rest",
"ruby",
"rust",
"sass",
"scala",
"scheme",
"scss",
"sh",
"shell",
"smalltalk",
"smarty",
"solidity",
"sparql",
"sql",
"stan",
"stylus",
"svelte",
"svg",
"swift",
"tcl",
"terraform",
"tex",
"text",
"toml",
"tsx",
"twig",
"typescript",
"v",
"vala",
"vb",
"vbnet",
"verilog",
"vhdl",
"vim",
"vue",
"wasm",
"wolfram",
"xml",
"xquery",
"xslt",
"yaml",
"zig",
"zsh",
"js",
"ts",
"rb",
"py",
"rs",
"cs",
"hs",
"yml",
"md",
];
pub fn standardize_code_blocks(html: &mut Html, main_content: NodeId) {
remove_line_numbers(html, main_content);
remove_copy_buttons(html, main_content);
normalize_pre_code_blocks(html, main_content);
}
fn remove_line_numbers(html: &mut Html, main_content: NodeId) {
let mut to_remove = Vec::new();
let descendants = dom::all_descendant_elements(html, main_content);
for node_id in descendants {
if should_remove_as_line_number(html, node_id) {
to_remove.push(node_id);
}
}
for id in to_remove {
dom::remove_node(html, id);
}
}
fn should_remove_as_line_number(html: &Html, node_id: NodeId) -> bool {
let Some(node_ref) = html.tree.get(node_id) else {
return false;
};
let Node::Element(el) = node_ref.value() else {
return false;
};
let classes = el.attr("class").unwrap_or("");
for class in classes.split_whitespace() {
let lower = class.to_ascii_lowercase();
if LINE_NUMBER_CLASSES.iter().any(|&c| c == lower) {
return true;
}
}
let tag = el.name.local.as_ref();
if tag == "td" && classes.contains("hljs-ln-numbers") {
return true;
}
false
}
fn remove_copy_buttons(html: &mut Html, main_content: NodeId) {
let mut to_remove = Vec::new();
let descendants = dom::all_descendant_elements(html, main_content);
for node_id in descendants {
if is_copy_button(html, node_id) {
to_remove.push(node_id);
}
}
for id in to_remove {
dom::remove_node(html, id);
}
}
fn is_copy_button(html: &Html, node_id: NodeId) -> bool {
let Some(node_ref) = html.tree.get(node_id) else {
return false;
};
let Node::Element(el) = node_ref.value() else {
return false;
};
for attr_name in BUTTON_DATA_ATTRS {
if el.attr(attr_name).is_some() {
return true;
}
}
let classes = el.attr("class").unwrap_or("");
for class in classes.split_whitespace() {
let lower = class.to_ascii_lowercase();
if BUTTON_CLASSES.iter().any(|&c| c == lower) {
return true;
}
}
false
}
fn normalize_pre_code_blocks(html: &mut Html, main_content: NodeId) {
let pre_ids = dom::descendant_elements_by_tag(html, main_content, "pre");
for pre_id in pre_ids {
normalize_single_pre(html, pre_id);
}
}
fn normalize_single_pre(html: &mut Html, pre_id: NodeId) {
let language = detect_language(html, pre_id);
let code_text = extract_code_text(html, pre_id);
let child_ids: Vec<NodeId> = {
let Some(node_ref) = html.tree.get(pre_id) else {
return;
};
node_ref.children().map(|c| c.id()).collect()
};
for child_id in child_ids {
dom::remove_node(html, child_id);
}
let code_el = build_code_element(&language);
let code_id = html.tree.orphan(Node::Element(code_el)).id();
let text_node = Node::Text(scraper::node::Text {
text: code_text.into(),
});
let text_id = html.tree.orphan(text_node).id();
{
let Some(mut code_mut) = html.tree.get_mut(code_id) else {
return;
};
code_mut.append_id(text_id);
}
{
let Some(mut pre_mut) = html.tree.get_mut(pre_id) else {
return;
};
pre_mut.append_id(code_id);
}
let Some(mut pre_mut) = html.tree.get_mut(pre_id) else {
return;
};
let Node::Element(el) = pre_mut.value() else {
return;
};
el.attrs.clear();
}
fn build_code_element(language: &str) -> scraper::node::Element {
let name = QualName::new(None, ns!(html), markup5ever::local_name!("code"));
let mut attributes = Vec::new();
if !language.is_empty() {
attributes.push(markup5ever::Attribute {
name: QualName::new(None, ns!(), markup5ever::LocalName::from("data-lang")),
value: language.into(),
});
}
scraper::node::Element::new(name, attributes)
}
fn detect_language(html: &Html, pre_id: NodeId) -> String {
if let Some(lang) = detect_language_from_node(html, pre_id) {
return lang;
}
let code_children = dom::descendant_elements_by_tag(html, pre_id, "code");
for code_id in code_children {
if let Some(lang) = detect_language_from_node(html, code_id) {
return lang;
}
}
let mut current = pre_id;
for _ in 0..3 {
let Some(parent_id) = dom::parent_element(html, current) else {
break;
};
if let Some(lang) = detect_language_from_node(html, parent_id) {
return lang;
}
current = parent_id;
}
String::new()
}
fn detect_language_from_node(html: &Html, node_id: NodeId) -> Option<String> {
let node_ref = html.tree.get(node_id)?;
let Node::Element(el) = node_ref.value() else {
return None;
};
for attr_name in &["data-language", "data-lang", "language"] {
if let Some(val) = el.attr(attr_name) {
let lower = val.to_ascii_lowercase();
if is_known_language(&lower) {
return Some(lower);
}
}
}
let classes = el.attr("class").unwrap_or("");
detect_language_from_classes(classes)
}
fn detect_language_from_classes(classes: &str) -> Option<String> {
for class in classes.split_whitespace() {
let lower = class.to_ascii_lowercase();
for prefix in HIGHLIGHTER_PREFIXES {
if let Some(lang) = lower.strip_prefix(prefix)
&& is_known_language(lang)
{
return Some(lang.to_string());
}
}
for suffix in HIGHLIGHTER_SUFFIXES {
if let Some(lang) = lower.strip_suffix(suffix)
&& is_known_language(lang)
{
return Some(lang.to_string());
}
}
if is_known_language(&lower) {
return Some(lower);
}
}
None
}
fn is_known_language(name: &str) -> bool {
LANGUAGES.contains(&name)
}
fn extract_code_text(html: &Html, node_id: NodeId) -> String {
let mut buf = String::new();
collect_code_text(html, node_id, &mut buf);
let cleaned = buf.replace('\t', " ").replace('\u{00a0}', " ");
let trimmed = cleaned.trim();
collapse_blank_lines(trimmed)
}
fn collect_code_text(html: &Html, node_id: NodeId, buf: &mut String) {
let Some(node_ref) = html.tree.get(node_id) else {
return;
};
match node_ref.value() {
Node::Text(t) => buf.push_str(t),
Node::Element(el) => {
let tag = el.name.local.as_ref();
if tag == "br" {
buf.push('\n');
return;
}
if tag == "button" || tag == "style" {
return;
}
for child in node_ref.children() {
collect_code_text(html, child.id(), buf);
}
}
_ => {}
}
}
fn collapse_blank_lines(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut newline_count = 0u32;
for ch in s.chars() {
if ch == '\n' {
newline_count += 1;
if newline_count <= 2 {
result.push(ch);
}
} else {
newline_count = 0;
result.push(ch);
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_and_standardize(html_str: &str) -> String {
let mut doc = Html::parse_document(html_str);
let root = doc.tree.root().id();
standardize_code_blocks(&mut doc, root);
dom::outer_html(&doc, root)
}
#[test]
fn basic_pre_code_passthrough() {
let html = r"<html><body><pre><code>fn main() {}</code></pre></body></html>";
let result = parse_and_standardize(html);
assert!(result.contains("<pre><code>fn main() {}</code></pre>"));
}
#[test]
fn detects_language_from_class() {
let html = r#"<html><body>
<pre><code class="language-rust">fn main() {}</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="rust""#));
assert!(result.contains("fn main() {}"));
}
#[test]
fn detects_prismjs_lang_prefix() {
let html = r#"<html><body>
<pre><code class="lang-python">print("hi")</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="python""#));
}
#[test]
fn detects_highlightjs_hljs_class() {
let html = r#"<html><body>
<pre><code class="hljs language-javascript">let x = 1;</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="javascript""#));
}
#[test]
fn detects_data_language_attribute() {
let html = r#"<html><body>
<pre data-language="go"><code>fmt.Println("hi")</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="go""#));
}
#[test]
fn strips_syntax_highlighting_spans() {
let html = r#"<html><body>
<pre><code class="language-rust">
<span class="keyword">fn</span> <span class="function">main</span>() {}
</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="rust""#));
assert!(!result.contains("<span"));
assert!(result.contains("fn"));
assert!(result.contains("main"));
}
#[test]
fn removes_line_numbers() {
let html = r#"<html><body>
<pre><code class="language-python">
<span class="lineno">1</span>print("hello")
<span class="lineno">2</span>print("world")
</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(!result.contains("lineno"));
assert!(result.contains("print"));
}
#[test]
fn removes_copy_buttons() {
let html = r#"<html><body>
<pre>
<button class="copy-button">Copy</button>
<code class="language-rust">let x = 1;</code>
</pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(!result.contains("copy-button"));
assert!(!result.contains("Copy"));
assert!(result.contains("let x = 1;"));
}
#[test]
fn removes_data_copy_buttons() {
let html = r#"<html><body>
<pre>
<button data-copy="true">Copy</button>
<code>x = 1</code>
</pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(!result.contains("button"));
assert!(result.contains("x = 1"));
}
#[test]
fn detects_language_from_parent_div() {
let html = r#"<html><body>
<div class="language-typescript">
<pre><code>const x: number = 1;</code></pre>
</div>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="typescript""#));
}
#[test]
fn handles_bare_language_class() {
let html = r#"<html><body>
<pre><code class="python">x = 1</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"data-lang="python""#));
}
#[test]
fn no_language_when_unknown() {
let html = r#"<html><body>
<pre><code class="someRandomClass">stuff</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(!result.contains("data-lang"));
assert!(result.contains("stuff"));
}
#[test]
fn normalizes_tabs_and_nbsp() {
let html = "<html><body><pre><code>x\tfoo\u{00a0}bar</code></pre></body></html>";
let result = parse_and_standardize(html);
assert!(
result.contains("x foo bar"),
"Expected normalized whitespace in: {result:?}"
);
}
#[test]
fn removes_rouge_gutter() {
let html = r#"<html><body>
<pre><code class="language-ruby">
<td class="rouge-gutter">1</td>
<td>puts "hi"</td>
</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(!result.contains("rouge-gutter"));
}
#[test]
fn handles_pre_without_code() {
let html = r"<html><body>
<pre>plain preformatted text</pre>
</body></html>";
let result = parse_and_standardize(html);
assert!(result.contains("<pre><code>plain preformatted text</code></pre>"));
}
#[test]
fn preserves_br_as_newlines() {
let html = r"<html><body>
<pre><code>line1<br>line2<br>line3</code></pre>
</body></html>";
let result = parse_and_standardize(html);
assert!(result.contains("line1\nline2\nline3"));
}
#[test]
fn collapses_excess_blank_lines() {
let input = "a\n\n\n\n\nb";
let result = collapse_blank_lines(input);
assert_eq!(result, "a\n\nb");
}
#[test]
fn is_known_language_works() {
assert!(is_known_language("rust"));
assert!(is_known_language("javascript"));
assert!(is_known_language("typescript"));
assert!(is_known_language("python"));
assert!(is_known_language("go"));
assert!(is_known_language("nix"));
assert!(!is_known_language("notareallang"));
}
#[test]
fn detects_code_suffix_pattern() {
assert_eq!(
detect_language_from_classes("javascript-code"),
Some("javascript".to_string())
);
}
#[test]
fn detects_snippet_suffix_pattern() {
assert_eq!(
detect_language_from_classes("python-snippet"),
Some("python".to_string())
);
}
#[test]
fn pre_attributes_are_stripped() {
let html = r#"<html><body>
<pre class="highlight" data-lang="rust"><code>let x = 1;</code></pre>
</body></html>"#;
let result = parse_and_standardize(html);
assert!(result.contains(r#"<pre><code data-lang="rust">"#));
}
}