mod diff;
mod dom;
mod pretty;
mod runtime;
mod select;
pub use diff::diff_html;
pub use pretty::pretty_print;
pub use select::select_html;
use std::cell::Cell;
use std::fmt::Write as _;
use std::time::Duration;
thread_local! {
static VERBOSE: Cell<bool> = const { Cell::new(false) };
}
pub fn set_verbose(v: bool) {
VERBOSE.with(|c| c.set(v));
}
fn is_verbose() -> bool {
VERBOSE.with(std::cell::Cell::get)
}
#[must_use]
pub fn to_json(raw_bytes: usize, html: &str) -> String {
format!(
"{{\n \"raw_bytes\": {},\n \"rendered_bytes\": {},\n \"html\": \"{}\"\n}}\n",
raw_bytes,
html.len(),
json_escape(html)
)
}
fn json_escape(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
'"' => out.push_str("\\\""),
'\\' => out.push_str("\\\\"),
'\n' => out.push_str("\\n"),
'\r' => out.push_str("\\r"),
'\t' => out.push_str("\\t"),
c if (c as u32) < 0x20 => write!(out, "\\u{:04x}", c as u32).unwrap(),
c => out.push(c),
}
}
out
}
#[derive(Default, Clone)]
pub struct HttpConfig {
pub user_agent: Option<String>,
pub headers: Vec<(String, String)>,
pub proxy: Option<String>,
pub forward_headers: bool,
}
impl HttpConfig {
#[must_use]
pub fn agent(&self) -> ureq::Agent {
let mut builder = ureq::AgentBuilder::new();
if let Some(ref proxy_url) = self.proxy {
match ureq::Proxy::new(proxy_url) {
Ok(proxy) => {
builder = builder.proxy(proxy);
}
Err(e) => {
eprintln!("[proxy error] {proxy_url}: {e}");
}
}
}
builder.build()
}
pub fn apply(&self, req: ureq::Request) -> ureq::Request {
let mut req = req;
if let Some(ua) = &self.user_agent {
req = req.set("User-Agent", ua);
}
for (name, value) in &self.headers {
req = req.set(name, value);
}
req
}
}
fn resolve_url(src: &str, base: Option<&str>) -> Option<String> {
if src.starts_with("data:") || src.starts_with("blob:") {
return None;
}
if src.starts_with("http://") || src.starts_with("https://") {
return Some(src.to_owned());
}
if src.starts_with("//") {
return Some(format!("https:{src}"));
}
let base_url = url::Url::parse(base?).ok()?;
let resolved = base_url.join(src).ok()?;
Some(resolved.to_string())
}
fn fetch_script(url: &str, cfg: &HttpConfig) -> Option<String> {
let body = match cfg.apply(cfg.agent().get(url)).call() {
Ok(r) => r.into_string().ok()?,
Err(e) => {
eprintln!("[fetch error] {url}: {e}");
return None;
}
};
let trimmed = body.trim_start();
if trimmed.starts_with("import ")
|| trimmed.starts_with("import{")
|| trimmed.starts_with("export ")
{
if let Some(target) = single_reexport_target(trimmed)
&& let Some(resolved) = resolve_url(target, Some(url))
{
if is_verbose() {
eprintln!("[module-shim] {url} → {resolved}");
}
return fetch_script(&resolved, cfg);
}
if is_verbose() {
eprintln!("[skip] {url}: ES module syntax requires a module loader");
}
return None;
}
Some(body)
}
fn single_reexport_target(src: &str) -> Option<&str> {
let s = src.trim();
if !s.starts_with("import ") {
return None;
}
let after_import = s["import".len()..].trim_start();
let (quote, rest) = match after_import.chars().next()? {
'\'' => ('\'', &after_import[1..]),
'"' => ('"', &after_import[1..]),
_ => return None, };
let specifier_end = rest.find(quote)?;
let specifier = &rest[..specifier_end];
let tail = rest[specifier_end + 1..]
.trim()
.trim_start_matches(';')
.trim();
if !tail.is_empty() {
return None; }
if specifier.starts_with("./") || specifier.starts_with("../") || specifier.starts_with('/') {
Some(specifier)
} else {
None
}
}
fn load_scripts(
sources: Vec<dom::ScriptSource>,
page_url: Option<&str>,
cfg: &HttpConfig,
max_remote: Option<usize>,
) -> Vec<String> {
let mut remote_fetched = 0usize;
let mut result = Vec::new();
for s in sources {
match s {
dom::ScriptSource::Inline(code) => result.push(code),
dom::ScriptSource::External(src) => {
if max_remote.is_some_and(|max| remote_fetched >= max) {
if is_verbose() {
eprintln!("[skip] --max-scripts limit reached, skipping {src}");
}
continue;
}
let Some(url) = resolve_url(&src, page_url) else {
continue;
};
if is_verbose() {
eprintln!("[fetch] {url}");
}
if let Some(code) = fetch_script(&url, cfg) {
remote_fetched += 1;
result.push(code);
}
}
}
}
result
}
fn build_meta_script(meta: &std::collections::HashMap<String, String>) -> String {
if meta.is_empty() {
return String::new();
}
let mut out = String::from("var _r_meta = {");
for (name, content) in meta {
let name_esc = name.replace('\\', "\\\\").replace('\'', "\\'");
let content_esc = content.replace('\\', "\\\\").replace('\'', "\\'");
write!(
out,
"'{name_esc}':{{name:'{name_esc}',content:'{content_esc}',\
getAttribute:function(n){{return n==='content'?this.content:n==='name'?this.name:null;}},\
hasAttribute:function(n){{return n==='content'||n==='name';}}}},"
)
.unwrap();
}
out.push_str("};");
out
}
pub fn render(
input: &str,
is_js: bool,
page_url: Option<&str>,
cfg: &HttpConfig,
clean: bool,
max_scripts: Option<usize>,
script_timeout: Option<Duration>,
) -> anyhow::Result<String> {
let html = if is_js {
format!("<!DOCTYPE html><html><head></head><body><script>{input}</script></body></html>")
} else {
input.to_owned()
};
let doc = dom::parse(&html);
let meta_script = build_meta_script(&doc.collect_meta());
let mut scripts = load_scripts(doc.extract_scripts(), page_url, cfg, max_scripts);
if !meta_script.is_empty() {
scripts.insert(0, meta_script);
}
let rt = match script_timeout {
Some(t) => runtime::JsRuntime::with_timeout(t),
None => runtime::JsRuntime::without_timeout(),
};
rt.execute(&scripts, page_url, cfg)?;
for msg in runtime::JsRuntime::logged_messages() {
if is_verbose() {
eprintln!("[console] {msg}");
}
}
let body_html = runtime::JsRuntime::body_inner_html();
let raw_body_len = raw_body_content_len(&html);
let effective_body = if raw_body_len < 512 || body_html.len() * 2 >= raw_body_len {
body_html.as_str()
} else {
""
};
let out =
doc.serialize_with_body_and_injection(effective_body, &runtime::JsRuntime::written_html());
Ok(if clean { clean_document(out) } else { out })
}
#[must_use]
pub fn clean_document(mut html: String) -> String {
html = remove_script_elements(html);
html = remove_preload_links(html);
html = unwrap_noscript(html);
html
}
fn remove_script_elements(mut html: String) -> String {
const OPEN: &str = "<script";
const CLOSE: &str = "</script>";
while let Some(start) = html.find(OPEN) {
let next = html.as_bytes().get(start + OPEN.len()).copied();
if !matches!(
next,
Some(b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/') | None
) {
break;
}
let end = html[start..]
.find(CLOSE)
.map_or(html.len(), |p| start + p + CLOSE.len());
html.drain(start..end);
}
html
}
fn remove_preload_links(mut html: String) -> String {
const OPEN: &str = "<link";
let mut pos = 0;
while let Some(rel) = html[pos..].find(OPEN).map(|p| p + pos) {
let tag_end = match html[rel..].find('>') {
Some(p) => rel + p + 1,
None => break,
};
let tag = &html[rel..tag_end];
let is_modulepreload = tag.contains("modulepreload");
let is_preload_script = tag.contains("preload") && tag.contains("as=\"script\"");
if is_modulepreload || is_preload_script {
html.drain(rel..tag_end);
} else {
pos = tag_end;
}
}
html
}
fn unwrap_noscript(mut html: String) -> String {
#[allow(clippy::while_let_loop)] loop {
let Some(open_start) = html.find("<noscript") else {
break;
};
let Some(open_end) = html[open_start..].find('>').map(|p| open_start + p + 1) else {
break;
};
html.drain(open_start..open_end);
if let Some(close) = html[open_start..]
.find("</noscript>")
.map(|p| open_start + p)
{
html.drain(close..close + "</noscript>".len());
}
}
html
}
fn raw_body_content_len(html: &str) -> usize {
let body_start = html.find("<body").unwrap_or(0);
let content_start = html[body_start..]
.find('>')
.map_or(0, |i| i + body_start + 1);
let body_end = html.rfind("</body>").unwrap_or(html.len());
let body = &html[content_start.min(body_end)..body_end];
let mut len = body.len();
let mut rest = body;
while let Some(s) = rest.find("<script") {
let end = rest[s..]
.find("</script>")
.map(|e| s + e + 9)
.or_else(|| rest[s..].find("/>").map(|e| s + e + 2))
.unwrap_or(rest.len());
len -= end - s;
rest = &rest[end.min(rest.len())..];
}
len
}
pub fn render_url(url: &str, cfg: &HttpConfig, clean: bool) -> anyhow::Result<String> {
let body = cfg.apply(cfg.agent().get(url)).call()?.into_string()?;
render(
&body,
false,
Some(url),
cfg,
clean,
None,
Some(Duration::from_secs(30)),
)
}
#[cfg(test)]
mod tests {
use super::*;
fn render_simple(input: &str, is_js: bool, page_url: Option<&str>) -> anyhow::Result<String> {
render(
input,
is_js,
page_url,
&HttpConfig::default(),
false,
None,
None,
)
}
#[test]
fn html_inline_script_document_write() {
let input = concat!(
"<!DOCTYPE html><html><head><title>Test</title></head>",
"<body><h1>Before</h1>",
r#"<script>document.write("<p>Hello from JS!</p>"); console.log("done");</script>"#,
"</body></html>"
);
let out = render_simple(input, false, None).unwrap();
assert!(out.contains("<h1>Before</h1>"), "static content preserved");
assert!(
out.contains("<p>Hello from JS!</p>"),
"document.write injected"
);
}
#[test]
fn js_file_mode_loop() {
let js = concat!(
r#"document.write("<ul>");"#,
"\n",
r#"for (let i = 1; i <= 3; i++) { document.write("<li>Item " + i + "</li>"); }"#,
"\n",
r#"document.write("</ul>");"#,
"\n",
r#"console.log("rendered", 3, "items");"#,
);
let out = render_simple(js, true, None).unwrap();
assert!(out.contains("<li>Item 1</li>"), "first item");
assert!(out.contains("<li>Item 2</li>"), "second item");
assert!(out.contains("<li>Item 3</li>"), "third item");
}
#[test]
fn console_messages_captured() {
let js = r#"console.log("hello", "world"); console.warn("oops");"#;
let rt = runtime::JsRuntime::with_timeout(std::time::Duration::from_secs(30));
rt.execute(&[js.to_owned()], None, &HttpConfig::default())
.unwrap();
let msgs = runtime::JsRuntime::logged_messages();
assert_eq!(msgs[0], "hello world");
assert_eq!(msgs[1], "oops");
}
#[test]
fn document_writeln_adds_newline() {
let js = r#"document.writeln("line1"); document.writeln("line2");"#;
let out = render_simple(js, true, None).unwrap();
assert!(out.contains("line1\nline2\n"), "writeln appends newline");
}
#[test]
fn window_aliases_global() {
let js = r#"window.document.write("<p>via window</p>");"#;
let out = render_simple(js, true, None).unwrap();
assert!(
out.contains("<p>via window</p>"),
"window.document.write works"
);
}
#[test]
fn script_errors_are_non_fatal() {
let html = concat!(
"<!DOCTYPE html><html><body>",
"<script>throw new Error('deliberate');</script>",
"<script>document.write('<p>survived</p>');</script>",
"</body></html>"
);
let out = render_simple(html, false, None).unwrap();
assert!(
out.contains("<p>survived</p>"),
"rendering continues after script error"
);
}
#[test]
fn location_href_reflects_page_url() {
let js = r#"document.write(window.location.href);"#;
let out = render_simple(js, true, Some("https://example.com/page")).unwrap();
assert!(
out.contains("https://example.com/page"),
"location.href set from page_url"
);
}
#[test]
fn common_globals_accessible() {
let js = r#"
var ua = window.navigator.userAgent;
var tid = window.setTimeout(function(){}, 100);
var mq = window.matchMedia('(max-width: 768px)');
var mo = new window.MutationObserver(function(){});
document.write('<p>' + ua + '</p>');
"#;
let out = render_simple(js, true, None).unwrap();
assert!(out.contains("<p>rakers/"), "navigator.userAgent accessible");
}
#[test]
fn document_create_element_is_accessible() {
let js = r#"
var el = document.createElement('div');
el.className = 'test';
document.write('<p>' + el.className + '</p>');
"#;
let out = render_simple(js, true, None).unwrap();
assert!(out.contains("<p>test</p>"), "createElement stub works");
}
#[test]
fn settimeout_callback_flushed() {
let html = concat!(
"<!DOCTYPE html><html><body>",
r#"<div id="app"></div>"#,
"<script>setTimeout(function() {",
r#"document.getElementById('app').innerHTML = '<h1>Rendered via setTimeout</h1>';"#,
"}, 0);</script>",
"</body></html>"
);
let out = render_simple(html, false, None).unwrap();
assert!(
out.contains("<h1>Rendered via setTimeout</h1>"),
"setTimeout callback flushed before readback"
);
}
#[test]
fn body_inner_html_set_directly() {
let js = r#"document.body.innerHTML = '<h1>Set directly</h1>';"#;
let out = render_simple(js, true, None).unwrap();
assert!(
out.contains("<h1>Set directly</h1>"),
"body.innerHTML = '...' captured"
);
}
#[test]
fn append_child_to_body() {
let js = r#"
var h1 = document.createElement('h1');
h1.innerHTML = 'Appended';
document.body.appendChild(h1);
"#;
let out = render_simple(js, true, None).unwrap();
assert!(
out.contains("<h1>Appended</h1>"),
"appendChild serialized into output"
);
}
#[test]
fn nested_elements_serialized() {
let js = r#"
var ul = document.createElement('ul');
for (var i = 1; i <= 3; i++) {
var li = document.createElement('li');
li.innerHTML = 'Item ' + i;
ul.appendChild(li);
}
document.body.appendChild(ul);
"#;
let out = render_simple(js, true, None).unwrap();
assert!(out.contains("<li>Item 1</li>"), "nested li 1");
assert!(out.contains("<li>Item 3</li>"), "nested li 3");
}
#[test]
fn get_element_by_id_content_with_append() {
let js = r#"
var app = document.getElementById('app');
app.innerHTML = '<p>App content</p>';
document.body.appendChild(app);
"#;
let out = render_simple(js, true, None).unwrap();
assert!(
out.contains("<p>App content</p>"),
"getElementById + appendChild captured"
);
}
#[test]
fn clean_removes_scripts_and_unwraps_noscript() {
let html = concat!(
"<!DOCTYPE html><html><head>",
r#"<link rel="modulepreload" href="/bundle.js">"#,
r#"<link rel="preload" as="script" href="/chunk.js">"#,
r#"<link rel="stylesheet" href="/style.css">"#, "</head><body>",
"<h1>Hello</h1>",
r#"<script src="/app.js"></script>"#,
"<script>var x = 1;</script>",
"<noscript><p>JS required</p></noscript>",
"</body></html>",
);
let out = render(html, false, None, &HttpConfig::default(), true, None, None).unwrap();
assert!(!out.contains("<script"), "script tags removed");
assert!(!out.contains("modulepreload"), "modulepreload link removed");
assert!(
!out.contains(r#"as="script""#),
"preload-script link removed"
);
assert!(
out.contains(r#"rel="stylesheet""#),
"stylesheet link preserved"
);
assert!(!out.contains("<noscript"), "noscript tags removed");
assert!(
out.contains("<p>JS required</p>"),
"noscript content preserved"
);
assert!(out.contains("<h1>Hello</h1>"), "regular content preserved");
}
#[test]
#[cfg_attr(not(feature = "rquickjs"), ignore = "boa has no interrupt handler")]
fn script_timeout_is_non_fatal() {
let rt = runtime::JsRuntime::with_timeout(std::time::Duration::from_millis(100));
rt.execute(
&[
"while(true){}".to_owned(),
"document.write('<p>survived</p>');".to_owned(),
],
None,
&HttpConfig::default(),
)
.unwrap();
assert!(
runtime::JsRuntime::written_html().contains("<p>survived</p>"),
"second script must run after timeout interrupts the first"
);
}
#[test]
fn to_json_fields() {
let out = to_json(100, "<h1>hi</h1>");
assert!(out.contains("\"raw_bytes\": 100"), "raw_bytes field");
assert!(
out.contains("\"rendered_bytes\": 11"),
"rendered_bytes field"
);
assert!(out.contains("\"html\""), "html field present");
assert!(out.contains("<h1>hi</h1>"), "html content");
}
#[test]
fn to_json_escapes_special_chars() {
let out = to_json(0, "say \"hello\"\nline2\\end");
assert!(
out.contains(r#"say \"hello\"\nline2\\end"#),
"quotes, newline, backslash escaped: {out}"
);
}
#[test]
#[cfg_attr(not(feature = "rquickjs"), ignore = "boa microtask draining differs")]
fn fetch_stub_resolves_then_chain() {
let js = concat!(
"window.fetch('/api/data')",
".then(function(r){ return r.text(); })",
".then(function(t){ document.write('<p>fetch-ok</p>'); });",
);
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
let after_script = out.find("</script>").map(|i| &out[i..]).unwrap_or("");
assert!(
after_script.contains("<p>fetch-ok</p>"),
"fetch .then() chain must fire, got: {out}"
);
}
#[test]
#[cfg_attr(not(feature = "rquickjs"), ignore = "boa microtask draining differs")]
fn fetch_stub_json_resolves() {
let js = concat!(
"window.fetch('/api').then(function(r){ return r.json(); })",
".then(function(d){ document.write('<p>json-ok</p>'); });",
);
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
let after_script = out.find("</script>").map(|i| &out[i..]).unwrap_or("");
assert!(
after_script.contains("<p>json-ok</p>"),
"fetch.json() chain must fire, got: {out}"
);
}
#[test]
fn xhr_stub_fires_onload() {
let js = concat!(
"var xhr = new XMLHttpRequest();",
"xhr.open('GET', '/api/data');",
"xhr.onload = function() { document.write('<p>xhr-ok</p>'); };",
"xhr.send();",
);
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
let after_script = out.find("</script>").map(|i| &out[i..]).unwrap_or("");
assert!(
after_script.contains("<p>xhr-ok</p>"),
"XHR onload must fire, got: {out}"
);
}
#[test]
fn xhr_stub_fires_addeventlistener_load() {
let js = concat!(
"var xhr = new XMLHttpRequest();",
"xhr.open('GET', '/api');",
"xhr.addEventListener('load', function() { document.write('<p>xhr-addev-ok</p>'); });",
"xhr.send();",
);
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
assert!(
out.contains("<p>xhr-addev-ok</p>"),
"XHR addEventListener('load') must fire, got: {out}"
);
}
#[test]
fn location_pathname_reflects_page_url() {
let js = r#"document.write(window.location.pathname)"#;
let out = render(
js,
true,
Some("https://example.com/foo/bar"),
&HttpConfig::default(),
false,
None,
None,
)
.unwrap();
assert!(
out.contains("/foo/bar"),
"pathname should be /foo/bar, got: {out}"
);
}
#[test]
fn location_fields_parsed_from_url() {
let js = concat!(
"document.write(window.location.protocol + '|');",
"document.write(window.location.hostname + '|');",
"document.write(window.location.pathname + '|');",
"document.write(window.location.search + '|');",
"document.write(window.location.hash);",
);
let out = render(
js,
true,
Some("https://example.com/path?q=1#sec"),
&HttpConfig::default(),
false,
None,
None,
)
.unwrap();
assert!(out.contains("https:|"), "protocol wrong: {out}");
assert!(out.contains("example.com|"), "hostname wrong: {out}");
assert!(out.contains("/path|"), "pathname wrong: {out}");
assert!(out.contains("?q=1|"), "search wrong: {out}");
assert!(out.contains("#sec"), "hash wrong: {out}");
}
#[test]
fn location_defaults_when_no_url() {
let js = r#"document.write(window.location.href)"#;
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
assert!(
out.contains("about:blank"),
"href should be about:blank when no URL given, got: {out}"
);
}
#[test]
fn history_state_updated_by_push() {
let js = concat!(
"window.history.pushState({page:1}, '');",
"document.write(JSON.stringify(window.history.state));",
);
let out = render(js, true, None, &HttpConfig::default(), false, None, None).unwrap();
assert!(
out.contains(r#""page""#) && out.contains('1'.to_string().as_str()),
"history.state should reflect pushed state, got: {out}"
);
}
#[test]
fn single_reexport_target_detects_shim() {
assert_eq!(
single_reexport_target("import './bundle.js'"),
Some("./bundle.js")
);
assert_eq!(
single_reexport_target("import \"../dist/app.js\";"),
Some("../dist/app.js")
);
assert_eq!(
single_reexport_target("import '/assets/main.js'\n"),
Some("/assets/main.js")
);
assert_eq!(
single_reexport_target("import './a.js'\nimport './b.js'"),
None
);
assert_eq!(
single_reexport_target("import { foo } from './lib.js'"),
None
);
assert_eq!(single_reexport_target("import 'react'"), None);
assert_eq!(single_reexport_target("(function(){ var x = 1; })()"), None);
}
#[test]
fn proxy_config_does_not_break_inline_rendering() {
let cfg = HttpConfig {
proxy: Some("socks5://127.0.0.1:9050".to_owned()),
..Default::default()
};
let html = r#"<html><body><script>document.write('<p>ok</p>');</script></body></html>"#;
let out = render(html, false, None, &cfg, false, None, None).unwrap();
assert!(
out.contains("<p>ok</p>"),
"inline script renders with proxy configured"
);
}
#[test]
fn proxy_fetch_failure_is_non_fatal() {
let cfg = HttpConfig {
proxy: Some("socks5://127.0.0.1:1".to_owned()),
..Default::default()
};
let html = concat!(
"<html><body><script>",
"var x = new XMLHttpRequest();",
"x.open('GET','http://example.com/data.json',false);",
"try { x.send(); } catch(e) {}",
"document.write('<p>done</p>');",
"</script></body></html>"
);
let out = render(html, false, None, &cfg, false, None, None).unwrap();
assert!(
out.contains("<p>done</p>"),
"render completes despite proxy failure"
);
}
}