#![cfg(feature = "html")]
#![expect(
clippy::expect_used,
reason = "integration test: panicking on unexpected input is the assertion"
)]
use std::fs;
use std::path::PathBuf;
use rama_http::protocols::html::tokenizer::{
Cdata, Comment, Doctype, EndTag, StartTag, Text, TokenSink, Tokenizer,
};
use serde_json::Value;
#[derive(Default)]
struct Identity {
out: Vec<u8>,
}
impl TokenSink for Identity {
fn start_tag(&mut self, tag: &StartTag<'_>) {
self.out.extend_from_slice(tag.raw());
}
fn end_tag(&mut self, tag: &EndTag<'_>) {
self.out.extend_from_slice(tag.raw());
}
fn text(&mut self, text: &Text<'_>) {
self.out.extend_from_slice(text.raw());
}
fn comment(&mut self, comment: &Comment<'_>) {
self.out.extend_from_slice(comment.raw());
}
fn cdata(&mut self, cdata: &Cdata<'_>) {
self.out.extend_from_slice(cdata.raw());
}
fn doctype(&mut self, doctype: &Doctype<'_>) {
self.out.extend_from_slice(doctype.raw());
}
}
fn corpus_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("html5lib-tokenizer")
}
fn decode_double_escaped(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '\\' && chars.peek() == Some(&'u') {
chars.next();
let mut hex = String::new();
while hex.len() < 4 {
match chars.peek() {
Some(&h) if h.is_ascii_hexdigit() => {
hex.push(h);
chars.next();
}
_ => break,
}
}
if let Ok(cp) = u32::from_str_radix(&hex, 16) {
out.push(char::from_u32(cp).unwrap_or('\u{FFFD}'));
} else {
out.push('\\');
out.push('u');
out.push_str(&hex);
}
} else {
out.push(c);
}
}
out
}
fn assert_identity(input: &[u8], origin: &str) {
let mut sink = Identity::default();
Tokenizer::new()
.with_strict(false)
.tokenize(input, &mut sink)
.expect("lenient tokenizer never errors");
assert_eq!(
sink.out,
input,
"identity failed in {origin}: {:?}",
String::from_utf8_lossy(input)
);
}
#[test]
fn html5lib_tokenizer_identity() {
let mut files = 0_usize;
let mut cases = 0_usize;
let mut entries: Vec<PathBuf> = fs::read_dir(corpus_dir())
.expect("open html5lib-tokenizer corpus dir")
.map(|e| e.expect("dir entry").path())
.filter(|p| p.extension().and_then(|e| e.to_str()) == Some("test"))
.collect();
entries.sort();
for path in entries {
files += 1;
let data = fs::read_to_string(&path).expect("read test file");
let json: Value = serde_json::from_str(&data).expect("parse test JSON");
let origin = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("<unknown>")
.to_owned();
let tests = json
.get("tests")
.or_else(|| json.get("xmlViolationTests"))
.and_then(Value::as_array);
let Some(tests) = tests else { continue };
for test in tests {
let Some(input) = test.get("input").and_then(Value::as_str) else {
continue;
};
let double_escaped = test
.get("doubleEscaped")
.and_then(Value::as_bool)
.unwrap_or(false);
let input = if double_escaped {
decode_double_escaped(input)
} else {
input.to_owned()
};
assert_identity(input.as_bytes(), &origin);
cases += 1;
}
}
assert!(files > 0, "no html5lib .test files found in corpus dir");
assert!(cases > 0, "no test cases found in html5lib corpus");
eprintln!("html5lib tokenizer identity: {cases} inputs across {files} files");
}