use crate::tokenizer::{
TagKind, TokenKind, Tokenizer, state_cdata_section, state_plaintext, state_raw_text,
state_rc_data, state_script_data,
};
use serde_json::Value;
use std::fs;
use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq)]
enum Token {
Doctype {
name: Option<String>,
public_id: Option<String>,
system_id: Option<String>,
correctness: bool,
},
StartTag {
name: String,
attributes: Vec<(String, String)>,
self_closing: bool,
},
EndTag {
name: String,
},
Comment {
data: String,
},
Character {
data: String,
},
}
#[derive(Debug)]
enum InitialState {
Data,
Plaintext,
RcData,
RawText,
ScriptData,
CdataSection,
}
#[derive(Debug)]
struct TestCase {
name: String,
initial_state: InitialState,
input: String,
last_start_tag: Option<String>,
expected_output: Vec<Token>,
}
#[derive(serde::Deserialize)]
struct TestCollection {
tests: Vec<TestDescription>,
}
#[derive(serde::Deserialize)]
#[serde(rename_all = "camelCase")]
struct TestDescription {
description: String,
input: String,
#[serde(default)]
double_escaped: bool,
#[serde(default)]
initial_states: Vec<String>,
#[serde(default)]
last_start_tag: String,
output: Vec<Vec<Value>>,
}
fn unescape(double_escaped: bool, s: &str) -> String {
if !double_escaped {
return s.to_string();
}
let mut out = String::with_capacity(s.len());
let mut iter = s.chars().peekable();
loop {
match iter.next() {
None => {
return out;
}
Some('\\') => {
assert_eq!(iter.next(), Some('u'), "invalid escape sequence");
let hex: String = iter.by_ref().take(4).collect();
match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
None => panic!("invalid utf-8 codepoint: {}", hex),
Some(c) => out.push(c),
}
}
Some(c) => out.push(c),
}
}
}
fn to_test_case(category: &str, desc: TestDescription) -> Vec<TestCase> {
let initial_states: Vec<_> = if !desc.initial_states.is_empty() {
desc.initial_states
.into_iter()
.map(|s| match s.as_str() {
"Data state" => InitialState::Data,
"PLAINTEXT state" => InitialState::Plaintext,
"RCDATA state" => InitialState::RcData,
"RAWTEXT state" => InitialState::RawText,
"Script data state" => InitialState::ScriptData,
"CDATA section state" => InitialState::CdataSection,
s => panic!("unrecognized initial state {s}"),
})
.collect()
} else {
vec![InitialState::Data]
};
let input = unescape(desc.double_escaped, &desc.input);
let mut expected_tokens = Vec::new();
for mut items in desc.output {
let tok = match items[0].as_str().unwrap() {
"Character" => {
let text = items[1].as_str().unwrap().to_string();
Token::Character {
data: unescape(desc.double_escaped, &text),
}
}
"Comment" => {
let comment = items[1].as_str().unwrap().to_string();
Token::Comment {
data: unescape(desc.double_escaped, &comment),
}
}
"EndTag" => {
let name = items[1].as_str().unwrap().to_string();
Token::EndTag {
name: unescape(desc.double_escaped, &name),
}
}
"StartTag" => {
let name = items[1].as_str().unwrap().to_string();
let attributes = items[2]
.as_object()
.unwrap()
.iter()
.map(|(k, v)| (k.clone(), v.as_str().unwrap().to_string()))
.collect();
let self_closing = items
.get(3)
.map(|v| v.as_bool().unwrap())
.unwrap_or_default();
Token::StartTag {
name: unescape(desc.double_escaped, &name),
attributes,
self_closing,
}
}
"DOCTYPE" => {
let name = match &mut items[1] {
Value::Null => None,
Value::String(s) => Some(unescape(desc.double_escaped, s)),
c => panic!("unexpected doctype name value: {:?}", c),
};
let public_id = match &mut items[2] {
Value::Null => None,
Value::String(s) => Some(unescape(desc.double_escaped, s)),
c => panic!("unexpected doctype public id value: {:?}", c),
};
let system_id = match &mut items[3] {
Value::Null => None,
Value::String(s) => Some(unescape(desc.double_escaped, s)),
c => panic!("unexpected doctype system id value: {:?}", c),
};
let correctness = !items[4].as_bool().unwrap();
Token::Doctype {
name,
public_id,
system_id,
correctness,
}
}
c => panic!("unexpected test token type: {}", c),
};
expected_tokens.push(tok);
}
initial_states
.into_iter()
.map(|s| {
let name = format!(
"[{}] {} // Initial State: {:?}",
&category, desc.description, &s,
);
TestCase {
name,
initial_state: s,
input: input.clone(),
last_start_tag: if desc.last_start_tag.is_empty() {
None
} else {
Some(desc.last_start_tag.clone())
},
expected_output: expected_tokens.clone(),
}
})
.collect::<Vec<_>>()
}
fn run_test_category(category: &str) {
let path = Path::new("tokenizer")
.join("testfiles")
.join(format!("{}.test", category));
let json_str = String::from_utf8(fs::read(&path).unwrap()).unwrap();
let collection: TestCollection = serde_json::from_str(&json_str).unwrap();
let text_cases = collection
.tests
.into_iter()
.flat_map(|t| to_test_case(category, t))
.collect::<Vec<_>>();
for test_case in text_cases {
let mut tokenizer = Tokenizer::new(&test_case.input);
match test_case.initial_state {
InitialState::CdataSection => tokenizer.switch_to(state_cdata_section),
InitialState::Plaintext => tokenizer.switch_to(state_plaintext),
InitialState::RawText => tokenizer.switch_to(state_raw_text),
InitialState::RcData => tokenizer.switch_to(state_rc_data),
InitialState::ScriptData => tokenizer.switch_to(state_script_data),
InitialState::Data => {}
};
tokenizer.set_last_emitted_start_tag_name(test_case.last_start_tag);
let mut tokenizer_tokens = Vec::new();
let max_tokens = 1000;
loop {
if tokenizer_tokens.len() > max_tokens {
panic!("no eof token after {} tokens", max_tokens);
}
let next = tokenizer.next_token();
if matches!(next.kind, TokenKind::Eof) {
break;
}
tokenizer_tokens.push(next);
}
let mut tokens = Vec::new();
for tokenizer_token in tokenizer_tokens {
match tokenizer_token.kind {
TokenKind::Comment(s) => tokens.push(Token::Comment { data: s }),
TokenKind::Doctype(d) => tokens.push(Token::Doctype {
name: d.name,
correctness: d.force_quirks,
public_id: d.public_identifier,
system_id: d.system_identifier,
}),
TokenKind::Tag(t) => tokens.push(match &t.kind {
TagKind::Start => Token::StartTag {
name: t.name,
attributes: {
let mut attr: Vec<_> = t
.attributes
.into_iter()
.map(|a| (a.name, a.value))
.collect();
attr.sort_by(|a, b| a.0.cmp(&b.0));
attr
},
self_closing: t.self_closing,
},
TagKind::End => Token::EndTag { name: t.name },
}),
TokenKind::Text(t) => tokens.push(Token::Character { data: t }),
TokenKind::Eof => unreachable!(),
};
}
assert_eq!(tokens, test_case.expected_output, "{}", test_case.name);
}
}
#[test]
fn content_model_flags() {
run_test_category("contentModelFlags");
}
#[test]
fn domjs() {
run_test_category("domjs");
}
#[test]
fn entities() {
run_test_category("entities");
}
#[test]
fn named_entities() {
run_test_category("namedEntities");
}
#[test]
fn numeric_entities() {
run_test_category("numericEntities");
}
#[test]
fn pending_spec_changes() {
run_test_category("pendingSpecChanges");
}
#[test]
fn test_1234() {
run_test_category("test1");
run_test_category("test2");
run_test_category("test3");
run_test_category("test4");
}
#[test]
fn unicode_chars() {
run_test_category("unicodeChars");
}
#[test]
#[ignore = "this crate only supports utf8 input"]
fn unicode_chars_problematic() {
run_test_category("unicodeCharsProblematic");
}
#[test]
#[ignore = "todo: figure out what these are actually testing"]
fn xml_violation() {
run_test_category("xmlViolation");
}