extern crate rustc_serialize;
#[macro_use] extern crate string_cache;
extern crate tendril;
extern crate test;
extern crate html5ever;
mod foreach_html5lib_test;
use foreach_html5lib_test::foreach_html5lib_test;
use std::{char, env};
use std::ffi::OsStr;
use std::mem::replace;
use std::default::Default;
use std::path::Path;
use test::{TestDesc, TestDescAndFn, DynTestName, DynTestFn};
use test::ShouldPanic::No;
use rustc_serialize::json::Json;
use std::collections::BTreeMap;
use std::borrow::Cow::Borrowed;
use html5ever::tokenizer::{Doctype, Attribute, StartTag, EndTag, Tag};
use html5ever::tokenizer::{Token, DoctypeToken, TagToken, CommentToken};
use html5ever::tokenizer::{CharacterTokens, NullCharacterToken, EOFToken, ParseError};
use html5ever::tokenizer::{TokenSink, Tokenizer, TokenizerOpts};
use html5ever::tokenizer::states::{Plaintext, RawData, Rcdata, Rawtext};
use string_cache::{Atom, QualName};
use tendril::{StrTendril, SliceExt};
fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
if n == 1 {
return vec!(vec!(s.to_tendril()));
}
let mut points: Vec<usize> = s.char_indices().map(|(n,_)| n).collect();
points.push(s.len());
let mut out = vec!();
for p in points.into_iter() {
let y = &s[p..];
for mut x in splits(&s[..p], n-1).into_iter() {
x.push(y.to_tendril());
out.push(x);
}
}
out.extend(splits(s, n-1).into_iter());
out
}
struct TokenLogger {
tokens: Vec<Token>,
current_str: StrTendril,
exact_errors: bool,
}
impl TokenLogger {
fn new(exact_errors: bool) -> TokenLogger {
TokenLogger {
tokens: vec!(),
current_str: StrTendril::new(),
exact_errors: exact_errors,
}
}
fn push(&mut self, token: Token) {
self.finish_str();
self.tokens.push(token);
}
fn finish_str(&mut self) {
if self.current_str.len() > 0 {
let s = replace(&mut self.current_str, StrTendril::new());
self.tokens.push(CharacterTokens(s));
}
}
fn get_tokens(mut self) -> Vec<Token> {
self.finish_str();
self.tokens
}
}
impl TokenSink for TokenLogger {
fn process_token(&mut self, token: Token) {
match token {
CharacterTokens(b) => {
self.current_str.push_slice(&b);
}
NullCharacterToken => {
self.current_str.push_char('\0');
}
ParseError(_) => if self.exact_errors {
self.push(ParseError(Borrowed("")));
},
TagToken(mut t) => {
match t.kind {
EndTag => {
t.self_closing = false;
t.attrs = vec!();
}
_ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
}
self.push(TagToken(t));
}
EOFToken => (),
_ => self.push(token),
}
}
}
fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> {
let sink = TokenLogger::new(opts.exact_errors);
let mut tok = Tokenizer::new(sink, opts);
for chunk in input.into_iter() {
tok.feed(chunk);
}
tok.end();
tok.unwrap().get_tokens()
}
trait JsonExt: Sized {
fn get_str(&self) -> String;
fn get_tendril(&self) -> StrTendril;
fn get_nullable_tendril(&self) -> Option<StrTendril>;
fn get_bool(&self) -> bool;
fn get_obj<'t>(&'t self) -> &'t BTreeMap<String, Self>;
fn get_list<'t>(&'t self) -> &'t Vec<Self>;
fn find<'t>(&'t self, key: &str) -> &'t Self;
}
impl JsonExt for Json {
fn get_str(&self) -> String {
match *self {
Json::String(ref s) => s.to_string(),
_ => panic!("Json::get_str: not a String"),
}
}
fn get_tendril(&self) -> StrTendril {
match *self {
Json::String(ref s) => s.to_tendril(),
_ => panic!("Json::get_tendril: not a String"),
}
}
fn get_nullable_tendril(&self) -> Option<StrTendril> {
match *self {
Json::Null => None,
Json::String(ref s) => Some(s.to_tendril()),
_ => panic!("Json::get_nullable_tendril: not a String"),
}
}
fn get_bool(&self) -> bool {
match *self {
Json::Boolean(b) => b,
_ => panic!("Json::get_bool: not a Boolean"),
}
}
fn get_obj<'t>(&'t self) -> &'t BTreeMap<String, Json> {
match *self {
Json::Object(ref m) => &*m,
_ => panic!("Json::get_obj: not an Object"),
}
}
fn get_list<'t>(&'t self) -> &'t Vec<Json> {
match *self {
Json::Array(ref m) => m,
_ => panic!("Json::get_list: not an Array"),
}
}
fn find<'t>(&'t self, key: &str) -> &'t Json {
self.get_obj().get(&key.to_string()).unwrap()
}
}
fn json_to_token(js: &Json) -> Token {
let parts = js.get_list();
let args: Vec<&Json> = parts[1..].iter().collect();
match &*parts[0].get_str() {
"DOCTYPE" => DoctypeToken(Doctype {
name: args[0].get_nullable_tendril(),
public_id: args[1].get_nullable_tendril(),
system_id: args[2].get_nullable_tendril(),
force_quirks: !args[3].get_bool(),
}),
"StartTag" => TagToken(Tag {
kind: StartTag,
name: Atom::from(&*args[0].get_str()),
attrs: args[1].get_obj().iter().map(|(k,v)| {
Attribute {
name: QualName::new(ns!(), Atom::from(&**k)),
value: v.get_tendril()
}
}).collect(),
self_closing: match args.get(2) {
Some(b) => b.get_bool(),
None => false,
}
}),
"EndTag" => TagToken(Tag {
kind: EndTag,
name: Atom::from(&*args[0].get_str()),
attrs: vec!(),
self_closing: false
}),
"Comment" => CommentToken(args[0].get_tendril()),
"Character" => CharacterTokens(args[0].get_tendril()),
_ => panic!("don't understand token {:?}", parts),
}
}
fn json_to_tokens(js: &Json, exact_errors: bool) -> Vec<Token> {
let mut sink = TokenLogger::new(exact_errors);
for tok in js.get_list().iter() {
match *tok {
Json::String(ref s)
if &s[..] == "ParseError" => sink.process_token(ParseError(Borrowed(""))),
_ => sink.process_token(json_to_token(tok)),
}
}
sink.get_tokens()
}
fn unescape(s: &str) -> Option<String> {
let mut out = String::with_capacity(s.len());
let mut it = s.chars().peekable();
loop {
match it.next() {
None => return Some(out),
Some('\\') => {
if it.peek() != Some(&'u') {
panic!("can't understand escape");
}
drop(it.next());
let hex: String = it.by_ref().take(4).collect();
match u32::from_str_radix(&hex, 16).ok()
.and_then(char::from_u32) {
None => return None,
Some(c) => out.push(c),
}
}
Some(c) => out.push(c),
}
}
}
fn unescape_json(js: &Json) -> Json {
match *js {
Json::String(ref s) => Json::String(unescape(&s).unwrap()),
Json::Array(ref xs) => Json::Array(xs.iter().map(unescape_json).collect()),
Json::Object(ref obj) => {
let mut new_obj = BTreeMap::new();
for (k,v) in obj.iter() {
new_obj.insert(k.clone(), unescape_json(v));
}
Json::Object(new_obj)
}
_ => js.clone(),
}
}
fn mk_test(desc: String, input: String, expect: Vec<Token>, opts: TokenizerOpts)
-> TestDescAndFn {
TestDescAndFn {
desc: TestDesc {
name: DynTestName(desc),
ignore: false,
should_panic: No,
},
testfn: DynTestFn(Box::new(move || {
let insplits = splits(&input, 3);
for input in insplits.into_iter() {
let output = tokenize(input.clone(), opts.clone());
if output != expect {
panic!("\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
input, output, expect);
}
}
})),
}
}
fn mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Json) {
let obj = js.get_obj();
let mut input = js.find("input").unwrap().get_str();
let mut expect = js.find("output").unwrap().clone();
let desc = format!("tok: {}: {}",
filename, js.find("description").unwrap().get_str());
if obj.get(&"doubleEscaped".to_string()).map_or(false, |j| j.get_bool()) {
match unescape(&input) {
None => return,
Some(i) => input = i,
}
expect = unescape_json(&expect);
}
let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());
let state_overrides = match obj.get(&"initialStates".to_string()) {
Some(&Json::Array(ref xs)) => xs.iter().map(|s|
Some(match &s.get_str()[..] {
"PLAINTEXT state" => Plaintext,
"RAWTEXT state" => RawData(Rawtext),
"RCDATA state" => RawData(Rcdata),
s => panic!("don't know state {}", s),
})).collect(),
None => vec!(None),
_ => panic!("don't understand initialStates value"),
};
for state in state_overrides.into_iter() {
for &exact_errors in [false, true].iter() {
let mut newdesc = desc.clone();
match state {
Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
None => (),
};
if exact_errors {
newdesc = format!("{} (exact errors)", newdesc);
}
let expect_toks = json_to_tokens(&expect, exact_errors);
tests.push(mk_test(newdesc, input.clone(), expect_toks, TokenizerOpts {
exact_errors: exact_errors,
initial_state: state,
last_start_tag_name: start_tag.clone(),
discard_bom: false,
.. Default::default()
}));
}
}
}
fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
let mut tests = vec!();
foreach_html5lib_test(src_dir, "tokenizer",
OsStr::new("test"), |path, mut file| {
let js = Json::from_reader(&mut file).ok().expect("json parse error");
match js.get_obj().get(&"tests".to_string()) {
Some(&Json::Array(ref lst)) => {
for test in lst.iter() {
mk_tests(&mut tests, path.file_name().unwrap().to_str().unwrap(), test);
}
}
_ => (),
}
});
tests
}
fn main() {
let args: Vec<_> = env::args().collect();
test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
}