htmlite 0.20.0

An HTML manipulation toolkit
Documentation
use crate::tokenizer::{
    TagKind, TokenKind, Tokenizer, state_cdata_section, state_plaintext, state_raw_text,
    state_rc_data, state_script_data,
};
use serde_json::Value;
use std::fs;
use std::path::Path;

#[derive(Debug, Clone, PartialEq, Eq)]
enum Token {
    Doctype {
        name: Option<String>,
        public_id: Option<String>,
        system_id: Option<String>,
        correctness: bool,
    },
    StartTag {
        name: String,
        attributes: Vec<(String, String)>,
        self_closing: bool,
    },
    EndTag {
        name: String,
    },
    Comment {
        data: String,
    },
    Character {
        data: String,
    },
}

#[derive(Debug)]
enum InitialState {
    Data,
    Plaintext,
    RcData,
    RawText,
    ScriptData,
    CdataSection,
}

#[derive(Debug)]
struct TestCase {
    name: String,
    initial_state: InitialState,
    input: String,
    last_start_tag: Option<String>,
    expected_output: Vec<Token>,
}

#[derive(serde::Deserialize)]
struct TestCollection {
    tests: Vec<TestDescription>,
}

#[derive(serde::Deserialize)]
#[serde(rename_all = "camelCase")]
struct TestDescription {
    description: String,
    input: String,
    #[serde(default)]
    double_escaped: bool,
    #[serde(default)]
    initial_states: Vec<String>,
    #[serde(default)]
    last_start_tag: String,
    output: Vec<Vec<Value>>,
}

// Undo the escaping in "doubleEscaped" tests.
fn unescape(double_escaped: bool, s: &str) -> String {
    if !double_escaped {
        return s.to_string();
    }

    let mut out = String::with_capacity(s.len());
    let mut iter = s.chars().peekable();
    loop {
        match iter.next() {
            None => {
                return out;
            }
            Some('\\') => {
                assert_eq!(iter.next(), Some('u'), "invalid escape sequence");
                let hex: String = iter.by_ref().take(4).collect();
                match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
                    // Test files surrogate code points are already skipped.
                    None => panic!("invalid utf-8 codepoint: {}", hex),
                    Some(c) => out.push(c),
                }
            }
            Some(c) => out.push(c),
        }
    }
}

fn to_test_case(category: &str, desc: TestDescription) -> Vec<TestCase> {
    let initial_states: Vec<_> = if !desc.initial_states.is_empty() {
        desc.initial_states
            .into_iter()
            .map(|s| match s.as_str() {
                "Data state" => InitialState::Data,
                "PLAINTEXT state" => InitialState::Plaintext,
                "RCDATA state" => InitialState::RcData,
                "RAWTEXT state" => InitialState::RawText,
                "Script data state" => InitialState::ScriptData,
                "CDATA section state" => InitialState::CdataSection,
                s => panic!("unrecognized initial state {s}"),
            })
            .collect()
    } else {
        vec![InitialState::Data]
    };

    let input = unescape(desc.double_escaped, &desc.input);

    let mut expected_tokens = Vec::new();

    for mut items in desc.output {
        let tok = match items[0].as_str().unwrap() {
            "Character" => {
                let text = items[1].as_str().unwrap().to_string();
                Token::Character {
                    data: unescape(desc.double_escaped, &text),
                }
            }
            "Comment" => {
                let comment = items[1].as_str().unwrap().to_string();
                Token::Comment {
                    data: unescape(desc.double_escaped, &comment),
                }
            }
            "EndTag" => {
                let name = items[1].as_str().unwrap().to_string();
                Token::EndTag {
                    name: unescape(desc.double_escaped, &name),
                }
            }
            "StartTag" => {
                let name = items[1].as_str().unwrap().to_string();

                let attributes = items[2]
                    .as_object()
                    .unwrap()
                    .iter()
                    .map(|(k, v)| (k.clone(), v.as_str().unwrap().to_string()))
                    .collect();

                let self_closing = items
                    .get(3)
                    .map(|v| v.as_bool().unwrap())
                    .unwrap_or_default();

                Token::StartTag {
                    name: unescape(desc.double_escaped, &name),
                    attributes,
                    self_closing,
                }
            }
            "DOCTYPE" => {
                let name = match &mut items[1] {
                    Value::Null => None,
                    Value::String(s) => Some(unescape(desc.double_escaped, s)),
                    c => panic!("unexpected doctype name value: {:?}", c),
                };

                let public_id = match &mut items[2] {
                    Value::Null => None,
                    Value::String(s) => Some(unescape(desc.double_escaped, s)),
                    c => panic!("unexpected doctype public id value: {:?}", c),
                };
                let system_id = match &mut items[3] {
                    Value::Null => None,
                    Value::String(s) => Some(unescape(desc.double_escaped, s)),
                    c => panic!("unexpected doctype system id value: {:?}", c),
                };
                let correctness = !items[4].as_bool().unwrap();
                Token::Doctype {
                    name,
                    public_id,
                    system_id,
                    correctness,
                }
            }
            c => panic!("unexpected test token type: {}", c),
        };

        expected_tokens.push(tok);
    }

    initial_states
        .into_iter()
        .map(|s| {
            let name = format!(
                "[{}] {} // Initial State: {:?}",
                &category, desc.description, &s,
            );

            TestCase {
                name,
                initial_state: s,
                input: input.clone(),
                last_start_tag: if desc.last_start_tag.is_empty() {
                    None
                } else {
                    Some(desc.last_start_tag.clone())
                },
                expected_output: expected_tokens.clone(),
            }
        })
        .collect::<Vec<_>>()
}

fn run_test_category(category: &str) {
    let path = Path::new("tokenizer")
        .join("testfiles")
        .join(format!("{}.test", category));
    let json_str = String::from_utf8(fs::read(&path).unwrap()).unwrap();
    let collection: TestCollection = serde_json::from_str(&json_str).unwrap();

    let text_cases = collection
        .tests
        .into_iter()
        .flat_map(|t| to_test_case(category, t))
        .collect::<Vec<_>>();

    for test_case in text_cases {
        let mut tokenizer = Tokenizer::new(&test_case.input);
        match test_case.initial_state {
            InitialState::CdataSection => tokenizer.switch_to(state_cdata_section),
            InitialState::Plaintext => tokenizer.switch_to(state_plaintext),
            InitialState::RawText => tokenizer.switch_to(state_raw_text),
            InitialState::RcData => tokenizer.switch_to(state_rc_data),
            InitialState::ScriptData => tokenizer.switch_to(state_script_data),
            InitialState::Data => {}
        };
        tokenizer.set_last_emitted_start_tag_name(test_case.last_start_tag);

        let mut tokenizer_tokens = Vec::new();
        // Don't let a broken implementation cause the test to loop forever;
        let max_tokens = 1000;
        loop {
            if tokenizer_tokens.len() > max_tokens {
                panic!("no eof token after {} tokens", max_tokens);
            }

            let next = tokenizer.next_token();
            if matches!(next.kind, TokenKind::Eof) {
                break;
            }
            tokenizer_tokens.push(next);
        }

        let mut tokens = Vec::new();

        for tokenizer_token in tokenizer_tokens {
            match tokenizer_token.kind {
                TokenKind::Comment(s) => tokens.push(Token::Comment { data: s }),
                TokenKind::Doctype(d) => tokens.push(Token::Doctype {
                    name: d.name,
                    correctness: d.force_quirks,
                    public_id: d.public_identifier,
                    system_id: d.system_identifier,
                }),
                TokenKind::Tag(t) => tokens.push(match &t.kind {
                    TagKind::Start => Token::StartTag {
                        name: t.name,
                        attributes: {
                            let mut attr: Vec<_> = t
                                .attributes
                                .into_iter()
                                .map(|a| (a.name, a.value))
                                .collect();
                            attr.sort_by(|a, b| a.0.cmp(&b.0));
                            attr
                        },
                        self_closing: t.self_closing,
                    },
                    TagKind::End => Token::EndTag { name: t.name },
                }),
                TokenKind::Text(t) => tokens.push(Token::Character { data: t }),
                TokenKind::Eof => unreachable!(),
            };
        }

        assert_eq!(tokens, test_case.expected_output, "{}", test_case.name);
    }
}

#[test]
fn content_model_flags() {
    run_test_category("contentModelFlags");
}

#[test]
fn domjs() {
    run_test_category("domjs");
}

#[test]
fn entities() {
    run_test_category("entities");
}

#[test]
fn named_entities() {
    run_test_category("namedEntities");
}

#[test]
fn numeric_entities() {
    run_test_category("numericEntities");
}

#[test]
fn pending_spec_changes() {
    run_test_category("pendingSpecChanges");
}

#[test]
fn test_1234() {
    run_test_category("test1");
    run_test_category("test2");
    run_test_category("test3");
    run_test_category("test4");
}

#[test]
fn unicode_chars() {
    run_test_category("unicodeChars");
}

#[test]
#[ignore = "this crate only supports utf8 input"]
fn unicode_chars_problematic() {
    run_test_category("unicodeCharsProblematic");
}

#[test]
#[ignore = "todo: figure out what these are actually testing"]
fn xml_violation() {
    run_test_category("xmlViolation");
}