xhtml_minimizer 0.1.4

Minimize XHTML files.
Documentation
use xml_tokens::serializer::*;
use xml_tokens::*;

pub struct XHTMLSerializer {
    keep_references: bool,
    keep_whitespace: bool,
    keep_comments: bool,
    keep_xml_declaration: bool,
}
impl XHTMLSerializer {
    pub fn new(
        keep_references: bool,
        keep_whitespace: bool,
        keep_comments: bool,
        keep_xml_declaration: bool,
    ) -> XHTMLSerializer {
        XHTMLSerializer {
            keep_references,
            keep_whitespace,
            keep_comments,
            keep_xml_declaration,
        }
    }

    pub fn serialize(&self, tokens: &Vec<Token>) -> String {
        let mut xml = String::new();
        let mut i = 0;
        let length = tokens.len();

        while i < length {
            match &tokens[i] {
                Token::XMLDeclStart => {
                    if self.keep_xml_declaration {
                        xml.push_str("<?xml");
                    }
                }
                Token::XMLVersion(xml_version) => {
                    if self.keep_xml_declaration {
                        match xml_version {
                            XMLVersion::Version1_0 => xml.push_str(" version=\"1.0\""),
                            XMLVersion::Version1_1 => xml.push_str(" version=\"1.1\""),
                        }
                    }
                }
                Token::XMLEncoding(enc_name) => {
                    if self.keep_xml_declaration {
                        xml.push_str(&format!(" encoding=\"{}\"", enc_name.get_as_str()));
                    }
                }
                Token::XMLStandalone(standalone) => {
                    if self.keep_xml_declaration {
                        if *standalone {
                            xml.push_str(" standalone=\"yes\"");
                        } else {
                            xml.push_str(" standalone=\"no\"");
                        }
                    }
                }
                Token::XMLDeclEnd => {
                    if self.keep_xml_declaration {
                        xml.push_str(XML_DECL_END_AS_STR);
                    }
                }
                Token::DoctypeDeclStart => xml.push_str("<!DOCTYPE "),
                Token::DoctypeName(name) => xml.push_str(name.get_as_str()),
                Token::DoctypeDeclEnd => xml.push_str(DOCTYPE_DECL_END_AS_STR),
                Token::Comment(comment) => {
                    if self.keep_comments {
                        xml.push_str(&format!("<!--{}-->", comment.get_as_str()))
                    }
                }
                Token::PIStart => xml.push_str("<?"),
                Token::PITarget(target) => xml.push_str(target.get_as_str()),
                Token::PIData(data) => xml.push_str(data.get_as_str()),
                Token::PIEnd => xml.push_str(PI_END_AS_STR),
                Token::ElementStart(qname) => match qname.get_prefix_as_str() {
                    Some(prefix) => {
                        xml.push_str(&format!("<{}:{}", prefix, qname.get_local_part_as_str()))
                    }
                    None => xml.push_str(&format!("<{}", qname.get_local_part_as_str())),
                },
                Token::ElementEmptyEnd => xml.push_str("/>"),
                Token::ElementSTagEnd => xml.push_str(ELEMENT_STAG_END_AS_STR),
                Token::ElementEnd(qname) => match qname.get_prefix_as_str() {
                    Some(prefix) => {
                        xml.push_str(&format!("</{}:{}>", prefix, qname.get_local_part_as_str()))
                    }
                    None => xml.push_str(&format!("</{}>", qname.get_local_part_as_str())),
                },
                Token::AttributeStart => {}
                Token::AttributeName(qname) => match qname.get_prefix_as_str() {
                    Some(prefix) => {
                        xml.push_str(&format!(" {}:{}", prefix, qname.get_local_part_as_str()))
                    }
                    None => xml.push_str(&format!(" {}", qname.get_local_part_as_str())),
                },
                Token::AttributeValueStart => xml.push_str("=\""),
                Token::AttributeValue(attribute_value) => {
                    xml.push_str(attribute_value.get_as_str())
                }
                Token::AttributeValueEnd => {}
                Token::AttributeEnd => xml.push_str("\""),
                Token::NamespaceStart => xml.push_str(" xmlns"),
                Token::NamespaceDefault => {}
                Token::NamespacePrefix(nc_name) => xml.push_str(nc_name.get_as_str()),
                Token::NamespaceValue(namespace_value) => {
                    xml.push_str(&format!("=\"{}\"", namespace_value.get_as_str()))
                }
                Token::NamespaceEnd => {}
                Token::Text(text) => {
                    if self.keep_whitespace {
                        xml.push_str(text.get_as_str());
                    } else {
                        let allow_head_whitespace: bool;
                        if i == 0 {
                            allow_head_whitespace = false;
                        } else {
                            allow_head_whitespace = XHTMLSerializer::allow_following_whitespace(
                                XHTMLSerializer::get_previous_token(tokens, i, self.keep_comments),
                            );
                        }

                        let allow_tail_whitespace: bool;
                        if i == 0 {
                            allow_tail_whitespace = false;
                        } else {
                            allow_tail_whitespace = XHTMLSerializer::allow_preceeding_whitespace(
                                XHTMLSerializer::get_next_token(tokens, i, self.keep_comments),
                            );
                        }

                        if allow_head_whitespace && allow_tail_whitespace {
                            xml.push_str(&text.deduplicate_whitespace());
                        } else if allow_head_whitespace {
                            xml.push_str(&text.normalize_space_deduplicate_head());
                        } else if allow_tail_whitespace {
                            xml.push_str(&text.normalize_space_deduplicate_tail());
                        } else {
                            xml.push_str(&text.normalize_space());
                        }
                    }
                }
                Token::CDATASection(cdata) => {
                    xml.push_str(&format!("<![CDATA[{}]]>", cdata.get_as_str()))
                }
                Token::EntityRef(name) => {
                    if self.keep_references {
                        xml.push_str(&format!("&{};", name.get_as_str()));
                    } else {
                        // TODO check character can be legally expanded
                        // https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
                        xml.push_str(&format!("&{};", name.get_as_str()));
                    }
                }
                Token::DecCharRef(dec_char_ref) => {
                    if self.keep_references {
                        xml.push_str(&format!("&#{};", dec_char_ref.get_as_u32()))
                    } else {
                        // TODO check character can be legally expanded
                        xml.push_str(&format!("{}", dec_char_ref.get_as_char()))
                    }
                }
                Token::HexCharRef(hex_char_ref) => {
                    if self.keep_references {
                        xml.push_str(&format!("&#x{};", hex_char_ref.get_as_u32()))
                    } else {
                        // TODO check character can be legally expanded
                        xml.push_str(&format!("{}", hex_char_ref.get_as_char()))
                    }
                }
            }

            i += 1;
        }

        xml
    }

    fn get_previous_token(tokens: &Vec<Token>, index: usize, keep_comments: bool) -> &Token {
        let mut previous_index = index - 1;

        while previous_index > 0 {
            match &tokens[previous_index] {
                Token::Comment(_comment) => {
                    if keep_comments {
                        return &tokens[previous_index];
                    } else {
                        previous_index -= 1;
                    }
                }
                _ => {
                    return &tokens[previous_index];
                }
            }
        }

        &tokens[0]
    }

    fn get_next_token(tokens: &Vec<Token>, index: usize, keep_comments: bool) -> &Token {
        let mut next_index = index + 1;

        while next_index < tokens.len() {
            match &tokens[next_index] {
                Token::Comment(_comment) => {
                    if keep_comments {
                        return &tokens[next_index];
                    } else {
                        next_index += 1;
                    }
                }
                _ => {
                    return &tokens[next_index];
                }
            }
        }

        &tokens[tokens.len() - 1]
    }

    fn allow_preceeding_whitespace(token: &Token) -> bool {
        match token {
            Token::ElementStart(_qname) => {
                return XHTMLSerializer::is_inline_element_local_name(
                    _qname.get_local_part_as_str(),
                );
            }
            Token::CDATASection(_cdata_section) => true,
            Token::Comment(_comment) => true,
            Token::EntityRef(_name) => true,
            Token::DecCharRef(_dec_char_ref) => true,
            Token::HexCharRef(_hex_char_ref) => true,
            Token::PIStart => true,
            _ => false,
        }
    }

    fn allow_following_whitespace(token: &Token) -> bool {
        match token {
            Token::ElementEnd(_qname) => {
                return XHTMLSerializer::is_inline_element_local_name(
                    _qname.get_local_part_as_str(),
                );
            }
            Token::CDATASection(_cdata_section) => true,
            Token::Comment(_comment) => true,
            Token::EntityRef(_name) => true,
            Token::DecCharRef(_dec_char_ref) => true,
            Token::HexCharRef(_hex_char_ref) => true,
            Token::PIEnd => true,
            _ => false,
        }
    }

    fn is_inline_element_local_name(local_name: &str) -> bool {
        match local_name {
            "a" => true,
            "abbr" => true,
            "b" => true,
            "bdi" => true,
            "cite" => true,
            "code" => true,
            "data" => true,
            "dfn" => true,
            "em" => true,
            "i" => true,
            "kbd" => true,
            "mark" => true,
            "q" => true,
            "s" => true,
            "samp" => true,
            "small" => true,
            "span" => true,
            "strong" => true,
            "sub" => true,
            "sup" => true,
            "time" => true,
            "u" => true,
            "var" => true,
            _ => false,
        }
    }
}

#[cfg(test)]
mod tests {
    #[test]
    fn it_works() {
        assert_eq!(2 + 2, 4);
    }
}