pub mod serializer;
use std::char;
const XML_DECL_START: [char; 5] = ['<', '?', 'x', 'm', 'l'];
const XML_DECL_VERSION: [char; 7] = ['v', 'e', 'r', 's', 'i', 'o', 'n'];
const XML_DECL_VERSION_PREFIX: [char; 2] = ['1', '.'];
const XML_DECL_ENCODING: [char; 8] = ['e', 'n', 'c', 'o', 'd', 'i', 'n', 'g'];
const XML_DECL_STANDALONE: [char; 10] = ['s', 't', 'a', 'n', 'd', 'a', 'l', 'o', 'n', 'e'];
const XML_DECL_END: [char; 2] = ['?', '>'];
const YES: [char; 3] = ['y', 'e', 's'];
const NO: [char; 2] = ['n', 'o'];
const COMMENT_START: [char; 4] = ['<', '!', '-', '-'];
const COMMENT_END: [char; 3] = ['-', '-', '>'];
const CDATA_START: [char; 9] = ['<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['];
const CDATA_END: [char; 3] = [']', ']', '>'];
const DOCTYPE_START: [char; 9] = ['<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E'];
const DOCTYPE_END: char = STAG_END;
const PI_START: [char; 2] = ['<', '?'];
const PI_END: [char; 2] = XML_DECL_END;
const EQUALS: char = '=';
const SINGLE_QUOTE: char = '\'';
const DOUBLE_QUOTE: char = '"';
const STAG_START: char = '<';
const STAG_END: char = '>';
const ETAG_START: [char; 2] = ['<', '/'];
const ETAG_END: char = STAG_END;
const EMPTY_TAG_END: [char; 2] = ['/', '>'];
const XMLNS: [char; 5] = ['x', 'm', 'l', 'n', 's'];
const ENTITY_REFERENCE_START: char = '&';
const HEXIDECIMAL_CHAR_REFERENCE_START: [char; 3] = ['&', '#', 'x'];
const DECIMAL_CHAR_REFERENCE_START: [char; 2] = ['&', '#'];
const REFERENCE_END: char = ';';
const HYPHEN: char = '-';
const COLON: char = ':';
#[derive(Debug, PartialEq)]
pub enum Token {
XMLDeclStart,
XMLVersion(XMLVersion),
XMLEncoding(EncName),
XMLStandalone(bool),
XMLDeclEnd,
DoctypeDeclStart,
DoctypeName(Name),
DoctypeDeclEnd,
Comment(Comment),
PIStart,
PITarget(PITarget),
PIData(PIData),
PIEnd,
ElementStart(QName),
ElementEmptyEnd,
ElementSTagEnd,
ElementEnd(QName),
AttributeStart,
AttributeName(QName),
AttributeValueStart,
AttributeValue(AttributeValue),
AttributeValueEnd,
AttributeEnd,
NamespaceStart,
NamespaceDefault,
NamespacePrefix(NCName),
NamespaceValue(NamespaceValue),
NamespaceEnd,
Text(Text),
CDATASection(CDATASection),
EntityRef(Name),
DecCharRef(DecCharRef),
HexCharRef(HexCharRef),
}
#[derive(Debug, PartialEq)]
pub enum XMLVersion {
Version1_0,
Version1_1,
}
#[derive(Debug, PartialEq)]
pub struct EncName {
enc_name: String,
}
impl EncName {
fn new_unvalidated(enc_name: String) -> EncName {
EncName { enc_name }
}
fn is_valid_start_char(c: char) -> bool {
match c {
'A'..='Z' => true,
'a'..='z' => true,
_ => false,
}
}
fn is_valid_char(c: char) -> bool {
match c {
'A'..='Z' => true,
'a'..='z' => true,
'0'..='9' => true,
'.' | '_' | '-' => true,
_ => false,
}
}
pub fn get_as_str(&self) -> &str {
&self.enc_name
}
}
#[derive(Debug, PartialEq)]
pub struct Name {
name: String,
}
impl Name {
fn new_unvalidated(name: String) -> Name {
Name { name }
}
fn is_valid_start_char(c: char) -> bool {
match c {
'a'..='z' => true,
'A'..='Z' => true,
':' | '_' => true,
'\u{C0}'..='\u{D6}' => true,
'\u{D8}'..='\u{F6}' => true,
'\u{F8}'..='\u{2FF}' => true,
'\u{370}'..='\u{37D}' => true,
'\u{37F}'..='\u{1FFF}' => true,
'\u{200C}'..='\u{200D}' => true,
'\u{2070}'..='\u{218F}' => true,
'\u{2C00}'..='\u{2FEF}' => true,
'\u{3001}'..='\u{D7FF}' => true,
'\u{F900}'..='\u{FDCF}' => true,
'\u{FDF0}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{EFFFF}' => true,
_ => false,
}
}
fn is_valid_char(c: char) -> bool {
match c {
'a'..='z' => true,
'A'..='Z' => true,
':' | '_' | '-' | '.' | '\u{B7}' => true,
'0'..='9' => true,
'\u{C0}'..='\u{D6}' => true,
'\u{D8}'..='\u{F6}' => true,
'\u{F8}'..='\u{2FF}' => true,
'\u{300}'..='\u{37D}' => true,
'\u{37F}'..='\u{1FFF}' => true,
'\u{200C}'..='\u{200D}' => true,
'\u{203F}'..='\u{2040}' => true,
'\u{2070}'..='\u{218F}' => true,
'\u{2C00}'..='\u{2FEF}' => true,
'\u{3001}'..='\u{D7FF}' => true,
'\u{F900}'..='\u{FDCF}' => true,
'\u{FDF0}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{EFFFF}' => true,
_ => false,
}
}
pub fn get_as_str(&self) -> &str {
&self.name
}
}
#[derive(Debug, PartialEq)]
pub struct Comment {
comment: String,
}
impl Comment {
fn new_unvalidated(comment: String) -> Comment {
Comment { comment }
}
fn is_valid_char_minus_hyphen(c: char, version: &XMLVersion) -> bool {
if c == HYPHEN {
return false;
}
is_xml_char(c, version)
}
pub fn get_as_str(&self) -> &str {
&self.comment
}
}
fn is_xml_char(c: char, version: &XMLVersion) -> bool {
match version {
XMLVersion::Version1_0 => match c {
'\u{9}' | '\u{A}' | '\u{D}' => true,
'\u{20}'..='\u{D7FF}' => true,
'\u{E000}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{10FFFF}' => true,
_ => false,
},
XMLVersion::Version1_1 => match c {
'\u{1}'..='\u{D7FF}' => true,
'\u{E000}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{10FFFF}' => true,
_ => false,
},
}
}
#[derive(Debug, PartialEq)]
pub struct PITarget {
target: String,
}
impl PITarget {
fn new_unvalidated(target: String) -> PITarget {
PITarget { target }
}
fn is_valid_start_char(c: char) -> bool {
Name::is_valid_start_char(c)
}
fn is_valid_char(c: char) -> bool {
Name::is_valid_char(c)
}
pub fn get_as_str(&self) -> &str {
&self.target
}
}
#[derive(Debug, PartialEq)]
pub struct PIData {
data: String,
}
impl PIData {
pub fn new_unvalidated(data: String) -> PIData {
PIData { data }
}
fn is_valid_start_char(c: char) -> bool {
is_whitespace(c)
}
fn is_valid_char(c: char, version: &XMLVersion) -> bool {
is_xml_char(c, version)
}
pub fn get_as_str(&self) -> &str {
&self.data
}
}
fn is_whitespace(c: char) -> bool {
match c {
'\u{9}' | '\u{A}' | '\u{D}' | '\u{20}' => true,
_ => false,
}
}
#[derive(Debug, PartialEq)]
pub struct QName {
prefix: Option<String>,
local_part: String,
}
impl QName {
fn new_unvalidated(prefix: Option<String>, local_part: String) -> QName {
QName { prefix, local_part }
}
fn is_valid_start_char(c: char) -> bool {
NCName::is_valid_start_char(c)
}
fn is_valid_char(c: char) -> bool {
NCName::is_valid_char(c)
}
pub fn get_prefix_as_str(&self) -> Option<&str> {
match &self.prefix {
Some(prefix) => Some(&prefix),
None => None,
}
}
pub fn get_local_part_as_str(&self) -> &str {
&self.local_part
}
}
#[derive(Debug, PartialEq)]
pub struct AttributeValue {
value: String,
}
impl AttributeValue {
fn new_unvalidated(value: String) -> AttributeValue {
AttributeValue { value }
}
fn is_valid_inside_single_quotes_char(c: char) -> bool {
match c {
STAG_START | ENTITY_REFERENCE_START | SINGLE_QUOTE => false,
_ => true,
}
}
fn is_valid_inside_double_quotes_char(c: char) -> bool {
match c {
STAG_START | ENTITY_REFERENCE_START | DOUBLE_QUOTE => false,
_ => true,
}
}
pub fn get_as_str(&self) -> &str {
&self.value
}
}
#[derive(Debug, PartialEq)]
pub struct NCName {
nc_name: String,
}
impl NCName {
fn new_unvalidated(nc_name: String) -> NCName {
NCName { nc_name }
}
fn is_valid_start_char(c: char) -> bool {
match c {
'a'..='z' => true,
'A'..='Z' => true,
'_' => true,
'\u{C0}'..='\u{D6}' => true,
'\u{D8}'..='\u{F6}' => true,
'\u{F8}'..='\u{2FF}' => true,
'\u{370}'..='\u{37D}' => true,
'\u{37F}'..='\u{1FFF}' => true,
'\u{200C}'..='\u{200D}' => true,
'\u{2070}'..='\u{218F}' => true,
'\u{2C00}'..='\u{2FEF}' => true,
'\u{3001}'..='\u{D7FF}' => true,
'\u{F900}'..='\u{FDCF}' => true,
'\u{FDF0}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{EFFFF}' => true,
_ => false,
}
}
fn is_valid_char(c: char) -> bool {
match c {
'a'..='z' => true,
'A'..='Z' => true,
'_' | '-' | '.' | '\u{B7}' => true,
'0'..='9' => true,
'\u{C0}'..='\u{D6}' => true,
'\u{D8}'..='\u{F6}' => true,
'\u{F8}'..='\u{2FF}' => true,
'\u{300}'..='\u{37D}' => true,
'\u{37F}'..='\u{1FFF}' => true,
'\u{200C}'..='\u{200D}' => true,
'\u{203F}'..='\u{2040}' => true,
'\u{2070}'..='\u{218F}' => true,
'\u{2C00}'..='\u{2FEF}' => true,
'\u{3001}'..='\u{D7FF}' => true,
'\u{F900}'..='\u{FDCF}' => true,
'\u{FDF0}'..='\u{FFFD}' => true,
'\u{10000}'..='\u{EFFFF}' => true,
_ => false,
}
}
pub fn get_as_str(&self) -> &str {
&self.nc_name
}
}
#[derive(Debug, PartialEq)]
pub struct NamespaceValue {
value: String,
}
impl NamespaceValue {
fn new_unvalidated(value: String) -> NamespaceValue {
NamespaceValue { value }
}
fn is_valid_char(c: char, version: &XMLVersion) -> bool {
match version {
XMLVersion::Version1_0 => match c {
':' | '/' | '?' | '#' | '[' | ']' | '@' => true,
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true,
'a'..='z' => true,
'A'..='Z' => true,
'0'..='9' => true,
'-' | '.' | '_' | '~' => true,
_ => false,
},
XMLVersion::Version1_1 => match c {
':' | '/' | '?' | '#' | '[' | ']' | '@' => true,
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true,
'a'..='z' => true,
'A'..='Z' => true,
'0'..='9' => true,
'-' | '.' | '_' | '~' => true,
'\u{A0}'..='\u{D7FF}' => true,
'\u{F900}'..='\u{FDCF}' => true,
'\u{FDF0}'..='\u{FFEF}' => true,
'\u{10000}'..='\u{1FFFD}' => true,
'\u{20000}'..='\u{2FFFD}' => true,
'\u{30000}'..='\u{3FFFD}' => true,
'\u{40000}'..='\u{4FFFD}' => true,
'\u{50000}'..='\u{5FFFD}' => true,
'\u{60000}'..='\u{6FFFD}' => true,
'\u{70000}'..='\u{7FFFD}' => true,
'\u{80000}'..='\u{8FFFD}' => true,
'\u{90000}'..='\u{9FFFD}' => true,
'\u{A0000}'..='\u{AFFFD}' => true,
'\u{B0000}'..='\u{BFFFD}' => true,
'\u{C0000}'..='\u{CFFFD}' => true,
'\u{D0000}'..='\u{DFFFD}' => true,
'\u{E1000}'..='\u{EFFFD}' => true,
'\u{E000}'..='\u{F8FF}' => true,
'\u{F0000}'..='\u{FFFFD}' => true,
'\u{100000}'..='\u{10FFFD}' => true,
_ => false,
},
}
}
pub fn get_as_str(&self) -> &str {
&self.value
}
}
#[derive(Debug, PartialEq)]
pub struct Text {
text: String,
}
impl Text {
fn new_unvalidated(text: String) -> Text {
Text { text }
}
fn is_valid_char(c: char) -> bool {
match c {
STAG_START | ENTITY_REFERENCE_START => false,
_ => true,
}
}
pub fn get_as_str(&self) -> &str {
&self.text
}
pub fn normalize_space(&self) -> String {
let collection: Vec<&str> = self.text.split_whitespace().collect();
collection.join(" ")
}
pub fn deduplicate_whitespace(&self) -> String {
let normalized_space = self.normalize_space();
if self.text.len() == 0 {
return normalized_space;
} else if normalized_space.len() == 0 {
return String::from(" ");
}
let whitespace_head: bool;
if is_whitespace(self.text.chars().next().unwrap()) {
whitespace_head = true;
} else {
whitespace_head = false;
}
let whitespace_tail: bool;
if is_whitespace(self.text.chars().last().unwrap()) {
whitespace_tail = true;
} else {
whitespace_tail = false;
}
if whitespace_head && whitespace_tail {
return format!(" {} ", normalized_space);
} else if whitespace_head {
return format!(" {}", normalized_space);
} else if whitespace_tail {
return format!("{} ", normalized_space);
} else {
return normalized_space;
}
}
pub fn normalize_space_deduplicate_head(&self) -> String {
let normalized_space = self.normalize_space();
if self.text.len() == 0 {
return normalized_space;
} else if normalized_space.len() == 0 {
return String::from(" ");
}
if is_whitespace(self.text.chars().next().unwrap()) {
return format!(" {}", normalized_space);
} else {
return normalized_space;
}
}
pub fn normalize_space_deduplicate_tail(&self) -> String {
let normalized_space = self.normalize_space();
if self.text.len() == 0 {
return normalized_space;
} else if normalized_space.len() == 0 {
return String::from(" ");
}
if is_whitespace(self.text.chars().last().unwrap()) {
return format!("{} ", normalized_space);
} else {
return normalized_space;
}
}
}
#[cfg(test)]
mod text_tests {
use super::*;
#[test]
fn normalize_space() {
let text = Text::new_unvalidated(String::from(" a b c "));
assert_eq!(text.normalize_space(), String::from("a b c"));
}
#[test]
fn deduplicate_whitespace() {
let text = Text::new_unvalidated(String::from(" a b c "));
assert_eq!(text.deduplicate_whitespace(), String::from(" a b c "));
}
#[test]
fn normalize_space_deduplicate_head() {
let text = Text::new_unvalidated(String::from(" a b c "));
assert_eq!(
text.normalize_space_deduplicate_head(),
String::from(" a b c")
);
}
#[test]
fn normalize_space_deduplicate_tail() {
let text = Text::new_unvalidated(String::from(" a b c "));
assert_eq!(
text.normalize_space_deduplicate_tail(),
String::from("a b c ")
);
}
}
#[derive(Debug, PartialEq)]
pub struct CDATASection {
data: String,
}
impl CDATASection {
fn new_unvalidated(data: String) -> CDATASection {
CDATASection { data }
}
fn is_valid_char(c: char, version: &XMLVersion) -> bool {
is_xml_char(c, version)
}
pub fn get_as_str(&self) -> &str {
&self.data
}
}
#[derive(Debug, PartialEq)]
pub struct DecCharRef {
character: char,
}
impl DecCharRef {
fn new_unvalidated(character: char) -> DecCharRef {
DecCharRef { character }
}
pub fn new_from_string(
dec_code: String,
version: &XMLVersion,
) -> Result<DecCharRef, ParseTokenError> {
match dec_code.parse::<u32>() {
Ok(u32_value) => match char::from_u32(u32_value) {
Some(c) => {
if is_xml_char(c, version) {
return Ok(DecCharRef::new_unvalidated(c));
}
}
None => {}
},
_ => {}
}
Err(ParseTokenError::new(ParseTokenErrorKind::DecCharRef))
}
pub fn get_as_char(&self) -> char {
self.character
}
pub fn get_as_u32(&self) -> u32 {
self.character as u32
}
}
#[cfg(test)]
mod dec_char_ref_tests {
use super::*;
#[test]
fn new_from_string_test() {
let result = DecCharRef::new_from_string(String::from("169"), &XMLVersion::Version1_0);
match result {
Ok(dec_char_ref) => {
assert_eq!(dec_char_ref.get_as_char(), '©');
}
Err(_error) => assert!(false),
}
}
}
#[derive(Debug, PartialEq)]
pub struct HexCharRef {
character: char,
}
impl HexCharRef {
fn new_unvalidated(character: char) -> HexCharRef {
HexCharRef { character }
}
pub fn new_from_string(
hex_code: String,
version: &XMLVersion,
) -> Result<HexCharRef, ParseTokenError> {
match u32::from_str_radix(&hex_code, 16) {
Ok(u32_value) => match char::from_u32(u32_value) {
Some(c) => {
if is_xml_char(c, version) {
return Ok(HexCharRef::new_unvalidated(c));
}
}
None => {}
},
_ => {}
}
Err(ParseTokenError::new(ParseTokenErrorKind::HexCharRef))
}
pub fn get_as_char(&self) -> char {
self.character
}
pub fn get_as_u32(&self) -> u32 {
self.character as u32
}
}
#[cfg(test)]
mod hex_char_ref_tests {
use super::*;
#[test]
fn new_from_string_test() {
let result = HexCharRef::new_from_string(String::from("1f61e"), &XMLVersion::Version1_0);
match result {
Ok(hex_char_ref) => {
assert_eq!(hex_char_ref.get_as_char(), '😞');
}
Err(_error) => assert!(false),
}
}
}
pub struct Tokenizer {
c: Vec<char>,
i: usize,
length: usize,
span_start: usize,
pub tokens: Vec<Token>,
version: XMLVersion,
warning_messages: Vec<String>,
pub error_messages: Vec<String>,
error: bool,
}
impl Tokenizer {
pub fn new(xml: String) -> Tokenizer {
let c: Vec<char> = xml.chars().collect();
let length = c.len();
Tokenizer {
c: c,
i: 0,
length: length,
span_start: 0,
tokens: Vec::new(),
version: XMLVersion::Version1_0,
warning_messages: Vec::new(),
error_messages: Vec::new(),
error: false,
}
}
pub fn tokenize_document(&mut self) -> bool {
self.munch_document()
}
fn munch_document(&mut self) -> bool {
self.munch_prolog() && self.munch_element() && self.munch_misc_asterisk()
}
fn munch_prolog(&mut self) -> bool {
self.munch_xml_decl_eroteme()
&& self.munch_misc_asterisk()
&& self.munch_doctypedecl_misc_asterisk_eroteme()
}
fn munch_xml_decl_eroteme(&mut self) -> bool {
self.munch_xml_decl();
!self.error
}
fn munch_xml_decl(&mut self) -> bool {
if !self.munch_xml_decl_start() {
return false;
}
if !self.munch_version_info() {
self.error("An XML declaration must have a version attribute.");
return false;
}
self.munch_encoding_decl();
if self.error {
return false;
}
self.munch_sd_decl();
if self.error {
return false;
}
self.munch_s_eroteme();
if !self.munch_xml_decl_end() {
self.error("An XML declaration must end with '?>'.");
return false;
} else {
return true;
}
}
fn munch_xml_decl_start(&mut self) -> bool {
if self.munch_sequence(&XML_DECL_START) {
self.tokens.push(Token::XMLDeclStart);
return true;
}
false
}
fn munch_version_info(&mut self) -> bool {
self.munch_s();
if !self.munch_version() {
return false;
}
if !self.munch_eq() {
self.error("Expected an '=' after version attribute name in XML declaration.");
return false;
}
let double_quotes: bool;
if self.munch_double_quote() {
double_quotes = true;
} else if self.munch_single_quote() {
double_quotes = false;
} else {
self.error("Expected a single or double quote.");
return false;
}
if !self.munch_version_num() {
self.error("Expected legal version number in XML declaration.");
}
if double_quotes && !self.munch_double_quote() {
self.error("Expected closing double quote following version value in XML declaration.");
return false;
}
if !double_quotes && !self.munch_single_quote() {
self.error("Expected closing single quote following version value in XML declaration.");
return false;
}
true
}
fn munch_s(&mut self) -> bool {
if self.i < self.length && is_whitespace(self.c[self.i]) {
self.i += 1;
return self.munch_s_eroteme();
}
false
}
fn munch_s_eroteme(&mut self) -> bool {
while self.i < self.length && is_whitespace(self.c[self.i]) {
self.i += 1;
}
true
}
fn munch_version(&mut self) -> bool {
self.munch_sequence(&XML_DECL_VERSION)
}
fn munch_eq(&mut self) -> bool {
self.munch_s_eroteme() && self.munch_character(EQUALS) && self.munch_s_eroteme()
}
fn munch_single_quote(&mut self) -> bool {
self.munch_character(SINGLE_QUOTE)
}
fn munch_double_quote(&mut self) -> bool {
self.munch_character(DOUBLE_QUOTE)
}
fn munch_version_num(&mut self) -> bool {
if self.munch_sequence(&XML_DECL_VERSION_PREFIX) {
self.start_span();
if self.munch_digits() {
let span = self.get_span(0);
match span.as_ref() {
"0" => {
self.version = XMLVersion::Version1_0;
self.tokens.push(Token::XMLVersion(XMLVersion::Version1_0));
return true;
}
"1" => {
self.version = XMLVersion::Version1_1;
self.tokens.push(Token::XMLVersion(XMLVersion::Version1_1));
return true;
}
_ => {
self.version = XMLVersion::Version1_1;
self.warning(&format!(
"Unknown XML version 1.{} tokenizing as if it were version '1.1'.",
span
));
}
}
}
}
false
}
fn munch_digits(&mut self) -> bool {
if self.i < self.length && is_digit(self.c[self.i]) {
self.i += 1;
while self.i < self.length && is_digit(self.c[self.i]) {
self.i += 1;
}
return true;
}
false
}
fn munch_encoding_decl(&mut self) -> bool {
self.munch_s();
if !self.munch_encoding() {
return false;
}
if !self.munch_eq() {
self.error("Expected an '=' after encoding attribute name in XML declaration.");
return false;
}
let double_quotes: bool;
if self.munch_double_quote() {
double_quotes = true;
} else if self.munch_single_quote() {
double_quotes = false;
} else {
self.error("Expected a single or double quote.");
return false;
}
if !self.munch_enc_name() {
self.error("Expected legal encoding value in XML declaration.");
return false;
}
if double_quotes && !self.munch_double_quote() {
self.error(
"Expected closing double quote following encoding value in XML declaration.",
);
return false;
}
if !double_quotes && !self.munch_single_quote() {
self.error(
"Expected closing single quote following encoding value in XML declaration.",
);
return false;
}
true
}
fn munch_encoding(&mut self) -> bool {
self.munch_sequence(&XML_DECL_ENCODING)
}
fn munch_enc_name(&mut self) -> bool {
if self.i < self.length && EncName::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length && EncName::is_valid_char(self.c[self.i]) {
self.i += 1;
}
let span = self.get_span(0);
self.tokens
.push(Token::XMLEncoding(EncName::new_unvalidated(span)));
return true;
}
false
}
fn munch_sd_decl(&mut self) -> bool {
self.munch_s();
if !self.munch_standalone() {
return false;
}
if !self.munch_eq() {
self.error("Expected an '=' after standalone attribute name in XML declaration.");
return false;
}
let double_quotes: bool;
if self.munch_double_quote() {
double_quotes = true;
} else if self.munch_single_quote() {
double_quotes = false;
} else {
self.error("Expected a single or double quote.");
return false;
}
if !self.munch_yes_no() {
self.error("Expected yes or no for standalone value in XML declaration.");
return false;
}
if double_quotes && !self.munch_double_quote() {
self.error(
"Expected closing double quote following standalone value in XML declaration.",
);
return false;
}
if !double_quotes && !self.munch_single_quote() {
self.error(
"Expected closing single quote following standalone value in XML declaration.",
);
return false;
}
true
}
fn munch_standalone(&mut self) -> bool {
self.munch_sequence(&XML_DECL_STANDALONE)
}
fn munch_yes_no(&mut self) -> bool {
if self.munch_sequence(&YES) {
self.tokens.push(Token::XMLStandalone(true));
} else if self.munch_sequence(&NO) {
self.tokens.push(Token::XMLStandalone(false));
} else {
return false;
}
true
}
fn munch_xml_decl_end(&mut self) -> bool {
if self.munch_sequence(&XML_DECL_END) {
self.tokens.push(Token::XMLDeclEnd);
return true;
}
false
}
fn munch_misc_asterisk(&mut self) -> bool {
while self.i < self.length {
if !self.munch_misc() {
break;
}
}
!self.error
}
fn munch_misc(&mut self) -> bool {
if self.munch_comment() {
return !self.error;
}
if self.munch_pi() {
return !self.error;
}
if self.munch_s() {
return !self.error;
}
false
}
fn munch_comment(&mut self) -> bool {
if !self.munch_sequence(&COMMENT_START) {
return false;
}
self.start_span();
while self.i < self.length {
if Comment::is_valid_char_minus_hyphen(self.c[self.i], &self.version) {
self.i += 1;
continue;
} else if self.munch_sequence(&COMMENT_END) {
let span = self.get_span(COMMENT_END.len());
self.tokens
.push(Token::Comment(Comment::new_unvalidated(span)));
return true;
} else if self.i + 1 < self.length
&& self.c[self.i] == HYPHEN
&& Comment::is_valid_char_minus_hyphen(self.c[self.i + 1], &self.version)
{
self.i += 1;
continue;
} else {
self.error("Illegal character in comment.");
return false;
}
}
self.error("Comment must end with the character sequence '-->'.");
return false;
}
fn munch_pi(&mut self) -> bool {
if self.munch_sequence(&PI_START) {
self.tokens.push(Token::PIStart);
} else {
return false;
}
if !self.munch_pi_target() {
return false;
}
return self.munch_pi_data();
}
fn munch_pi_target(&mut self) -> bool {
if PITarget::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length && PITarget::is_valid_char(self.c[self.i]) {
self.i += 1;
}
let pi_target = self.get_span(0);
if pi_target.to_lowercase() == "xml" {
self.error("Illegal processing instruction target. The string 'xml' and all case variations are reserved.");
return false;
}
self.tokens
.push(Token::PITarget(PITarget::new_unvalidated(pi_target)));
return true;
}
self.error("Illegal start character for processing instruction target.");
false
}
fn munch_pi_data(&mut self) -> bool {
if self.munch_sequence(&PI_END) {
self.tokens.push(Token::PIEnd);
return true;
} else if PIData::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length {
if self.munch_sequence(&PI_END) {
let pi_data = self.get_span(PI_END.len());
self.tokens
.push(Token::PIData(PIData::new_unvalidated(pi_data)));
self.tokens.push(Token::PIEnd);
return true;
} else if PIData::is_valid_char(self.c[self.i], &self.version) {
self.i += 1;
} else {
self.error("Illegal character in processing instruction data.");
return false;
}
}
self.error("Processing instruction must end with the '?>' character sequence.");
return false;
} else {
self.error("Illegal start character in processing instruction data.");
false
}
}
fn munch_doctypedecl_misc_asterisk_eroteme(&mut self) -> bool {
if !self.munch_doctypedecl() {
return !self.error;
}
return self.munch_misc_asterisk();
}
fn munch_doctypedecl(&mut self) -> bool {
if !self.munch_sequence(&DOCTYPE_START) {
return false;
}
if !self.munch_s() {
self.error("Doctypedecl must have an S following Doctype start.");
return false;
}
if !self.munch_doctype_name() {
self.error("Doctypedecl must have a doctype name.");
return false;
}
if self.munch_character(DOCTYPE_END) {
self.tokens.push(Token::DoctypeDeclEnd);
return true;
} else {
self.error("Doctypedecl must end in a '>' character.");
return false;
}
}
fn munch_doctype_name(&mut self) -> bool {
if Name::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length {
if Name::is_valid_char(self.c[self.i]) {
self.i += 1;
} else {
let span = self.get_span(0);
self.tokens.push(Token::DoctypeDeclStart);
self.tokens.push(Token::DoctypeName(Name { name: span }));
return true;
}
}
self.error("Doctype declaration must end with a '>' character.");
return false;
}
self.error("Illegal first character of doctype name.");
false
}
fn munch_element(&mut self) -> bool {
if !self.munch_character(STAG_START) {
return false;
}
if !self.munch_element_name() {
self.error("Expected an element name.");
return false;
}
self.munch_s_attibute_asterisk();
if self.error {
return false;
}
if self.munch_empty_element_end() {
return !self.error;
}
if self.munch_character(STAG_END) {
self.tokens.push(Token::ElementSTagEnd);
} else {
self.error("Expected end of STag.");
return false;
}
return self.munch_content();
}
fn munch_element_name(&mut self) -> bool {
if QName::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
let mut prefix_defined = false;
let mut prefix = String::new();
while self.i < self.length {
if QName::is_valid_char(self.c[self.i]) {
self.i += 1;
} else if self.c[self.i] == COLON {
prefix = self.get_span(0);
prefix_defined = true;
self.i += 1;
self.start_span();
} else {
let local_part = self.get_span(0);
if prefix_defined {
self.tokens.push(Token::ElementStart(QName::new_unvalidated(
Some(prefix),
local_part,
)));
} else {
self.tokens.push(Token::ElementStart(QName::new_unvalidated(
None, local_part,
)));
}
return true;
}
}
self.error("Premature end of input in element tag.");
return false;
}
self.error("Expected name start character.");
false
}
fn munch_s_attibute_asterisk(&mut self) -> bool {
while self.i < self.length {
if self.munch_s() && self.munch_attribute() {
continue;
} else {
break;
}
}
!self.error
}
fn munch_empty_element_end(&mut self) -> bool {
if self.munch_sequence(&EMPTY_TAG_END) {
self.tokens.push(Token::ElementEmptyEnd);
return true;
}
false
}
fn munch_content(&mut self) -> bool {
while self.i < self.length {
if self.munch_etag() {
return true;
} else if self.munch_cd_sect() {
continue;
} else if self.munch_comment() {
continue;
} else if self.munch_pi() {
continue;
} else if self.munch_element() {
continue;
} else if self.munch_reference() {
continue;
} else if self.munch_char_data() {
continue;
} else {
return false;
}
}
!self.error
}
fn munch_char_data(&mut self) -> bool {
if self.i < self.length && Text::is_valid_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length {
if self.munch_sequence(&CDATA_END) {
self.error("Illegal CDATA END in character data.");
return false;
} else if Text::is_valid_char(self.c[self.i]) {
self.i += 1;
} else {
break;
}
}
let span = self.get_span(0);
self.tokens.push(Token::Text(Text::new_unvalidated(span)));
return true;
}
false
}
fn munch_cd_sect(&mut self) -> bool {
if self.munch_sequence(&CDATA_START) {
self.start_span();
while self.i < self.length {
if self.munch_sequence(&CDATA_END) {
let span = self.get_span(CDATA_END.len());
self.tokens
.push(Token::CDATASection(CDATASection::new_unvalidated(span)));
return true;
} else if CDATASection::is_valid_char(self.c[self.i], &self.version) {
self.i += 1;
} else {
self.error("Illegal character in CDATA Section.");
return false;
}
}
self.error("CDATA Section must end with a ']]>' character sequence.");
return false;
}
false
}
fn munch_reference(&mut self) -> bool {
if self.munch_char_ref() {
return !self.error;
} else if self.munch_entity_ref() {
return !self.error;
} else {
return false;
}
}
fn munch_char_ref(&mut self) -> bool {
if self.munch_hexidecimal_char_ref() {
return !self.error;
} else if self.munch_decimal_char_ref() {
return !self.error;
} else {
return false;
}
}
fn munch_hexidecimal_char_ref(&mut self) -> bool {
if !self.munch_sequence(&HEXIDECIMAL_CHAR_REFERENCE_START) {
return false;
}
self.start_span();
while self.i < self.length {
if is_hexidecimal_digit(self.c[self.i]) {
self.i += 1;
} else if self.munch_character(REFERENCE_END) {
match HexCharRef::new_from_string(self.get_span(1), &self.version) {
Ok(hex_char_ref) => {
self.tokens.push(Token::HexCharRef(hex_char_ref));
return true;
}
Err(parse_char_ref_err) => {
self.error(
"Failed to parse hexidecimal character reference to a character.",
);
self.error(parse_char_ref_err.message());
return false;
}
}
} else {
self.error("Illegal character in hexidecimal character reference.");
return false;
}
}
self.error("Expected a ';' character to terminate the hexidecimal character reference.");
false
}
fn munch_decimal_char_ref(&mut self) -> bool {
if !self.munch_sequence(&DECIMAL_CHAR_REFERENCE_START) {
return false;
}
self.start_span();
while self.i < self.length {
if is_digit(self.c[self.i]) {
self.i += 1;
} else if self.munch_character(REFERENCE_END) {
match DecCharRef::new_from_string(self.get_span(1), &self.version) {
Ok(dec_char_ref) => {
self.tokens.push(Token::DecCharRef(dec_char_ref));
return true;
}
Err(parse_char_ref_err) => {
self.error(
"Failed to parse decidecimal character reference to a character.",
);
self.error(parse_char_ref_err.message());
return false;
}
}
} else {
self.error("Illegal character in decidecimal character reference.");
return false;
}
}
self.error("Expected a ';' character to terminate the decidecimal character reference.");
false
}
fn munch_entity_ref(&mut self) -> bool {
if self.munch_character(ENTITY_REFERENCE_START) {
self.start_span();
if Name::is_valid_start_char(self.c[self.i]) {
self.i += 1;
} else {
self.error("Expected a legal name start character in entity reference.");
return false;
}
while self.i < self.length {
if Name::is_valid_char(self.c[self.i]) {
self.i += 1;
} else if self.munch_character(REFERENCE_END) {
let span = self.get_span(1);
self.tokens
.push(Token::EntityRef(Name::new_unvalidated(span)));
return true;
} else {
self.error("Illegal character in entity reference name.");
return false;
}
}
self.error("Entity reference must end with a ';' character.");
return false;
}
false
}
fn munch_etag(&mut self) -> bool {
if !self.munch_sequence(&ETAG_START) {
return false;
}
if !QName::is_valid_start_char(self.c[self.i]) {
self.error("Expected a qname start character after ETag start.");
return false;
}
let mut prefix_defined = false;
let mut prefix = String::new();
self.start_span();
self.i += 1;
while self.i < self.length {
if QName::is_valid_char(self.c[self.i]) {
self.i += 1;
} else if self.c[self.i] == COLON {
prefix = self.get_span(0);
prefix_defined = true;
self.i += 1;
self.start_span();
} else if self.munch_character(ETAG_END) {
let local_part = self.get_span(1);
if prefix_defined {
self.tokens.push(Token::ElementEnd(QName::new_unvalidated(
Some(prefix),
local_part,
)));
} else {
self.tokens
.push(Token::ElementEnd(QName::new_unvalidated(None, local_part)));
}
return true;
} else {
self.error("Illegal character in ETag name.");
return false;
}
}
self.error("ETag must finish with a closing '>' character.");
false
}
fn munch_attribute(&mut self) -> bool {
if self.munch_namespace() {
return true;
}
if !self.munch_attribute_name() {
return false;
}
if !self.munch_eq() {
self.error("Expected an '=' character after an attribute name.");
return false;
}
if !self.munch_attribute_value() {
self.error("Expected an attribute value.");
return false;
}
true
}
fn munch_namespace(&mut self) -> bool {
if self.munch_sequence(&XMLNS) {
self.tokens.push(Token::NamespaceStart);
} else {
return false;
}
if self.munch_character(COLON) {
if !self.munch_namespace_prefix() {
self.error("Expected a namespace prefix after the character sequence 'xmlns:'.");
return false;
}
} else {
self.tokens.push(Token::NamespaceDefault);
}
if !self.munch_eq() {
self.error("Expected an '=' character after a namespace attribute name.");
return false;
}
self.munch_namespace_value()
}
fn munch_namespace_prefix(&mut self) -> bool {
if NCName::is_valid_start_char(self.c[self.i]) {
self.start_span();
self.i += 1;
while self.i < self.length {
if NCName::is_valid_char(self.c[self.i]) {
self.i += 1;
} else {
break;
}
}
let prefix = self.get_span(0);
self.tokens
.push(Token::NamespacePrefix(NCName::new_unvalidated(prefix)));
return true;
}
false
}
fn munch_namespace_value(&mut self) -> bool {
let double_quotes: bool;
if self.munch_double_quote() {
double_quotes = true;
} else if self.munch_single_quote() {
double_quotes = false;
} else {
self.error("Expected a single or double quote.");
return false;
}
self.start_span();
while self.i < self.length {
if double_quotes && self.munch_character(DOUBLE_QUOTE)
|| !double_quotes && self.munch_character(SINGLE_QUOTE)
{
let span = self.get_span(1);
self.tokens
.push(Token::NamespaceValue(NamespaceValue::new_unvalidated(span)));
self.tokens.push(Token::NamespaceEnd);
return true;
} else if NamespaceValue::is_valid_char(self.c[self.i], &self.version) {
self.i += 1;
} else {
self.error("Illegal character in namespace value.");
return false;
}
}
if double_quotes {
self.error("Expected closing double quote following namespace value.");
} else {
self.error("Expected closing single quote following namespace value.");
}
false
}
fn munch_attribute_name(&mut self) -> bool {
if QName::is_valid_start_char(self.c[self.i]) {
let mut prefix_defined = false;
let mut prefix = String::new();
self.start_span();
self.i += 1;
while self.i < self.length {
if QName::is_valid_char(self.c[self.i]) {
self.i += 1;
} else if self.c[self.i] == COLON {
prefix = self.get_span(0);
prefix_defined = true;
self.i += 1;
self.start_span();
} else {
break;
}
}
let local_part = self.get_span(0);
if prefix_defined {
self.tokens.push(Token::AttributeStart);
self.tokens
.push(Token::AttributeName(QName::new_unvalidated(
Some(prefix),
local_part,
)));
} else {
self.tokens.push(Token::AttributeStart);
self.tokens
.push(Token::AttributeName(QName::new_unvalidated(
None, local_part,
)));
}
return true;
}
false
}
fn munch_attribute_value(&mut self) -> bool {
let double_quotes: bool;
if self.munch_double_quote() {
double_quotes = true;
} else if self.munch_single_quote() {
double_quotes = false;
} else {
self.error("Expected a single or double quote.");
return false;
}
self.tokens.push(Token::AttributeValueStart);
self.start_span();
while self.i < self.length {
if double_quotes && AttributeValue::is_valid_inside_double_quotes_char(self.c[self.i])
|| !double_quotes
&& AttributeValue::is_valid_inside_single_quotes_char(self.c[self.i])
{
self.i += 1;
} else if self.c[self.i] == ENTITY_REFERENCE_START {
let span = self.get_span(0);
self.tokens
.push(Token::AttributeValue(AttributeValue::new_unvalidated(span)));
self.munch_reference();
self.start_span();
} else if (double_quotes && self.munch_double_quote())
|| (!double_quotes && self.munch_single_quote())
{
let span = self.get_span(1);
self.tokens
.push(Token::AttributeValue(AttributeValue::new_unvalidated(span)));
self.tokens.push(Token::AttributeEnd);
return true;
} else {
self.error("Illegal character in attribute value.");
return false;
}
}
if double_quotes {
self.error("Expected closing double quote following attribute value.");
} else {
self.error("Expected closing single quote following attriubte value.");
}
false
}
fn munch_character(&mut self, character: char) -> bool {
if self.i < self.length && self.c[self.i] == character {
self.i += 1;
return true;
}
false
}
fn munch_sequence(&mut self, sequence: &[char]) -> bool {
let sequence_length = sequence.len();
let sequence_end = self.i + sequence_length;
if sequence_end > self.length {
return false;
}
if &self.c[self.i..sequence_end] == sequence {
self.i += sequence_length;
return true;
}
false
}
fn start_span(&mut self) {
self.span_start = self.i;
}
fn get_span(&mut self, span_end_offset: usize) -> String {
let span_end = self.i - span_end_offset;
self.c[self.span_start..span_end].iter().collect()
}
fn warning(&mut self, msg: &str) {
if self.i < self.length {
self.warning_messages
.push(format!("c[{}]={} {}", self.i, self.c[self.i], msg));
} else {
self.warning_messages
.push(format!("Out of bounds: {}", msg));
}
}
fn error(&mut self, msg: &str) {
if self.i < self.length {
self.error_messages
.push(format!("c[{}]={} {}", self.i, self.c[self.i], msg));
} else {
self.error_messages.push(format!("Out of bounds: {}", msg));
}
}
}
#[cfg(test)]
mod tokenizer_tests {
use super::*;
#[test]
fn tokenize_document_hit() {
let mut tok = Tokenizer::new(String::from("<a/>"));
assert!(tok.tokenize_document());
assert_eq!(
tok.tokens,
vec![
Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
Token::ElementEmptyEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_document_hit() {
let mut tok = Tokenizer::new(String::from("<a/>"));
assert!(tok.munch_document());
assert_eq!(
tok.tokens,
vec![
Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
Token::ElementEmptyEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_prolog_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_prolog());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_xml_decl_eroteme_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_xml_decl_eroteme());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_xml_decl_hit() {
let mut tok = Tokenizer::new(String::from("<?xml version='1.0'?>"));
assert!(tok.munch_xml_decl());
assert_eq!(
tok.tokens,
vec![
Token::XMLDeclStart,
Token::XMLVersion(XMLVersion::Version1_0),
Token::XMLDeclEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<?xml version='1.0' encoding='utf-8'?>"));
assert!(tok.munch_xml_decl());
assert_eq!(
tok.tokens,
vec![
Token::XMLDeclStart,
Token::XMLVersion(XMLVersion::Version1_0),
Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
Token::XMLDeclEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<?xml version='1.0' standalone='yes'?>"));
tok.munch_xml_decl();
assert_eq!(
tok.tokens,
vec![
Token::XMLDeclStart,
Token::XMLVersion(XMLVersion::Version1_0),
Token::XMLStandalone(true),
Token::XMLDeclEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from(
"<?xml version='1.0' encoding='utf-8' standalone='no'?>",
));
assert!(tok.munch_xml_decl());
assert_eq!(
tok.tokens,
vec![
Token::XMLDeclStart,
Token::XMLVersion(XMLVersion::Version1_0),
Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
Token::XMLStandalone(false),
Token::XMLDeclEnd
]
);
assert!(!tok.error);
let mut tok =
Tokenizer::new(String::from("<?xml version = \"1.0\" encoding = \"utf-8\" standalone = \"no\" ?>"));
assert!(tok.munch_xml_decl());
assert_eq!(
tok.tokens,
vec![
Token::XMLDeclStart,
Token::XMLVersion(XMLVersion::Version1_0),
Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
Token::XMLStandalone(false),
Token::XMLDeclEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_xml_decl_start_hit() {
let mut tok = Tokenizer::new(String::from("<?xml"));
assert!(tok.munch_xml_decl_start());
assert_eq!(tok.tokens, vec![Token::XMLDeclStart,]);
assert!(!tok.error);
}
#[test]
fn munch_version_info_hit() {
let mut tok = Tokenizer::new(String::from(" version='1.0'"));
assert!(tok.munch_version_info());
assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_0),]);
assert!(!tok.error);
}
#[test]
fn munch_s_hit() {
let mut tok = Tokenizer::new(String::from(" "));
assert!(tok.munch_s());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_s_eroteme_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_s_eroteme());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from(" \n \t \r "));
assert!(tok.munch_s_eroteme());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_version_hit() {
let mut tok = Tokenizer::new(String::from("version"));
assert!(tok.munch_version());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_eq_hit() {
let mut tok = Tokenizer::new(String::from("="));
assert!(tok.munch_eq());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from(" = "));
assert!(tok.munch_eq());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_single_quote_hit() {
let mut tok = Tokenizer::new(String::from("'"));
assert!(tok.munch_single_quote());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_double_quote_hit() {
let mut tok = Tokenizer::new(String::from("\""));
assert!(tok.munch_double_quote());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_version_num_hit() {
let mut tok = Tokenizer::new(String::from("1.0"));
assert!(tok.munch_version_num());
assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_0)]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("1.1"));
assert!(tok.munch_version_num());
assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_1)]);
assert!(!tok.error);
}
#[test]
fn munch_digits_hit() {
let mut tok = Tokenizer::new(String::from("1234567890"));
assert!(tok.munch_digits());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_encoding_decl_hit() {
let mut tok = Tokenizer::new(String::from(" encoding='utf-8'"));
assert!(tok.munch_encoding_decl());
assert_eq!(
tok.tokens,
vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
"utf-8"
))),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from(" encoding = \"UTF-8\" "));
assert!(tok.munch_encoding_decl());
assert_eq!(
tok.tokens,
vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
"UTF-8"
))),]
);
assert!(!tok.error);
}
#[test]
fn munch_encoding_hit() {
let mut tok = Tokenizer::new(String::from("encoding"));
assert!(tok.munch_encoding());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_enc_name_hit() {
let mut tok = Tokenizer::new(String::from("iso8859-1"));
assert!(tok.munch_enc_name());
assert_eq!(
tok.tokens,
vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
"iso8859-1"
)))]
);
assert!(!tok.error);
}
#[test]
fn munch_sd_decl_hit() {
let mut tok = Tokenizer::new(String::from(" standalone='yes'"));
assert!(tok.munch_sd_decl());
assert_eq!(tok.tokens, vec![Token::XMLStandalone(true)]);
assert!(!tok.error);
}
#[test]
fn munch_standalone_hit() {
let mut tok = Tokenizer::new(String::from("standalone"));
assert!(tok.munch_standalone());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_yes_no_hit() {
let mut tok = Tokenizer::new(String::from("yes"));
assert!(tok.munch_yes_no());
assert_eq!(tok.tokens, vec![Token::XMLStandalone(true)]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("no"));
assert!(tok.munch_yes_no());
assert_eq!(tok.tokens, vec![Token::XMLStandalone(false)]);
assert!(!tok.error);
}
#[test]
fn munch_xml_decl_end_hit() {
let mut tok = Tokenizer::new(String::from("?>"));
assert!(tok.munch_xml_decl_end());
assert_eq!(tok.tokens, vec![Token::XMLDeclEnd]);
assert!(!tok.error);
}
#[test]
fn munch_misc_asterisk_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_misc_asterisk());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_misc_hit() {
let mut tok = Tokenizer::new(String::from(" "));
assert!(tok.munch_misc());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_comment_hit() {
let mut tok = Tokenizer::new(String::from("<!---->"));
assert!(tok.munch_comment());
assert_eq!(
tok.tokens,
vec![Token::Comment(Comment::new_unvalidated(String::from("")))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!--My comment text-->"));
assert!(tok.munch_comment());
assert_eq!(
tok.tokens,
vec![Token::Comment(Comment::new_unvalidated(String::from(
"My comment text"
)))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!-- My - comment - text -->"));
assert!(tok.munch_comment());
assert_eq!(
tok.tokens,
vec![Token::Comment(Comment::new_unvalidated(String::from(
" My - comment - text "
)))]
);
assert!(!tok.error);
}
#[test]
fn munch_pi_hit() {
let mut tok = Tokenizer::new(String::from("<?mypi my pi data?>"));
assert!(tok.munch_pi());
assert_eq!(
tok.tokens,
vec![
Token::PIStart,
Token::PITarget(PITarget {
target: String::from("mypi")
}),
Token::PIData(PIData::new_unvalidated(String::from(" my pi data"))),
Token::PIEnd,
]
);
assert!(!tok.error);
}
#[test]
fn munch_pi_target() {
let mut tok = Tokenizer::new(String::from("mypi"));
assert!(tok.munch_pi_target());
assert_eq!(
tok.tokens,
vec![Token::PITarget(PITarget {
target: String::from("mypi")
}),]
);
assert!(!tok.error);
}
#[test]
fn munch_pi_data() {
let mut tok = Tokenizer::new(String::from("?>"));
assert!(tok.munch_pi_data());
assert_eq!(tok.tokens, vec![Token::PIEnd]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from(" Valid PI data is empty or starts with S.?>"));
assert!(tok.munch_pi_data());
assert_eq!(
tok.tokens,
vec![
Token::PIData(PIData {
data: String::from(" Valid PI data is empty or starts with S.")
}),
Token::PIEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_doctypedecl_misc_asterisk_eroteme_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
assert_eq!(
tok.tokens,
vec![
Token::DoctypeDeclStart,
Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
Token::DoctypeDeclEnd,
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
assert_eq!(
tok.tokens,
vec![
Token::DoctypeDeclStart,
Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
Token::DoctypeDeclEnd,
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!DOCTYPE html> <!--Comment-->"));
assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
assert_eq!(
tok.tokens,
vec![
Token::DoctypeDeclStart,
Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
Token::DoctypeDeclEnd,
Token::Comment(Comment::new_unvalidated(String::from("Comment"))),
]
);
assert!(!tok.error);
}
#[test]
fn munch_doctypedecl_hit() {
let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
assert!(tok.munch_doctypedecl());
assert_eq!(
tok.tokens,
vec![
Token::DoctypeDeclStart,
Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
Token::DoctypeDeclEnd,
]
);
assert!(!tok.error);
}
#[test]
fn munch_element_hit() {
let mut tok = Tokenizer::new(String::from("<a/>"));
assert!(tok.munch_element());
assert_eq!(
tok.tokens,
vec![
Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
Token::ElementEmptyEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_element_name_hit() {
let mut tok = Tokenizer::new(String::from("emptyelementname/>"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
None,
String::from("emptyelementname")
)),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("emptyelementname />"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
None,
String::from("emptyelementname")
)),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("validname>"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
None,
String::from("validname")
)),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("prefix:emptyelementname/>"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
Some(String::from("prefix")),
String::from("emptyelementname")
)),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("prefix:emptyelementname />"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
Some(String::from("prefix")),
String::from("emptyelementname")
)),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("prefix:validname>"));
assert!(tok.munch_element_name());
assert_eq!(
tok.tokens,
vec![Token::ElementStart(QName::new_unvalidated(
Some(String::from("prefix")),
String::from("validname")
)),]
);
assert!(!tok.error);
}
#[test]
fn munch_s_attibute_asterisk_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_s_attibute_asterisk());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
}
#[test]
fn munch_empty_element_end_hit() {
let mut tok = Tokenizer::new(String::from("/>"));
assert!(tok.munch_empty_element_end());
assert_eq!(tok.tokens, vec![Token::ElementEmptyEnd]);
assert!(!tok.error);
}
#[test]
fn munch_content_hit() {
let mut tok = Tokenizer::new(String::from(""));
assert!(tok.munch_content());
assert_eq!(tok.tokens, vec![]);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("character data"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![Token::Text(Text::new_unvalidated(String::from(
"character data"
)))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("character data"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![Token::Text(Text::new_unvalidated(String::from(
"character data"
)))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<a/>"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![
Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
Token::ElementEmptyEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<a></a>"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![
Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
Token::ElementSTagEnd,
Token::ElementEnd(QName::new_unvalidated(None, String::from("a"))),
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("&"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![Token::EntityRef(Name::new_unvalidated(String::from("amp"))),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<![CDATA[mycdata]]>"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![Token::CDATASection(CDATASection::new_unvalidated(
String::from("mycdata")
))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<?pi my pi data?>"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![
Token::PIStart,
Token::PITarget(PITarget::new_unvalidated(String::from("pi"))),
Token::PIData(PIData::new_unvalidated(String::from(" my pi data"))),
Token::PIEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("<!--my comment-->"));
assert!(tok.munch_content());
assert_eq!(
tok.tokens,
vec![Token::Comment(Comment::new_unvalidated(String::from(
"my comment"
))),]
);
assert!(!tok.error);
}
#[test]
fn munch_char_data_hit() {
let mut tok = Tokenizer::new(String::from("Valid character data<"));
assert!(tok.munch_char_data());
assert_eq!(
tok.tokens,
vec![Token::Text(Text::new_unvalidated(String::from(
"Valid character data"
)))]
);
assert!(!tok.error);
}
#[test]
fn munch_cd_sect_hit() {
let mut tok = Tokenizer::new(String::from("<![CDATA[Valid cdata section]]>"));
assert!(tok.munch_cd_sect());
assert_eq!(
tok.tokens,
vec![Token::CDATASection(CDATASection::new_unvalidated(
String::from("Valid cdata section")
))]
);
assert!(!tok.error);
}
#[test]
fn munch_reference_hit() {
let mut tok = Tokenizer::new(String::from("&"));
assert!(tok.munch_reference());
assert_eq!(
tok.tokens,
vec![Token::EntityRef(Name::new_unvalidated(String::from("amp"))),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("A"));
assert!(tok.munch_reference());
assert_eq!(
tok.tokens,
vec![Token::DecCharRef(DecCharRef::new_unvalidated('A')),]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("😞"));
assert!(tok.munch_reference());
assert_eq!(
tok.tokens,
vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
);
assert!(!tok.error);
}
#[test]
fn munch_char_ref_hit() {
let mut tok = Tokenizer::new(String::from("A"));
assert!(tok.munch_char_ref());
assert_eq!(
tok.tokens,
vec![Token::DecCharRef(DecCharRef::new_unvalidated('A'))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("😞"));
assert!(tok.munch_char_ref());
assert_eq!(
tok.tokens,
vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
);
assert!(!tok.error);
}
#[test]
fn munch_hexidecimal_char_ref_hit() {
let mut tok = Tokenizer::new(String::from("😞"));
assert!(tok.munch_hexidecimal_char_ref());
assert_eq!(
tok.tokens,
vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
);
assert!(!tok.error);
}
#[test]
fn munch_decimal_char_ref_hit() {
let mut tok = Tokenizer::new(String::from("A"));
assert!(tok.munch_decimal_char_ref());
assert_eq!(
tok.tokens,
vec![Token::DecCharRef(DecCharRef::new_unvalidated('A'))]
);
assert!(!tok.error);
}
#[test]
fn munch_entity_ref_hit() {
let mut tok = Tokenizer::new(String::from("&"));
assert!(tok.munch_reference());
assert_eq!(
tok.tokens,
vec![Token::EntityRef(Name::new_unvalidated(String::from("amp")))]
);
assert!(!tok.error);
}
#[test]
fn munch_etag_hit() {
let mut tok = Tokenizer::new(String::from("</etag>"));
assert!(tok.munch_etag());
assert_eq!(
tok.tokens,
vec![Token::ElementEnd(QName::new_unvalidated(
None,
String::from("etag")
))]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("</prefix:etag>"));
assert!(tok.munch_etag());
assert_eq!(
tok.tokens,
vec![Token::ElementEnd(QName::new_unvalidated(
Some(String::from("prefix")),
String::from("etag")
))]
);
assert!(!tok.error);
}
#[test]
fn munch_attribute_hit() {
let mut tok = Tokenizer::new(String::from("name='value'"));
assert!(tok.munch_attribute());
assert_eq!(
tok.tokens,
vec![
Token::AttributeStart,
Token::AttributeName(QName::new_unvalidated(None, String::from("name"))),
Token::AttributeValueStart,
Token::AttributeValue(AttributeValue::new_unvalidated(String::from("value"))),
Token::AttributeEnd
]
);
assert!(!tok.error);
}
#[test]
fn munch_namespace_hit() {
let mut tok = Tokenizer::new(String::from("xmlns='http://defaultnamespace.com'"));
assert!(tok.munch_namespace());
assert_eq!(
tok.tokens,
vec![
Token::NamespaceStart,
Token::NamespaceDefault,
Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
"http://defaultnamespace.com"
))),
Token::NamespaceEnd,
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("xmlns:prefix='http://prefixednamespace.com'"));
assert!(tok.munch_namespace());
assert_eq!(
tok.tokens,
vec![
Token::NamespaceStart,
Token::NamespacePrefix(NCName::new_unvalidated(String::from("prefix"))),
Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
"http://prefixednamespace.com"
))),
Token::NamespaceEnd,
]
);
assert!(!tok.error);
}
#[test]
fn munch_namespace_prefix_hit() {
let mut tok = Tokenizer::new(String::from("validprefix"));
assert!(tok.munch_namespace_prefix());
assert_eq!(
tok.tokens,
vec![Token::NamespacePrefix(NCName::new_unvalidated(
String::from("validprefix")
)),]
);
assert!(!tok.error);
}
#[test]
fn munch_namespace_value_hit() {
let mut tok = Tokenizer::new(String::from("'namespacevalue'"));
assert!(tok.munch_namespace_value());
assert_eq!(
tok.tokens,
vec![
Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
"namespacevalue"
))),
Token::NamespaceEnd,
]
);
assert!(!tok.error);
}
#[test]
fn munch_attribute_name_hit() {
let mut tok = Tokenizer::new(String::from("validname"));
assert!(tok.munch_attribute_name());
assert_eq!(
tok.tokens,
vec![
Token::AttributeStart,
Token::AttributeName(QName::new_unvalidated(None, String::from("validname"))),
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("prefix:validname"));
assert!(tok.munch_attribute_name());
assert_eq!(
tok.tokens,
vec![
Token::AttributeStart,
Token::AttributeName(QName::new_unvalidated(
Some(String::from("prefix")),
String::from("validname")
)),
]
);
assert!(!tok.error);
}
#[test]
fn munch_attribute_value_hit() {
let mut tok = Tokenizer::new(String::from("'value'"));
assert!(tok.munch_attribute_value());
assert_eq!(
tok.tokens,
vec![
Token::AttributeValueStart,
Token::AttributeValue(AttributeValue::new_unvalidated(String::from("value"))),
Token::AttributeEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("'this & that'"));
assert!(tok.munch_attribute_value());
assert_eq!(
tok.tokens,
vec![
Token::AttributeValueStart,
Token::AttributeValue(AttributeValue::new_unvalidated(String::from("this "))),
Token::EntityRef(Name::new_unvalidated(String::from("amp"))),
Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" that"))),
Token::AttributeEnd
]
);
assert!(!tok.error);
let mut tok = Tokenizer::new(String::from("\"This & that — and A.\""));
assert!(tok.munch_attribute_value());
assert_eq!(
tok.tokens,
vec![
Token::AttributeValueStart,
Token::AttributeValue(AttributeValue::new_unvalidated(String::from("This "))),
Token::EntityRef(Name::new_unvalidated(String::from("amp"))),
Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" that "))),
Token::HexCharRef(HexCharRef::new_unvalidated('—')),
Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" and "))),
Token::DecCharRef(DecCharRef::new_unvalidated('A')),
Token::AttributeValue(AttributeValue::new_unvalidated(String::from("."))),
Token::AttributeEnd
]
);
assert!(!tok.error);
}
}
fn is_digit(c: char) -> bool {
match c {
'0'..='9' => true,
_ => false,
}
}
fn is_hexidecimal_digit(c: char) -> bool {
match c {
'0'..='9' => true,
'a'..='f' => true,
'A'..='F' => true,
_ => false,
}
}
pub struct ParseTokenError {
kind: ParseTokenErrorKind,
}
impl ParseTokenError {
pub fn new(kind: ParseTokenErrorKind) -> ParseTokenError {
ParseTokenError { kind }
}
fn message(&self) -> &str {
match self.kind {
ParseTokenErrorKind::PITarget => "Error parsing processing instruction target.",
ParseTokenErrorKind::DecCharRef => "Error parsing Decimal Character Reference value.",
ParseTokenErrorKind::HexCharRef => {
"Error parsing Hexidecimal Character Reference value."
}
ParseTokenErrorKind::FromU32 => "Error converting u32 to char.",
}
}
}
pub enum ParseTokenErrorKind {
PITarget,
DecCharRef,
HexCharRef,
FromU32,
}